openvinotoolkit · mzegla · Apr 28, 2026 · Apr 29, 2026 · Apr 30, 2026
diff --git a/extras/chat_template_examples/chat_template_gpt_oss.jinja b/extras/chat_template_examples/chat_template_gpt_oss.jinja
@@ -351,11 +351,28 @@
             {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }}
         {%- endif %}
         {{- "<|start|>functions." + last_tool_call.name }}
-        {#- Original: {{- " to=assistant<|channel|>commentary<|message|>" + message.content|tojson + "<|end|>" }} #}
-        {#- Actual version that works, does not escape and allows non-json: #}
-        {{- " to=assistant<|channel|>commentary<|message|>" + message.content + "<|end|>" -}}
+        {#- When content is a plain string we render it directly.                                    #}
+        {#- When content is an array (OpenAI multipart format) we join the text items with newlines, #}
+        {#- matching what the model was trained on. JSON-serialising the array would add noise.      #}
+        {%- if message.content is string -%}
+            {{- " to=assistant<|channel|>commentary<|message|>" + message.content + "<|end|>" -}}
+        {%- else -%}
+            {%- set ns = namespace(parts=[]) -%}
+            {%- for item in message.content if item.type == "text" -%}
+                {%- set ns.parts = ns.parts + [item.text] -%}
+            {%- endfor -%}
+            {{- " to=assistant<|channel|>commentary<|message|>" + ns.parts | join("\n") + "<|end|>" -}}
+        {%- endif -%}
     {%- elif message.role == 'user' -%}
-        {{- "<|start|>user<|message|>" + message.content + "<|end|>" }}
+        {%- if message.content is string -%}
+            {{- "<|start|>user<|message|>" + message.content + "<|end|>" }}
+        {%- else -%}
+            {%- set ns = namespace(parts=[]) -%}
+            {%- for item in message.content if item.type == "text" -%}
+                {%- set ns.parts = ns.parts + [item.text] -%}
+            {%- endfor -%}
+            {{- "<|start|>user<|message|>" + ns.parts | join("\n") + "<|end|>" }}
+        {%- endif -%}
     {%- endif -%}
 {%- endfor -%}
 

diff --git a/src/llm/apis/openai_api_handler.hpp b/src/llm/apis/openai_api_handler.hpp
@@ -146,6 +146,7 @@ class OpenAIApiHandler {
     std::optional<int> getNumReturnSequences() const;
     StreamOptions getStreamOptions() const;
     const std::string& getProcessedJson() const;
+    // Returns the flat ordered list of image tensors (one per image_url item, in document order).
     const ImageHistory& getImageHistory() const;
     ov::genai::ChatHistory& getChatHistory();
     std::optional<int> getMaxTokens() const;

diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp
@@ -196,13 +196,14 @@ absl::Status OpenAIChatCompletionsHandler::parseMessages(std::optional<std::stri
                 continue;
             }
             if (memberName == "content" && member->value.IsArray()) {
-                // Adjust content field format when it is passed as an array of objects (typically with images)
+                // Content passed as an array of objects (OpenAI multipart format).
+                // We preserve the array structure so chat templates can decide how to render it.
+                // image_url items are translated to {type:image} so that VLM chat templates
+                // (which use the OpenVINO GenAI MULTIPART_CONTENT convention) see them correctly.
+                // The corresponding decoded tensors are appended in document order to imageHistory.
                 if (member->value.GetArray().Size() == 0) {
                     return absl::InvalidArgumentError("Invalid message structure - content array is empty");
                 }
-                jsonChanged = true;
-                Value contentText(rapidjson::kStringType);
-                contentText.SetString("", doc.GetAllocator());
                 for (auto& v : member->value.GetArray()) {
                     if (!v.IsObject()) {
                         return absl::InvalidArgumentError("Invalid message structure - content array should contain objects");
@@ -211,14 +212,12 @@ absl::Status OpenAIChatCompletionsHandler::parseMessages(std::optional<std::stri
                     if (!entry.HasMember("type") || !entry["type"].IsString()) {
                         return absl::InvalidArgumentError("Invalid message structure - content object type missing");
                     }
-                    auto entryType = entry["type"].GetString();
-                    if (entryType == std::string("text")) {
+                    std::string entryType = entry["type"].GetString();
+                    if (entryType == "text") {
                         if (!entry.HasMember("text") || !entry["text"].IsString()) {
                             return absl::InvalidArgumentError("Invalid message structure - content text missing");
                         }
-                        contentText = entry["text"];
-                        continue;
-                    } else if (entryType == std::string("image_url")) {
+                    } else if (entryType == "image_url") {
                         if (!entry.HasMember("image_url") || !entry["image_url"].IsObject()) {
                             return absl::InvalidArgumentError("Invalid message structure - content image_url missing");
                         }
@@ -231,18 +230,27 @@ absl::Status OpenAIChatCompletionsHandler::parseMessages(std::optional<std::stri
                         if (!tensorResult.ok()) {
                             return tensorResult.status();
                         }
-                        request.imageHistory.push_back({i, tensorResult.value()});
+                        // Store tensor in flat image list (document order = template rendering order)
+                        request.imageHistory.push_back(std::move(tensorResult.value()));
+                        // Translate image_url item to {type:image} so VLM chat templates
+                        // (which use GenAI MULTIPART_CONTENT convention) see the image in context.
+                        while (v.MemberBegin() != v.MemberEnd()) {
+                            v.RemoveMember(v.MemberBegin());
+                        }
+                        v.AddMember(rapidjson::Value("type", doc.GetAllocator()),
+                                    rapidjson::Value("image", doc.GetAllocator()),
+                                    doc.GetAllocator());
+                        jsonChanged = true;
                     } else {
                         return absl::InvalidArgumentError("Unsupported content type");
                     }
                 }
-                // Pulling out text from nested structure to the "content" field for text and replace whole "content" value for image data
-                // with empty string, since images are stored separately in request.images
-                member->value = contentText;
-                // Add new field to the last message in history if content is text
-                if (member->value.IsString()) {
-                    request.chatHistory.last()[member->name.GetString()] = member->value.GetString();
-                }
+                // Preserve the array (with any image_url translated to {type:image}) in chatHistory.
+                // For the Python Jinja path, processedJson is only written when jsonChanged is true
+                // (i.e. when image_url items were translated or tool_call arguments were injected).
+                // Otherwise the template falls back to payload.body and sees the original OpenAI
+                // format, which is equally correct — template decides how to render content arrays.
+                request.chatHistory.last()[memberName] = rapidJsonValueToJsonContainer(member->value);
             }
         }
         auto lastMessage = request.chatHistory.last();

diff --git a/src/llm/apis/openai_request.hpp b/src/llm/apis/openai_request.hpp
@@ -32,7 +32,11 @@
 #include "tool_schema_wrapper.hpp"
 
 namespace ovms {
-using ImageHistory = std::vector<std::pair<size_t, ov::Tensor>>;
+// Flat ordered list of image tensors extracted from content arrays.
+// Order matches the sequence of image_url items across all messages,
+// which corresponds to the order of {"type":"image"} items in chatHistory
+// after image_url → image translation.
+using ImageHistory = std::vector<ov::Tensor>;
 
 struct StreamOptions {
     bool includeUsage = false;

diff --git a/src/llm/apis/openai_responses.cpp b/src/llm/apis/openai_responses.cpp
@@ -120,7 +120,13 @@ absl::Status OpenAIResponsesHandler::parseInput(std::optional<std::string> allow
                 return absl::InvalidArgumentError("Invalid message structure - content array is empty");
             }
 
-            std::string contentText = "";
+            // Translate Responses API content array into the canonical multipart format
+            // used by chatHistory and VLM chat templates:
+            //   input_text  → {"type":"text",  "text": "<value>"}
+            //   input_image → {"type":"image"} (tensor appended to imageHistory in order)
+            // This mirrors the Chat Completions image_url → image translation so that
+            // both VLM chat templates (GenAI MULTIPART_CONTENT) and Python Jinja2 templates
+            // receive a uniform representation.
             for (auto& contentItem : contentIt->value.GetArray()) {
                 if (!contentItem.IsObject()) {
                     return absl::InvalidArgumentError("input content items must be objects");
@@ -137,7 +143,17 @@ absl::Status OpenAIResponsesHandler::parseInput(std::optional<std::string> allow
                     if (textIt == contentObj.MemberEnd() || !textIt->value.IsString()) {
                         return absl::InvalidArgumentError("input_text requires a valid text field");
                     }
-                    contentText = textIt->value.GetString();
+                    // Normalise to {"type":"text","text":"..."} in-place.
+                    std::string textValue = textIt->value.GetString();
+                    while (contentItem.MemberBegin() != contentItem.MemberEnd()) {
+                        contentItem.RemoveMember(contentItem.MemberBegin());
+                    }
+                    contentItem.AddMember(rapidjson::Value("type", doc.GetAllocator()),
+                                         rapidjson::Value("text", doc.GetAllocator()),
+                                         doc.GetAllocator());
+                    contentItem.AddMember(rapidjson::Value("text", doc.GetAllocator()),
+                                         rapidjson::Value(textValue.c_str(), doc.GetAllocator()),
+                                         doc.GetAllocator());
                 } else if (type == "input_image") {
                     std::string imageUrl;
                     auto imageUrlIt = contentObj.FindMember("image_url");
@@ -161,13 +177,21 @@ absl::Status OpenAIResponsesHandler::parseInput(std::optional<std::string> allow
                     if (!tensorResult.ok()) {
                         return tensorResult.status();
                     }
-                    request.imageHistory.push_back({i, tensorResult.value()});
+                    request.imageHistory.push_back(std::move(tensorResult.value()));
+                    // Translate to {"type":"image"} in-place so VLM chat templates see
+                    // the image at the correct position in the content array.
+                    while (contentItem.MemberBegin() != contentItem.MemberEnd()) {
+                        contentItem.RemoveMember(contentItem.MemberBegin());
+                    }
+                    contentItem.AddMember(rapidjson::Value("type", doc.GetAllocator()),
+                                         rapidjson::Value("image", doc.GetAllocator()),
+                                         doc.GetAllocator());
                 } else {
                     return absl::InvalidArgumentError("Unsupported content type. Supported types are input_text and input_image.");
                 }
             }
 
-            request.chatHistory.last()["content"] = contentText;
+            request.chatHistory.last()["content"] = rapidJsonValueToJsonContainer(contentIt->value);
         }
     } else {
         return absl::InvalidArgumentError("input is not a string or array");

diff --git a/src/llm/visual_language_model/continuous_batching/servable.cpp b/src/llm/visual_language_model/continuous_batching/servable.cpp
@@ -19,7 +19,6 @@
 #include <memory>
 #include <stdexcept>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
 #include "../../../logging.hpp"
@@ -74,26 +73,36 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptr<GenAiSer
 
         for (size_t i = 0; i < chatHistory.size(); i++) {
             const auto& message = chatHistory[i];
-            if (message["content"].as_string().value_or("").find("<ov_genai_image_") != std::string::npos) {
+            const auto& contentField = message["content"];
+            if (contentField.is_array()) {
+                for (size_t j = 0; j < contentField.size(); j++) {
+                    const auto& item = contentField[j];
+                    if (item["type"].as_string().value_or("") == "text" &&
+                        item["text"].as_string().value_or("").find("<ov_genai_image_") != std::string::npos) {
+                        return absl::InvalidArgumentError("Message contains restricted <ov_genai_image> tag");
+                    }
+                }
+            } else if (contentField.as_string().value_or("").find("<ov_genai_image_") != std::string::npos) {
                 return absl::InvalidArgumentError("Message contains restricted <ov_genai_image> tag");
             }
         }
 
-        const ImageHistory& imageHistory = vlmExecutionContext->apiHandler->getImageHistory();
-        size_t imageIndex = 0;
-        std::unordered_map<size_t, std::string> imageTags;
-        for (const auto& image : imageHistory) {
-            const auto& [chatTurnIndex, imageTensor] = image;
-            std::string imageTag = "<ov_genai_image_" + std::to_string(imageIndex++) + ">\n";
-            imageTags[chatTurnIndex] = imageTags[chatTurnIndex] + imageTag;
-            vlmExecutionContext->inputImages.push_back(imageTensor);
-        }
+        // imageHistory is a flat ordered list of tensors matching the {type:image} items in
+        // chatHistory. Pass them directly to add_request; the chat template applied below will
+        // emit the model-specific image tokens at the correct positions.
+        vlmExecutionContext->inputImages = vlmExecutionContext->apiHandler->getImageHistory();
 
-        for (const auto& [chatTurnIndex, imageTagString] : imageTags) {
-            std::string messageContent = chatHistory[chatTurnIndex]["content"].as_string().value_or("");
-            chatHistory[chatTurnIndex]["content"] = imageTagString + messageContent;
+#if (PYTHON_DISABLE == 0)
+        bool success;
+        if (vlmExecutionContext->apiHandler->getProcessedJson().size() > 0) {
+            success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, vlmExecutionContext->apiHandler->getProcessedJson(), vlmExecutionContext->inputText);
+        } else {
+            success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, vlmExecutionContext->payload.body, vlmExecutionContext->inputText);
         }
-
+        if (!success) {
+            return absl::Status(absl::StatusCode::kInvalidArgument, vlmExecutionContext->inputText);
+        }
+#else
         constexpr bool addGenerationPrompt = true;  // confirm it should be hardcoded
         auto toolsStatus = vlmExecutionContext->apiHandler->parseToolsToJsonContainer();
         if (!toolsStatus.ok()) {
@@ -106,6 +115,7 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptr<GenAiSer
         }
         const auto& chatTemplateKwargs = chatTemplateKwargsStatus.value();
         vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, addGenerationPrompt, {}, tools, chatTemplateKwargs);
+#endif
     } else {
         return absl::InvalidArgumentError("Unsupported endpoint");
     }

diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp
@@ -17,7 +17,6 @@
 #include <memory>
 #include <stdexcept>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
 #include "../../../logging.hpp"
@@ -256,25 +255,36 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptr<Ge
 
         for (size_t i = 0; i < chatHistory.size(); i++) {
             const auto& message = chatHistory[i];
-            if (message["content"].as_string().value_or("").find("<ov_genai_image_") != std::string::npos) {
+            const auto& contentField = message["content"];
+            if (contentField.is_array()) {
+                for (size_t j = 0; j < contentField.size(); j++) {
+                    const auto& item = contentField[j];
+                    if (item["type"].as_string().value_or("") == "text" &&
+                        item["text"].as_string().value_or("").find("<ov_genai_image_") != std::string::npos) {
+                        return absl::InvalidArgumentError("Message contains restricted <ov_genai_image> tag");
+                    }
+                }
+            } else if (contentField.as_string().value_or("").find("<ov_genai_image_") != std::string::npos) {
                 return absl::InvalidArgumentError("Message contains restricted <ov_genai_image> tag");
             }
         }
 
-        const ImageHistory& imageHistory = vlmExecutionContext->apiHandler->getImageHistory();
-        size_t imageIndex = 0;
-        std::unordered_map<size_t, std::string> imageTags;
-        for (const auto& image : imageHistory) {
-            const auto& [chatTurnIndex, imageTensor] = image;
-            std::string imageTag = "<ov_genai_image_" + std::to_string(imageIndex++) + ">\n";
-            imageTags[chatTurnIndex] = imageTags[chatTurnIndex] + imageTag;
-            vlmExecutionContext->inputImages.push_back(imageTensor);
+        // imageHistory is a flat ordered list of tensors matching the {type:image} items in
+        // chatHistory. Pass them directly; the chat template applied below will emit the
+        // model-specific image tokens at the correct positions.
+        vlmExecutionContext->inputImages = vlmExecutionContext->apiHandler->getImageHistory();
+
+#if (PYTHON_DISABLE == 0)
+        bool success;
+        if (vlmExecutionContext->apiHandler->getProcessedJson().size() > 0) {
+            success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, vlmExecutionContext->apiHandler->getProcessedJson(), vlmExecutionContext->inputText);
+        } else {
+            success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, vlmExecutionContext->payload.body, vlmExecutionContext->inputText);
         }
-        for (const auto& [chatTurnIndex, imageTagString] : imageTags) {
-            std::string messageContent = chatHistory[chatTurnIndex]["content"].as_string().value_or("");
-            chatHistory[chatTurnIndex]["content"] = imageTagString + messageContent;
+        if (!success) {
+            return absl::Status(absl::StatusCode::kInvalidArgument, vlmExecutionContext->inputText);
         }
-
+#else
         constexpr bool addGenerationPrompt = true;  // confirm it should be hardcoded
         auto toolsStatus = vlmExecutionContext->apiHandler->parseToolsToJsonContainer();
         if (!toolsStatus.ok()) {
@@ -287,6 +297,7 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptr<Ge
         }
         const auto& chatTemplateKwargs = chatTemplateKwargsStatus.value();
         vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, addGenerationPrompt, {}, tools, chatTemplateKwargs);
+#endif
     } else {
         return absl::InvalidArgumentError("Unsupported endpoint");
     }