diff --git a/extras/chat_template_examples/chat_template_gpt_oss.jinja b/extras/chat_template_examples/chat_template_gpt_oss.jinja index 8549cdf1be..f86a050b7d 100644 --- a/extras/chat_template_examples/chat_template_gpt_oss.jinja +++ b/extras/chat_template_examples/chat_template_gpt_oss.jinja @@ -351,11 +351,28 @@ {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }} {%- endif %} {{- "<|start|>functions." + last_tool_call.name }} - {#- Original: {{- " to=assistant<|channel|>commentary<|message|>" + message.content|tojson + "<|end|>" }} #} - {#- Actual version that works, does not escape and allows non-json: #} - {{- " to=assistant<|channel|>commentary<|message|>" + message.content + "<|end|>" -}} + {#- When content is a plain string we render it directly. #} + {#- When content is an array (OpenAI multipart format) we join the text items with newlines, #} + {#- matching what the model was trained on. JSON-serialising the array would add noise. #} + {%- if message.content is string -%} + {{- " to=assistant<|channel|>commentary<|message|>" + message.content + "<|end|>" -}} + {%- else -%} + {%- set ns = namespace(parts=[]) -%} + {%- for item in message.content if item.type == "text" -%} + {%- set ns.parts = ns.parts + [item.text] -%} + {%- endfor -%} + {{- " to=assistant<|channel|>commentary<|message|>" + ns.parts | join("\n") + "<|end|>" -}} + {%- endif -%} {%- elif message.role == 'user' -%} - {{- "<|start|>user<|message|>" + message.content + "<|end|>" }} + {%- if message.content is string -%} + {{- "<|start|>user<|message|>" + message.content + "<|end|>" }} + {%- else -%} + {%- set ns = namespace(parts=[]) -%} + {%- for item in message.content if item.type == "text" -%} + {%- set ns.parts = ns.parts + [item.text] -%} + {%- endfor -%} + {{- "<|start|>user<|message|>" + ns.parts | join("\n") + "<|end|>" }} + {%- endif -%} {%- endif -%} {%- endfor -%} diff --git a/src/llm/apis/openai_api_handler.hpp b/src/llm/apis/openai_api_handler.hpp index 9071e6addc..5f9c0ba22d 100644 --- a/src/llm/apis/openai_api_handler.hpp +++ b/src/llm/apis/openai_api_handler.hpp @@ -146,6 +146,7 @@ class OpenAIApiHandler { std::optional getNumReturnSequences() const; StreamOptions getStreamOptions() const; const std::string& getProcessedJson() const; + // Returns the flat ordered list of image tensors (one per image_url item, in document order). const ImageHistory& getImageHistory() const; ov::genai::ChatHistory& getChatHistory(); std::optional getMaxTokens() const; diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index b69b682611..be98b8317e 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -196,13 +196,14 @@ absl::Status OpenAIChatCompletionsHandler::parseMessages(std::optionalvalue.IsArray()) { - // Adjust content field format when it is passed as an array of objects (typically with images) + // Content passed as an array of objects (OpenAI multipart format). + // We preserve the array structure so chat templates can decide how to render it. + // image_url items are translated to {type:image} so that VLM chat templates + // (which use the OpenVINO GenAI MULTIPART_CONTENT convention) see them correctly. + // The corresponding decoded tensors are appended in document order to imageHistory. if (member->value.GetArray().Size() == 0) { return absl::InvalidArgumentError("Invalid message structure - content array is empty"); } - jsonChanged = true; - Value contentText(rapidjson::kStringType); - contentText.SetString("", doc.GetAllocator()); for (auto& v : member->value.GetArray()) { if (!v.IsObject()) { return absl::InvalidArgumentError("Invalid message structure - content array should contain objects"); @@ -211,14 +212,12 @@ absl::Status OpenAIChatCompletionsHandler::parseMessages(std::optionalvalue = contentText; - // Add new field to the last message in history if content is text - if (member->value.IsString()) { - request.chatHistory.last()[member->name.GetString()] = member->value.GetString(); - } + // Preserve the array (with any image_url translated to {type:image}) in chatHistory. + // For the Python Jinja path, processedJson is only written when jsonChanged is true + // (i.e. when image_url items were translated or tool_call arguments were injected). + // Otherwise the template falls back to payload.body and sees the original OpenAI + // format, which is equally correct — template decides how to render content arrays. + request.chatHistory.last()[memberName] = rapidJsonValueToJsonContainer(member->value); } } auto lastMessage = request.chatHistory.last(); diff --git a/src/llm/apis/openai_request.hpp b/src/llm/apis/openai_request.hpp index e1d05282eb..419322f67b 100644 --- a/src/llm/apis/openai_request.hpp +++ b/src/llm/apis/openai_request.hpp @@ -32,7 +32,11 @@ #include "tool_schema_wrapper.hpp" namespace ovms { -using ImageHistory = std::vector>; +// Flat ordered list of image tensors extracted from content arrays. +// Order matches the sequence of image_url items across all messages, +// which corresponds to the order of {"type":"image"} items in chatHistory +// after image_url → image translation. +using ImageHistory = std::vector; struct StreamOptions { bool includeUsage = false; diff --git a/src/llm/apis/openai_responses.cpp b/src/llm/apis/openai_responses.cpp index e5d63985e6..b355363aad 100644 --- a/src/llm/apis/openai_responses.cpp +++ b/src/llm/apis/openai_responses.cpp @@ -120,7 +120,13 @@ absl::Status OpenAIResponsesHandler::parseInput(std::optional allow return absl::InvalidArgumentError("Invalid message structure - content array is empty"); } - std::string contentText = ""; + // Translate Responses API content array into the canonical multipart format + // used by chatHistory and VLM chat templates: + // input_text → {"type":"text", "text": ""} + // input_image → {"type":"image"} (tensor appended to imageHistory in order) + // This mirrors the Chat Completions image_url → image translation so that + // both VLM chat templates (GenAI MULTIPART_CONTENT) and Python Jinja2 templates + // receive a uniform representation. for (auto& contentItem : contentIt->value.GetArray()) { if (!contentItem.IsObject()) { return absl::InvalidArgumentError("input content items must be objects"); @@ -137,7 +143,17 @@ absl::Status OpenAIResponsesHandler::parseInput(std::optional allow if (textIt == contentObj.MemberEnd() || !textIt->value.IsString()) { return absl::InvalidArgumentError("input_text requires a valid text field"); } - contentText = textIt->value.GetString(); + // Normalise to {"type":"text","text":"..."} in-place. + std::string textValue = textIt->value.GetString(); + while (contentItem.MemberBegin() != contentItem.MemberEnd()) { + contentItem.RemoveMember(contentItem.MemberBegin()); + } + contentItem.AddMember(rapidjson::Value("type", doc.GetAllocator()), + rapidjson::Value("text", doc.GetAllocator()), + doc.GetAllocator()); + contentItem.AddMember(rapidjson::Value("text", doc.GetAllocator()), + rapidjson::Value(textValue.c_str(), doc.GetAllocator()), + doc.GetAllocator()); } else if (type == "input_image") { std::string imageUrl; auto imageUrlIt = contentObj.FindMember("image_url"); @@ -161,13 +177,21 @@ absl::Status OpenAIResponsesHandler::parseInput(std::optional allow if (!tensorResult.ok()) { return tensorResult.status(); } - request.imageHistory.push_back({i, tensorResult.value()}); + request.imageHistory.push_back(std::move(tensorResult.value())); + // Translate to {"type":"image"} in-place so VLM chat templates see + // the image at the correct position in the content array. + while (contentItem.MemberBegin() != contentItem.MemberEnd()) { + contentItem.RemoveMember(contentItem.MemberBegin()); + } + contentItem.AddMember(rapidjson::Value("type", doc.GetAllocator()), + rapidjson::Value("image", doc.GetAllocator()), + doc.GetAllocator()); } else { return absl::InvalidArgumentError("Unsupported content type. Supported types are input_text and input_image."); } } - request.chatHistory.last()["content"] = contentText; + request.chatHistory.last()["content"] = rapidJsonValueToJsonContainer(contentIt->value); } } else { return absl::InvalidArgumentError("input is not a string or array"); diff --git a/src/llm/visual_language_model/continuous_batching/servable.cpp b/src/llm/visual_language_model/continuous_batching/servable.cpp index 0ef06d22df..3c094aab19 100644 --- a/src/llm/visual_language_model/continuous_batching/servable.cpp +++ b/src/llm/visual_language_model/continuous_batching/servable.cpp @@ -19,7 +19,6 @@ #include #include #include -#include #include #include "../../../logging.hpp" @@ -74,26 +73,36 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptr tag"); + } + } + } else if (contentField.as_string().value_or("").find(" tag"); } } - const ImageHistory& imageHistory = vlmExecutionContext->apiHandler->getImageHistory(); - size_t imageIndex = 0; - std::unordered_map imageTags; - for (const auto& image : imageHistory) { - const auto& [chatTurnIndex, imageTensor] = image; - std::string imageTag = "\n"; - imageTags[chatTurnIndex] = imageTags[chatTurnIndex] + imageTag; - vlmExecutionContext->inputImages.push_back(imageTensor); - } + // imageHistory is a flat ordered list of tensors matching the {type:image} items in + // chatHistory. Pass them directly to add_request; the chat template applied below will + // emit the model-specific image tokens at the correct positions. + vlmExecutionContext->inputImages = vlmExecutionContext->apiHandler->getImageHistory(); - for (const auto& [chatTurnIndex, imageTagString] : imageTags) { - std::string messageContent = chatHistory[chatTurnIndex]["content"].as_string().value_or(""); - chatHistory[chatTurnIndex]["content"] = imageTagString + messageContent; +#if (PYTHON_DISABLE == 0) + bool success; + if (vlmExecutionContext->apiHandler->getProcessedJson().size() > 0) { + success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, vlmExecutionContext->apiHandler->getProcessedJson(), vlmExecutionContext->inputText); + } else { + success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, vlmExecutionContext->payload.body, vlmExecutionContext->inputText); } - + if (!success) { + return absl::Status(absl::StatusCode::kInvalidArgument, vlmExecutionContext->inputText); + } +#else constexpr bool addGenerationPrompt = true; // confirm it should be hardcoded auto toolsStatus = vlmExecutionContext->apiHandler->parseToolsToJsonContainer(); if (!toolsStatus.ok()) { @@ -106,6 +115,7 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptrinputText = properties->tokenizer.apply_chat_template(chatHistory, addGenerationPrompt, {}, tools, chatTemplateKwargs); +#endif } else { return absl::InvalidArgumentError("Unsupported endpoint"); } diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index bc3ecaf71f..4f36c115c4 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include "../../../logging.hpp" @@ -256,25 +255,36 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptr tag"); + } + } + } else if (contentField.as_string().value_or("").find(" tag"); } } - const ImageHistory& imageHistory = vlmExecutionContext->apiHandler->getImageHistory(); - size_t imageIndex = 0; - std::unordered_map imageTags; - for (const auto& image : imageHistory) { - const auto& [chatTurnIndex, imageTensor] = image; - std::string imageTag = "\n"; - imageTags[chatTurnIndex] = imageTags[chatTurnIndex] + imageTag; - vlmExecutionContext->inputImages.push_back(imageTensor); + // imageHistory is a flat ordered list of tensors matching the {type:image} items in + // chatHistory. Pass them directly; the chat template applied below will emit the + // model-specific image tokens at the correct positions. + vlmExecutionContext->inputImages = vlmExecutionContext->apiHandler->getImageHistory(); + +#if (PYTHON_DISABLE == 0) + bool success; + if (vlmExecutionContext->apiHandler->getProcessedJson().size() > 0) { + success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, vlmExecutionContext->apiHandler->getProcessedJson(), vlmExecutionContext->inputText); + } else { + success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, vlmExecutionContext->payload.body, vlmExecutionContext->inputText); } - for (const auto& [chatTurnIndex, imageTagString] : imageTags) { - std::string messageContent = chatHistory[chatTurnIndex]["content"].as_string().value_or(""); - chatHistory[chatTurnIndex]["content"] = imageTagString + messageContent; + if (!success) { + return absl::Status(absl::StatusCode::kInvalidArgument, vlmExecutionContext->inputText); } - +#else constexpr bool addGenerationPrompt = true; // confirm it should be hardcoded auto toolsStatus = vlmExecutionContext->apiHandler->parseToolsToJsonContainer(); if (!toolsStatus.ok()) { @@ -287,6 +297,7 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptrinputText = properties->tokenizer.apply_chat_template(chatHistory, addGenerationPrompt, {}, tools, chatTemplateKwargs); +#endif } else { return absl::InvalidArgumentError("Unsupported endpoint"); } diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index a4e6585af0..0ac5e27c3c 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -1586,8 +1586,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsBase64) { ASSERT_EQ(apiHandler->parseMessages(), absl::OkStatus()); const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); ASSERT_EQ(imageHistory.size(), 1); - auto [index, image] = imageHistory[0]; - EXPECT_EQ(index, 0); + const auto& image = imageHistory[0]; EXPECT_EQ(image.get_element_type(), ov::element::u8); EXPECT_EQ(image.get_size(), 3); std::vector expectedBytes = {110, 181, 160}; @@ -1595,7 +1594,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsBase64) { EXPECT_EQ(expectedBytes[i], ((uint8_t*)image.data())[i]); } json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); + EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":[{\"type\":\"text\",\"text\":\"What is in this image?\"},{\"type\":\"image\"}]}]}")); } TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttp) { @@ -1627,12 +1626,11 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttp) { ASSERT_EQ(apiHandler->parseMessages(std::nullopt, allowedDomains), absl::OkStatus()); const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); ASSERT_EQ(imageHistory.size(), 1); - auto [index, image] = imageHistory[0]; - EXPECT_EQ(index, 0); + const auto& image = imageHistory[0]; EXPECT_EQ(image.get_element_type(), ov::element::u8); EXPECT_EQ(image.get_size(), 225792); json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); + EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":[{\"type\":\"text\",\"text\":\"What is in this image?\"},{\"type\":\"image\"}]}]}")); } TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttpMultipleAllowedDomains) { @@ -1664,12 +1662,11 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttpMultipleAllow ASSERT_EQ(apiHandler->parseMessages(std::nullopt, allowedDomains), absl::OkStatus()); const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); ASSERT_EQ(imageHistory.size(), 1); - auto [index, image] = imageHistory[0]; - EXPECT_EQ(index, 0); + const auto& image = imageHistory[0]; EXPECT_EQ(image.get_element_type(), ov::element::u8); EXPECT_EQ(image.get_size(), 225792); json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); + EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":[{\"type\":\"text\",\"text\":\"What is in this image?\"},{\"type\":\"image\"}]}]}")); } TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttps) { @@ -1701,12 +1698,11 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttps) { ASSERT_EQ(apiHandler->parseMessages(std::nullopt, allowedDomains), absl::OkStatus()); const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); ASSERT_EQ(imageHistory.size(), 1); - auto [index, image] = imageHistory[0]; - EXPECT_EQ(index, 0); + const auto& image = imageHistory[0]; EXPECT_EQ(image.get_element_type(), ov::element::u8); EXPECT_EQ(image.get_size(), 225792); json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); + EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":[{\"type\":\"text\",\"text\":\"What is in this image?\"},{\"type\":\"image\"}]}]}")); } TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttpsAllowedDomainAll) { @@ -1738,12 +1734,11 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttpsAllowedDomai ASSERT_EQ(apiHandler->parseMessages(std::nullopt, allowedDomains), absl::OkStatus()); const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); ASSERT_EQ(imageHistory.size(), 1); - auto [index, image] = imageHistory[0]; - EXPECT_EQ(index, 0); + const auto& image = imageHistory[0]; EXPECT_EQ(image.get_element_type(), ov::element::u8); EXPECT_EQ(image.get_size(), 225792); json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); + EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":[{\"type\":\"text\",\"text\":\"What is in this image?\"},{\"type\":\"image\"}]}]}")); } TEST_F(HttpOpenAIHandlerParsingTest, ParsingImageJpegWithNoTextSucceeds) { @@ -1769,8 +1764,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingImageJpegWithNoTextSucceeds) { ASSERT_EQ(apiHandler->parseMessages(), absl::OkStatus()); const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); ASSERT_EQ(imageHistory.size(), 1); - auto [index, image] = imageHistory[0]; - EXPECT_EQ(index, 0); + const auto& image = imageHistory[0]; EXPECT_EQ(image.get_element_type(), ov::element::u8); EXPECT_EQ(image.get_size(), 3); std::vector expectedBytes = {54, 245, 241}; @@ -1778,7 +1772,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingImageJpegWithNoTextSucceeds) { EXPECT_EQ(expectedBytes[i], ((uint8_t*)image.data())[i]); } json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"\"}]}")); + EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":[{\"type\":\"image\"}]}]}")); } TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageStringWithNoPrefixFails) { @@ -1948,12 +1942,11 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystem) { ASSERT_EQ(apiHandler->parseMessages(getGenericFullPathForSrcTest("/ovms/src/test")), absl::OkStatus()); const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); ASSERT_EQ(imageHistory.size(), 1); - auto [index, image] = imageHistory[0]; - EXPECT_EQ(index, 0); + const auto& image = imageHistory[0]; EXPECT_EQ(image.get_element_type(), ov::element::u8); EXPECT_EQ(image.get_size(), 3); json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); + EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":[{\"type\":\"text\",\"text\":\"What is in this image?\"},{\"type\":\"image\"}]}]}")); } TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystemWithinAllowedPath) { @@ -1984,12 +1977,11 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystemWithinAl ASSERT_EQ(apiHandler->parseMessages(getGenericFullPathForSrcTest("/ovms/src/test/binaryutils")), absl::OkStatus()); const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); ASSERT_EQ(imageHistory.size(), 1); - auto [index, image] = imageHistory[0]; - EXPECT_EQ(index, 0); + const auto& image = imageHistory[0]; EXPECT_EQ(image.get_element_type(), ov::element::u8); EXPECT_EQ(image.get_size(), 3); json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); + EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":[{\"type\":\"text\",\"text\":\"What is in this image?\"},{\"type\":\"image\"}]}]}")); } TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystemNotWithinAllowedPath) { @@ -2140,10 +2132,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMultipleMessagesSucceeds) { const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); ASSERT_EQ(imageHistory.size(), 2); std::vector expectedBytes = {110, 181, 160}; - std::vector expectedImageIndexes = {0, 2}; - size_t i = 0; - for (auto [index, image] : imageHistory) { - EXPECT_EQ(index, expectedImageIndexes[i++]); + for (const auto& image : imageHistory) { EXPECT_EQ(image.get_element_type(), ov::element::u8); EXPECT_EQ(image.get_size(), 3); for (size_t i = 0; i < image.get_size(); i++) { @@ -2151,10 +2140,10 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMultipleMessagesSucceeds) { } } json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}," - "{\"role\":\"assistant\",\"content\":\"No idea my friend.\"}," - "{\"role\":\"user\",\"content\":\"What about this one?\"}," - "{\"role\":\"assistant\",\"content\":\"Same thing. I'm not very good with images.\"}," + EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":[{\"type\":\"text\",\"text\":\"What is in this image?\"},{\"type\":\"image\"}]}," + "{\"role\":\"assistant\",\"content\":[{\"type\":\"text\",\"text\":\"No idea my friend.\"}]}," + "{\"role\":\"user\",\"content\":[{\"type\":\"text\",\"text\":\"What about this one?\"},{\"type\":\"image\"}]}," + "{\"role\":\"assistant\",\"content\":[{\"type\":\"text\",\"text\":\"Same thing. I'm not very good with images.\"}]}," "{\"role\":\"user\",\"content\":\"You were not trained with images, were you?\"}]}")); } @@ -3610,3 +3599,76 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParseMessagesRegularMessageHasNoToolFields) EXPECT_FALSE(history[1].contains("tool_call_id")); EXPECT_FALSE(history[1].contains("name")); } + +TEST_F(HttpOpenAIHandlerParsingTest, ParseMessagesToolContentArrayPreservedInChatHistory) { + // Tool responses may arrive with content as an array of {type, text} objects (OpenAI multipart + // format). The array is preserved end-to-end so the chat template can decide how to render it. + std::string json = R"({ + "model": "llama", + "messages": [ + {"role": "user", "content": "list jobs"}, + {"role": "assistant", "content": null, "tool_calls": [{"id": "call_1", "type": "function", "function": {"name": "list_jobs", "arguments": "{\"folder_path\":\"ovmsc\"}"}}]}, + {"role": "tool", "tool_call_id": "call_1", "content": [ + {"type": "text", "text": "ubuntu"}, + {"type": "text", "text": "redhat"}, + {"type": "text", "text": "windows"} + ]} + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + auto apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + ASSERT_EQ(apiHandler->parseMessages(), absl::OkStatus()); + + ov::genai::ChatHistory& history = apiHandler->getChatHistory(); + ASSERT_EQ(history.size(), 3); + + // Tool message content must be an array with all three items preserved + auto toolMsg = history[2]; + ASSERT_TRUE(toolMsg.contains("content")); + ASSERT_TRUE(toolMsg["content"].is_array()); + ASSERT_EQ(toolMsg["content"].size(), 3); + EXPECT_EQ(toolMsg["content"][0]["type"].get_string(), "text"); + EXPECT_EQ(toolMsg["content"][0]["text"].get_string(), "ubuntu"); + EXPECT_EQ(toolMsg["content"][1]["text"].get_string(), "redhat"); + EXPECT_EQ(toolMsg["content"][2]["text"].get_string(), "windows"); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParseMessagesContentArrayPreservedInProcessedJson) { + // Content arrays are preserved as-is in processedJson when another mutation triggers + // jsonChanged (here: ensureArgumentsInToolCalls adds "arguments" to the tool call). + // The Python Jinja template receives the full structured content and decides how to render it. + std::string json = R"({ + "model": "llama", + "messages": [ + {"role": "user", "content": "list jobs"}, + {"role": "assistant", "content": null, "tool_calls": [{"id": "call_1", "type": "function", "function": {"name": "list_jobs"}}]}, + {"role": "tool", "tool_call_id": "call_1", "content": [ + {"type": "text", "text": "ubuntu"}, + {"type": "text", "text": "redhat"}, + {"type": "text", "text": "windows"} + ]} + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + auto apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + ASSERT_EQ(apiHandler->parseMessages(), absl::OkStatus()); + + // ensureArgumentsInToolCalls adds "arguments": "{}" → jsonChanged → processedJson is written + const std::string& processed = apiHandler->getProcessedJson(); + ASSERT_FALSE(processed.empty()); + + // The tool message content must remain an array in processedJson + rapidjson::Document processedDoc; + processedDoc.Parse(processed.c_str()); + ASSERT_FALSE(processedDoc.HasParseError()); + const auto& messages = processedDoc["messages"].GetArray(); + ASSERT_EQ(messages.Size(), 3); + const auto& toolContent = messages[2]["content"]; + ASSERT_TRUE(toolContent.IsArray()); + ASSERT_EQ(toolContent.GetArray().Size(), 3); + EXPECT_STREQ(toolContent.GetArray()[0]["text"].GetString(), "ubuntu"); + EXPECT_STREQ(toolContent.GetArray()[1]["text"].GetString(), "redhat"); + EXPECT_STREQ(toolContent.GetArray()[2]["text"].GetString(), "windows"); +}