diff --git a/menu/navigation.ts b/menu/navigation.ts index c127afda26..2dcbe84727 100644 --- a/menu/navigation.ts +++ b/menu/navigation.ts @@ -44,7 +44,6 @@ import { localStorageMenu } from "../pages/local-storage/menu" import { mailboxMenu } from "../pages/mailbox/menu" import { managedDatabasesForPostgresqlAndMysqlMenu } from "../pages/managed-databases-for-postgresql-and-mysql/menu" import { managedDatabasesForRedisMenu } from "../pages/managed-databases-for-redis/menu" -import { managedInferenceMenu } from "../pages/managed-inference/menu" import { managedMongodbDatabasesMenu } from "../pages/managed-mongodb-databases/menu" import { natsMenu } from "../pages/nats/menu" import { objectStorageMenu } from "../pages/object-storage/menu" @@ -109,7 +108,6 @@ export default [ icon: 'AiCategoryIcon', items: [ generativeApisMenu, - managedInferenceMenu, ], label: 'AI', category: 'ai-data', diff --git a/pages/generative-apis/api-cli/understanding-errors.mdx b/pages/generative-apis/api-cli/understanding-errors.mdx index 5a81e8d056..d020ded595 100644 --- a/pages/generative-apis/api-cli/understanding-errors.mdx +++ b/pages/generative-apis/api-cli/understanding-errors.mdx @@ -8,9 +8,12 @@ dates: --- Scaleway uses conventional HTTP response codes to indicate the success or failure of an API request. -In general, codes in the 2xx range indicate success, codes in the 4xx range indicate an error caused by the information provided, and codes in the 5xx range show an error from Scaleway servers. +In general: +- codes in the `2xx` range indicate success +- codes in the `4xx` range indicate an error caused by the information provided +- codes in the `5xx` range show an error from Scaleway servers -If the response code is not within the 2xx range, the API returns an error object. The structure of the error object depends on how recent the model being used is: +If the response code is not within the `2xx` range, the API returns an error object. The structure of the error object depends on how recent the model being used is: @@ -45,14 +48,14 @@ If the response code is not within the 2xx range, the API returns an error objec Below are common HTTP error codes: -- 400 - **Bad Request**: The format or content of your payload is incorrect. The body may be too large, or fail to parse, or the content-type is mismatched. -- 401 - **Unauthorized**: The `authorization` header is missing. Find required headers in [this page](/generative-apis/api-cli/using-generative-apis/) -- 403 - **Forbidden**: Your API key does not exist or does not have the necessary permissions to access the requested resource. Find required permission sets in [this page](/generative-apis/api-cli/using-generative-apis/) -- 404 - **Route Not Found**: The requested resource could not be found. Check your request is being made to the correct endpoint. +- 400 - **Bad Request**: The format or content of your payload is incorrect. The body may be too large, or fails to parse, or the `content-type` is mismatched. +- 401 - **Unauthorized**: The `authorization` header is missing. For information about the required headers, see the [Using Generative APIs](/generative-apis/api-cli/using-generative-apis/) page. +- 403 - **Forbidden**: Your API key does not exist or does not have the necessary permissions to access the requested resource. For information about the required permission sets, see the [Using Generative APIs](/generative-apis/api-cli/using-generative-apis/) page. +- 404 - **Route Not Found**: The requested resource could not be found. Check if your request is being made to the correct endpoint. - 422 - **Model Not Found**: The `model` key is present in the request payload, but the corresponding model is not found. - 422 - **Missing Model**: The `model` key is missing from the request payload. -- 429 - **Too Many Requests**: You are exceeding your current quota for the requested model, calculated in requests per minute. Find rate limits on [this page](/generative-apis/reference-content/rate-limits/) -- 429 - **Too Many Tokens**: You are exceeding your current quota for the requested model, calculated in tokens per minute. Find rate limits on [this page](/generative-apis/reference-content/rate-limits/) -- 500 - **API error**: An unexpected internal error has occurred within Scaleway's systems. If the issue persists, please [open a support ticket](https://console.scaleway.com/support/tickets/create). +- 429 - **Too Many Requests**: You are exceeding your current quota for the requested model, calculated in requests per minute. For information about rate limits, see the [Rate limits](/generative-apis/reference-content/rate-limits/) page. +- 429 - **Too Many Tokens**: You are exceeding your current quota for the requested model, calculated in tokens per minute. For information about rate limits, see the [Rate limits](/generative-apis/reference-content/rate-limits/) page. +- 500 - **API error**: An unexpected internal error has occurred within Scaleway's systems. If the issue persists, [open a Support ticket](https://console.scaleway.com/support/tickets/create). -For streaming responses via SSE, 5xx errors may occur after a 200 response has been returned. +For streaming responses via SSE, 5xx errors may occur after a 200 response has been returned. \ No newline at end of file diff --git a/pages/generative-apis/api-cli/using-chat-api.mdx b/pages/generative-apis/api-cli/using-chat-api.mdx index ba83dfc7b7..4cddc6f040 100644 --- a/pages/generative-apis/api-cli/using-chat-api.mdx +++ b/pages/generative-apis/api-cli/using-chat-api.mdx @@ -1,5 +1,5 @@ --- -title: Using Chat API +title: Using the Chat API description: This page explains how to use the Chat API to query models tags: generative-apis ai-data chat-api dates: @@ -7,7 +7,7 @@ dates: posted: 2024-09-03 --- -Scaleway Generative APIs are designed as a drop-in replacement for the OpenAI APIs. If you have an LLM-driven application that uses one of OpenAI's client libraries, you can easily configure it to point to Scaleway Chat API, and get your existing applications running using open-weight instruct models hosted at Scaleway. +Scaleway Generative APIs are designed as a drop-in replacement for the OpenAI APIs. If you have an LLM-driven application that uses one of OpenAI's client libraries, you can easily configure it to point to Scaleway's Chat API, and get your existing applications running using open-weight instruct models hosted at Scaleway. You can also try the **Responses** API for LLM-driven tasks. Released in 2025, it is designed to combine the simplicity of Chat Completions with the ability to do more agentic tasks and reasoning. [Find out more about the Responses API](/generative-apis/how-to/query-language-models/#chat-completions-api-or-responses-api). @@ -48,7 +48,7 @@ curl --request POST \ ## Headers -Find required headers in [this page](/generative-apis/api-cli/using-generative-apis/). +For information about the required headers, see the [Using Generative APIs](/generative-apis/api-cli/using-generative-apis/) page. ## Body @@ -59,35 +59,35 @@ Find required headers in [this page](/generative-apis/api-cli/using-generative-a | **messages** | array of objects | A list of messages comprising the conversation so far. | | **model** | string | The name of the model to query. | -Our chat API is OpenAI compatible. Use OpenAI’s [API reference](https://platform.openai.com/docs/api-reference/chat/create) for more detailed information on the usage. +Our Chat API is OpenAI compatible. Refer to OpenAI’s [API reference](https://platform.openai.com/docs/api-reference/chat/create) for detailed information on usage. ### Supported parameters -- temperature -- top_p -- max_tokens -- stream -- stream_options -- presence_penalty -- [response_format](/generative-apis/how-to/use-structured-outputs) -- logprobs -- stop -- seed -- [tools](/generative-apis/how-to/use-function-calling) -- [tool_choice](/generative-apis/how-to/use-function-calling) +- `temperature` +- `top_p` +- `max_tokens` +- `stream` +- `stream_options` +- `presence_penalty` +- `response_format` (For more information, see [How to use structured outputs](/generative-apis/how-to/use-structured-outputs).) +- `logprobs` +- `stop` +- `seed` +- `tools` (For more information, see [How to use function calling](/generative-apis/how-to/use-function-calling).) +- `tool_choice` (For more information, see [How to use function calling](/generative-apis/how-to/use-function-calling).) ### Unsupported parameters -- frequency_penalty -- n -- top_logprobs -- logit_bias -- user +- `frequency_penalty` +- `n` +- `top_logprobs` +- `logit_bias` +- `user` -If you have a use case requiring one of these unsupported parameters, please [contact us via Slack](https://slack.scaleway.com/) on #ai channel. +If you have a use case requiring one of these unsupported parameters, [contact us via Slack](https://slack.scaleway.com/) using the `#ai` channel. ## Going further 1. [Python code examples](/generative-apis/how-to/query-language-models/#querying-language-models-via-api) to query text models using Scaleway's Chat API 2. [How to use structured outputs](/generative-apis/how-to/use-structured-outputs) with the `response_format` parameter -3. [How to use function calling](/generative-apis/how-to/use-function-calling) with `tools` and `tool_choice` +3. [How to use function calling](/generative-apis/how-to/use-function-calling) with the `tools` and `tool_choice` parameters diff --git a/pages/generative-apis/api-cli/using-embeddings-api.mdx b/pages/generative-apis/api-cli/using-embeddings-api.mdx index b34152415d..225002d0b2 100644 --- a/pages/generative-apis/api-cli/using-embeddings-api.mdx +++ b/pages/generative-apis/api-cli/using-embeddings-api.mdx @@ -27,7 +27,7 @@ curl https://api.scaleway.ai/v1/embeddings \ ## Headers -Find required headers in [this page](/generative-apis/api-cli/using-generative-apis/). +For information about the required headers, see the [Using Generative APIs](/generative-apis/api-cli/using-generative-apis/) page. ## Body @@ -35,17 +35,17 @@ Find required headers in [this page](/generative-apis/api-cli/using-generative-a | Param | Type | Description | | ------------- |-------------|-------------| -| **input** | string or array | Input text to embed, encoded as a string or array of strings. It cannot be an empty string. | -| **model** | string | The name of the model to query. | +| `input` | string or array | Input text to embed, encoded as a string or array of strings. It cannot be an empty string. | +| `model` | string | The name of the model to query. | -Our embeddings API is OpenAI compatible. Use OpenAI’s [API reference](https://platform.openai.com/docs/api-reference/embeddings) for more detailed information on the usage. +Our Embeddings API is OpenAI compatible. Refer to OpenAI’s [API reference](https://platform.openai.com/docs/api-reference/embeddings) for detailed information on usage. ### Unsupported parameters -- encoding_format (default float) -- dimensions +- `encoding_format` (default float) +- `dimensions` -If you have a use case requiring one of these unsupported parameters, please [contact us via Slack](https://slack.scaleway.com/) on #ai channel. +If you have a use case requiring one of these unsupported parameters, [contact us via Slack](https://slack.scaleway.com/) using the `#ai` channel. Check our [Python code examples](/generative-apis/how-to/query-embedding-models/#querying-embedding-models-via-api) to query embedding models using Scaleway's Embeddings API. diff --git a/pages/generative-apis/api-cli/using-generative-apis.mdx b/pages/generative-apis/api-cli/using-generative-apis.mdx index 69645deddb..80571544d8 100644 --- a/pages/generative-apis/api-cli/using-generative-apis.mdx +++ b/pages/generative-apis/api-cli/using-generative-apis.mdx @@ -29,8 +29,8 @@ curl -X GET \ "https://api.scaleway.ai/v1/models" ``` -When using the OpenAI Python SDK, the API key is set once during client initialization, and the SDK automatically manages the inclusion of the Authorization header in all API requests. -In contrast, when directly integrating with the Scaleway Generative APIs, you are responsible for manually setting the Authorization header with the API key for each request to ensure proper authentication. +When using the OpenAI Python SDK, the API key is set once during client initialization, and the SDK automatically manages the inclusion of the `Authorization` header in all API requests. +In contrast, when directly integrating with the Scaleway Generative APIs, you are responsible for manually setting the `Authorization` header with the API key for each request to ensure proper authentication. ## Content types @@ -57,11 +57,17 @@ Querying AI models hosted by Scaleway Generative APIs will require any of the fo - **GenerativeApisFullAccess** - **AllProductsFullAccess** + + Due to a product name change, the permission set names `InferenceFullAccess` and `InferenceReadOnly` are also changing. If you are automatically provisioning IAM policies (using Terraform, CLI, or APIs) with permission sets `InferenceFullAccess` and `InferenceReadOnly`, then you should edit your existing scripts and replace these permissions with `GenerativeApisFullAccess` and `GenerativeApisModelAccess`, respectively. + + For now, both `InferenceFullAccess` and `InferenceReadOnly` will remain available at least until 1 June 2026. If you may be impacted by the permission set name update, you will receive a dedicated communication including the definitive End Of Life date for these permission sets. + + ## Projects You can scope your Generative APIs consumption to a [Project](/iam/concepts/#project). This is helpful to restrict IAM users’ access to only the Project they are working on, or to isolate your bills between Projects. -1. Find your Project ID in your [Project settings](https://console.scaleway.com/project/settings) +1. Find your Project ID in your [Project settings](https://console.scaleway.com/project/settings). 2. Insert your Project ID in the Generative APIs service URL, for example: ``` diff --git a/pages/generative-apis/api-cli/using-models-api.mdx b/pages/generative-apis/api-cli/using-models-api.mdx index 4a191b6ad8..1576912b8d 100644 --- a/pages/generative-apis/api-cli/using-models-api.mdx +++ b/pages/generative-apis/api-cli/using-models-api.mdx @@ -1,5 +1,5 @@ --- -title: Using Models API +title: Using the Models API description: This page explains how to use the Models API tags: generative-apis ai-data embeddings-api dates: @@ -7,7 +7,7 @@ dates: posted: 2024-09-02 --- -Scaleway Generative APIs are designed as drop-in replacement for the OpenAI APIs. +Scaleway Generative APIs are designed as a drop-in replacement for the OpenAI APIs. The Models API allows you to easily list the various AI models available at Scaleway. ## List models diff --git a/pages/generative-apis/concepts.mdx b/pages/generative-apis/concepts.mdx index dfb27a2405..fb573283e9 100644 --- a/pages/generative-apis/concepts.mdx +++ b/pages/generative-apis/concepts.mdx @@ -3,87 +3,143 @@ title: Generative APIs - Concepts description: This page explains all the concepts related to Generative APIs tags: dates: - validation: 2025-09-03 + validation: 2026-04-16 --- -## API rate limits -API rate limits define the maximum number of requests a user can make to the Generative APIs within a specific time frame. Rate limiting helps to manage resource allocation, prevent abuse, and ensure fair access for all users. Understanding and adhering to these limits is essential for maintaining optimal application performance using these APIs. +## Allowed IPs {/* Dedicated */} + +The **Allowed IPs** feature is no longer available for Generative APIs - Dedicated Deployment. Use one of the alternative methods detailed in our [documentation about access management](/generative-apis/how-to/manage-allowed-ips/) to restrict access to your dedicated Generative APIs deployments. + +## API rate limits {/* Serverless */} + +API rate limits define the maximum number of requests a user can make to the Generative APIs - Serverless endpoints within a specific time frame. Rate limiting helps to manage resource allocation, prevent abuse, and ensure fair access for all users. Understanding and adhering to these limits is essential for maintaining optimal application performance using these APIs. Refer to the [Rate limits](/generative-apis/reference-content/rate-limits/) documentation for more information. -## Batch processing +## Batch processing {/* Serverless */} Batch jobs are processed asynchronously, offering reduced costs (see [pricing page](https://www.scaleway.com/en/pricing/model-as-a-service/)) and no rate limits. They are designed for high-volume workloads and are typically completed within 24 hours. -## Context window +## Context window {/* Serverless + Dedicated (Context size) */} A context window is the maximum amount of prompt data considered by the model to generate a response. Using models with high context length, you can provide more information to generate relevant responses. The context is measured in tokens. -## Function calling +## Deployment {/* Dedicated */} -Function calling allows a large language model (LLM) to interact with external tools or APIs, executing specific tasks based on user requests. The LLM identifies the appropriate function, extracts the required parameters, and returns the results as structured data, typically in JSON format. +A deployment makes a trained language model available for real-world applications. It encompasses tasks such as integrating the model into existing systems, optimizing its performance, and ensuring scalability and reliability. -Refer to [How to use function calling](/generative-apis/how-to/use-function-calling/) for more information. +## Embeddings {/* Serverless + Dedicated (Embedding models) */} -## Embeddings - -Embeddings are numerical representations of text data that capture semantic information in a dense vector format. In Generative APIs, embeddings are essential for tasks such as similarity matching, clustering, and serving as inputs for downstream models. These vectors enable the model to understand and generate text based on the underlying meaning rather than just the surface-level words. +Embeddings are numerical representations of text data that capture semantic information in a dense vector format. In Generative APIs, embeddings are essential for tasks such as similarity matching, clustering, and serving as input for downstream models, or algorithms. These vectors enable the model to understand and generate text based on the underlying meaning rather than just the surface-level words. Refer to [How to query embedding models](/generative-apis/how-to/query-embedding-models/) for more information. -## Error handling +## Endpoint {/* Dedicated */} + +In the context of LLMs, an endpoint refers to a network-accessible URL or interface through which clients can interact with the model for inference tasks. It exposes methods for sending input data and receiving model predictions or responses. + +## Error handling {/* Serverless */} Error handling refers to the strategies and mechanisms in place to manage and respond to errors during API requests. This includes handling network issues, invalid inputs, or server-side errors. Proper error handling ensures that applications using Generative APIs can gracefully recover from failures and provide meaningful feedback to users. Refer to [Understanding errors](/generative-apis/api-cli/understanding-errors/) for more information. -## Parameters +## Fine-tuning {/* Dedicated */} -Parameters are settings that control the behavior and performance of generative models. These include temperature, max tokens, and top-p sampling, among others. Adjusting parameters allows users to tweak the model's output, balancing factors like creativity, accuracy, and response length to suit specific use cases. +Fine-tuning involves further training a pre-trained language model on domain-specific or task-specific data to improve performance on a particular task. This process often includes updating the model's parameters using a smaller, task-specific dataset. + +## Few-shot prompting {/* Dedicated */} + +Few-shot prompting uses the power of language models to generate responses with minimal input, relying on just a handful of examples or prompts. +It demonstrates the model's ability to generalize from limited training data to produce coherent and contextually relevant outputs. + +## Function calling {/* Serverless + Dedicated */} + +Function calling allows a large language model (LLM) to interact with external tools or APIs, executing specific tasks based on user requests. The LLM identifies the appropriate function, extracts the required parameters, and returns the results as structured data, typically in JSON format. + +Refer to [How to use function calling](/generative-apis/how-to/use-function-calling/) for more information. + +## Hallucinations {/* Dedicated */} -## Inter-token Latency (ITL) +Hallucinations in LLMs refer to instances where generative AI models generate responses that, while grammatically coherent, contain inaccuracies or nonsensical information. These inaccuracies are termed "hallucinations" because the models create false or misleading content. Hallucinations can occur because of constraints in the training data, biases embedded within the models, or the complex nature of language itself. -The inter-token latency (ITL) corresponds to the average time elapsed between two generated tokens. It is usually expressed in milliseconds. +## Inference {/* Dedicated */} -## JSON mode +Inference is the process of deriving logical conclusions or predictions from available data. This concept involves using statistical methods, machine learning algorithms, and reasoning techniques to make decisions or draw insights based on observed patterns or evidence. +Inference is fundamental in various AI applications, including natural language processing, image recognition, and autonomous systems. + +## Inter-token Latency (ITL) {/* Serverless */} + +Inter-token latency (ITL) corresponds to the average time elapsed between two generated tokens. It is usually expressed in milliseconds. + +## JSON mode {/* Serverless + Dedicated */} JSON mode allows you to guide the language model in outputting well-structured JSON data. To activate JSON mode, provide the `response_format` parameter with `{"type": "json_object"}`. -JSON mode is useful for applications like chatbots or APIs, where a machine-readable format is essential for easy processing. +JSON mode is useful for applications such as chatbots or APIs, where a machine-readable format is essential for easy processing. + +## Large Language Models (LLMs) {/* Dedicated */} + +LLMs are advanced artificial intelligence systems capable of understanding and generating human-like text on various topics. +These models, such as Llama-3, are trained on vast amounts of data to learn the patterns and structures of language, enabling them to generate coherent and contextually relevant responses to queries or prompts. +LLMs have applications in natural language processing, text generation, translation, and other tasks requiring sophisticated language understanding and production. + +## Large Language Model Applications {/* Dedicated */} + +LLM Applications are applications or software tools that leverage the capabilities of LLMs for various tasks, such as text generation, summarization, or translation. These apps provide user-friendly interfaces for interacting with the models and accessing their functionalities. + +## Node number {/* Dedicated */} + +The node number (or node count) defines the number of nodes, or Instances, that are running your dedicated Generative APIs deployment. [Increasing the node number](/generative-apis/how-to/configure-autoscaling/) scales your deployment, so that it can handle more load. + +## Parameters {/* Serverless */} + +Parameters are settings that control the behavior and performance of generative models. These include temperature, max tokens, and top-p sampling, among others. Adjusting parameters allows users to tweak the model's output, balancing factors like creativity, accuracy, and response length to suit specific use cases. + +## Prompt {/* Dedicated */} + +In the context of generative AI models, a prompt refers to the input provided to the model to generate a desired response. +It typically consists of a sentence, paragraph, or series of keywords or instructions that guide the model in producing text relevant to the given context or task. +The quality and specificity of the prompt greatly influence the generated output, as the model uses it to understand the user's intent and create responses accordingly. + +## Prompt Engineering {/* Serverless */} + +Prompt engineering involves crafting specific and well-structured inputs (prompts) to guide the model toward generating the desired output. Effective prompt design is crucial for generating relevant responses, particularly in complex or creative tasks. It often requires experimentation to find the right balance between specificity and flexibility. -## Prompt Engineering +## Quantization {/* Dedicated */} -Prompt engineering involves crafting specific and well-structured inputs (prompts) to guide the model towards generating the desired output. Effective prompt design is crucial for generating relevant responses, particularly in complex or creative tasks. It often requires experimentation to find the right balance between specificity and flexibility. +Quantization is a technique used to reduce the precision of numerical values in a model's parameters or activations to improve efficiency and reduce memory footprint during inference. It involves representing floating-point values with fewer bits while minimizing the loss of accuracy. +AI models provided for deployment are named with suffixes that denote their quantization levels, such as `:int8`, `:fp8`, and `:fp16`. -## Retrieval Augmented Generation (RAG) +## Retrieval Augmented Generation (RAG) {/* Serverless + Dedicated */} -Retrieval Augmented Generation (RAG) is a technique that enhances generative models by integrating information retrieval methods. By fetching relevant data from external sources before generating a response, RAG ensures that the output is more accurate and contextually relevant, especially in scenarios requiring up-to-date or specific information. +Retrieval Augmented Generation (RAG) is a technique that enhances generative models by combining information retrieval elements with language generation to enhance the capabilities of LLMs. By fetching relevant data from external sources before generating a response, RAG ensures that the output is more accurate and contextually relevant, especially in scenarios requiring up-to-date or specific information. -## Stop words +## Stop words {/* Serverless */} Stop words are a parameter set to tell the model to stop generating further tokens after one or more chosen tokens have been generated. This is useful for controlling the end of the model output, as it will cut off at the first occurrence of any of these strings. -## Streaming +## Streaming {/* Serverless */} Streaming is a parameter allowing responses to be delivered in real-time, showing parts of the output as they are generated rather than waiting for the full response. Scaleway is following the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html#server-sent-events) standard. This behavior usually enhances user experience by providing immediate feedback and a more interactive conversation. -## Structured outputs +## Structured outputs {/* Serverless + Dedicated */} Structured outputs enable you to format the model's responses to suit specific use cases. To activate structured outputs, provide the `response_format` parameter with `"type": "json_schema"` and define its `"json_schema": {}`. By customizing the structure, such as using lists, tables, or key-value pairs, you ensure that the data returned is in a form that is easy to extract and process. By specifying the expected response format through the API, you can make the model consistently deliver the output your system requires. -Refer to [How to use structured outputs](/generative-apis/how-to/query-vision-models/) for more information. +Refer to [How to use structured outputs](/generative-apis/how-to/use-structured-outputs/) for more information. -## Temperature +## Temperature {/* Serverless */} Temperature is a parameter that controls the randomness of the model's output during text generation. A higher temperature produces more creative and diverse outputs, while a lower temperature makes the model's responses more deterministic and focused. Adjusting the temperature allows users to balance creativity with coherence in the generated text. -## Time to First Token (TTFT) +## Time to First Token (TTFT) {/* Serverless */} Time to First Token (TTFT) measures the time elapsed from the moment a request is made to the point when the first token of the generated text is returned. TTFT is a crucial performance metric for evaluating the responsiveness of generative models, especially in interactive applications where users expect immediate feedback. -## Tokens +## Tokens {/* Serverless */} Tokens are the basic units of text that a generative model processes. Depending on the tokenization strategy, these can be words, subwords, or even characters. The number of tokens directly affects the context window size and the computational cost of using the model. Understanding token usage is essential for optimizing API requests and managing costs effectively. diff --git a/pages/generative-apis/faq.mdx b/pages/generative-apis/faq.mdx index af97c99d97..50013b5585 100644 --- a/pages/generative-apis/faq.mdx +++ b/pages/generative-apis/faq.mdx @@ -2,47 +2,79 @@ title: Generative APIs FAQ description: Get answers to the most frequently asked questions about Scaleway Generative APIs. dates: - validation: 2025-12-19 + validation: 2026-04-13 productIcon: GenerativeApiProductIcon --- ## Overview -### What are Scaleway Generative APIs? -Scaleway's Generative APIs provide access to pre-configured, serverless endpoints of leading AI models, hosted in European data centers. This allows you to integrate advanced AI capabilities into your applications without managing underlying infrastructure. +### What is Scaleway Generative APIs - Serverless? +Scaleway's Generative APIs - Serverless (formerly known as Scaleway Generative APIs) provides access to pre-configured, serverless endpoints of leading AI models, hosted in European data centers. This allows you to integrate advanced AI capabilities into your applications without managing underlying infrastructure. -### What is the difference between Generative APIs and Managed Inference? -- **Generative APIs**: A serverless service providing access to pre-configured AI models via API, billed per token usage. -- **Managed Inference**: Allows deployment of curated or custom models with chosen quantization and Instances, offering predictable throughput and enhanced security features like private network isolation and access control. Managed Inference is billed based on hourly usage, regardless of whether the provisioned capacity is receiving traffic or not. +### What is Scaleway Generative APIs - Dedicated Deployment? +Scaleway's Generative APIs - Dedicated Deployment (formerly known as Scaleway Managed Inference) is a fully managed service that allows you to deploy, run, and scale AI models in a dedicated environment. +It provides optimized infrastructure, customizable deployment options, and secure access controls to meet the needs of enterprises and developers looking for high-performance inference solutions. -### How do I get started with Generative APIs? +### What is the difference between Serverless and Dedicated Deployment? +- **Generative APIs - Serverless** (formerly known as Scaleway Generative APIs): A serverless service providing access to pre-configured AI models via API, billed per token usage. +- **Generative APIs - Dedicated Deployment** (formerly known as Scaleway Managed Inference): Allows deployment of curated or custom models with chosen quantization and Instances, offering predictable throughput and enhanced security features such as private network isolation and access control. The service is billed based on hourly usage, regardless of whether the provisioned capacity is receiving traffic or not. + +### I'm looking for Scaleway Managed Inference. Has it been discontinued? +Managed Inference has been renamed to Generative APIs - Dedicated Deployment. It is the same product, just with a new name. All features and functionality remain unchanged. + +### Is Generative APIs - Dedicated Deployment suitable for real-time applications? {/* Dedicated */} +Yes, Generative APIs - Dedicated Deployment is designed for low-latency, high-throughput applications, making it suitable for real-time use cases such as chatbots, recommendation systems, fraud detection, and live video processing. + +### Can I fine-tune or retrain my models within Generative APIs - Dedicated Deployment? +Generative APIs - Dedicated Deployment is primarily designed for deploying and running inference workloads. If you need to fine-tune or retrain models, you may need to use a separate training environment, such as [Scaleway’s GPU Instances](/gpu/quickstart/), and then deploy the trained model in Generative APIs - Dedicated Deployment. + +## Getting started + +### How do I get started with Generative APIs - Serverless? To get started, explore the [Generative APIs Playground](/generative-apis/quickstart/#interacting-with-generative-apis-via-the-playground) in the Scaleway console. For application integration, refer to our [Quickstart guide](/generative-apis/quickstart/), which provides step-by-step instructions on accessing, configuring, and using a Generative APIs endpoint. -## Offering and availability +### How do I deploy a model using Generative APIs - Dedicated Deployment? +Deployment is done through Scaleway's [console](https://console.scaleway.com/generative-api/deployments) or [API](https://www.scaleway.com/en/developers/api/managed-inference/). You can choose a model from Scaleway’s selection or import your own directly from Hugging Face's repositories, configure [Instance types](/gpu/reference-content/choosing-gpu-instance-type/), set up networking options, and start inference with minimal setup. For details, see the [document about how to create a deployment](/generative-apis/how-to/create-deployment/). {/* COMMENT: Developer Reference link has to be updated. */} -### Where are the inference servers located? -All models are currently hosted in a secure data center located in Paris, France, operated by [OPCORE](https://www.opcore.com/). This ensures low latency for European users and compliance with European data privacy regulations. +### Can I run inference on private models? {/* Dedicated */} +Yes, Generative APIs - Dedicated Deployment allows you to deploy private models with access control settings. You can restrict access to specific users, teams, or networks. + +## Offering and availability ### Which models are supported by Generative APIs? Our Generative APIs support a range of popular models, including: -- Chat / Text Generation models: Refer to our dedicated [documentation](/generative-apis/reference-content/supported-models/#chat-models) for a list of supported chat models. -- Vision models: Refer to our dedicated [documentation](/generative-apis/reference-content/supported-models/#vision-models) for a list of supported vision models. -- Embedding models: Refer to our dedicated [documentation](/generative-apis/reference-content/supported-models/#embedding-models) for a list of supported embedding models. +- Large Language Models (LLMs) +- Chat / Text Generation models +- Vision models +- Embedding models +- Audio recognition models +- Custom AI models (through API only yet) + +For details, refer to our [Supported models](/generative-apis/reference-content/supported-models) catalog. -### What is the model lifecycle for Generative APIs? +Generative APIs - Dedicated Deployment supports both open-source models and your own uploaded proprietary models. + +### What is the model lifecycle for Generative APIs - Serverless? Scaleway is dedicated to updating and offering the latest versions of generative AI models, while ensuring older models remain accessible for a significant time, and also ensuring the reliability of your production applications. Learn more in our [model lifecycle policy](/generative-apis/reference-content/model-lifecycle/). +### Where are the inference servers located? +All models are currently hosted in a secure data center located in Paris, France, operated by [OPCORE](https://www.opcore.com/). This ensures low latency for European users and compliance with European data privacy regulations. + +### What Instance types are available for inference? +Generative APIs - Dedicated Deployment offers different Instance types optimized for various workloads from Scaleway's [GPU Instances](/gpu/reference-content/choosing-gpu-instance-type/) range. +You can select the Instance type based on your model’s computational needs and compatibility. + ## Pricing and billing -### How does the free tier work? -The free tier allows you to process, without incurring any costs, up to: +### How does the Free Tier work? +There is a Free Tier available for Generative APIs - Serverless. The Free Tier allows you to process, without incurring any costs, up to: - 1,000,000 tokens for models billed by tokens - 60 minutes of audio transcription for models billed by audio minutes -After reaching this limit, you will be charged per million tokens processed and per minutes of audio processed. Free tier usage is calculated by adding all input / output tokens and audio minutes consumed by all models used. +After reaching this limit, you will be charged per million tokens processed and per minutes of audio processed. Free Tier usage is calculated by adding all input / output tokens and audio minutes consumed by all models used. For more information, refer to our [pricing page](https://www.scaleway.com/en/pricing/model-as-a-service/#generative-apis) or access your bills by models in the [billing section of the Scaleway Console](https://console.scaleway.com/billing/payment) (past and provisional bills for the current month). -When your consumption exceeds the free tier, you will be billed for each additional token consumed by the model and token type. **The minimum billing unit is 1,000 tokens**. +When your consumption exceeds the Free Tier, you will be billed for each additional token consumed by the model and token type. **The minimum billing unit is 1,000 tokens**. Here are two examples of low-volume consumption: Example 1: Free Tier only @@ -57,7 +89,7 @@ Example 1: Free Tier only Total tokens consumed: `900k` Total bill: `0.00€` -Example 2: Exceeding Free Tier +Example 2: Exceeding the Free Tier | Model | Token type | Tokens consumed | Price | Billed consumption | Bill | |-----------------|-----------------|-----------------|-----------------|-----------------|-----------------| @@ -72,38 +104,39 @@ Total bill: `€2.13` Note that in this example, the first line where the Free Tier applies will not display in your current Scaleway bills by model, but will instead be listed under `Offer deducted - Generative APIs Free Tier`. If you are using multiple projects (configuring a URL such as `api.scaleway.ai/{project_id}/v1/chat/completions`), Free Tier will be applied to each project proportionally to their Generative APIs consumption. Assuming that `project A` consumed `1€`, `project B` consumed `3€`, and the Free Tier amount is `2€`: `project A` Free Tier will be `(1/(3+1))*2=0.50€` and `project B` Free Tier will be `(3/(3+1))*2=1.50€`. -Free tier for audio transcription models is applied similarly for models billed by audio minutes. +Free Tier for audio transcription models is applied similarly for models billed by audio minutes. -### What are tokens, and how are they counted? +### What are tokens, and how are they counted? {/* Serverless */} A token is the minimum unit of content that is seen and processed by a model. Hence, token definitions depend on input types: -- For text, on average, `1` token corresponds to `~4` characters, and thus `0.75` words (as words are on average five characters long) +- For text, on average, `1` token corresponds to `~4` characters, and thus `0.75` words (as words are on average five characters long). - For images, `1` token corresponds to a square of pixels. For example, `mistral-small-3.1-24b-instruct-2503` model image tokens are `28x28` pixels (28-pixels height, and 28-pixels width, hence `784` pixels in total). - For audio: - `1` token corresponds to a duration of time. For example, `voxtral-small-24b-2507` model audio tokens are `80` milliseconds. - - Some models process audio in chunks having a minimum duration. For example, `voxtral-small-24b-2507` model process audio in `30` second chunks. This means audio lasting `13` seconds will be considered `375` tokens (`30` seconds / `0.08` seconds). And audio lasting `178` seconds will be considered `2 250` tokens (`30` seconds * `6` / `0.08` seconds). + - Some models process audio in chunks having a minimum duration. For example, `voxtral-small-24b-2507` model process audio in `30`-second chunks. This means audio lasting `13` seconds will be considered `375` tokens (`30` seconds / `0.08` seconds). And audio lasting `178` seconds will be considered `2,250` tokens (`30` seconds * `6` / `0.08` seconds). -The exact token count and definition depend on the [tokenizer](https://huggingface.co/learn/llm-course/en/chapter2/4) used by each model. When this difference is significant (such as for image processing), you can find detailed information in each model's documentation (for instance, in [`mistral-small-3.1-24b-instruct-2503` size limit documentation](/managed-inference/reference-content/model-catalog/#mistral-small-31-24b-instruct-2503)). When the model is open, you can also find this information in the model files on platforms such as Hugging Face, usually in the `tokenizer_config.json` file. +The exact token count and definition depend on the [tokenizer](https://huggingface.co/learn/llm-course/en/chapter2/4) used by each model. When this difference is significant (such as for image processing), you can find detailed information in each model's documentation (for instance, in [`mistral-small-3.1-24b-instruct-2503` size limit documentation](/generative-apis/reference-content/supported-models/#mistral-small-31-24b-instruct-2503)). When the model is open, you can also find this information in the model files on platforms such as Hugging Face, usually in the `tokenizer_config.json` file. -### How can I monitor my token consumption? +### How can I monitor my token consumption? {/* Serverless */} You can see your token consumption in [Scaleway Cockpit](/cockpit/). You can access it from the Scaleway console under the [Metrics tab](https://console.scaleway.com/generative-api/metrics). Note that: - Cockpits are isolated by Project, hence you first need to select the right Project in the Scaleway console before accessing Cockpit to see your token consumption for this Project (you can see the `project_id` in the Cockpit URL: `https://{project_id}.dashboard.obs.fr-par.scw.cloud/`). - Cockpit graphs can take up to 5 minutes to update token consumption. See [Troubleshooting](/generative-apis/troubleshooting/fixing-common-issues/#tokens-consumption-is-not-displayed-in-cockpit-metrics) for further details. -In these dashboards, you can view consumption by IAM principal (users or applications) by filtering on the IAM principal label or by opening the **Usage details per IAM Principal ID** panel. +In these dashboards, you can view consumption by IAM principal (users or applications) by filtering on the IAM principal label or by opening the **Usage details per IAM Principal ID** panel. Note that for low consumption volumes (less than a few million tokens per hour), the displayed values over long time ranges (e.g., several days) may be inaccurate by several percent. This is a known limitation of Grafana queries using PromQL for discrete values: while exact consumption is stored in metrics, aggregated values in graphs are extrapolated from instantaneous samples. -To minimize this effect, you can: +To minimize this effect, you can: - Select a narrower time range (e.g., 1 hour) to ensure sufficient data samples for accurate reconstruction of the consumption graph. - Query the data directly using PromQL on the data source. -### Can I configure a maximum billing threshold? + +### Can I configure a maximum billing threshold? {/* Serverless */} Currently, you cannot configure a specific threshold after which your usage will be blocked. However: - You can [configure billing alerts](/billing/how-to/use-billing-alerts/) to ensure you are warned when you hit specific budget thresholds. - Your total billing remains limited by the amount of tokens you can consume within [rate limits](/generative-apis/reference-content/rate-limits/). -- To ensure fixed billing, you can use [Managed Inference](https://www.scaleway.com/en/inference/), which provides the same set of OpenAI-compatible APIs and a wider range of models. +- To ensure fixed billing, you can use [Generative APIs - Dedicated Deployment](https://www.scaleway.com/en/generative-apis/), which provides the same set of OpenAI-compatible APIs and a wider range of models. -### How can I give access to token consumption to my users outside Scaleway? +### How can I give access to token consumption to my users outside Scaleway? {/* Serverless */} If your users do not have a Scaleway account, you can still give them access to their Generative API usage consumption by either: - Collecting consumption data from the [Billing API](https://www.scaleway.com/en/developers/api/billing/#path-consumption-get-monthly-consumption) and exposing it to your users. Consumption can be detailed by Projects. @@ -126,13 +159,24 @@ Note that: - Cockpits are isolated by Projects. You first need to select the right Project in the Scaleway console before accessing Cockpit to see your token consumption for the desired Project (you can see the `project_id` in the Cockpit URL: `https://{project_id}.dashboard.obs.fr-par.scw.cloud/`). - Cockpit graphs can take up to 5 minutes to update token consumption. See [Troubleshooting](/generative-apis/troubleshooting/fixing-common-issues/#tokens-consumption-is-not-displayed-in-cockpit-metrics) for further details. +### How is Generative APIs - Dedicated Deployment billed? {/* Dedicated */} +Billing is based on the Instance type and usage duration (in minutes). Unlike Generative APIs - Serverless, which are billed per token, Generative APIs - Dedicated Deployment provides predictable costs based on the allocated infrastructure. Billing only starts when a deployment is ready and can be queried. +Pricing details can be found on the [Scaleway pricing page](https://www.scaleway.com/en/pricing/model-as-a-service/#managed-inference). + +### Can I pause Generative APIs - Dedicated Deployment billing when the Instance is not in use? {/* Dedicated */} +When a dedicated Generative APIs deployment is running, corresponding resources are provisioned and thus billed. Resources can therefore not be paused. +However, you can still optimize your Generative APIs dedicated deployment to fit within specific time ranges (such as during working hours). To do so, you can automate deployment creation and deletion using the [Generative APIs - Dedicated Deployment API](https://www.scaleway.com/en/developers/api/managed-inference/), [Terraform](https://registry.terraform.io/providers/scaleway/scaleway/latest/docs/resources/inference_deployment), or [Scaleway SDKs](/scaleway-sdk/). These actions can be programmed using [Serverless Jobs](/serverless-jobs/) to be automatically carried out periodically. {/* COMMENT: Developer Reference link has to be updated. */} + ## Specifications -### What are the SLAs applicable to Generative APIs? -Generative APIs targets a 99.9% monthly availability rate, as detailed in [Service Level Agreement for Generative APIs](https://www.scaleway.com/en/generative-apis/sla/). +### What are the SLAs applicable to Generative APIs - Serverless? +Generative APIs - Serverless targets a 99.9% monthly availability rate, as detailed in [Service Level Agreement - Generative APIs](https://www.scaleway.com/en/generative-apis/sla/). + +### What are the SLAs applicable to Generative APIs - Dedicated Deployment? +We are currently working on defining our SLAs for Generative APIs - Dedicated Deployment. We will provide more information on this topic soon. -### What are the performance guarantees (vs Managed Inference)? -Generative APIs is optimized and monitored to provide reliable performance in most use cases, but does not strictly guarantee performance as it depends on many client-side parameters. We recommend using Managed Inference (dedicated deployment capacity) for applications with critical performance requirements. +### What are the performance guarantees for Serverless (vs Dedicated Deployment)? +Generative APIs - Serverless is optimized and monitored to provide reliable performance in most use cases, but does not strictly guarantee performance as it depends on many client-side parameters. We recommend using Generative APIs - Dedicated Deployment (dedicated deployment capacity) for applications with critical performance requirements. As an order of magnitude, for Chat models, when performing requests with `stream` activated: - Time to first token should be less than `1` second for most standard queries (with less than 1,000 input tokens) @@ -142,46 +186,59 @@ Exact performance will still vary based mainly on the following factors: - Model size and architecture: Smaller and more recent models usually provide better performance. - Model type: - Chat models' time to first token increases proportionally to the input context size after a certain threshold (usually above `1,000` tokens). - - Audio transcription models' time to first token remains mostly constant, as they only need to process small numbers of input tokens (`30` seconds audio chunk) to generate a first output. + - Audio transcription models' time to first token remains mostly constant, as they only need to process small numbers of input tokens (`30`-second audio chunk) to generate a first output. - Input and output size: In rough terms, total processing time is proportional to input and output size. However, for larger queries (usually above `10,000` tokens), processing speed may degrade with query size. For optimal performance, we recommend splitting queries into the smallest meaningful parts (`10` queries with `1,000` input tokens and `100` output tokens will be processed faster than `1` query with `10,000` input tokens and `1,000` output tokens). -### How long does a batch take to be processed using Batches API, and how do I optimize this time? +### How long does a batch take to be processed using the Serverless Batches API endpoint, and how do I optimize this time? {/* Serverless */} We aim to process any batch within **24 hours**. After this delay, batch processing will be stopped, and any remaining unprocessed queries will not be billed. Batches are processed in the order they were created. You can optimize time before receiving a batch output by splitting a batch into multiple smaller ones. For example: -- Assuming a batch of `10 000` requests will take `10 hours` to be processed. -- Splitting this batch into `10 batches` of `1.000 requests` each will take the same time (e.g., `10 hours`) to process all batches. However, the first batch output will be provided after `1 hour`, the second one after `2 hours`, and so on. +- Assuming a batch of `10,000` requests will take `10 hours` to be processed. +- Splitting this batch into `10 batches` of `1,000 requests`, each will take the same time (e.g., `10 hours`) to process all batches. However, the first batch output will be provided after `1 hour`, the second one after `2 hours`, and so on. + +### What are the performance guarantees for Dedicated Deployment (vs Serverless)? +Generative APIs - Dedicated Deployment provides dedicated resources, ensuring predictable performance and lower latency compared to Generative APIs - Serverless, which is a shared, serverless offering optimized for infrequent traffic with moderate peak loads. Generative APIs - Dedicated Deployment is ideal for workloads that require consistent response times, high availability, custom hardware configurations, or generate extreme peak loads during a narrow period. +Compared to Generative APIs - Serverless, no usage quota is applied to the number of tokens per second generated, since the output is limited by the GPU Instance size and the number of your dedicated Scaleway Generative APIs deployments. ## Quotas and limitations -### Are there any rate limits for API usage? +### Are there any rate limits for Serverless API usage? {/* Serverless */} Yes, API rate limits define the maximum number of requests a user can make within a specific time frame to ensure fair access and resource allocation between users. If you require increased rate limits, we recommend either: -- Using [Batches API](https://console.scaleway.com/generative-api/batches) for non-real time workloads. Requests performed through Batches API do not have a rate limit and are billed with a [-50% discount compared to standard model prices](https://www.scaleway.com/en/pricing/model-as-a-service/#generative-apis). -- Using [Managed Inference](https://console.scaleway.com/inference/deployments), which provides dedicated capacity and doesn't enforce rate limits (you remain limited by the total provisioned capacity) +- Using the [Batches API](https://console.scaleway.com/generative-api/batches) for non-real time workloads. Requests performed through the Batches API do not have a rate limit and are billed with a [-50% discount compared to standard model prices](https://www.scaleway.com/en/pricing/model-as-a-service/#generative-apis). +- Using [Generative APIs - Dedicated Deployment](https://console.scaleway.com/generative-api/deployments), which provides dedicated capacity and doesn't enforce rate limits (you remain limited by the total provisioned capacity) - Contacting your existing Scaleway account manager or our Sales team to discuss volume commitment for specific models that will allow us to increase your quota proportionally. Refer to our dedicated [documentation](/generative-apis/reference-content/rate-limits/) for more information on rate limits. -### Can I increase maximum output (completion) tokens for a model? -No, you cannot increase maximum output tokens above [limits for each model](/generative-apis/reference-content/supported-models/) in Generative APIs. +### Can I increase maximum output (completion) tokens for a model when using Serverless? {/* Serverless */} +No, you cannot increase the maximum number of output tokens beyond the [limits for each model](/generative-apis/reference-content/supported-models/). These limits are in place to protect you against: - Long generation, which may end by an HTTP timeout. Limits are designed to ensure a model will send its HTTP response in less than 5 minutes. - Uncontrolled billing, as several models are known to be able to enter infinite generation loops (specific prompts can make the model generate the same sentence over and over, without stopping at all). -If you require higher maximum output tokens, you can use [Managed Inference](https://console.scaleway.com/inference/deployments), where these limits do not apply (as your bill will be limited to the size of your deployment). +If you require higher maximum output tokens, you can use [Generative APIs - Dedicated Deployment](https://console.scaleway.com/generative-api/deployments), where these limits do not apply (as your bill will be limited to the size of your deployment). -### Can I increase the maximum number of concurrent requests? +### Can I increase the maximum number of concurrent requests when using Serverless? -By default, you cannot increase the maximum number of concurrent requests beyond the [limits for all model](/organizations-and-projects/additional-content/organization-quotas/#generative-apis) in Generative APIs. +By default, you cannot increase the maximum number of concurrent requests beyond the [limits for all models](/organizations-and-projects/additional-content/organization-quotas/#generative-apis). -However, for embedding models, you can batch multiple inputs by providing an [array of strings](https://www.scaleway.com/en/developers/api/generative-apis/#path-embeddings-create-an-embedding) in a single request. For example, with `qwen3-embedding-8b`, you can send up to `2048` strings of `32 000` input tokens **each**, in a single query. +However, for embedding models, you can batch multiple inputs by providing an [array of strings](https://www.scaleway.com/en/developers/api/generative-apis/#path-embeddings-create-an-embedding) in a single request. For example, with `qwen3-embedding-8b`, you can send up to `2,048` strings of `32,000` input tokens **each**, in a single query. + +If you have a specific use case that requires higher concurrency limits, we recommend using [Generative APIs - Dedicated Deployment](https://console.scaleway.com/generative-api/deployments), where these limits do not apply or, [contacting our support team](https://console.scaleway.com/support/tickets/create?for=product&productName=generativeApi) with details about your use case and your expected concurrency requirements. + +### Do model licenses apply when using Serverless? +Yes, you need to comply with model licenses when using Generative APIs - Serverless. Applicable licenses are available for [each model in our documentation](/generative-apis/reference-content/supported-models) and in the console Playground. + +### Do model licenses apply when using Dedicated Deployment? +Yes, model licenses need to be complied with when using Generative APIs - Dedicated Deployment. Applicable licenses are available for [each model in our documentation](/generative-apis/reference-content/supported-models). +- For models provided in the Scaleway catalog, you need to accept licenses (including potential EULA) before creating any dedicated Generative APIs deployment. +- For custom models you choose to import on Scaleway, you are responsible for complying with model licenses (as with any software you choose to install on a GPU Instance, for example). -If you have a specific use case that requires higher concurrency limits, we recommend using [Managed Inference](https://console.scaleway.com/inference/deployments), where these limits do not apply or, [contacting our support team](https://console.scaleway.com/support/tickets/create?for=product&productName=generativeApi) with details about your use case and your expected concurrency requirements. ## Compatibility and integration -### Can I use OpenAI libraries and APIs with Scaleway's Generative APIs? -Yes, Scaleway's Generative APIs are designed to be compatible with OpenAI libraries and SDKs, including the OpenAI Python client library and LangChain SDKs. This allows for seamless integration with existing workflows. +### Can I use OpenAI libraries and APIs with Scaleway's Generative APIs? {/* QUESTION: Is this true for Dedicated too? Should this section be merged with the next one? */} +Yes, Scaleway's Generative APIs - Serverless is designed to be compatible with OpenAI libraries and SDKs, including the OpenAI Python client library and LangChain SDKs. This allows for seamless integration with existing workflows. For detailed information, see [OpenAI API compatibility](/generative-apis/reference-content/openai-compatibility/) documentation. -### How can I convert audio files to a supported format? +### How can I convert audio files to a supported format? {/* Serverless */} For audio transcription, supported formats are: `flac`, `mp3`, `mpeg`, `mpga`, `oga`, `ogg`, `wav`. For unsupported formats such as `m4a`, we recommend using third-party libraries or tools to convert them to a supported format, such as [ffmpeg](https://www.ffmpeg.org/) or [VLC](https://www.videolan.org/vlc/). For example, you can convert an `m4a` file to `mp3` using `ffmpeg` with: @@ -190,7 +247,7 @@ ffmpeg -i audio-file.m4a audio-file.mp3 ``` Where `audio-file.m4a` is your original file. -### Can I transcribe audio streams? +### Can I transcribe audio streams? {/* Serverless */} Streams are currently only supported for audio output, not input. As a workaround, you can send small chunks of audio lasting a few seconds each, and activate output streaming with: ```bash @@ -201,24 +258,33 @@ curl https://api.scaleway.ai/v1/audio/transcriptions \ -F model="whisper-large-v3" \ -F stream=true ``` -Audio streaming will start as soon as the first 30-second chunk is processed, i.e. after only a few seconds. This is close enough to realtime for many user / audio agent interactions. -If you need to stitch together audio transcription and avoid word duplication between two segments, you can provide the last few words of chunk transcription as `prompt` for the next chunk. This will guide model decoding and provide a better output. +Audio streaming will start as soon as the first 30-second chunk is processed, i.e., after only a few seconds. This is close enough to realtime for many user / audio agent interactions. +If you need to stitch together audio transcriptions and avoid word duplication between two segments, you can provide the last few words of a chunk's transcription as a `prompt` for the next chunk. This will guide model decoding and provide a better output. Since billing is done per second of audio input for models such as `whisper-large-v3`, this method does not incur additional costs. Billing is based on the length (in seconds) of each audio file. -### Which vector database can I use to store embeddings? -Since `/embeddings` API returns a raw list of vector coordinates, all vector databases are by default compatible with this format. +### Which vector database can I use to store embeddings? {/* Serverless */} +Since the `/embeddings` API returns a raw list of vector coordinates, all vector databases are by default compatible with this format. However, some vector databases may only support a maximum number of `dimensions` below the dimensions returned by a model. -In this case, we recommend using models which support custom dimensions number (also known as [Matryoshka embeddings](https://huggingface.co/blog/matryoshka)). +In this case, we recommend using models which support custom numbers of dimensions (also known as [Matryoshka embeddings](https://huggingface.co/blog/matryoshka)). -As an example, when using the PostgreSQL `pgvector` extension, we recommend using the `qwen3-embedding-8b` embedding model with `2000` dimensions, to ensure compatibility with vector indexes such as `hnsw` or `ivfflat`. +As an example, when using the PostgreSQL `pgvector` extension, we recommend using the `qwen3-embedding-8b` embedding model with `2,000` dimensions, to ensure compatibility with vector indexes such as `hnsw` or `ivfflat`. + +### Can I use Generative APIs - Dedicated Deployment with other Scaleway services? {/* Dedicated */} +Absolutely. Generative APIs - Dedicated Deployment integrates seamlessly with other Scaleway services, such as [Object Storage](/object-storage/quickstart/) for model hosting, [Kubernetes](/kubernetes/quickstart/) for containerized applications, and [Scaleway IAM](/iam/quickstart/) for access management. + +### Does Generative APIs - Dedicated Deployment support model quantization? {/* Dedicated */} +Yes, Scaleway Generative APIs - Dedicated Deployment supports model [quantization](/generative-apis/concepts/#quantization) to optimize performance and reduce inference latency. You can select different quantization options depending on your accuracy and efficiency requirements. ## Usage and management -### Do model licenses apply when using Generative APIs? -Yes, you need to comply with model licenses when using Generative APIs. Applicable licenses are available for [each model in our documentation](/generative-apis/reference-content/supported-models/#vision-models) and in the console Playground. +### How can I monitor performance? {/* Dedicated */} +Generative APIs - Dedicated Deployment metrics and logs are available in [Scaleway Cockpit](https://console.scaleway.com/cockpit/overview). You can follow your deployment metrics in real-time, such as token throughput, request latency, GPU power usage, and GPU VRAM usage. ## Privacy and safety -### Where can I find the privacy policy regarding Generative APIs? -You can find the privacy policy applicable to all use of Generative APIs [here](/generative-apis/reference-content/data-privacy/). +### Where can I find information regarding the data, privacy, and security policies applied to Scaleway's AI services? +You can find detailed information regarding the policies applied to Scaleway's AI services in our dedicated documentation: + +- [Understand the Generative APIs Privacy Policy](/generative-apis/reference-content/data-privacy/) +- [Security and Reliability in Generative APIs](/generative-apis/reference-content/security-and-reliability/) diff --git a/pages/managed-inference/how-to/assets/scaleway-api-authentication.webp b/pages/generative-apis/how-to/assets/scaleway-api-authentication.webp similarity index 100% rename from pages/managed-inference/how-to/assets/scaleway-api-authentication.webp rename to pages/generative-apis/how-to/assets/scaleway-api-authentication.webp diff --git a/pages/managed-inference/how-to/assets/scaleway-inference-pn-connected.webp b/pages/generative-apis/how-to/assets/scaleway-inference-pn-connected.webp similarity index 100% rename from pages/managed-inference/how-to/assets/scaleway-inference-pn-connected.webp rename to pages/generative-apis/how-to/assets/scaleway-inference-pn-connected.webp diff --git a/pages/managed-inference/how-to/assets/scaleway-inference-pn-detach.webp b/pages/generative-apis/how-to/assets/scaleway-inference-pn-detach.webp similarity index 100% rename from pages/managed-inference/how-to/assets/scaleway-inference-pn-detach.webp rename to pages/generative-apis/how-to/assets/scaleway-inference-pn-detach.webp diff --git a/pages/managed-inference/how-to/assets/scaleway-inference-tls-dl.webp b/pages/generative-apis/how-to/assets/scaleway-inference-tls-dl.webp similarity index 100% rename from pages/managed-inference/how-to/assets/scaleway-inference-tls-dl.webp rename to pages/generative-apis/how-to/assets/scaleway-inference-tls-dl.webp diff --git a/pages/generative-apis/how-to/change-model.mdx b/pages/generative-apis/how-to/change-model.mdx new file mode 100644 index 0000000000..ef4b92fffc --- /dev/null +++ b/pages/generative-apis/how-to/change-model.mdx @@ -0,0 +1,36 @@ +--- +title: How to change the model of a dedicated Generative APIs deployment +description: Learn how to change the model of your dedicated Scaleway Generative APIs deployment in just a few easy clicks. +tags: generative-apis-dedicated-deployment ai-data change model +dates: + posted: 2025-07-18 + validation: 2026-04-15 +categories: + - ai-data +--- +import Requirements from '@macros/iam/requirements.mdx' + + +You can change the model used by your dedicated Generative APIs deployment at any time, as long as the new model is compatible with the existing deployment node. If you want to change to a model that is too big for your current node type, you must [delete your existing deployment](/generative-apis/how-to/delete-deployment/) and [create a new one](/generative-apis/how-to/create-deployment/) with a compatible node type. + +Follow the steps below to change the model using the Scaleway console: + + + + - A Scaleway account logged into the [console](https://console.scaleway.com) + - A [dedicated Generative APIs deployment](/generative-apis/how-to/create-deployment/) + - [Owner](/iam/concepts/#owner) status or [IAM permissions](/iam/concepts/#permission) allowing you to perform actions in the intended Organization + +1. Click **Generative APIs** in the **AI** section of the side menu in the [Scaleway console](https://console.scaleway.com/) to access the dashboard. The list of models displays. +2. Select the **Deployments** tab. +3. From the drop-down menu, select the geographical region you want to manage. +4. Click a deployment name to access the deployment's dashboard. +5. On the **Overview** tab, click **Change** next to the model name. + A pop-up displays, showing your current model. +6. From the drop-down menu, select the model you want to change to, and click **Change model**. +

+ The change of model is initiated. Note that while the model is changed, your deployment will not be available for 15 - 30 minutes. + +Remember to update the model string in your client configuration, to reflect the new model. You can find the model string in the code sample available in the **Playground** tab of your deployment's dashboard (use the **View code** button). + +If you have also changed to a different type of model (e.g., from a chat model to an embedding model), you will also need to update the client code itself, in addition to updating the model string. \ No newline at end of file diff --git a/pages/generative-apis/how-to/configure-autoscaling.mdx b/pages/generative-apis/how-to/configure-autoscaling.mdx new file mode 100644 index 0000000000..23d53c39bd --- /dev/null +++ b/pages/generative-apis/how-to/configure-autoscaling.mdx @@ -0,0 +1,38 @@ +--- +title: How to scale dedicated Generative APIs deployments +description: This page explains how to scale dedicated Generative APIs deployments in size +tags: generative-apis-dedicated-deployment ai-data ip-address +dates: + validation: 2026-04-16 + posted: 2025-06-03 +--- +import Requirements from '@macros/iam/requirements.mdx' + + +You can scale your dedicated Generative APIs deployment up or down to match it to the incoming load of your deployment. + + +This feature is currently in [Public Beta](https://www.scaleway.com/en/betas/). + + + + + - A Scaleway account logged into the [console](https://console.scaleway.com) + - A [dedicated Generative APIs deployment](/generative-apis/how-to/create-deployment/) + - [Owner](/iam/concepts/#owner) status or [IAM permissions](/iam/concepts/#permission) allowing you to perform actions in the intended Organization + +## How to scale a dedciated Generative APIs deployment in size + +1. Click **Generative APIs** in the **AI** section of the side menu in the [Scaleway console](https://console.scaleway.com/) to access the dashboard. The list of models displays. +2. Select the **Deployments** tab. +3. From the drop-down menu, select the geographical region you want to manage. +4. Click a deployment name to access the deployment's dashboard. +5. Click the **Settings** tab and navigate to the **Scaling** section. +6. Click **Update node count** and adjust the number of nodes in your deployment. + + High availability is only guaranteed with two or more nodes. + +7. Click **Update node count** to update the number of nodes in your deployment. + + Your deployment will be unavailable for 15-30 minutes while the node update is in progress. + \ No newline at end of file diff --git a/pages/managed-inference/how-to/create-deployment.mdx b/pages/generative-apis/how-to/create-deployment.mdx similarity index 65% rename from pages/managed-inference/how-to/create-deployment.mdx rename to pages/generative-apis/how-to/create-deployment.mdx index 9f8a339110..0b22591a59 100644 --- a/pages/managed-inference/how-to/create-deployment.mdx +++ b/pages/generative-apis/how-to/create-deployment.mdx @@ -1,9 +1,9 @@ --- -title: How to deploy a model on Scaleway Managed Inference -description: This page explains how to deploy a Managed Inference model on the Scaleway console. -tags: managed-inference ai-data +title: How to deploy a model on a dedicated Generative APIs deployment +description: This page explains how to deploy a Generative APIs model on the Scaleway console. +tags: generative-apis-dedicated-deployment ai-data dates: - validation: 2025-07-21 + validation: 2026-04-16 posted: 2024-03-06 --- import Requirements from '@macros/iam/requirements.mdx' @@ -14,18 +14,19 @@ import Requirements from '@macros/iam/requirements.mdx' - A Scaleway account logged into the [console](https://console.scaleway.com) - [Owner](/iam/concepts/#owner) status or [IAM permissions](/iam/concepts/#permission) allowing you to perform actions in the intended Organization -1. Click the **AI** section of the [Scaleway console](https://console.scaleway.com/), and select **Managed Inference** from the side menu to access the Managed Inference dashboard. -2. From the drop-down menu, select the geographical region where you want to create your deployment. +1. Click **Generative APIs** in the **AI** section of the side menu in the [Scaleway console](https://console.scaleway.com/) to access the dashboard. The list of models displays. +2. Select the **Deployments** tab. 3. Click **Deploy a model** to launch the model deployment wizard. -4. Provide the necessary information: - - Select the desired model and quantization to use for your deployment [from the available options](/managed-inference/reference-content/). +4. From the drop-down menu, select the geographical region where you want to create your deployment. +5. Provide the necessary information: + - Choose the geographical **region** for the deployment. + - Select the desired model and quantization to use for your deployment [from the available options](/generative-apis/reference-content/). - Scaleway Managed Inference allows you to deploy various AI models, either from the Scaleway catalog or by importing a custom model. For detailed information about supported models, visit our [Supported models in Managed Inference](/managed-inference/reference-content/supported-models/) documentation. + Scaleway Generative APIs - Dedicated Deployment allows you to deploy various AI models, either from the Scaleway catalog or by importing a custom model. For detailed information about supported models, visit our [Supported models](/generative-apis/reference-content/supported-models/) documentation. Some models may require acceptance of an end-user license agreement (EULA). If prompted, review the terms and conditions and accept the license accordingly. - - Choose the geographical **region** for the deployment. - For custom models: Choose the model quantization. Each model comes with a default quantization. Select lower bits quantization to improve performance and enable the model to run on smaller GPU nodes, while potentially reducing precision. @@ -35,12 +36,12 @@ import Requirements from '@macros/iam/requirements.mdx' High availability is only guaranteed with two or more nodes. -5. Enter a **name** for the deployment, and optional tags. -6. Configure the **network connectivity** settings for the deployment: +6. Enter a **name** for the deployment, and optional tags. +7. Configure the **network connectivity** settings for the deployment: - Attach to a **Private Network** for secure communication and restricted availability. Choose an existing Private Network from the drop-down list, or create a new one. - Set up **Public connectivity** to access resources via the public internet. Authentication by API key is enabled by default. - Enabling both private and public connectivity will result in two distinct endpoints (public and private) for your deployment. - Deployments must have at least one endpoint, either public or private. -7. Click **Deploy model** to launch the deployment process. Once the model is ready, it will be listed among your deployments. \ No newline at end of file +8. Click **Deploy model** to launch the deployment process. Once the model is ready, it will be listed among your deployments. \ No newline at end of file diff --git a/pages/managed-inference/how-to/managed-inference-with-private-network.mdx b/pages/generative-apis/how-to/dedicated-deployment-with-private-network.mdx similarity index 58% rename from pages/managed-inference/how-to/managed-inference-with-private-network.mdx rename to pages/generative-apis/how-to/dedicated-deployment-with-private-network.mdx index 6a00103522..1f77b3285f 100644 --- a/pages/managed-inference/how-to/managed-inference-with-private-network.mdx +++ b/pages/generative-apis/how-to/dedicated-deployment-with-private-network.mdx @@ -1,9 +1,9 @@ --- -title: How to use your Managed Inference deployment with a Private Network -description: Learn how to deploy and manage AI models securely using Scaleway's Managed Inference with a private network configuration, ensuring optimal performance and data protection. -tags: managed-inference ai-data private-network +title: How to use your dedicated Generative APIs deployment with a Private Network +description: Learn how to deploy and manage AI models securely using Scaleway's Generative APIs - Dedicated Deployment with a private network configuration, ensuring optimal performance and data protection. +tags: generative-apis-dedicated-deployment ai-data private-network dates: - validation: 2025-07-16 + validation: 2026-04-15 posted: 2024-06-18 --- import Requirements from '@macros/iam/requirements.mdx' @@ -13,64 +13,62 @@ import image2 from './assets/scaleway-inference-tls-dl.webp' import image3 from './assets/scaleway-inference-pn-detach.webp' -In this tutorial, we guide you through the process of attaching a Private Network to your Managed Inference deployment. +In this tutorial, we guide you through the process of attaching a Private Network to your dedicated Generative APIs deployment. This can be done during the initial setup or added later to an existing deployment. -Using a Private Network for communications between the Instances hosting your applications and your Managed Inference deployment ensures secure communication between resources, with low latency. +Using a Private Network for communications between the Instances hosting your applications and your dedicated Generative APIs deployment ensures secure communication between resources, with low latency. - A Scaleway account logged into the [console](https://console.scaleway.com) - [Owner](/iam/concepts/#owner) status or [IAM permissions](/iam/concepts/#permission) allowing you to perform actions in the intended Organization - - A [Managed Inference deployment](/managed-inference/quickstart/) + - A [dedicated Generative APIs deployment](/generative-apis/how-to/create-deployment/) -## How to attach a Private Network to a Managed Inference deployment +## How to attach a Private Network to a dedicated Generative APIs deployment ### Attaching a Private Network during model deployment -1. Click **Managed Inference** in the **AI** section of the [Scaleway console](https://console.scaleway.com) side menu. A list of your deployments displays. -2. From the drop-down menu, select the geographical region where you want to deploy. -3. Navigate to the **Deployments** section and click **Deploy a model**. The setup wizard displays. -4. During the [setup](/managed-inference/how-to/create-deployment/), in the step **Configure network connectivity**, check the **Attach to Private Network** box. +1. Click **Generative APIs** in the **AI** section of the side menu in the [Scaleway console](https://console.scaleway.com/) to access the dashboard. The list of models displays. +2. Select the **Deployments** tab. +3. Click **Deploy a model** to launch the model deployment wizard. +4. During [setup](/generative-apis/how-to/create-deployment/), in step **Configure network connectivity**, check the **Attach to Private Network** box. 5. Choose one of the following options: - **Attach an existing Private Network**: Select from the list of available networks. - **Create a new Private Network**: Choose this option (at the bottom of the list) to create a new Private Network to attach the model to. -6. *Complete the deployment setup process and click **Deploy model**. +6. Complete the deployment setup process and click **Deploy model**. -Your Managed Inference model will be deployed, and it will be attached to the selected Private Network. +Your Generative APIs - Dedicated Deployment model will be deployed, and it will be attached to the selected Private Network. ### Attaching a Private Network to an existing deployment -1. Click **Managed Inference** in the **AI** section of the [Scaleway console](https://console.scaleway.com) side menu. A list of your deployments displays. -2. From the drop-down menu, select the geographical region you want to manage. -3. Click a deployment name or > **More info** to access the deployment dashboard. -4. Go to the **Overview** tab and locate the **Endpoints** section. -5. Click **Attach Private Network**. Two options are available: +1. Click **Generative APIs** in the **AI** section of the side menu in the [Scaleway console](https://console.scaleway.com/) to access the dashboard. The list of models displays. +2. Select the **Deployments** tab. +3. From the drop-down menu, select the geographical region you want to manage. +4. Click a deployment name to access the deployment's dashboard. +5. Go to the **Security** tab and locate the **Private Network** section. +6. Click **Attach Private Network**. Two options are available: - **Attach an existing Private Network**: Select from the list of available networks. - **Create a new Private Network**: Choose this option from the end of the list to create a new Private Network to attach the model to. - - Alternatively, you can access the **Security tab** and attach a network from the **Private Network** section. - -6. Click **Attach to Private Network** to confirm. +7. Click **Attach to Private Network** to confirm. ### Verifying the Private Network connection -1. After attaching a Private Network, go to the **Security** tab. +1. After attaching a Private Network, go to the **Security** tab of your deployment. 2. You should see the **Private Network** connected to the deployment resource and its **allocated IPs** listed. - + ## How to send inference requests in a Private Network -For more information on managing access to deployments in a Private Network, see [How to manage access to deployments](/managed-inference/how-to/manage-allowed-ips/). +For more information on managing access to deployments in a Private Network, see [How to manage access to deployments](/generative-apis/how-to/manage-allowed-ips/). 1. [Create an Instance](/instances/how-to/create-an-instance/) which will host the inference application. - Ensure the Instance is attached to a Private Network in the same VPC as your Managed Inference deployment. + Ensure the Instance is attached to a Private Network in the same VPC as your dedicated Generative APIs deployment. -2. Download the TLS certificate from your Managed Inference deployment, available from the **Overview** tab in the **Endpoints** section. - +2. Download the TLS certificate from your dedicated Generative APIs deployment, available from the **Overview** tab in the **Endpoints** section. + 3. Transfer the TLS certificate to the Instance. You can use the `scp` (secure copy) command to securely transfer the certificate from your local machine to the Scaleway Instance. - Example command: ```bash @@ -92,7 +90,7 @@ For more information on managing access to deployments in a Private Network, see ``` 6. Paste the following Python code sample into your `inference.py` file: - This script takes an example of a conversational task with a request sent to an [LLM through the Chat Completions API](/managed-inference/reference-content/openai-compatibility/#chat-completions-api-or-responses-api). + This script takes an example of a conversational task with a request sent to an [LLM through the Chat Completions API](/generative-apis/reference-content/openai-compatibility/#chat-completions-api-or-responses-api). ```py import requests @@ -139,17 +137,19 @@ For more information on managing access to deployments in a Private Network, see python3 inference.py ``` -## Detaching a Private Network from a Managed Inference deployment +## Detaching a Private Network from a dedicated Generative APIs deployment -1. Click **Managed Inference** in the **AI** section of the [Scaleway console](https://console.scaleway.com) side menu. A list of your deployments displays. -2. Click a deployment name or > **More info** to access the deployment dashboard. -3. Go to the **Overview** tab and locate the **Endpoints** section. -4. Click **Detach Private Network**. A pop-up displays. - -5. Click **Detach resource** to confirm the removal of the private endpoint for your deployment. +1. Click **Generative APIs** in the **AI** section of the side menu in the [Scaleway console](https://console.scaleway.com/) to access the dashboard. The list of models displays. +2. Select the **Deployments** tab. +3. From the drop-down menu, select the geographical region you want to manage. +4. Click a deployment name to access the deployment's dashboard. +5. Go to the **Overview** tab and locate the **Endpoints** section. +6. Click **Detach Private Network**. A pop-up displays. + +7. Click **Detach resource** to confirm the removal of the private endpoint for your deployment. Alternatively, you can access the **Security** tab and detach a network from the **Private Network** section. - Managed Inference deployments must have at least one endpoint, either public or private. Consequently, users cannot detach their private endpoint without having a public one. - + Dedicated Generative APIs deployments must have at least one endpoint, either public or private. Consequently, users cannot detach their private endpoint without having a public one. + \ No newline at end of file diff --git a/pages/generative-apis/how-to/delete-deployment.mdx b/pages/generative-apis/how-to/delete-deployment.mdx new file mode 100644 index 0000000000..8d6fd363ec --- /dev/null +++ b/pages/generative-apis/how-to/delete-deployment.mdx @@ -0,0 +1,32 @@ +--- +title: How to delete a dedicated Generative APIs deployment +description: This page explains how to delete a dedicated Generative APIs deployment via the Scaleway console. +tags: generative-apis-dedicated-deployment ai-data delete +dates: + validation: 2026-04-15 + posted: 2024-03-06 +--- +import Requirements from '@macros/iam/requirements.mdx' + + +Once you have finished your inference tasks, you can delete your deployment. This page explains how to do so from the Scaleway console. + + + + - A Scaleway account logged into the [console](https://console.scaleway.com) + - A [dedicated Generative APIs deployment](/generative-apis/how-to/create-deployment/) + - [Owner](/iam/concepts/#owner) status or [IAM permissions](/iam/concepts/#permission) allowing you to perform actions in the intended Organization + +1. Click **Generative APIs** in the **AI** section of the side menu in the [Scaleway console](https://console.scaleway.com/) to access the dashboard. The list of models displays. +2. Select the **Deployments** tab. +3. From the drop-down menu, select the geographical region where your deployment was created. +4. Choose the deployment by clicking its name. The deployment's **Overview** page displays. +5. Navigate to the **Settings** tab. +6. Click **Delete deployment** at the bottom of the page. +7. Type **DELETE** to confirm and click **Delete deployment**. + + Alternatively, from the Deployments listing, click the icon next to the deployment name you no longer need, and click **Delete**. A pop-up appears. Type **DELETE** to confirm, then click **Delete deployment**. + + + Deleting a deployment is a permanent action that erases all its associated data and resources. + \ No newline at end of file diff --git a/pages/managed-inference/how-to/import-custom-model.mdx b/pages/generative-apis/how-to/import-custom-model.mdx similarity index 57% rename from pages/managed-inference/how-to/import-custom-model.mdx rename to pages/generative-apis/how-to/import-custom-model.mdx index b1a49b7014..da3b1d91cf 100644 --- a/pages/managed-inference/how-to/import-custom-model.mdx +++ b/pages/generative-apis/how-to/import-custom-model.mdx @@ -1,15 +1,15 @@ --- -title: How to import custom models into Managed Inference -description: Learn how to import your custom models into Scaleway's Managed Inference platform. -tags: managed-inference ai-data import custom model +title: How to import a custom model into a dedicated Generative APIs deployment +description: Learn how to import your custom models into Scaleway's Generative APIs - Dedicated Deployment platform. +tags: generative-apis-dedicated-deployment ai-data import custom model dates: - validation: 2025-10-03 + validation: 2026-04-15 posted: 2025-03-27 --- import Requirements from '@macros/iam/requirements.mdx' -Scaleway provides a selection of common models for deployment from the Scaleway console. If you need a specific model, you can import it into Managed Inference directly from Hugging Face or a Scaleway Object Storage bucket. +Scaleway provides a selection of common models for deployment from the Scaleway console. If you need a specific model, you can import it directly from Hugging Face or a Scaleway Object Storage bucket. This feature is currently in **beta stage** and will evolve in the future. @@ -19,34 +19,32 @@ Scaleway provides a selection of common models for deployment from the Scaleway - A Scaleway account logged into the [console](https://console.scaleway.com). - [Owner](/iam/concepts/#owner) status or [IAM permissions](/iam/concepts/#permission) to perform actions in your Organization. -1. Click **Managed Inference** in the **AI** section of the side menu in the [Scaleway console](https://console.scaleway.com/) to access the dashboard. -2. From the drop-down menu, select the geographical region you want to manage. -3. Click **Deploy a model** to launch the model deployment wizard. -4. In the **Choose a model** section, select **Custom model**. If you have no model yet, click **Import a model** to start the model import wizard. -5. Choose an upload source: +1. Click **Generative APIs** in the **AI** section of the side menu in the [Scaleway console](https://console.scaleway.com/) to access the dashboard. +2. Select the **Custom models** tab. +3. Click **Import a model** to launch the model import wizard. +4. Choose an upload source: - **Hugging Face**: Pull the model from Hugging Face. - **Object Storage**: This feature is coming soon. -6. Enter your Hugging Face access token, which must have READ access to the repository. +5. Enter the name of the Hugging Face repository to pull the model from. - [Learn how to generate a Hugging Face access token](https://huggingface.co/docs/hub/security-tokens). + Ensure you have access to gated models if applicable. Refer to the [Hugging Face documentation](https://huggingface.co/docs/hub/en/models-gated) for details. -7. Enter the name of the Hugging Face repository to pull the model from. +6. Enter your Hugging Face access token, which must have READ access to the repository. - Ensure you have access to gated models if applicable. Refer to the [Hugging Face documentation](https://huggingface.co/docs/hub/en/models-gated) for details. + [Learn how to generate a Hugging Face access token](https://huggingface.co/docs/hub/security-tokens). -8. Choose a name for your model. The name must be unique within your Organization and Project and cannot be changed later. -9. Click **Verify import** to check your Hugging Face credentials and ensure model compatibility. +7. Choose a name for your model. The name must be unique within your Organization and Project and cannot be changed later. +8. Click **Verify import** to check your Hugging Face credentials and ensure model compatibility. - For detailed information about supported models, visit our [Supported models in Managed Inference](/managed-inference/reference-content/supported-models/) documentation. + For detailed information about supported models, visit our [Supported models](/generative-apis/reference-content/model-catalog/) catalog. -10. Review the summary of your import, which includes: - - Context size by node type. - - Quantization options. - - Estimated cost. +9. Review the summary of your import, which includes: + - Context size by node type + - Quantization options + - Estimated cost Once checked, click **Begin import** to finalize the process. Importing a model may take some time, depending on its size. Once the import is complete, the model will appear in the model library, and you will be able to use it for deployment. -Your imported model will now appear in the model library and you can [deploy it on Managed Inference](/managed-inference/how-to/create-deployment/). - +Your imported model will now appear in the model library and you can [deploy it on Generative APIs](/generative-apis/how-to/create-deployment/). \ No newline at end of file diff --git a/pages/managed-inference/how-to/manage-allowed-ips.mdx b/pages/generative-apis/how-to/manage-allowed-ips.mdx similarity index 55% rename from pages/managed-inference/how-to/manage-allowed-ips.mdx rename to pages/generative-apis/how-to/manage-allowed-ips.mdx index b7e2a5ec3a..fb5d621a90 100644 --- a/pages/managed-inference/how-to/manage-allowed-ips.mdx +++ b/pages/generative-apis/how-to/manage-allowed-ips.mdx @@ -1,9 +1,9 @@ --- -title: How to manage access to your Managed Inference deployments -description: This page explains how to manage and restrict access and authentication for your Managed Inference deployments -tags: managed-inference ai-data ip-address +title: How to manage access to your dedicated Generative APIs deployments +description: This page explains how to manage and restrict access and authentication for your dedicated Generative APIs deployments +tags: generative-apis-dedicated-deployment ai-data ip-address dates: - validation: 2025-07-31 + validation: 2026-04-15 posted: 2024-03-06 --- import Requirements from '@macros/iam/requirements.mdx' @@ -11,10 +11,10 @@ import Requirements from '@macros/iam/requirements.mdx' import apiAuthentication from './assets/scaleway-api-authentication.webp' -The **Allowed IPs** feature via ACLs is no longer available for Managed Inference deployments. We recommended using one of the alternative methods detailed in this document to restrict access to your Managed Inference deployments. +The **Allowed IPs** feature via ACLs is no longer available for Generative APIs - Dedicated Deployment. We recommended using one of the alternative methods detailed in this document to restrict access to your dedicated Generative APIs deployments. -You can manage and restrict access to your Managed Inference deployments via the following methods: +You can manage and restrict access to your dedicated Generative APIs deployments via the following methods: - Enable or disable authentication by API key - Use [IAM](/iam/) features to control which API keys are accepted and under what conditions (including IP-based restrictions) @@ -25,20 +25,21 @@ Read on for full details. - A Scaleway account logged into the [console](https://console.scaleway.com) - - A [Managed Inference deployment](/managed-inference/quickstart/) + - A [dedicated Generative APIs deployment](/generative-apis/how-to/create-deployment/) - [Owner](/iam/concepts/#owner) status or [IAM permissions](/iam/concepts/#permission) allowing you to perform actions in the intended Organization ## How to enable or disable authentication by API key -By default, when you create your Managed Inference deployment, authentication by API key is automatically enabled. This means that when the deployment is accessed via either its public or private endpoint, a valid Scaleway API key must accompany all requests. +By default, when you create your dedicated Generative APIs deployment, authentication by API key is automatically enabled. This means that when the deployment is accessed via either its public or private endpoint, a valid Scaleway API key must accompany all requests. You can disable API key authentication at any time, for either the public endpoint, the private endpoint, or both. -1. Click **Managed Inference** in the **AI** section of the [Scaleway console](https://console.scaleway.com) side menu. A list of your deployments displays. -2. From the drop-down menu, select the geographical region containing your deployment. -3. Click the deployment whose authentication you want to manage. The deployment's dashboard displays. -4. Click the **Security** tab. -5. In the **Authentication** panel, use the toggles to enable or disable authentication by API key for the public and/or private endpoint. +1. Click **Generative APIs** in the **AI** section of the side menu in the [Scaleway console](https://console.scaleway.com/) to access the dashboard. The list of models displays. +2. Select the **Deployments** tab. +3. From the drop-down menu, select the geographical region containing your deployment. +4. Click the deployment whose authentication you want to manage. The deployment's dashboard displays. +5. Click the **Security** tab. +6. In the **Authentication** panel, use the toggles to enable or disable authentication by API key for the public and/or private endpoint. @@ -51,7 +52,13 @@ An API key is considered valid to access a deployment when: - It belongs to the [Owner](/iam/concepts/#owner) of the Organization which owns the deployment, or - It belongs to a [Member](/iam/concepts/#member) or [Application](/iam/concepts/#application) of the Organization which owns the deployment, and the Member/Application has appropriate [IAM permissions](/iam/reference-content/permission-sets/). -There are two IAM permission sets specific to Managed Inference deployments: `InferenceFullAccess` (allowing access to create, read, update, and delete a deployment) and `InferenceReadOnly` (allowing read-only access). Alternatively, wide-scoped permission sets such as `AllProductsFullAccess` will also allow access. +There are two IAM permission sets specific to Generative APIs - Dedicated Deployment (formerly known as Managed Inference): `InferenceFullAccess` (allowing access to create, read, update, and delete a deployment) and `InferenceReadOnly` (allowing read-only access). Alternatively, wide-scoped permission sets such as `AllProductsFullAccess` will also allow access. + + + Due to a product name change, the permission set names `InferenceFullAccess` and `InferenceReadOnly` are also changing. If you are automatically provisioning IAM policies (using Terraform, CLI, or APIs) with permission sets `InferenceFullAccess` and `InferenceReadOnly`, then you should edit your existing scripts and replace these permissions with `GenerativeApisFullAccess` and `GenerativeApisModelAccess`, respectively. + + For now, both `InferenceFullAccess` and `InferenceReadOnly` will remain available at least until 1 June 2026. If you may be impacted by the permission set name update, you will receive a dedicated communication including the definitive End Of Life date for these permission sets. + Permissions are attributed via [policies](/iam/concepts/#policy), which are then attached to a Member or Application. @@ -66,20 +73,20 @@ Read on if you want to manage access to your deployment for others. 1. [Invite Members](/iam/how-to/manage-members/) (other humans) to your Organization, or [create Applications](/iam/how-to/create-application/) (non-human users). -2. Create and attach a [policy](/iam/how-to/create-policy/) to the Member or Application, defining the permissions they should have in your Organization by selecting permission sets (e.g. `InferenceFullAccess`). If desired, define [conditions](/iam/concepts/#conditions) as part of the policy, to further restrict access based on user agent type, date/time or IP address. +2. Create and attach a [policy](/iam/how-to/create-policy/) to the Member or Application, defining the permissions they should have in your Organization by selecting permission sets (e.g., `InferenceFullAccess`). If desired, define [conditions](/iam/concepts/#conditions) as part of the policy, to further restrict access based on user agent type, date/time or IP address. {/* QUESTION: Verify if the name of the permission set is InferenceFullAccess. */} -All API keys generated by the Member, or for the Application, will automatically inherit the permissions you defined, and can be used to access a Managed Inference deployment's endpoint depending on those permissions. +All API keys generated by the Member, or for the Application, will automatically inherit the permissions you defined, and can be used to access a dedicated Generative APIs deployment's endpoint depending on those permissions. You can revoke access to a deployment at any time by [modifying or deleting the policy](/iam/how-to/manage-policies/) attached to the Member or Application in question. ### How to access a deployment as an Organization Member -Your access to Managed Inference deployments owned by an Organization in which you are a Member depends on the IAM permissions attributed to you by the Organization's Owner or administrators. +Your access to dedicated Generative APIs deployments owned by an Organization in which you are a Member depends on the IAM permissions attributed to you by the Organization's Owner or administrators. -Your permissions will be automatically applied to any API keys you generate for yourself in the Scaleway console. Check with your Organization Owner if you are unsure that you have the right permissions to access a Managed Inference deployment. +Your permissions will be automatically applied to any API keys you generate for yourself in the Scaleway console. Check with your Organization Owner if you are unsure that you have the right permissions to access a dedicated Generative APIs deployment. 1. Log into the [Scaleway console](https://console.scaleway.com) and [generate an API key for yourself](/iam/how-to/create-api-keys/). -2. Use this API key for authentication when sending requests to a Managed Inference deployment. +2. Use this API key for authentication when sending requests to a dedicated Generative APIs deployment. ## How to restrict access over Private Networks @@ -87,12 +94,12 @@ For enhanced security, you can remove your deployment's public endpoint, attach You can still require API key authentication via the private endpoint, and use the methods described above to fine-tune API key restrictions and access. In addition, you can also use VPC features such as Network ACLs for enhanced control and security. -1. [Create your deployment](/managed-inference/how-to/create-deployment/) without checking the **Allow public connections** box, or remove the public endpoint via its **Overview** screen in the console if you already created it with a public endpoint. -2. Ensure the deployment is [attached to a Private Network](/managed-inference/how-to/managed-inference-with-private-network/#how-to-attach-a-private-network-to-a-managed-inference-deployment). -3. Transfer the deployment's [TLS certificate](/managed-inference/how-to/managed-inference-with-private-network/#how-to-send-inference-requests-in-a-private-network) to the resources in the VPC that need to access the deployment. +1. [Create your deployment](/generative-apis/how-to/create-deployment/) without checking the **Allow public connections** box, or remove the public endpoint via its **Overview** screen in the console if you already created it with a public endpoint. +2. Ensure the deployment is [attached to a Private Network](/generative-apis/how-to/dedicated-deployment-with-private-network/#how-to-attach-a-private-network-to-a-dedicated-generative-apis-deployment). +3. Transfer the deployment's [TLS certificate](/generative-apis/how-to/dedicated-deployment-with-private-network/#how-to-send-inference-requests-in-a-private-network) to the resources in the VPC that need to access the deployment. 4. (Optional) Ensure that API key authentication is enabled, and use [policies](/iam/how-to/create-policy/) to define IAM-based rules and conditions for access. 5. (Optional) Use VPC features such as [Network ACLs](/vpc/reference-content/understanding-nacls/) to place IP-based restrictions on which resources in the VPC can access the deployment. -6. Follow the instructions in the [dedicated documentation](/managed-inference/how-to/managed-inference-with-private-network/#how-to-send-inference-requests-in-a-private-network) for sending requests to your deployment in a Private Network. +6. Follow the instructions in the [dedicated documentation](/generative-apis/how-to/dedicated-deployment-with-private-network/#how-to-send-inference-requests-in-a-private-network) for sending requests to your deployment in a Private Network. If your VPC has a Public Gateway advertising a default route, external resources can still access the deployment via the Public Gateway (with correct authentication). [Read more about Public Gateways](/public-gateways/). diff --git a/pages/generative-apis/how-to/monitor-deployment.mdx b/pages/generative-apis/how-to/monitor-deployment.mdx new file mode 100644 index 0000000000..e3b1236c2f --- /dev/null +++ b/pages/generative-apis/how-to/monitor-deployment.mdx @@ -0,0 +1,32 @@ +--- +title: How to monitor a dedicated Generative APIs deployment +description: This page explains how to monitor a dedicated Generative APIs deployment +tags: generative-apis-dedicated-deployment ai-data monitoring +dates: + validation: 2026-04-16 + posted: 2024-03-06 +--- +import Requirements from '@macros/iam/requirements.mdx' +import CockpitIamPermissions from '@macros/cockpit/iam-permissions-cockpit.mdx' + + +This documentation page shows you how to monitor your dedicated Generative APIs deployment with [Cockpit](/cockpit/quickstart/). + + + + - A Scaleway account logged into the [console](https://console.scaleway.com) + - A [dedicated Generative APIs deployment](/generative-apis/how-to/create-deployment/) + - [Owner](/iam/concepts/#owner) status or [IAM permissions](/iam/concepts/#permission) allowing you to perform actions in the intended Organization + + + +## How to monitor your LLM dashboard + +1. Click **Generative APIs** in the **AI** section of the side menu in the [Scaleway console](https://console.scaleway.com/) to access the dashboard. The list of models displays. +2. Select the **Deployments** tab. +3. From the drop-down menu, select the geographical region you want to manage. +4. Click a deployment name to access the deployment's dashboard. +5. Click the **Monitoring** tab of your deployment. The Cockpit overview displays. +6. Click **Open Grafana metrics dashboard** to open your Cockpit's Grafana interface. +7. Log in to Grafana. +8. Select your Generative APIs dashboard from the [list of your preconfigured dashboards](/cockpit/how-to/access-grafana-and-managed-dashboards/) to visualize your metrics. \ No newline at end of file diff --git a/pages/generative-apis/how-to/query-audio-models.mdx b/pages/generative-apis/how-to/query-audio-models.mdx index fca47fdc51..4475b6b1a2 100644 --- a/pages/generative-apis/how-to/query-audio-models.mdx +++ b/pages/generative-apis/how-to/query-audio-models.mdx @@ -11,8 +11,9 @@ import Requirements from '@macros/iam/requirements.mdx' Scaleway's Generative APIs service allows users to interact with powerful audio models hosted on the platform. There are several ways to interact with audio models: -- The Scaleway [console](https://console.scaleway.com) provides a complete [playground](/generative-apis/how-to/query-language-models/#accessing-the-playground), aiming to test models, adapt parameters, and observe how these changes affect the output in real-time. +- The Scaleway [console](https://console.scaleway.com) provides a complete [playground](/generative-apis/how-to/query-audio-models/#accessing-the-playground), aiming to test models, adapt parameters, and observe how these changes affect the output in real-time. - Via the [Chat Completions API](https://www.scaleway.com/en/developers/api/generative-apis/#path-chat-completions-create-a-chat-completion) or the [Audio Transcriptions API](https://www.scaleway.com/en/developers/api/generative-apis/#path-audio-create-an-audio-transcription) +- Via your own [dedicated deployment](/generative-apis/how-to/create-deployment/) of a chosen model @@ -26,20 +27,18 @@ There are several ways to interact with audio models: Scaleway provides a web playground for instruct-based models hosted on Generative APIs. 1. Navigate to **Generative APIs** under the **AI** section of the [Scaleway console](https://console.scaleway.com/) side menu. The list of models you can query displays. -2. Click the name of the chat model you want to try. Alternatively, click next to the chat model, and click **Try model** in the menu. +2. Click the name of the chat model you want to try. Alternatively, click **Try** next to the model's name. The web playground displays. ## Using the playground -1. Enter a prompt at the bottom of the page, or use one of the suggested prompts in the conversation area. +1. Upload an audio file to send to the selected audio model for transcription purposes. 2. Edit the hyperparameters listed on the right column, for example the default temperature for more or less randomness on the outputs. 3. Switch models at the top of the page, to observe the capabilities of chat models offered via Generative APIs. -4. Click **View code** to get code snippets configured according to your settings in the playground. - - -You can also use the upload button to send supported audio file formats, such as MP3, to audio models for transcription purposes. - +4. Click **Deploy**, then select the **Serverless** option to get code snippets configured according to your settings in the playground. + + You can also choose to deploy a model on your own dedicated Instance by selecting the **Dedicated** option. In this case, you can access the playground after completing the steps in the deployment wizard. Once in the playground of your deployment, click **View code** to get code snippets that match your settings in the playground. ## Querying audio models via API @@ -72,6 +71,11 @@ pip install openai Initialize the OpenAI client with your base URL and API key: + + In the case of a dedicated Generative APIs deployment, the `base_url` value is the **Public Endpoint URL** displayed on the Overview tab of the deployment's dashboard. + + + ```python from openai import OpenAI diff --git a/pages/generative-apis/how-to/query-code-models.mdx b/pages/generative-apis/how-to/query-code-models.mdx index 6ee5776e2a..d8c2cb8816 100644 --- a/pages/generative-apis/how-to/query-code-models.mdx +++ b/pages/generative-apis/how-to/query-code-models.mdx @@ -16,6 +16,7 @@ Code models are inherently [language models](/generative-apis/how-to/query-langu As such, they will be available through the same interfaces as language models: - The Scaleway [console](https://console.scaleway.com) provides a complete [playground](/generative-apis/how-to/query-language-models/#accessing-the-playground), aiming to test models, adapt parameters, and observe how these changes affect the output in real time. - Via the [Chat API](/generative-apis/how-to/query-language-models/#querying-language-models-via-api) +- Via your own [dedicated deployment](/generative-apis/how-to/create-deployment/) of a chosen model For more information on how to query language models, read [our dedicated documentation](/generative-apis/how-to/query-language-models/). diff --git a/pages/generative-apis/how-to/query-embedding-models.mdx b/pages/generative-apis/how-to/query-embedding-models.mdx index 004d4b57cc..1c8f73260c 100644 --- a/pages/generative-apis/how-to/query-embedding-models.mdx +++ b/pages/generative-apis/how-to/query-embedding-models.mdx @@ -38,6 +38,10 @@ pip install openai Initialize the OpenAI client with your base URL and API key: + + In the case of a dedicated Generative APIs deployment, the `base_url` value is the **Public Endpoint URL** displayed on the Overview tab of the deployment's dashboard. + + ```python from openai import OpenAI diff --git a/pages/generative-apis/how-to/query-language-models.mdx b/pages/generative-apis/how-to/query-language-models.mdx index ede90d21e9..04e7db868a 100644 --- a/pages/generative-apis/how-to/query-language-models.mdx +++ b/pages/generative-apis/how-to/query-language-models.mdx @@ -12,8 +12,9 @@ import ChatCompVsResponsesApi from '@macros/ai/chat-comp-vs-responses-api.mdx' Scaleway's Generative APIs service allows users to interact with powerful language models hosted on the platform. There are several ways to interact with language models: -- The Scaleway [console](https://console.scaleway.com) provides complete [playground](/generative-apis/how-to/query-language-models/#accessing-the-playground), aiming to test models, adapt parameters, and observe how these changes affect the output in real-time. +- The Scaleway [console](https://console.scaleway.com) provides complete [playground](/generative-apis/how-to/query-language-models/#accessing-the-playground), aiming to test models, adapt parameters, and observe how these changes affect the output in real-time - Via the [Chat Completions API](https://www.scaleway.com/en/developers/api/generative-apis/#path-chat-completions-create-a-chat-completion) or the [Responses API](https://www.scaleway.com/en/developers/api/generative-apis/#path-responses-create-a-response) +- Via your own [dedicated deployment](/generative-apis/how-to/create-deployment/) of a chosen model @@ -22,12 +23,12 @@ There are several ways to interact with language models: - A valid [API key](/iam/how-to/create-api-keys/) for API authentication - Python 3.7+ installed on your system -## Accessing the Playground +## Accessing the playground Scaleway provides a web playground for instruct-based models hosted on Generative APIs. 1. Navigate to **Generative APIs** under the **AI** section of the [Scaleway console](https://console.scaleway.com/) side menu. The list of models you can query displays. -2. Click the name of the chat model you want to try. Alternatively, click next to the chat model, and click **Try model** in the menu. +2. Click the name of the chat model you want to try. Alternatively, click **Try** next to the model's name. The web playground displays. @@ -35,7 +36,9 @@ The web playground displays. 1. Enter a prompt at the bottom of the page, or use one of the suggested prompts in the conversation area. 2. Edit the hyperparameters listed on the right column, for example the default temperature for more or less randomness on the outputs. 3. Switch models at the top of the page, to observe the capabilities of chat models offered via Generative APIs. -4. Click **View code** to get code snippets configured according to your settings in the playground. +4. Click **Deploy**, then select the **Serverless** option to get code snippets configured according to your settings in the playground. + + You can also choose to deploy a model on your own dedicated Instance by selecting the **Dedicated** option. In this case, you can access the playground after completing the steps in the deployment wizard. Once in the playground of your deployment, click **View code** to get code snippets that match your settings in the playground. ## Querying language models via API @@ -58,6 +61,11 @@ pip install openai Initialize the OpenAI client with your base URL and API key: + + In the case of a dedicated Generative APIs deployment, the `base_url` value is the **Public Endpoint URL** displayed on the Overview tab of the deployment's dashboard. + + + ```python from openai import OpenAI diff --git a/pages/generative-apis/how-to/query-reasoning-models.mdx b/pages/generative-apis/how-to/query-reasoning-models.mdx index 26ca7fa821..8fc1018f06 100644 --- a/pages/generative-apis/how-to/query-reasoning-models.mdx +++ b/pages/generative-apis/how-to/query-reasoning-models.mdx @@ -12,12 +12,13 @@ Scaleway's Generative APIs service allows users to interact with language models A reasoning model is a language model that is capable of carrying out multiple inference steps and systematically verifying intermediate results before producing answers. You can specify how much effort it should put into reasoning via dedicated parameters, and access reasoning content in its outputs. Even with default parameters, such models are designed to perform better on reasoning tasks like maths and logic problems than non-reasoning language models. -Language models supporting the reasoning feature include `gpt-oss-120b`. See [Supported Models](/generative-apis/reference-content/supported-models/) for a full list. +Language models supporting the reasoning feature include `gpt-oss-120b`. See [Supported models](/generative-apis/reference-content/supported-models/) for a full list. You can interact with reasoning models in the following ways: -- Use the [playground](/generative-apis/how-to/query-language-models/#accessing-the-playground) in the Scaleway [console](https://console.scaleway.com) to test models, adapt parameters, and observe how your changes affect the output in real-time. +- Use the [playground](/generative-apis/how-to/query-reasoning-models/#accessing-the-playground) in the Scaleway [console](https://console.scaleway.com) to test models, adapt parameters, and observe how your changes affect the output in real-time - Use the [Chat Completions API](https://www.scaleway.com/en/developers/api/generative-apis/#path-chat-completions-create-a-chat-completion) or the [Responses API](https://www.scaleway.com/en/developers/api/generative-apis/#path-responses-create-a-response) +- Use your own [dedicated deployment](/generative-apis/how-to/create-deployment/) of a chosen model @@ -33,7 +34,7 @@ You can interact with reasoning models in the following ways: Scaleway provides a web playground for instruct-based models hosted on Generative APIs. 1. Navigate to **Generative APIs** under the **AI** section of the [Scaleway console](https://console.scaleway.com/) side menu. The list of models you can query displays. -2. Click the name of the chat model you want to try. Alternatively, click next to the chat model, and click **Try model** in the menu. Ensure that you choose a model with [reasoning capabilities](/generative-apis/reference-content/supported-models/). +2. Click the name of the reasoning model you want to try. Alternatively, click **Try** next to the model's name. Ensure that you choose a model with [reasoning capabilities](/generative-apis/reference-content/supported-models/). The web playground displays. @@ -42,7 +43,9 @@ The web playground displays. 1. Enter a prompt at the bottom of the page, or use one of the suggested prompts in the conversation area. 2. Edit the parameters listed on the right column, for example the default temperature for more or less randomness on the outputs. 3. Switch models at the top of the page, to observe the capabilities of chat models offered via Generative APIs. -4. Click **View code** to get code snippets configured according to your settings in the playground. +4. Click **Deploy**, then select the **Serverless** option to get code snippets configured according to your settings in the playground. + + You can also choose to deploy a model on your own dedicated Instance by selecting the **Dedicated** option. In this case, you can access the playground after completing the steps in the deployment wizard. Once in the playground of your deployment, click **View code** to get code snippets that match your settings in the playground. You cannot currently set values for parameters such as `reasoning_effort`, or access reasoning metadata in the model's output, via the console playground. Query the models programmatically as shown below in order to access the full reasoning feature set. @@ -71,6 +74,10 @@ pip install openai Initialize the OpenAI client with your base URL and API key: + + In the case of a dedicated Generative APIs deployment, the `base_url` value is the **Public Endpoint URL** displayed on the Overview tab of the deployment's dashboard. + + ```python from openai import OpenAI diff --git a/pages/generative-apis/how-to/query-reranking-models.mdx b/pages/generative-apis/how-to/query-reranking-models.mdx index c26f521044..024e29058a 100644 --- a/pages/generative-apis/how-to/query-reranking-models.mdx +++ b/pages/generative-apis/how-to/query-reranking-models.mdx @@ -26,7 +26,7 @@ This approach takes advantage of the strengths of each model: one that is fast b ## Using embedding models for reranking -In the case of using an embedding model such as `qwen3-embedding` for reranking, note that the [Embedding API](https://www.scaleway.com/en/developers/api/generative-apis/#path-embeddings-create-an-embedding) and [Rereanking API](https://www.scaleway.com/en/developers/api/generative-apis/#path-rerank-create-a-reranking) are functionally equivalent. This is because the generated embedding vectors are normalized - meaning that the reranking score corresponds directly to the **cosine similarity** between vectors, which, for normalized vectors, is identical to the **dot product**. +In the case of using an embedding model such as `qwen3-embedding` for reranking, note that the [Embedding API](https://www.scaleway.com/en/developers/api/generative-apis/#path-embeddings-create-an-embedding) and [Reranking API](https://www.scaleway.com/en/developers/api/generative-apis/#path-rerank-create-a-reranking) are functionally equivalent. This is because the generated embedding vectors are normalized - meaning that the reranking score corresponds directly to the **cosine similarity** between vectors, which, for normalized vectors, is identical to the **dot product**. In practical terms: diff --git a/pages/generative-apis/how-to/query-vision-models.mdx b/pages/generative-apis/how-to/query-vision-models.mdx index 3b0e548c49..bede285faf 100644 --- a/pages/generative-apis/how-to/query-vision-models.mdx +++ b/pages/generative-apis/how-to/query-vision-models.mdx @@ -18,6 +18,7 @@ There are several ways to interact with vision models: - **Scaleway console playground**: The Scaleway [console](https://console.scaleway.com) provides a complete [playground](/generative-apis/quickstart/#interacting-with-generative-apis-via-the-playground) for Generative APIs. This visual interface allows you to test models, adapt query parameters, and observe how these changes affect the output in real-time. - **[Chat Completions API](https://www.scaleway.com/en/developers/api/generative-apis/#path-chat-completions-create-a-chat-completion)**: Use the chat completions API to query vision models programmatically. +- **Your own [dedicated deployment](/generative-apis/how-to/create-deployment/)**: Deploy a model on your own Instance and interact with the model in an isolated environment @@ -30,8 +31,8 @@ There are several ways to interact with vision models: Scaleway provides a web playground for vision models hosted on Generative APIs. -1. Navigate to **Generative APIs** under the **AI** section of the [Scaleway console](https://console.scaleway.com/) side menu. The list of models you can query displays. -2. Click the name of the vision model you want to try. Alternatively, click next to the vision model, and click **Try model** in the menu. +1. Navigate to **Generative APIs** under the **AI** section of the [Scaleway console](https://console.scaleway.com/) side menu. The list of models you can query displays. Select the **Serverless** toggle button. +2. Click the name of the vision model you want to try. Alternatively, click **Try** next to the model's name. The web playground displays. @@ -39,7 +40,9 @@ The web playground displays. 1. Upload one or multiple images to the prompt area at the bottom of the page. Enter a prompt, for example, to describe the image(s) you attached. 2. Edit the hyperparameters listed on the right column, for example the default temperature for more or less randomness on the outputs. 3. Switch models at the top of the page, to observe the capabilities of chat and vision models offered via Generative APIs. -4. Click **View code** to get code snippets configured according to your settings in the playground. +4. Click **Deploy**, then select the **Serverless** option to get code snippets configured according to your settings in the playground. + + You can also choose to deploy a model on your own dedicated Instance by selecting the **Dedicated** option. In this case, you can access the playground after completing the steps in the deployment wizard. Once in the playground of your deployment, click **View code** to get code snippets that match your settings in the playground. ## Querying vision models via the API @@ -65,6 +68,10 @@ pip install openai Initialize the OpenAI client with your base URL and API key: + + In the case of a dedicated Generative APIs deployment, the `base_url` value is the **Public Endpoint URL** displayed on the Overview tab of the deployment's dashboard. + + ```python from openai import OpenAI @@ -247,7 +254,7 @@ asyncio.run(main()) ## Frequently Asked Questions #### Is there a limit to the size of each image? -The only limitation are the tokens context window and the maximum resolution supported by the model (images will be automatically downscaled to fit within maximum resolution). Refer to our [model catalog](/managed-inference/reference-content/model-catalog/#mistral-small-31-24b-instruct-2503) for more information about supported formats and token dimensions for each model. +The only limitation are the tokens context window and the maximum resolution supported by the model (images will be automatically downscaled to fit within maximum resolution). Refer to our [model catalog](/generative-apis/reference-content/supported-models/##model-details) for more information about supported formats and token dimensions for each model. #### What is the maximum amount of images per conversation? -Each conversation can handle up to 12 images (per request). Attempting to add a 13th image will result in a 400 Bad Request error. +Each conversation can handle up to 12 images (per request). Attempting to add a 13th image will result in a `400 Bad Request` error. diff --git a/pages/generative-apis/how-to/use-batch-processing.mdx b/pages/generative-apis/how-to/use-batch-processing.mdx index d5f041a33e..995a5756bb 100644 --- a/pages/generative-apis/how-to/use-batch-processing.mdx +++ b/pages/generative-apis/how-to/use-batch-processing.mdx @@ -1,14 +1,14 @@ --- title: How to use batch processing -description: Learn how to submit large volumes of requests to Generative APIs asynchronously. -tags: generative-apis ai-data batch-processing +description: Learn how to submit large volumes of requests to Generative APIs - Serverless asynchronously. +tags: generative-apis-serverless ai-data batch-processing dates: validation: 2026-02-17 posted: 2026-02-17 --- import Requirements from '@macros/iam/requirements.mdx' -Batch processing allows you to submit large volumes of requests to Generative APIs asynchronously, at a discounted price. +Batch processing allows you to submit large volumes of requests to Generative APIs - Serverless asynchronously, at a discounted price. Instead of sending individual requests, you upload an input file to Object Storage and create a batch job. The service processes the requests in the background and writes the results to an output file. diff --git a/pages/generative-apis/how-to/use-function-calling.mdx b/pages/generative-apis/how-to/use-function-calling.mdx index 5ffad636a8..b1fa90a8d4 100644 --- a/pages/generative-apis/how-to/use-function-calling.mdx +++ b/pages/generative-apis/how-to/use-function-calling.mdx @@ -23,7 +23,7 @@ Function calling allows a large language model (LLM) to interact with external t ## Supported models -All the [chat models](/generative-apis/reference-content/supported-models/#chat-models) hosted by Scaleway support function calling. +All the [chat models](/generative-apis/reference-content/supported-models) hosted by Scaleway support function calling. ## Understanding function calling @@ -110,6 +110,10 @@ tools = [{ To implement a basic function call, add the following code: + + In the case of a dedicated Generative APIs deployment, the `base_url` value is the **Public Endpoint URL** displayed on the Overview tab of the deployment's dashboard. + + ```python # Initialize the OpenAI client client = OpenAI( diff --git a/pages/generative-apis/how-to/use-structured-outputs.mdx b/pages/generative-apis/how-to/use-structured-outputs.mdx index a4cdf1e5f1..49d8abd863 100644 --- a/pages/generative-apis/how-to/use-structured-outputs.mdx +++ b/pages/generative-apis/how-to/use-structured-outputs.mdx @@ -59,11 +59,15 @@ There are several ways to interact with language models: The following Python examples demonstrate how to use **Structured outputs** to generate structured responses. -We using the base code below to send our LLM a voice note transcript to structure: +We are using the base code below to send our LLM a voice note transcript to structure: ### Defining the voice note and transcript - ```python + + In the case of a dedicated Generative APIs deployment, the `base_url` value is the **Public Endpoint URL** displayed on the Overview tab of the deployment's dashboard. + + + ```pythonPublic Endpoint URL import json from openai import OpenAI from pydantic import BaseModel, Field diff --git a/pages/generative-apis/index.mdx b/pages/generative-apis/index.mdx index a48aad7e31..caaeee4ddd 100644 --- a/pages/generative-apis/index.mdx +++ b/pages/generative-apis/index.mdx @@ -1,12 +1,12 @@ --- title: Generative APIs Documentation -description: Dive into Scaleway Generative APIs with our quickstart guides, how-tos, tutorials and more. +description: Dive into Scaleway Generative APIs with our quickstart guides, how-tos, tutorials, and more. --- @@ -17,7 +17,7 @@ description: Dive into Scaleway Generative APIs with our quickstart guides, how- @@ -31,14 +31,14 @@ description: Dive into Scaleway Generative APIs with our quickstart guides, how- @@ -46,10 +46,18 @@ description: Dive into Scaleway Generative APIs with our quickstart guides, how- + + ## Changelog diff --git a/pages/generative-apis/menu.ts b/pages/generative-apis/menu.ts index 4dc37eaebf..2edb20b386 100644 --- a/pages/generative-apis/menu.ts +++ b/pages/generative-apis/menu.ts @@ -42,7 +42,7 @@ export const generativeApisMenu = { label: 'Query audio models', slug: 'query-audio-models' }, - { + { label: 'Query reranking models', slug: 'query-reranking-models' }, @@ -58,6 +58,39 @@ export const generativeApisMenu = { label: 'Use batch processing', slug: 'use-batch-processing', }, + { + label: 'Deploy a model', + slug: 'create-deployment', + }, + { + label: 'Import a custom model', + slug: 'import-custom-model', + }, + { + label: 'Change the model of a deployment', + slug: 'change-model', + }, + { + label: 'Monitor a deployment', + slug: 'monitor-deployment', + }, + { + label: 'Configure autoscaling', + slug: 'configure-autoscaling', + }, + { + label: 'Manage access to a deployment', + slug: 'manage-allowed-ips', + }, + { + label: + 'Use deployment with a Private Network', + slug: 'dedicated-deployment-with-private-network', + }, + { + label: 'Delete a deployment', + slug: 'delete-deployment', + }, ], label: 'How to', slug: 'how-to', @@ -99,7 +132,11 @@ export const generativeApisMenu = { slug: 'model-lifecycle', }, { - label: 'Rate limits', + label: 'OpenAI API compatibility', + slug: 'openai-compatibility', + }, + { + label: 'Rate limits for Serverless', slug: 'rate-limits', }, { @@ -107,7 +144,7 @@ export const generativeApisMenu = { slug: 'data-privacy', }, { - label: 'Security and Reliability in Generative APIs', + label: 'Security and reliability in Generative APIs', slug: 'security-and-reliability', }, { diff --git a/pages/generative-apis/quickstart.mdx b/pages/generative-apis/quickstart.mdx index 01e764d53d..9445d89249 100644 --- a/pages/generative-apis/quickstart.mdx +++ b/pages/generative-apis/quickstart.mdx @@ -3,7 +3,7 @@ title: Generative APIs - Quickstart description: Get started with Scaleway Generative APIs for powerful AI-driven content generation. Follow this guide to set up, configure, and make your first API request. tags: generative-apis ai-data quickstart dates: - validation: 2025-11-18 + validation: 2026-04-16 posted: 2024-09-04 --- import Requirements from '@macros/iam/requirements.mdx' @@ -28,7 +28,7 @@ The Scaleway console provides a web playground for chat-based models hosted on G ### How to access the playground -1. Navigate to **Generative APIs** under the **AI** section of the [Scaleway console](https://console.scaleway.com/) side menu. The list of available models displays. +1. Navigate to **Generative APIs** under the **AI** section of the [Scaleway console](https://console.scaleway.com/) side menu. The list of available models displays. Select the **Serverless** toggle button. 2. Click the name of the chat model you want to try. Alternatively, click next to the chat model, and click **Try model** in the menu. @@ -49,7 +49,7 @@ The web playground displays. 4. Click the send button to send your prompt and view the model's response in the conversation area, for example `Why don’t scientists trust atoms anymore? Because they make up everything!` - You can click **View code** next to the model dropdown to get code snippets you can integrate directly into your Python, Javascript and cURL code bases when [querying a model programmatically](#interacting-with-generative-apis-programmatically). These snippets are configured according to the settings you have entered in the playground. + You can click **View code** next to the model dropdown to get code snippets you can integrate directly into your Python, Javascript, and cURL code bases when [querying a model programmatically](#interacting-with-generative-apis-programmatically). These snippets are configured according to the settings you have entered in the playground. ## Interacting with Generative APIs programmatically @@ -78,7 +78,7 @@ pip install openai from openai import OpenAI client = OpenAI( - base_url="https://api.scaleway.ai/v1", # # Scaleway's Generative APIs service URL + base_url="https://api.scaleway.ai/v1", # # Scaleway's Generative APIs - Serverless service URL api_key="" # Your unique API secret key from Scaleway ) ``` @@ -121,7 +121,7 @@ Below is an example for generating a description of a futuristic city. from openai import OpenAI client = OpenAI( - base_url="https://api.scaleway.ai/v1", # # Scaleway's Generative APIs service URL + base_url="https://api.scaleway.ai/v1", # # Scaleway's Generative APIs - Serverless service URL api_key="" # Your unique API secret key from Scaleway ) diff --git a/pages/generative-apis/quickstart_dedicated.mdx b/pages/generative-apis/quickstart_dedicated.mdx new file mode 100644 index 0000000000..e9637947fe --- /dev/null +++ b/pages/generative-apis/quickstart_dedicated.mdx @@ -0,0 +1,104 @@ +--- +title: Generative APIs - Quickstart +description: Get started with Scaleway Generative APIs for secure, scalable AI model deployment in Europe's premier platform. Follow this guide to set up, configure, and make your first API request. +tags: +dates: + validation: 2026-04-13 +--- +import Requirements from '@macros/iam/requirements.mdx' + + +Scaleway Generative APIs - Dedicated Deployment is the first European Managed Inference platform on the market. It is a scalable and secure inference engine for Large Language Models (LLMs). + +Scaleway Generative APIs - Dedicated Deployment is a fully managed service that allows you to serve generative AI models in a production environment. +With Scaleway Generative APIs - Dedicated Deployment, you can easily deploy, manage, and scale LLMs without worrying about the underlying infrastructure. + +Here are some of the key features of Scaleway Generative APIs: + +- **Easy deployment**: Deploy state-of-the-art open weights LLMs with just a few clicks. Scaleway Generative APIs provides a simple and intuitive interface for generating dedicated endpoints. +- **Security**: Scaleway provides [a secure environment](/generative-apis/reference-content/security-and-reliability/) to run your models. Our platform is built on top of a secure architecture, and we use state-of-the-art cloud security. +- **Complete data privacy**: [No storage](generative-apis/reference-content/security-and-reliability/#data-protection) or third-party access to your data (prompt or responses), to ensure it remains exclusively yours. +- **Interoperability**: Scaleway Generative APIs was designed as a drop-in [replacement for the OpenAI APIs](/generative-apis/reference-content/openai-compatibility/), for a seamless transition on your applications already using its libraries. + +{/* ## Console overview + +Discover the Generative APIs Dedicated Deployment interface on the Scaleway console. + */} + + + + - A Scaleway account logged into the [console](https://console.scaleway.com) + - [Owner](/iam/concepts/#owner) status or [IAM permissions](/iam/concepts/#permission) allowing you to perform actions in the intended Organization + +## How to create a dedicated deployment + +1. Click **Generative APIs** in the **AI** section of the side menu in the [Scaleway console](https://console.scaleway.com/) to access the dashboard. The list of models displays. +2. Select the **Deployments** tab. +3. Click **Deploy a model** to launch the model deployment wizard. +4. Provide the necessary information: + - Select the desired model and the quantization to use for your deployment [from the available options](/generative-apis/reference-content/model-catalog). + + Scaleway Generative APIs allows you to deploy various AI models, either from the Scaleway catalog or by importing a custom model. For detailed information about supported models, visit our [Generative APIs model catalog](/generative-apis/reference-content/supported-models/). + + + Some models may require acceptance of an end-user license agreement (EULA). If prompted, review the terms and conditions, and accept the license accordingly. + + - Choose the geographical **region** for the deployment. + - Select a node type, the GPU Instance that will be used with your deployment. + - Choose the number of nodes for your deployment. Note that this feature is currently in [Public Beta](https://www.scaleway.com/en/betas/). + + High availability is only guaranteed with two or more nodes. + +5. Enter a **name** for the deployment, along with optional tags to aid in organization. +6. Configure **network** settings for the deployment: + - Enable **Private Network** for secure communication and restricted availability within Private Networks. Choose an existing Private Network from the drop-down list, or create a new one. + - Enable **Public Network** to access resources via the public Internet. API key protection is enabled by default. + + - Enabling both private and public networks will result in two distinct endpoints (public and private) for your deployment. + - Deployments must have at least one endpoint, either public or private. + +7. Click **Deploy model** to launch the deployment process. Once the deployment is ready, it will be listed among your deployments. + +## How to access a Generative APIs deployment + +Generative APIs deployments have authentication enabled by default. As such, your endpoints expect a secret key generated with Scaleway's Identity and Access Management service (IAM) for authentication. + +1. Click **Generative APIs** in the **AI** section of the side menu in the [Scaleway console](https://console.scaleway.com/) to access the dashboard. The list of models displays. +2. Select the **Deployments** tab. +3. From the drop-down menu, select the geographical region containing your deployment. +4. Click the name of the deployment you wish to access. The deployment's **Overview** page displays. +5. Scroll down to the **Deployment authentication** section and click the **Generate key** button. The token creation wizard displays. +6. Fill in the [required information for API key creation](/iam/how-to/create-api-keys/) and click **Generate API key**. +7. Copy and safely store your credentials before closing the window, as they will not be shown again. + + + You have full control over authentication from the **Security** tab of your deployment. Authentication is enabled by default. + + +## How to interact with Generative APIs + +1. Click **Generative APIs** in the **AI** section of the side menu in the [Scaleway console](https://console.scaleway.com/) to access the dashboard. The list of models displays. +2. Select the **Deployments** tab. +2. From the drop-down menu, select the geographical region where your desired deployment was created. +3. Click the name of the deployment you wish to edit. The deployment's **Overview** page displays. +4. Click the **Playground** tab, then **View code** to see code examples in various environments. Copy and paste them into your code editor or terminal. + + + Prompt structure may vary from one model to another. Refer to the specific instructions for use in our [dedicated documentation](/generative-apis/reference-content/). {/* QUESTION: Which doc are we referring to? The current link is not very useful. SHOULD REDIRECT TO API DOC */} + + +## How to delete a deployment + +1. Click **Generative APIs** in the **AI** section of the side menu in the [Scaleway console](https://console.scaleway.com/) to access the dashboard. The list of models displays. +2. Select the **Deployments** tab. +3. From the drop-down menu, select the geographical region where your deployment was created. +4. Choose the deployment you wish to delete. +5. Navigate to the **Settings** tab. +6. Click **Delete deployment** at the bottom of the page. +7. Type **DELETE** to confirm and click **Delete deployment**. + + Alternatively, from the Deployments listing, click the icon next to the deployment name you no longer need, and click **Delete**. A pop-up appears. Type **DELETE** to confirm, then click **Delete deployment**. + + + Deleting a deployment is a permanent action that erases all its associated data and resources. + \ No newline at end of file diff --git a/pages/generative-apis/reference-content/data-privacy.mdx b/pages/generative-apis/reference-content/data-privacy.mdx index 024c7f1251..f36818f7ae 100644 --- a/pages/generative-apis/reference-content/data-privacy.mdx +++ b/pages/generative-apis/reference-content/data-privacy.mdx @@ -27,8 +27,8 @@ We collect and process the following categories of data: - Token counts (input/output specific to large language models (LLMs), without prompt details) - Parameters defined by the user, **excluding the actual content of the prompts (system, user, etc.)** -Only in the event of misuse harming the service functionality (such as specific customer requests generating unexpected errors or carrying out malicious activity), we may store temporarily and access full content of HTTP requests of this customer, to identify a root cause issue or any security vulnerability and ensure quality of service for all customers. -As a concrete example, these analysis happens when a request triggers a 500 HTTP error, occurring very rarely (1 out of a million request). Data is stored for up to two weeks, only to reproduce, investigate and fix the underlying issue. +It is only in the event of misuse harming the service functionality (such as specific customer requests generating unexpected errors or carrying out malicious activity) that we may store temporarily and access the full content of HTTP requests of this customer, to identify a root cause issue or any security vulnerability, and to ensure quality of service for all customers. +As a concrete example, this analysis happens when a request triggers a 500 HTTP error, occurring very rarely (1 out of a million requests). Data is stored for up to two weeks, only to reproduce, investigate, and fix the underlying issue. ## 3. How we use your data @@ -39,9 +39,9 @@ The personal data collected is used exclusively for: - Monitoring and improving the Generative API service through anonymized data for statistical analysis. -- We do not collect, read, reuse, or analyze the content of your inputs, prompts, or outputs generated by the API. The only exception being when your traffic harms our service operations such as generating abnormal errors - like HTTP 500 errors - or may represent malicious activity. In this case, we may store temporarily the corresponding HTTP request content to identify and fix root cause issues or any security vulnerability. +- We do not collect, read, reuse, or analyze the content of your inputs, prompts, or outputs generated by the API. The only exception being when your traffic harms our service operations or your resource availability such as generating abnormal errors - like HTTP 500 errors - or may represent malicious activity. In this case, we may store temporarily the corresponding HTTP request content to identify and fix root cause issues or any security vulnerability. - Your data is not accessible to other Scaleway customers. -- Your data is not accessible to the creators of the underlying large language models (LLMs). +- Your data is not accessible to the creators of the underlying large language models (LLMs). Your data is not used for training, retraining, or improving the base models. - Your data is not accessible to third-party products, or services. @@ -69,14 +69,15 @@ We implement appropriate technical and organizational security measures to prote - Secured authentication - Complex password - 2FA (customer's choice) - - SSH Key + - SSH key - Deletion with secured erasure code - - Manage accessibility with security groups + - Accessibility management with security groups - Physical server security - AI service security measures - Hosting: Scaleway maintains full control over the Generative APIs service, hosting the models on its infrastructure in Europe without interaction with third-party services. - Encryption: All traffic between the customer and the inference service is encrypted using in-transit TLS encryption to ensure data protection during transmission. - Endpoint Security: Public-facing endpoints are secured with API key tokens. + - Virtual Private Cloud (VPC): The service can be hosted in a Virtual Private Cloud within private subnets. Access to the service can be restricted based on allowed IP ranges. ## 7. Exercising your rights diff --git a/pages/generative-apis/reference-content/integrate-with-litellm.mdx b/pages/generative-apis/reference-content/integrate-with-litellm.mdx index ca30b44e74..f856d9c8bd 100644 --- a/pages/generative-apis/reference-content/integrate-with-litellm.mdx +++ b/pages/generative-apis/reference-content/integrate-with-litellm.mdx @@ -46,20 +46,20 @@ litellm --version 1. Create a `main.py` file with the following content: -```python -from litellm import completion -import os -os.environ["SCW_SECRET_KEY"] = "YOUR_SCW_SECRET_KEY" + ```python + from litellm import completion + import os + os.environ["SCW_SECRET_KEY"] = "YOUR_SCW_SECRET_KEY" -messages = [{"role": "user", "content": "Write me a poem about the blue sky"}] -response = completion(model="scaleway/mistral-small-3.2-24b-instruct-2506", messages=messages) -print(response) -``` + messages = [{"role": "user", "content": "Write me a poem about the blue sky"}] + response = completion(model="scaleway/mistral-small-3.2-24b-instruct-2506", messages=messages) + print(response) + ``` 2. Run `main.py` python script: -```bash -python main.py -``` + ```bash + python main.py + ``` The model response should display. @@ -83,42 +83,42 @@ Note that for the`/v1/embeddings` endpoint, you must also add the parameter `enc ## Configure LiteLLM Proxy Server (AI Gateway) to use Scaleway’s Generative APIs 1. Create a configuration file `config.yaml` in your current directory: -```yaml -model_list: - - model_name: ai-agent ### RECEIVED MODEL NAME ### - litellm_params: # all params accepted by litellm.completion() - https://docs.litellm.ai/docs/completion/input - model: scaleway/mistral-small-3.2-24b-instruct-2506 ### MODEL NAME sent to `litellm.completion()` ### - rpm: 10 # [OPTIONAL] Rate limit for this deployment: in requests per minute (rpm) - - model_name: ai-agent - litellm_params: - model: scaleway/qwen3-235b-a22b-instruct-2507 - rpm: 10 -``` + ```yaml + model_list: + - model_name: ai-agent ### RECEIVED MODEL NAME ### + litellm_params: # all params accepted by litellm.completion() - https://docs.litellm.ai/docs/completion/input + model: scaleway/mistral-small-3.2-24b-instruct-2506 ### MODEL NAME sent to `litellm.completion()` ### + rpm: 10 # [OPTIONAL] Rate limit for this deployment: in requests per minute (rpm) + - model_name: ai-agent + litellm_params: + model: scaleway/qwen3-235b-a22b-instruct-2507 + rpm: 10 + ``` 2. Run litellm proxy server with this configuration: -```bash -SCW_SECRET_KEY="YOUR_SCW_SECRET_KEY" \ -litellm --config ./config.yaml -``` + ```bash + SCW_SECRET_KEY="YOUR_SCW_SECRET_KEY" \ + litellm --config ./config.yaml + ``` 3. Perform a query to `ai-agent` model on `localhost:4000`, asking about the model's identity: -```bash -curl http://localhost:4000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "ai-agent", - "messages": [ - { - "role": "system", - "content": "You are a helpful assistant." - }, - { - "role": "user", - "content": "Who are you ?" - } - ] - }' -``` + ```bash + curl http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "ai-agent", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Who are you ?" + } + ] + }' + ``` If you perform multiple queries, model answers should display, either stating the model is Mistral or Qwen depending on where the query was routed by LiteLLM. Alternatively, you can configure `config.yaml` to use `openai` namespace and environment variables: diff --git a/pages/generative-apis/reference-content/integrate-with-qwen-code.mdx b/pages/generative-apis/reference-content/integrate-with-qwen-code.mdx index 169aeff02b..b8b81f308d 100644 --- a/pages/generative-apis/reference-content/integrate-with-qwen-code.mdx +++ b/pages/generative-apis/reference-content/integrate-with-qwen-code.mdx @@ -21,21 +21,21 @@ Qwen Code is a coding agent for terminal and IDE. You can integrate Scaleway's G 1. To integrate Qwen Code with Scaleway's Generative APIs, you need to configure the following environment variables. You can add these lines to your `~/.bashrc` or `~/.zshrc` file: - ```bash - export OPENAI_API_KEY="" - export OPENAI_BASE_URL="https://api.scaleway.ai/v1" - export OPENAI_MODEL="devstral-2-123b-instruct-2512" - ``` - Alternatively to avoid conflicts with existing OpenAI API Key configuration, you can set these environment variables in a local `.env` file. + ```bash + export OPENAI_API_KEY="" + export OPENAI_BASE_URL="https://api.scaleway.ai/v1" + export OPENAI_MODEL="devstral-2-123b-instruct-2512" + ``` + Alternatively to avoid conflicts with existing OpenAI API Key configuration, you can set these environment variables in a local `.env` file. - - On Windows, configure the environment variables using the system settings instead of `.bashrc`. - + + On Windows, configure the environment variables using the system settings instead of `.bashrc`. + 2. Reload your terminal configuration: - ```bash - source ~/.bashrc - ``` + ```bash + source ~/.bashrc + ``` ## Run and configure Qwen Code diff --git a/pages/generative-apis/reference-content/integrating-generative-apis-with-popular-tools.mdx b/pages/generative-apis/reference-content/integrating-generative-apis-with-popular-tools.mdx index 337346d9ad..67b8b36ec3 100644 --- a/pages/generative-apis/reference-content/integrating-generative-apis-with-popular-tools.mdx +++ b/pages/generative-apis/reference-content/integrating-generative-apis-with-popular-tools.mdx @@ -300,7 +300,7 @@ Bolt.diy is a software enabling users to create web applications from the prompt 9. Enter your prompt in the Bolt.diy interface to see your application being generated. - Only models that have a maximum output token of at least 8000 tokens are supported. Refer to the [list of Generative APIs models](/generative-apis/reference-content/supported-models/#chat-models) for more information. + Only models that have a maximum output token of at least 8000 tokens are supported. Refer to the [list of Generative APIs models](/generative-apis/reference-content/model-catalog) for more information. Alternatively, you can also setup your Scaleway Secret Key by renaming `.env.example` to `.env`, adding corresponding environment variables values and restarting Bolt.diy: diff --git a/pages/generative-apis/reference-content/model-lifecycle.mdx b/pages/generative-apis/reference-content/model-lifecycle.mdx index b831e74320..ac1c633d62 100644 --- a/pages/generative-apis/reference-content/model-lifecycle.mdx +++ b/pages/generative-apis/reference-content/model-lifecycle.mdx @@ -3,18 +3,20 @@ title: Understand Generative APIs model lifecycle description: Learn about the lifecycle of generative AI models in Scaleway. This page explains how each status affects model usage and updates, ensuring you are informed about transitions and how to access the latest model versions. tags: generative-apis ai-data model-lifecycle dates: - validation: 2025-09-22 + validation: 2026-04-24 posted: 2024-09-02 --- Scaleway is dedicated to updating and offering the latest versions of generative AI models, ensuring improvements in capabilities, accuracy, and safety. +## Generative APIs - Serverless + As new versions of models are introduced, you have the opportunity to explore them through the Scaleway console. -A model provided through Scaleway Generative APIs may be classified into one of these statuses: `Preview`, `Active`, `Deprecated`, or `End-of-Life` (EOL). +A model provided through Scaleway Generative APIs - Serverless may be classified into one of these statuses: `Preview`, `Active`, `Deprecated`, or `End-of-Life` (EOL). -- **Preview**: This status indicates that the model can be tested but no service-level agreements are yet provided. At this stage, the model is not guaranteed to reach `Active` status. In most cases, `Preview` model will still be deployable in dedicated instances using [Managed Inference](https://www.scaleway.com/en/inference/) product. -- **Active**: This status indicates that the model version is under continuous development, with ongoing updates that may include bug fixes and enhancements, and provides service-level agreement. +- **Preview**: This status indicates that the model can be tested but no service level agreements are provided yet. At this stage, the model is not guaranteed to reach `Active` status. In most cases, a model in `Preview` status will still be deployable in dedicated instances using the [Generative APIs - Dedicated Deployment](https://www.scaleway.com/en/generative-apis/) product. +- **Active**: This status indicates that the model version is under continuous development, with ongoing updates that may include bug fixes and enhancements, and provides a service level agreement. - **Deprecated**: A model version is designated deprecated when a newer, more efficient version is available. Scaleway assigns an EOL date to these deprecated versions. Although deprecated versions remain usable, it's recommended to transition to an active version by the EOL date. - **EOL**: At this stage, the model version is retired and no longer accessible for use. Any attempts to utilize an End-of-Life version will not be successful. @@ -30,3 +32,248 @@ When removing a model, if an alternative model of a similar type is available in Following the EOL date, information regarding the model version remains exclusively available on our [dedicated documentation page](/generative-apis/reference-content/supported-models/#deprecated-models). + +## Generative APIs - Dedicated Deployment + +Scaleway Generative APIs - Dedicated Deployment allows you to deploy various AI models, either from: + + * [Scaleway model catalog](/generative-apis/reference-content/supported-models/): A curated set of ready-to-deploy models available through the [Scaleway console](https://console.scaleway.com/generative-api/deployments/) or the [Generative APIs - Dedicated Deployment API](https://www.scaleway.com/en/developers/api/managed-inference/#path-models-list-models) {/*COMMENT: Fix link. */} + * [Custom models](#custom-models): Models that you import, typically from sources such as Hugging Face. + +## Custom models + + + Custom model support is currently in **beta**. If you encounter issues or limitations, report them via our [Slack community channel](https://scaleway-community.slack.com/archives/C01SGLGRLEA) or [customer support](https://console.scaleway.com/support/tickets/create?for=product&productName=inference). + + +### Prerequisites + + + We recommend starting with a variation of a supported model from the [Scaleway catalog](/generative-apis/reference-content/supported-models/). + For example, you can deploy a [quantized (4-bit) version of Llama 3.3](https://huggingface.co/unsloth/Llama-3.3-70B-Instruct-bnb-4bit). + If deploying a fine-tuned version of Llama 3.3, make sure your file structure matches the example linked above. + Examples whose compatibility has been tested are available in the section about [tested models](#known-compatible-models). + + +To deploy a custom model via Hugging Face, ensure the following: + +#### Access requirements + + * You must have access to the model using your Hugging Face credentials. + * For gated models, request access through your Hugging Face account. + * Credentials are not stored, but we recommend using [read or fine-grained access tokens](https://huggingface.co/docs/hub/security-tokens). + +#### Required files + +Your model repository must include: + + * A `config.json` file containing: + * An `architectures` array (See [supported architectures](#supported-model-architectures) for the exact list of supported values.) + * `max_position_embeddings` + * Model weights in the [`.safetensors`](https://huggingface.co/docs/safetensors/index) format + * A `tokenizer.json` file + * If your are fine-tuning an existing model, we recommend you use the same `tokenizer.json` file from the base model. + * A chat template included in either: + * `tokenizer_config.json` as a `chat_template` field, or + * the `chat_template.json` file or `chat_template.jinja` + + +If you have both a `chat_template` field in the `tokenizer_config.json` and a chat template file, the chat template file will be used. + + +#### Supported model types + +Your model must be one of the following types: + + * `chat` + * `vision` + * `multimodal` (chat + vision) + * `embedding` + + + **Security Notice**
+ Models using formats that allow arbitrary code execution, such as Python [`pickle`](https://docs.python.org/3/library/pickle.html), are **not supported**. +
+ +### Custom model lifecycle + +Currently, custom model deployments are considered to be valid for the long term, and we will ensure any updates or changes to Generative APIs - Dedicated Deployment will not impact existing deployments. +In case of breaking changes, leading to some custom models not being supported anymore, we will notify you **at least 3 months beforehand**. + +### Licensing + +When deploying custom models, **you remain responsible** for complying with any license requirements from the model provider, as you would do by running the model on a custom provisioned GPU. + +### Supported model architectures + +Custom models must conform to one of the architectures listed below. Click to expand the full list. + + + ### Supported custom model architectures + Custom model deployment currently supports the following model architectures: + * `AquilaModel` + * `AquilaForCausalLM` + * `ArcticForCausalLM` + * `BaiChuanForCausalLM` + * `BaichuanForCausalLM` + * `BloomForCausalLM` + * `CohereForCausalLM` + * `Cohere2ForCausalLM` + * `DbrxForCausalLM` + * `DeciLMForCausalLM` + * `DeepseekForCausalLM` + * `DeepseekV2ForCausalLM` + * `DeepseekV3ForCausalLM` + * `ExaoneForCausalLM` + * `FalconForCausalLM` + * `Fairseq2LlamaForCausalLM` + * `GemmaForCausalLM` + * `Gemma2ForCausalLM` + * `GlmForCausalLM` + * `GPT2LMHeadModel` + * `GPTBigCodeForCausalLM` + * `GPTJForCausalLM` + * `GPTNeoXForCausalLM` + * `GraniteForCausalLM` + * `GraniteMoeForCausalLM` + * `GritLM` + * `InternLMForCausalLM` + * `InternLM2ForCausalLM` + * `InternLM2VEForCausalLM` + * `InternLM3ForCausalLM` + * `JAISLMHeadModel` + * `JambaForCausalLM` + * `LlamaForCausalLM` + * `LLaMAForCausalLM` + * `MambaForCausalLM` + * `FalconMambaForCausalLM` + * `MiniCPMForCausalLM` + * `MiniCPM3ForCausalLM` + * `MistralForCausalLM` + * `MixtralForCausalLM` + * `QuantMixtralForCausalLM` + * `MptForCausalLM` + * `MPTForCausalLM` + * `NemotronForCausalLM` + * `OlmoForCausalLM` + * `Olmo2ForCausalLM` + * `OlmoeForCausalLM` + * `OPTForCausalLM` + * `OrionForCausalLM` + * `PersimmonForCausalLM` + * `PhiForCausalLM` + * `Phi3ForCausalLM` + * `Phi3SmallForCausalLM` + * `PhiMoEForCausalLM` + * `Qwen2ForCausalLM` + * `Qwen2MoeForCausalLM` + * `RWForCausalLM` + * `StableLMEpochForCausalLM` + * `StableLmForCausalLM` + * `Starcoder2ForCausalLM` + * `SolarForCausalLM` + * `TeleChat2ForCausalLM` + * `XverseForCausalLM` + * `BartModel` + * `BartForConditionalGeneration` + * `Florence2ForConditionalGeneration` + * `BertModel` + * `RobertaModel` + * `RobertaForMaskedLM` + * `XLMRobertaModel` + * `DeciLMForCausalLM` + * `Gemma2Model` + * `GlmForCausalLM` + * `GritLM` + * `InternLM2ForRewardModel` + * `JambaForSequenceClassification` + * `LlamaModel` + * `MistralModel` + * `Phi3ForCausalLM` + * `Qwen2Model` + * `Qwen2ForCausalLM` + * `Qwen2ForRewardModel` + * `Qwen2ForProcessRewardModel` + * `TeleChat2ForCausalLM` + * `LlavaNextForConditionalGeneration` + * `Phi3VForCausalLM` + * `Qwen2VLForConditionalGeneration` + * `Qwen2ForSequenceClassification` + * `BertForSequenceClassification` + * `RobertaForSequenceClassification` + * `XLMRobertaForSequenceClassification` + * `AriaForConditionalGeneration` + * `Blip2ForConditionalGeneration` + * `ChameleonForConditionalGeneration` + * `ChatGLMModel` + * `ChatGLMForConditionalGeneration` + * `DeepseekVLV2ForCausalLM` + * `FuyuForCausalLM` + * `H2OVLChatModel` + * `InternVLChatModel` + * `Idefics3ForConditionalGeneration` + * `LlavaForConditionalGeneration` + * `LlavaNextForConditionalGeneration` + * `LlavaNextVideoForConditionalGeneration` + * `LlavaOnevisionForConditionalGeneration` + * `MantisForConditionalGeneration` + * `MiniCPMO` + * `MiniCPMV` + * `MolmoForCausalLM` + * `NVLM_D` + * `PaliGemmaForConditionalGeneration` + * `Phi3VForCausalLM` + * `PixtralForConditionalGeneration` + * `QWenLMHeadModel` + * `Qwen2VLForConditionalGeneration` + * `Qwen2_5_VLForConditionalGeneration` + * `Qwen2AudioForConditionalGeneration` + * `UltravoxModel` + * `MllamaForConditionalGeneration` + * `WhisperForConditionalGeneration` + * `EAGLEModel` + * `MedusaModel` + * `MLPSpeculatorPreTrainedModel` + + +### Known compatible models + +Several models have already been verified to work on Generative APIs - Dedicated Deployment custom models. This list is not exhaustive and is updated gradually. Click to expand the full list. + + + ### Models verified for compatibility + The following models' compatibility has been verified: + * `google/medgemma-27b-it` + * `HuggingFaceTB/SmolLM2-135M-Instruct` + * `ibm-granite/granite-vision-3.2-2b` + * `ibm-granite/granite-3.3-2b-instruct` + * `Linq-AI-Research/Linq-Embed-Mistral` + * `microsoft/phi-4` + * `nanonets/Nanonets-OCR-s` + * `sentence-transformers/paraphrase-multilingual-mpnet-base-v2` + * `Qwen/Qwen3-32B` + * `Snowflake/snowflake-arctic-embed-l-v2.0` + + +## API support + +Depending on the model type, specific endpoints and features are supported. + +### Chat models + +The Chat API is exposed for chat models under the `/v1/chat/completions` endpoint. +**Structured outputs** or **Function calling** are not yet supported for custom models. + +### Vision models + +The Chat API is exposed for vision models under the `/v1/chat/completions` endpoint. +**Structured outputs** or **Function calling** are not yet supported for custom models. + +### Multimodal models + +Multimodal models are treated the same way as chat and vision models. + +### Embedding models + +The Embeddings API is exposed for embedding models under the `/v1/embeddings` endpoint. + diff --git a/pages/managed-inference/reference-content/openai-compatibility.mdx b/pages/generative-apis/reference-content/openai-compatibility.mdx similarity index 84% rename from pages/managed-inference/reference-content/openai-compatibility.mdx rename to pages/generative-apis/reference-content/openai-compatibility.mdx index 1d64f81159..93be4fee9c 100644 --- a/pages/managed-inference/reference-content/openai-compatibility.mdx +++ b/pages/generative-apis/reference-content/openai-compatibility.mdx @@ -1,24 +1,24 @@ --- title: OpenAI API compatibility -description: Scaleway Managed Inference provides compatibility with parts of the OpenAI API, to help connect existing applications. +description: Scaleway Generative APIs provide compatibility with parts of the OpenAI API, to help connect existing applications. tags: ai-data chat embeddings dates: - validation: 2025-09-03 + validation: 2026-04-24 posted: 2024-05-06 --- import ChatCompVsResponsesApi from '@macros/ai/chat-comp-vs-responses-api.mdx' -You can use any of the OpenAI [official libraries](https://platform.openai.com/docs/libraries/), for example, the [OpenAI Python client library](https://github.com/openai/openai-python) to interact with your Scaleway Managed Inference deployment. +You can use any of the OpenAI [official libraries](https://platform.openai.com/docs/libraries/), for example, the [OpenAI Python client library](https://github.com/openai/openai-python) to interact with your dedicated Generative APIs deployment. This feature is especially beneficial for those looking to seamlessly transition applications already utilizing OpenAI. -### Chat Completions API or Responses API? +## Chat Completions API or Responses API? -### CURL +### cURL -To invoke Scaleway Managed Inference's OpenAI-compatible Chat API, simply edit your dedicated endpoints with a suffix `/v1/chat/completions`: +To invoke the Generative APIs - Dedicated Deployment OpenAI-compatible Chat API, simply edit your dedicated endpoints with a suffix `/v1/chat/completions`: ``` https://.ifr.fr-par.scaleway.com/v1/chat/completions ``` @@ -52,7 +52,7 @@ print(chat_completion.choices[0].message.content) ``` - More OpenAI-like APIs (e.. audio) will be released step by step once related models are supported. + More OpenAI-like APIs (e.g., audio) will be released step-by-step once related models are supported. ### Supported parameters @@ -87,7 +87,7 @@ If you have a use case requiring one of these unsupported features, please [cont The Embeddings API is designed to get a vector representation of an input that can be easily consumed by other machine learning models. -### CURL +### cURL Use your dedicated endpoints as follows: ``` @@ -137,7 +137,7 @@ print(embedding) The Models API returns the model(s) available for inferencing. -In the context of a Scaleway Managed Inference deployment, it returns the name of the current model being served. +In the context of a dedicated Generative API deployment, it returns the name of the current model being served. ``` https://.ifr.fr-par.scaleway.com/v1/models @@ -148,13 +148,14 @@ curl https://.ifr.fr-par.scaleway.com/v1/models \ -H "Authorization: Bearer $SCW_API_KEY" \ -H "Content-Type: application/json" ``` + ## Differences ### Token usage stats OpenAI API doesn't return usage stats (number of tokens in prompt and completion) for streaming responses. -Scaleway Managed Inference endpoints return usage stats for both streaming and non-streaming responses. +Scaleway Generative APIs - Dedicated Deployment endpoints return usage stats for both streaming and non-streaming responses. For streaming responses, the usage field is incremented in each chunk, and completed in the very last chunk of the response. For example: @@ -180,4 +181,4 @@ Gradually, we plan to introduce additional APIs such as: We will progressively roll out more OpenAI-like APIs as we expand model support.
-If you have a use case requiring one of these unsupported APIs, please [contact us via Slack](https://slack.scaleway.com/). +If you have a use case requiring one of these unsupported APIs, [contact us via Slack](https://slack.scaleway.com/). diff --git a/pages/generative-apis/reference-content/rate-limits.mdx b/pages/generative-apis/reference-content/rate-limits.mdx index aee36ee4d4..dfcee54064 100644 --- a/pages/generative-apis/reference-content/rate-limits.mdx +++ b/pages/generative-apis/reference-content/rate-limits.mdx @@ -1,20 +1,20 @@ --- -title: What are Rate limits with Scaleway Generative APIs +title: What rate limits apply with Scaleway Generative APIs - Serverless? description: Find our service limits in tokens per minute and queries per minute -tags: generative-apis ai-data rate-limits +tags: generative-apis-serverless ai-data rate-limits dates: - validation: 2025-06-20 + validation: 2026-04-24 posted: 2024-08-27 --- ## What are the limits? -Any model served through Scaleway Generative APIs gets rate limited based on: +Any model served through Scaleway Generative APIs - Serverless gets rate limited based on: - Tokens per minute (total input and output tokens) - Queries per minute (HTTP requests) - Concurrent requests (total simultaneous HTTP sessions) -Base limits apply if you registered a valid payment method, and are increased automatically if you also [verify your identity](/account/how-to/verify-identity/). +Base limits apply if you registered a valid payment method, and they are increased automatically if you also [verify your identity](/account/how-to/verify-identity/). Exact limit values are detailed in [Organization quotas for Generative APIs](/organizations-and-projects/additional-content/organization-quotas/#generative-apis). These values apply to your Organization, and are shared by all Projects within your Organization. @@ -23,14 +23,14 @@ These values apply to your Organization, and are shared by all Projects within y We actively monitor usage and will improve rates based on feedback. If you need to increase your rate limits: -- [Verify your identity](/account/how-to/verify-identity/) to automatically increase your rate limit as described below -- Use [Batches API](https://console.scaleway.com/generative-api/batches) for non-real time workloads. Requests performed through Batches API do not have a rate limit and are billed with a [-50% discount compared to standard model prices](https://www.scaleway.com/en/pricing/model-as-a-service/#generative-apis). -- Use [Managed Inference](https://console.scaleway.com/inference/deployments), which provides dedicated capacity and does not enforce rate limits (you remain limited by the total provisioned capacity) +- [Verify your identity](/account/how-to/verify-identity/) to automatically increase your rate limit as described below. +- Use the [Batches API](https://console.scaleway.com/generative-api/batches) for non-real-time workloads. Requests performed through the Batches API do not have a rate limit and are billed with a [-50% discount compared to standard model prices](https://www.scaleway.com/en/pricing/model-as-a-service/#generative-apis). +- Use [Generative APIs- Dedicated Deployment](https://console.scaleway.com/generative-api/deployments), which provides dedicated capacity and does not enforce rate limits (you remain limited by the total provisioned capacity). - Contact your existing Scaleway account manager or our Sales team to discuss volume commitment for specific models that will allow us to increase your quota proportionally. ## Why do we set rate limits? -These limits safeguard against abuse or misuse of Scaleway Generative APIs, helping to ensure fair access to the API with consistent performance. +These limits safeguard against abuse or misuse of Scaleway Generative APIs - Serverless, helping to ensure fair access to the API with consistent performance. ## How can I monitor rate limits? @@ -40,10 +40,10 @@ Rate limit information is provided in HTTP response headers: |--------------------------|:-------------------------:|:-------------------------------------------------------------:| | x-ratelimit-limit-requests | 600 | Maximum number of requests allowed over a minute. | | x-ratelimit-remaining-requests | 599 | Remaining number of requests allowed before reaching rate limit over a minute. | -| x-ratelimit-reset-requests | 250ms | Time until rate limit requests usage resets to its initial value. | +| x-ratelimit-reset-requests | 250ms | Time until rate limit request usage resets to its initial value. | | x-ratelimit-limit-tokens | 1000000 | Maximum number of tokens (input and output) allowed over a minute. | | x-ratelimit-remaining-tokens | 999976 | Remaining number of tokens allowed before reaching rate limit over a minute. | -| x-ratelimit-reset-tokens | 35ms | Time until rate limit tokens usage resets to its initial value. | +| x-ratelimit-reset-tokens | 35ms | Time until rate limit token usage resets to its initial value. | You can see these headers by performing the following HTTP request, using the `curl -i` option: diff --git a/pages/generative-apis/reference-content/security-and-reliability.mdx b/pages/generative-apis/reference-content/security-and-reliability.mdx index fe9a79e044..ee451740ad 100644 --- a/pages/generative-apis/reference-content/security-and-reliability.mdx +++ b/pages/generative-apis/reference-content/security-and-reliability.mdx @@ -11,9 +11,9 @@ This page outlines key principles and best practices to help you ensure your app ## Resilience -Resilience ensures the continuity and availability of your applications and data, even in the face of disruptions or failures. In Generative APIs, you can promote resilience through three pillars: **availability**, **durability** and **performance**. +Resilience ensures the continuity and availability of your applications and data, even in the face of disruptions or failures. In Generative APIs, you can promote resilience through three pillars: **availability**, **durability**, and **performance**. -### Availability and durability +### Availability and durability {/* QUESTION: This section is written from the point of view of Serverless. Updated it. Needs to be reviewed. */} Generative APIs Service Level Agreements (SLAs) target the following Service Level Objectives (SLOs): @@ -22,20 +22,22 @@ Generative APIs Service Level Agreements (SLAs) target the following Service Lev | Standard | Standard synchronous HTTP calls to Generative APIs providing the generated content directly in HTTP response. These calls include stream and non-stream requests. | 99.9% | | Batch | Asynchronous processing of files sent to Generative APIs providing the generated content through files. | 99.9% | -The detailed SLA measurements and guarantees can be found on the [Service Level Agreement for Generative APIs](https://www.scaleway.com/en/generative-apis/sla/) page. +The detailed SLA measurements and guarantees can be found on the following pages: +- [Service Level Agreement for Generative APIs - Serverless](https://www.scaleway.com/en/generative-apis/sla/) +- [Service Level Agreement for Generative APIs - Dedicated Deployment](https://www.scaleway.com/en/inference/sla/) As we do not store any data with standard processing, durability requirements do not apply. When processing data using batch processing, your input data is stored only during processing time (24 hours): - As input data storage is only temporary, no specific durability guarantee applies. -- Output data (processing results) durability depends on the target storage system used. The storage system used by default is the [Object Storage Standard Class](/object-storage/concepts/#storage-class) +- Output data (processing results) durability depends on the target storage system used. The storage system used by default is the [Object Storage Standard Class](/object-storage/concepts/#storage-class). -## Performance +## Performance {/* QUESTION: This section is written from the point of view of Serverless. Updated it. Needs to be reviewed. */} Standard processing (synchronous HTTP calls): -- Generative APIs run models on mutualized infrastructure, and therefore ensure good performance in average usage. We monitor and respond quickly to any drops in token generation throughput, but cannot strictly guarantee performance, especially during customer peak loads. As a consequence, [rate limits](/generative-apis/reference-content/rate-limits/) apply, to ensure fair use of synchronous HTTP calls. Bigger volumes of requests should be treated through batch processing. -- Guaranteed performance can be provided using dedicated resources on the [Managed Inference](/managed-inference/) product. +- The Generative APIs - Serverless offering runs models on mutualized infrastructure, and therefore ensures good performance in average usage. We monitor and respond quickly to any drops in token generation throughput, but cannot strictly guarantee performance, especially during customer peak loads. As a consequence, [rate limits](/generative-apis/reference-content/rate-limits/) apply, to ensure fair use of synchronous HTTP calls. Bigger volumes of requests should be treated through batch processing. +- Guaranteed performance can be provided using the dedicated resources of the [Generative APIs - Dedicated Deployment](/generative-apis/faq/) product. Batch processing (asynchronous file processing): - When using batch processing, we handle scheduling of batch jobs to optimize both processing resource allocation and processing time. Processing time is therefore only guaranteed to be lower than 24 hours and [rate limits](/generative-apis/reference-content/rate-limits/) (larger than Standard processing) still apply. @@ -49,24 +51,24 @@ Monitoring is an essential pillar to ensure the security and reliability of your Generative APIs metrics and logs are stored inside [Scaleway Cockpit](/cockpit/). This includes: -- **Metrics**: Input and output tokens and API requests. Metrics are refreshed every minute (some dashboards may aggregate data by the hour for accuracy reasons, but metrics can be queried at a finer rate using Cockpit custom dashboards) +- **Metrics**: Input and output tokens and API requests. Metrics are refreshed every minute (some dashboards may aggregate data by the hour for accuracy reasons, but metrics can be queried at a finer rate using Cockpit custom dashboards). {/* QUESTION: What are metrics specific to Dedicated Deployment? */} - **Logs**: No logs are currently stored inside Cockpit. ## Configuration and version management Configuration and version management are critical for maintaining reliability and performance across your services. -### Configuration +### Configuration {/* QUESTION: This section is written from the point of view of Serverless. Updated it. Needs to be reviewed. */} -Currently, Generative APIs do not provide specific configuration properties stored within your account. All configuration parameters are the ones you send through each API HTTP call (such as `temperature`, `top_p` or `seed`) and you remain responsible for any change in outputs based on these parameters. +Currently, the Generative APIs - Serverless product does not provide specific configuration properties stored within your account. All configuration parameters are the ones you send through each API HTTP call (such as `temperature`, `top_p`, or `seed`) and you remain responsible for any change in outputs based on these parameters. -Since Generative AI models are by definition non-deterministic, we cannot guarantee the same input will provide the same output over time (for example when using two different HTTP calls). If you want deterministic processing, we encourage you to use [Managed Inference](/managed-inference) with a specific model and set all randomness parameters to deterministic levels (for example using for instance `temperature`:`0` and a specific `seed` value). +Since Generative AI models are by definition non-deterministic, we cannot guarantee the same input will provide the same output over time (for example, when using two different HTTP calls). If you want deterministic processing, we encourage you to use the [Dedicated Deployment](/generative-apis/faq/) product with a specific model and set all randomness parameters to deterministic levels (for example, using `temperature`:`0` and a specific `seed` value). ### Version management -#### Supported models +#### Supported models {/* COMMENT: Add link to model catalog + and Dedicated Deployment lifecycle doc. */} -Any changes to supported models and associated guarantees are detailed in our [model lifecycle policy page](/generative-apis/reference-content/model-lifecycle/). +Any changes to supported Generative APIs - Serverless models and their associated guarantees are detailed on our [model lifecycle policy page](/generative-apis/reference-content/model-lifecycle/). #### API versions @@ -77,26 +79,28 @@ Two types of API version updates may be performed: | Minor | These updates do not change the API's current fields format and are backward compatible (no action is required on your side). New fields and features can however be added. | | Major | These updates change the API's current fields or paths. They may require action from your side. In this case, we will notify you with at least 3 months' warning before deprecating significant features that might break your application. | -#### Compatibility with third party tools +#### Compatibility with third-party tools -By following industry standards (such as targeting OpenAI API compatibility), we aim to provide compatibility with most AI ecosystems and tools by default. However, as ecosystems evolve quickly, we cannot always guarantee compatibility with third party tools. We do provide extensive documentation: -- Current API supported features are available in our [API Documentation](/generative-apis/api-cli/) -- Advanced errors and edge case workarounds are provided in our [Troubleshooting section](/generative-apis/troubleshooting/fixing-common-issues/). -- Information about integration with third party tools is available in our [dedicated documentation page](/generative-apis/reference-content/integrating-generative-apis-with-popular-tools/#openai-client-libraries) +By following industry standards (such as targeting OpenAI API compatibility), we aim to provide compatibility with most AI ecosystems and tools by default. However, as ecosystems evolve quickly, we cannot always guarantee compatibility with third-party tools. We do provide extensive documentation: +- Current API supported features are available in our API documentation: + - [Generative APIs - Serverless: API/CLI](/generative-apis/api-cli/) {/* COMMENT: Fix title and relative link. */} + - [Generative APIs - Dedicated Deployment: API](https://www.scaleway.com/en/developers/api/managed-inference/) {/* COMMENT: Fix title and relative link. */} +- Advanced errors and edge case workarounds are provided in section [Troubleshooting](/generative-apis/troubleshooting/fixing-common-issues/). {/* COMMENT: Fix title and relative link. */} +- Information about integration with third-party tools is available on our [dedicated documentation page](/generative-apis/reference-content/integrating-generative-apis-with-popular-tools/#openai-client-libraries). {/* COMMENT: Fix relative link. */} ## Data protection Our data protection measures are detailed on our [privacy policy page](/generative-apis/reference-content/data-privacy/). - Scaleway does not store sensitive data (such as the content of your prompt), unless we need it to provide the service (such as when using batch processing) -- When data is stored, it is protected using a state of the art method (such as encryption at rest) +- When data is stored, it is protected using a state-of-the-art method (such as encryption at rest) - During transit, your data is encrypted via the HTTPS protocol -### Scaleway access +### Scaleway access {/* QUESTION: This section is written from the point of view of Generative APIs. Is there any info that could be relevant for Dedicated Deployments and should be added here? */} In order to perform maintenance operations and guarantee the reliability of Generative APIs, or comply with local regulations, we need to access servers hosting the Generative APIs service. -Most of the time, any actions Scaleway carries out are automatic, e.g. updating configuration or upgrading software versions. +Most of the time, any actions Scaleway carries out are automatic, e.g., updating configuration or upgrading software versions. Manual interventions might be required occasionally for troubleshooting reasons (such as specific customer requests generating errors or carrying out malicious activity). In this event, we may temporarily store and access the full content of HTTP requests to identify a root cause issue or any security vulnerability. All Scaleway access is authenticated and traced according to industry security standards. @@ -111,11 +115,12 @@ You are responsible for attributing these permissions to the relevant users or a ## Compliance Several regulations apply to us (Scaleway) directly, whereas others apply to your usage. Even in this case, we help you ease your compliance process by providing you with the information you need from your cloud provider. + ### AI Act We (Scaleway) ensure our compliance with the [AI Act](https://artificialintelligenceact.eu/) within our scope of responsibilities. We also ensure that you have the information needed to comply with the requirements that apply to you. This means concretely: - Gathering information from AI Model Providers about their models (such as whether their training capacity is above 10²⁵ FLOPs, and falls into a specific category), and providing you with a link to these documents when they are made available. -- Providing you with links towards licensing required by the AI Model Providers. +- Providing you with links toward licensing required by the AI Model Providers. Scaleway has no access to, nor knowledge of any inputs and outputs generated by the models. By using our AI products, you agree and acknowledge that you are (a) responsible for this use, including any content integrated into the models, and (b) required to use the AI products in compliance with our General Terms of Services. diff --git a/pages/generative-apis/reference-content/supported-models.mdx b/pages/generative-apis/reference-content/supported-models.mdx index 05f029d4ca..af37bc5f5c 100644 --- a/pages/generative-apis/reference-content/supported-models.mdx +++ b/pages/generative-apis/reference-content/supported-models.mdx @@ -1,75 +1,751 @@ --- -title: Supported models -description: This page lists which open-source chat or embedding models Scaleway is currently hosting -tags: generative-apis ai-data supported-models +title: Generative APIs supported models +description: This page lists the open-source large language models supported by Scaleway. +tags: dates: - validation: 2025-09-12 - posted: 2024-09-02 + validation: 2026-04-24 + posted: 2024-04-18 --- +This page provides a quick overview of available models in Scaleway's catalog and their core attributes. Expand any model below to see usage examples and detailed capabilities. -Our API supports the most popular models for [Chat](/generative-apis/how-to/query-language-models), [Vision](/generative-apis/how-to/query-vision-models/), [Audio](/generative-apis/how-to/query-audio-models/) and [Embeddings](/generative-apis/how-to/query-embedding-models/). + + For further information, see the following documentation: + + - [What rate limits apply with Scaleway Generative APIs - Serverless?](/generative-apis/reference-content/rate-limits/) + - [Understand Generative APIs model lifecycle](/generative-apis/reference-content/model-lifecycle/) + - [OpenAI API compatibility](/generative-apis/reference-content/openai-compatibility/) + -## Multimodal models -### Chat and Vision models +## Models technical summary -| Provider | Model string | Context window (Tokens) | Maximum output (Tokens)| License \* | Model card | -|-----------------|-----------------|-----------------|-----------------|-----------------|-----------------| -| Qwen | `qwen3.5-397b-a17b` | 250k | 16k | [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) | [HF](https://huggingface.co/Qwen/Qwen3.5-397B-A17B) | -| Mistral | `mistral-small-3.2-24b-instruct-2506` | 128k | 32k | [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) | [HF](https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506) | -| Google (Preview) | `gemma-3-27b-it` | 40k | 8k | [Gemma](https://ai.google.dev/gemma/terms) | [HF](https://huggingface.co/google/gemma-3-27b-it) | -| H | `holo2-30b-a3b` | 22k | 32k | [CC-BY-NC-4.0](https://spdx.org/licenses/CC-BY-NC-4.0) | [HF](https://huggingface.co/Hcompany/Holo2-30B-A3B) | +| Model name | Available in Serverless? | Maximum context window (tokens) | Maximum output (tokens) - Serverless | Modalities | License \* | +|------------|--------------|--------------|------------|-----------|-----------| +| [`gpt-oss-120b`](#gpt-oss-120b)| Yes | 128k | 32k | Text | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`whisper-large-v3`](#whisper-large-v3) | Yes | - | - | Audio transcription | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`qwen3.5-397b-a17b`](#qwen35-397b-a17b)| Yes | 250k | 16k | Text, Code, Vision | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`qwen3-235b-a22b-instruct-2507`](#qwen3-235b-a22b-instruct-2507) | Yes | 250k | 16k | Text | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`gemma-3-27b-it`](#gemma-3-27b-it) | Yes | 40k | 8k | Text, Vision | [Gemma](https://ai.google.dev/gemma/terms) | +| [`llama-3.3-70b-instruct`](#llama-33-70b-instruct) | Yes | 100k (Serverless)/ 128k (Dedicated)| 16k | Text | [Llama 3.3 Community](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) | +| [`llama-3.1-70b-instruct`](#llama-31-70b-instruct) | [EOL for Serverless](#end-of-life-eol-models-for-serverless) | 128k | [EOL for Serverless](#end-of-life-eol-models-for-serverless) | Text | [Llama 3.1 Community](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct/blob/main/LICENSE) | +| [`llama-3.1-8b-instruct`](#llama-31-8b-instruct) | [EOL for Serverless](#end-of-life-eol-models-for-serverless) | 128k | [EOL for Serverless](#end-of-life-eol-models-for-serverless) | Text | [Llama 3.1 Community](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/blob/main/LICENSE) | +| [`llama-3-70b-instruct`](#llama-3-70b-instruct) | No | 8k | N/A | Text | [Llama 3 Community](https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/LICENSE) | +| [`llama-3.1-nemotron-70b-instruct`](#llama-31-nemotron-70b-instruct) | No | 128k | N/A | Text | [Llama 3.1 Community](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct/blob/main/LICENSE) | +| [`deepseek-r1-distill-llama-70b`](#deepseek-r1-distill-llama-70b) | [EOL for Serverless](#end-of-life-eol-models-for-serverless) | 16k (Serverless) / 128k (Dedicated) | 4k | Text | [MIT](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/blob/main/LICENSE) and [Llama 3.3 Community](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct/blob/main/LICENSE) | +| [`deepseek-r1-distill-llama-8b`](#deepseek-r1-distill-llama-8b) | No | 128k | N/A | Text | [MIT](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/blob/main/LICENSE) and [Llama 3.1 Community](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/blob/main/LICENSE) | +| [`mistral-7b-instruct-v0.3`](#mistral-7b-instruct-v03) | No | 32k | N/A | Text | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`mistral-large-3-675b-instruct-2512`](#mistral-large-3-675b-instruct-2512) | No | 250k | N/A | Text, Vision | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`mistral-small-3.2-24b-instruct-2506`](#mistral-small-32-24b-instruct-2506) | Yes | 128k | 32k | Text, Vision | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`mistral-small-3.1-24b-instruct-2503`](#mistral-small-31-24b-instruct-2503) | [EOL for Serverless](#end-of-life-eol-models-for-serverless) | 128k | [EOL for Serverless](#end-of-life-eol-models-for-serverless) | Text, Vision | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`mistral-small-24b-instruct-2501`](#mistral-small-24b-instruct-2501) | No | 32k | N/A | Text | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`voxtral-small-24b-2507`](#voxtral-small-24b-2507) | Yes | 32k | 16k | Text, Audio | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`mistral-nemo-instruct-2407`](#mistral-nemo-instruct-2407) | [EOL for Serverless](#end-of-life-eol-models-for-serverless) | 128k | 8k | Text | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`mixtral-8x7b-instruct-v0.1`](#mixtral-8x7b-instruct-v01) | No | 32k | N/A | Text | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`magistral-small-2506`](#magistral-small-2506) | No | 32k | N/A | Text | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`devstral-2-123b-instruct-2512`](#devstral-2-123b-instruct-2512) | Yes | 200k (Serverless)/ 260k (Dedicated) | 16k | Text, Code | [Modified MIT](https://huggingface.co/mistralai/Devstral-2-123B-Instruct-2512/blob/main/LICENSE) | +| [`devstral-small-2505`](#devstral-small-2505) | [EOL for Serverless](#end-of-life-eol-models-for-serverless) | 128k | [EOL for Serverless](#end-of-life-eol-models-for-serverless) | Text | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`pixtral-12b-2409`](#pixtral-12b-2409) | Yes | 128k | 4k | Text, Vision | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`molmo-72b-0924`](#molmo-72b-0924) | No | 50k | N/A | Text, Vision | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) and [Twonyi Qianwen license](https://huggingface.co/Qwen/Qwen2-72B/blob/main/LICENSE)| +| [`holo2-30b-a3b`](#holo2-30b-a3b)| Yes | 22k | 32k | Text, Vision | [CC-BY-NC-4.0](https://spdx.org/licenses/CC-BY-NC-4.0)| +| [`qwen3-embedding-8b`](#qwen3-embedding-8b) | Yes | 32k | N/A | Embeddings | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`qwen3-coder-30b-a3b-instruct`](#qwen3-coder-30b-a3b-instruct) | Yes | 128k | 32k | Code | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`qwen2.5-coder-32b-instruct`](#qwen25-coder-32b-instruct) | [EOL for Serverless](#end-of-life-eol-models-for-serverless) | 32k | [EOL for Serverless](#end-of-life-eol-models-for-serverless) | Code | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`bge-multilingual-gemma2`](#bge-multilingual-gemma2) | Yes | 8k | N/A | Embeddings | [Gemma](https://ai.google.dev/gemma/terms) | +| [`sentence-t5-xxl`](#sentence-t5-xxl) | [EOL for Serverless](#end-of-life-eol-models-for-serverless) | 512 | [EOL for Serverless](#end-of-life-eol-models-for-serverless) | Embeddings | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) |qwen3-235b-a22b-instruct-2507 \*Licences which are not open-weight and may restrict commercial usage (such as `CC-BY-NC-4.0`), do not apply to usage through Scaleway Products due to existing partnerships between Scaleway and the corresponding providers. Original licences are provided for transparency only. -### Chat and Audio models +## Model details + + Despite efforts for accuracy, the possibility of generated text containing inaccuracies or [hallucinations](/generative-apis/concepts/#hallucinations) exists. Always verify the content generated independently. + -| Provider | Model string | Context window (Tokens) | Maximum output (Tokens)| License | Model card | -|-----------------|-----------------|-----------------|-----------------|-----------------|-----------------| -| Mistral | `voxtral-small-24b-2507` | 32k | 16k | [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) | [HF](https://huggingface.co/mistralai/Voxtral-Small-24B-2507) | +## Multimodal models (Text and Vision) -### Audio transcription models + + Vision models can understand and analyze images, not generate them. You will use vision models through the `/v1/chat/completions` endpoint. + -| Provider | Model string | Maximum audio duration (Minutes) | Chunk size (Seconds) | Maximum file size (MB) | License | Model card | -|-----------------|-----------------|-----------------|-----------------|-----------------|-----------------|-----------------| -| Mistral | `voxtral-small-24b-2507` | 30 | 30 | 25 | [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) | [HF](https://huggingface.co/mistralai/Voxtral-Small-24B-2507) | -| OpenAI | `whisper-large-v3` | - | 30 | 25 | [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) | [HF](https://huggingface.co/openai/whisper-large-v3) | +### Gemma-3-27b-it +Gemma-3-27b-it is a model developed by Google to perform text processing and image analysis on many languages. +The model was not trained specifically to output function / tool call tokens. Hence function calling is currently supported, but reliability remains limited. -## Chat models +| Attribute | Value | +|-----------|-------| +| Provider | Google | +| Supports structured output | Yes | +| Supports function calling | Partial | +| Supports parallel tool-calling | No | +| Supported image formats | PNG, JPEG, WEBP, and non-animated GIFs | +| Maximum image resolution (pixels) | 896x896 | +| Token dimension (pixels)| 56x56 | +| Supported languages | English, Chinese, Japanese, Korean, and 31 additional languages | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | H100, H100-2 | +| Hugging Face model card | [gemma-3-27b-it](https://huggingface.co/google/gemma-3-27b-it) | -| Provider | Model string | Context window (Tokens) | Maximum output (Tokens)| License | Model card | -|-----------------|-----------------|-----------------|-----------------|-----------------|-----------------| -| OpenAI | `gpt-oss-120b` | 128k | 32k | [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) | [HF](https://huggingface.co/openai/gpt-oss-120b) | -| Meta | `llama-3.3-70b-instruct` | 100k | 16k | [Llama 3.3 Community](https://www.llama.com/llama3_3/license/) | [HF](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) | -| Meta | `llama-3.1-8b-instruct` | 128k | 16k | [Llama 3.1 Community](https://llama.meta.com/llama3_1/license/) | [HF](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) | -| Mistral | `mistral-nemo-instruct-2407` | 128k | 8k | [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) | [HF](https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407) | -| Mistral | `devstral-2-123b-instruct-2512` | 200k | 16k | [Modified MIT](https://huggingface.co/mistralai/Devstral-2-123B-Instruct-2512/blob/main/LICENSE) | [HF](https://huggingface.co/mistralai/Devstral-2-123B-Instruct-2512) | -| Qwen | `qwen3-235b-a22b-instruct-2507` | 250k | 16k | [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) | [HF](https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507) | -| Qwen | `qwen3-coder-30b-a3b-instruct` | 128k | 32k | [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) | [HF](https://huggingface.co/Qwen/Qwen3-Coder-30B-A3B-Instruct) | -| DeepSeek | `deepseek-r1-distill-llama-70b` | 16k | 4k | [MIT](https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/mit.md) | [HF](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B) | +\*Maximum context length is only mentioned when an instance's VRAM size limits context length. Otherwise, maximum context length is the one defined by the model. - - If you are unsure which chat model to use, we currently recommend `mistral-small-3.2-24b-instruct-2506` to get started, and `qwen3.5-397b-a17b` for best accuracy or coding tasks. - +Pan & Scan is not yet supported for Gemma 3 images. This means that high-resolution images are currently resized to 896x896 resolution, which may generate artifacts and lead to a lower accuracy. + +#### Model names +``` +google/gemma-3-27b-it:bf16 +``` + +### Mistral-large-3-675b-instruct-2512 +Mistral-large-3-675b-instruct-2512 is a frontier model, performing among the best open-weight models as of December 2025. It is ideal for agentic workflows and image understanding. + +| Attribute | Value | +|-----------|-------| +| Provider | Mistral | +| Supports structured output | Yes | +| Supports function calling | Yes | +| Supports parallel tool-calling | Yes | +| Supported image formats | PNG, JPEG, WEBP, and non-animated GIFs | +| Maximum image resolution (pixels) | 1540x1540 | +| Token dimension (pixels)| 28x28 | +| Supported languages | English, French, German, Spanish, Italian, Portuguese, Dutch, Chinese, Japanese, Korean, Arabic | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | H100-SXM-8 (180k) | + +\*Maximum context length is only mentioned when an instance's VRAM size limits context length. Otherwise, maximum context length is the one defined by the model. + +#### Model names +``` +mistral/mistral-large-3-675b-instruct-2512:fp4 +``` + +### Mistral-small-3.2-24b-instruct-2506 +Mistral-small-3.2-24b-instruct-2506 is an improved version of Mistral-small-3.1, which performs better on tool-calling. +This model was optimized to have a dense knowledge and faster token throughput compared to its size. + +| Attribute | Value | +|-----------|-------| +| Provider | Mistral | +| Supports structured output | Yes | +| Supports function calling | Yes | +| Supports parallel tool-calling | Yes | +| Supported image formats | PNG, JPEG, WEBP, and non-animated GIFs | +| Maximum image resolution (pixels) | 1540x1540 | +| Token dimension (pixels)| 28x28 | +| Supported languages | English, French, German, Greek, Hindi, Indonesian, Italian, Japanese, Korean, Malay, Nepali, Polish, Portuguese, Romanian, Russian, Serbian, Spanish, Swedish, Turkish, Ukrainian, Vietnamese, Arabic, Bengali, Chinese, Farsi | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | H100, H100-2 | +| Hugging Face model card | [mistral-small-3.2-24b-instruct-2506](https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506) | + +\*Maximum context length is only mentioned when an instance's VRAM size limits context length. Otherwise, maximum context length is the one defined by the model. + +#### Model names +``` +mistral/mistral-small-3.2-24b-instruct-2506:fp8 +``` + +### Mistral-small-3.1-24b-instruct-2503 +Mistral-small-3.1-24b-instruct-2503 is a model developed by Mistral to perform text processing and image analysis on many languages. +This model was optimized to have a dense knowledge and faster token throughput compared to its size. + +| Attribute | Value | +|-----------|-------| +| Provider | Mistral | +| Supports structured output | Yes | +| Supports function calling | Yes | +| Supports parallel tool-calling | Yes | +| Supported image formats | PNG, JPEG, WEBP, and non-animated GIFs | +| Maximum image resolution (pixels) | 1540x1540 | +| Token dimension (pixels)| 28x28 | +| Supported languages | English, French, German, Greek, Hindi, Indonesian, Italian, Japanese, Korean, Malay, Nepali, Polish, Portuguese, Romanian, Russian, Serbian, Spanish, Swedish, Turkish, Ukrainian, Vietnamese, Arabic, Bengali, Chinese, Farsi | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | H100, H100-2 | + +\*Maximum context length is only mentioned when an instance's VRAM size limits context length. Otherwise, maximum context length is the one defined by the model. + +- Bitmap (or raster) image formats, meaning storing images as grids of individual pixels, are supported. Vector image formats (SVG, PSD) are not supported, neither PDFs nor videos. +- Image size is limited in the following ways: + - Directly by the maximum context window. As an example, since tokens are squares of 28x28 pixels, the maximum context window taken by a single image is `3025` tokens (i.e., `(1540*1540)/(28*28)`) + - Indirectly by the model accuracy: resolution above 1540x1540 will not increase model output accuracy. Indeed, images above a width or height of 1540 pixels will be automatically downscaled to fit within the 1540x1540 dimension. Note that image ratio and overall aspect is preserved (images are not cropped, only additionally compressed). + +#### Model names +``` +mistral/mistral-small-3.1-24b-instruct-2503:bf16 +mistral/mistral-small-3.1-24b-instruct-2503:fp8 +``` + +### Qwen3.5-397b-a17b +Qwen3.5-397b-a17b is a model developed by Qwen to perform text processing, agentic coding, image, and video analysis in several languages. +This model was released as a frontier reasoning model on 16 February 2026. + +| Attribute | Value | +|-----------|-------| +| Provider | Qwen | +| Supports structured output | Yes | +| Supports function calling | Yes | +| Supports parallel tool-calling | Yes | +| Supported image formats | PNG, JPEG, WEBP, and non-animated GIFs | +| Supported video formats | MP4, MPEG, MOV, OGG and WEBM | +| Maximum image resolution (pixels) | 4096x4096 | +| Token dimension (pixels)| 32x32 | +| Supported languages | English, French, German, Chinese, Japanese, Korean, and 113 additional languages and dialects | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | H100-SXM-8 | +| Hugging Face model card | [qwen3.5-397b-a17b](https://huggingface.co/Qwen/Qwen3.5-397B-A17B) | + +\*Maximum context length is only mentioned when an instance's VRAM size limits context length. Otherwise, maximum context length is the one defined by the model. + +#### Model names +``` +qwen/qwen3.5-397b-a17b:int4 +``` + +### Pixtral-12b-2409 +Pixtral is a vision language model introducing a novel architecture: 12B parameter multimodal decoder plus 400M parameter vision encoder. +It can analyze images and offer insights from visual content alongside text. + +| Attribute | Value | +|-----------|-------| +| Provider | Mistral | +| Supports structured output | Yes | +| Supports function calling | Yes | +| Supports parallel tool-calling | Yes | +| Supported image formats | PNG, JPEG, WEBP, and non-animated GIFs | +| Maximum image resolution (pixels) | 1024x1024 | +| Token dimension (pixels)| 16x16 | +| Maximum images per request | 12 | +| Supported languages | English | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | L40S (50k), H100, H100-2 | +| Hugging Face model card | [pixtral-12b-2409](https://huggingface.co/mistralai/Pixtral-12B-2409) | + +\*Maximum context length is only mentioned when an instance's VRAM size limits context length. Otherwise, maximum context length is the one defined by the model. + +#### Model name +``` +mistral/pixtral-12b-2409:bf16 +``` +### Holo2-30b-a3b +Holo2 30B is a text and vision model optimized to analyze a Graphical User Interface, such as a web browser or software, and take actions. + +| Attribute | Value | +|-----------|-------| +| Provider | H | +| Supports structured output | Yes | +| Supports function calling | No | +| Supports parallel tool-calling | Yes | +| Supported image formats | PNG, JPEG, WEBP, and non-animated GIFs | +| Token dimension (pixels)| 16x16 | +| Supported languages | English | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | H100-SXM-2 | +| Hugging Face model card | [holo2-30b-a3b](https://huggingface.co/Hcompany/Holo2-30B-A3B) | + +\*Maximum context length is only mentioned when an instance's VRAM size limits context length. Otherwise, maximum context length is the one defined by the model. + +#### Model name +``` +hcompany/holo2-30b-a3b:bf16 +``` + +### Molmo-72b-0924 +Molmo 72B is the powerhouse of the Molmo family of multimodal models developed by the renowned research lab Allen Institute for AI. +Vision-language models like Molmo can analyze an image and offer insights from visual content alongside text. This multimodal functionality creates new opportunities for applications that need both visual and textual comprehension. + +| Attribute | Value | +|-----------|-------| +| Provider | Allen Institute for AI | +| Supports structured output | Yes | +| Supports function calling | No | +| Supported languages | English | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | H100-2 | + +\*Maximum context length is only mentioned when an instance's VRAM size limits context length. Otherwise, maximum context length is the one defined by the model. + + +#### Model name +``` +allenai/molmo-72b-0924:fp8 +``` + +## Multimodal models (Text and Audio) + +### Voxtral-small-24b-2507 +Voxtral-small-24b-2507 is a model developed by Mistral to perform text processing and audio analysis on many languages. +This model was optimized to enable transcription in many languages while keeping conversational capabilities (translations, classification, etc.) + +| Attribute | Value | +|-----------|-------| +| Provider | Mistral | +| Supports structured output | Yes | +| Supports function calling | Yes | +| Supports parallel tool-calling | Yes | +| Supported audio formats | WAV and MP3 | +| Audio chunk duration | 30 seconds | +| Token duration (audio)| 80ms | +| Maximum transcription duration| 30 minutes | +| Maximum understanding duration| 40 minutes | +| Maximum file size - Serverless | 25 MB | +| Supported languages | English, French, German, Dutch, Spanish, Italian, Portuguese, Hindi | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | H100, H100-2 | +| Hugging Face model card | [voxtral-small-24b-2507](https://huggingface.co/mistralai/Voxtral-Small-24B-2507) | + +\*Maximum context length is only mentioned when an instance's VRAM size limits context length. Otherwise, maximum context length is the one defined by the model. + +- Mono and stereo audio formats are supported. For stereo formats, both left and right channels are merged before being processed. +- Audio files are processed in 30-second chunks: + - If audio sent is less than 30 seconds, the rest of the chunk will be considered silent. + - 80ms is equal to 1 input token. + +#### Model names +``` +mistral/voxtral-small-24b-2507:bf16 +mistral/voxtral-small-24b-2507:fp8 +``` + +## Audio transcription models + +### Whisper-large-v3 +Whisper-large-v3 is a model developed by OpenAI to transcribe audio in many languages. +This model is optimized for audio transcription tasks. + +| Attribute | Value | +|-----------|-------| +| Provider | OpenAI | +| Supports structured output | - | +| Supports function calling | - | +| Supports parallel tool-calling | - | +| Supported audio formats | WAV and MP3 | +| Audio chunk duration | 30 seconds | +| Maximum file size - Serverless | 25 MB | +| Supported languages | English, French, German, Chinese, Japanese, Korean, and 81 additional languages | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | L4, L40S, H100, H100-SXM-2 | +| Hugging Face model card | [whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) | + +\*Maximum context length is only mentioned when an instance's VRAM size limits context length. Otherwise, maximum context length is the one defined by the model. + +- Mono and stereo audio formats are supported. For stereo formats, left and right channels are merged before being processed. +- Audio files are processed in 30-second chunks: + - If audio sent is less than 30 seconds, the rest of the chunk will be considered silent. + +#### Model names +``` +openai/whisper-large-v3:bf16 +``` + +## Text models + +### Qwen3-235b-a22b-instruct-2507 +Released 23 July 2025, Qwen 3 235B A22B is an open-weight model, competitive in multiple benchmarks (such as [LM Arena for text use cases](https://lmarena.ai/leaderboard)) compared to Gemini 2.5 Pro and GPT4.5. + +| Attribute | Value | +|-----------|-------| +| Provider | Qwen | +| Supports structured output | Yes | +| Supports function calling | Yes | +| Supports parallel tool-calling | Yes | +| Supported languages | English, French, German, Chinese, Japanese, Korean, and 113 additional languages and dialects | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | H100-SXM-2 (40k), H100-SXM-4 | +| Hugging Face model card | [qwen3-235b-a22b-instruct-2507](https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507) | + +\*Maximum context length is only mentioned when an instance's VRAM size limits context length. Otherwise, maximum context length is the one defined by the model. + + +#### Model name +``` +qwen/qwen3-235b-a22b-instruct-2507 +``` + +### Gpt-oss-120b +Released 5 August 2025, GPT OSS 120B is an open-weight model providing significant throughput performance and reasoning capabilities. +Currently, this model should be used through Responses API, as Chat Completion does not yet support tool-calling for this model. + +| Attribute | Value | +|-----------|-------| +| Provider | OpenAI | +| Supports structured output | Yes | +| Supports function calling | Yes | +| Supports parallel tool-calling | Yes | +| Supported languages | English | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | H100 | +| Hugging Face model card | [gpt-oss-120b](https://huggingface.co/openai/gpt-oss-120b) | + +\*Maximum context length is only mentioned when an instance's VRAM size limits context length. Otherwise, maximum context length is the one defined by the model. + +#### Model name +``` +openai/gpt-oss-120b:fp4 +``` + +### Llama-3.3-70b-instruct +Released 6 December 2024, Meta’s Llama 3.3 70b is a fine-tune of the [Llama 3.1 70b](/generative-apis/reference-content/supported-models/#llama-31-70b-instruct) model. +This model is still text-only (text in/text out). However, Llama 3.3 was designed to approach the performance of Llama 3.1 405B on some applications. +| Attribute | Value | +|-----------|-------| +| Provider | Meta | +| Supports structured output | Yes | +| Supports function calling | Yes | +| Supports parallel tool-calling | Yes | +| Supported languages | English, German, French, Italian, Portuguese, Hindi, Spanish, and Thai | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | H100 (15k), H100-2 | +| Hugging Face model card | [llama-3.3-70b-instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) | + +\*Maximum context length is only mentioned when an instance's VRAM size limits context length. Otherwise, maximum context length is the one defined by the model. -## Vision models +#### Model name +``` +meta/llama-3.3-70b-instruct:fp8 +meta/llama-3.3-70b-instruct:bf16 +``` -| Provider | Model string | Context window (Tokens) | Maximum output (Tokens)| License | Model card | -|-----------------|-----------------|-----------------|-----------------|-----------------|-----------------| -| Mistral | `pixtral-12b-2409` | 128k | 4k | [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) | [HF](https://huggingface.co/mistralai/Pixtral-12B-2409) | +### Llama-3.1-70b-instruct +Released 23 July 2024, Meta’s Llama 3.1 is an iteration of the open-access Llama family. +Llama 3.1 was designed to match the best proprietary models and outperform many of the available open-source common industry benchmarks. + +| Attribute | Value | +|-----------|-------| +| Provider | Meta | +| Supports structured output | Yes | +| Supports function calling | Yes | +| Supports parallel tool-calling | Yes | +| Supported languages | English, German, French, Italian, Portuguese, Hindi, Spanish, and Thai | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | H100 (15k), H100-2 | + +\*Maximum context length is only mentioned when an instance's VRAM size limits context length. Otherwise, maximum context length is the one defined by the model. + +#### Model names +``` +meta/llama-3.1-70b-instruct:fp8 +meta/llama-3.1-70b-instruct:bf16 +``` + +### Llama-3.1-8b-instruct +Released 23 July 2024, Meta’s Llama 3.1 is an iteration of the open-access Llama family. +Llama 3.1 was designed to match the best proprietary models and outperform many of the available open-source common industry benchmarks. + +| Attribute | Value | +|-----------|-------| +| Provider | Meta | +| Supports structured output | Yes | +| Supports function calling | Yes | +| Supports parallel tool-calling | Yes | +| Supported languages | English, German, French, Italian, Portuguese, Hindi, Spanish, and Thai | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | L4 (90k), L40S, H100, H100-2 | +| Hugging Face model card | [llama-3.1-8b-instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) | + +\*Maximum context length is only mentioned when an instance's VRAM size limits context length. Otherwise, maximum context length is the one defined by the model. + +#### Model names +``` +meta/llama-3.1-8b-instruct:fp8 +meta/llama-3.1-8b-instruct:bf16 +``` + +### Llama-3-70b-instruct +Meta’s Llama 3 is an iteration of the open-access Llama family. +Llama 3 was designed to match the best proprietary models, enhanced by community feedback for greater utility and responsibly spearheading the deployment of LLMs. +With a commitment to open-source principles, this release marks the beginning of a multilingual, multimodal future for Llama 3, pushing the boundaries in reasoning and coding capabilities. + +| Attribute | Value | +|-----------|-------| +| Provider | Meta | +| Supports structured output | Yes | +| Supports function calling | No | +| Supported languages | English | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | H100, H100-2 | + +#### Model name +``` +meta/llama-3-70b-instruct:fp8 +``` + +### Llama-3.1-Nemotron-70b-instruct +Introduced 14 October 2024, NVIDIA's Nemotron 70B Instruct is a specialized version of the Llama 3.1 model designed to follow complex instructions. +NVIDIA employed Reinforcement Learning from Human Feedback (RLHF) to fine-tune the model’s ability to generate relevant and informative responses. + +| Attribute | Value | +|-----------|-------| +| Provider | Nvidia | +| Supports structured output | Yes | +| Supports function calling | Yes | +| Supported languages | English | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | H100 (15k), H100-2 | + +#### Model name +``` +nvidia/llama-3.1-nemotron-70b-instruct:fp8 +``` + +### DeepSeek-R1-Distill-Llama-70B +Released 21 January 2025, Deepseek’s R1 Distilled Llama 70B is a distilled version of the Llama model family based on Deepseek R1. +DeepSeek R1 Distill Llama 70B is designed to improve the performance of Llama models on reasoning use cases, such as mathematics and coding tasks. + +| Attribute | Value | +|-----------|-------| +| Provider | DeepSeek | +| Supports structured output | Yes | +| Supports function calling | Yes | +| Supported languages | English, Chinese | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | H100 (13k), H100-2 | +| Hugging Face model card | [deepseek-r1-distill-llama-70b](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B) | + +\*Maximum context length is only mentioned when an instance's VRAM size limits context length. Otherwise, maximum context length is the one defined by the model. + +#### Model name +``` +deepseek/deepseek-r1-distill-llama-70b:fp8 +deepseek/deepseek-r1-distill-llama-70b:bf16 +``` + +### DeepSeek-R1-Distill-Llama-8B +Released 21 January 2025, Deepseek’s R1 Distilled Llama 8B is a distilled version of the Llama model family based on Deepseek R1. +DeepSeek R1 Distill Llama 8B is designed to improve the performance of Llama models on reasoning use cases, such as mathematics and coding tasks. + +| Attribute | Value | +|-----------|-------| +| Provider | DeepSeek | +| Supports structured output | Yes | +| Supports function calling | Yes | +| Supported languages | English, Chinese | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | L4 (90k), L40S, H100, H100-2 | + +#### Model names +``` +deepseek/deepseek-r1-distill-llama-8b:fp8 +deepseek/deepseek-r1-distill-llama-8b:bf16 +``` + +### Mixtral-8x7b-instruct-v0.1 +Mixtral-8x7b-instruct-v0.1, developed by Mistral, is tailored for instructional platforms and virtual assistants. +Trained on vast instructional datasets, it provides clear and concise instructions across various domains, enhancing user learning experiences. + +| Attribute | Value | +|-----------|-------| +| Provider | DeepSeek | +| Supports structured output | Yes | +| Supports function calling | No | +| Supported languages | English, French, German, Italian, Spanish | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | H100-2 | + +#### Model names +``` +mistral/mixtral-8x7b-instruct-v0.1:fp8 +mistral/mixtral-8x7b-instruct-v0.1:bf16 +``` + +### Mistral-7b-instruct-v0.3 +The first dense model released by Mistral AI, perfect for experimentation, customization, and quick iteration. At the time of the release, it matched the capabilities of models up to 30B parameters. +This model is open-weight and distributed under the Apache 2.0 license. + +| Attribute | Value | +|-----------|-------| +| Provider | Mistral | +| Supports structured output | Yes | +| Supports function calling | Yes | +| Supported languages | English | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | L4, L40S, H100, H100-2 | + +#### Model name +``` +mistral/mistral-7b-instruct-v0.3:bf16 +``` + +### Mistral-small-24b-instruct-2501 +Mistral Small 24B Instruct is a state-of-the-art transformer model of 24B parameters, built by Mistral. +This model is open-weight and distributed under the Apache 2.0 license. + +| Attribute | Value | +|-----------|-------| +| Provider | Mistral | +| Supports structured output | Yes | +| Supports function calling | Yes | +| Supported languages | English, French, German, Dutch, Spanish, Italian, Polish, Portuguese, Chinese, Japanese, Korean | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | L40S (20k), H100, H100-2 | + +#### Model name +``` +mistral/mistral-small-24b-instruct-2501:fp8 +mistral/mistral-small-24b-instruct-2501:bf16 +``` + +### Mistral-nemo-instruct-2407 +Mistral Nemo is a state-of-the-art transformer model of 12B parameters, built by Mistral in collaboration with NVIDIA. +This model is open-weight and distributed under the Apache 2.0 license. +It was trained on a large proportion of multilingual and code data. + +| Attribute | Value | +|-----------|-------| +| Provider | Mistral | +| Supports structured output | Yes | +| Supports function calling | Yes | +| Supports parallel tool-calling | Yes | +| Supported languages | English, French, German, Spanish, Italian, Portuguese, Russian, Chinese, Japanese | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | L40S, H100, H100-2 | +| Hugging Face model card | [mistral-nemo-instruct-2407](https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407) | + +\*Maximum context length is only mentioned when an instance's VRAM size limits context length. Otherwise, maximum context length is the one defined by the model. + +#### Model name +``` +mistral/mistral-nemo-instruct-2407:fp8 +``` + +### Magistral-small-2506 +Magistral Small is a reasoning model optimized to perform well on reasoning tasks, such as academic or scientific questions. +It is well suited for complex tasks requiring multiple reasoning steps. + +| Attribute | Value | +|-----------|-------| +| Provider | Mistral | +| Supports structured output | Yes | +| Supports function calling | Yes | +| Supported languages | English, French, German, Spanish, Portuguese, Italian, Japanese, Korean, Russian, Chinese, Arabic, Persian, Indonesian, Malay, Nepali, Polish, Romanian, Serbian, Swedish, Turkish, Ukrainian, Vietnamese, Hindi, Bengali | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | L40S, H100, H100-2 | + + +#### Model name +``` +mistral/magistral-small-2506:fp8 +mistral/magistral-small-2506:bf16 +``` + +## Code models + +### Devstral-2-123b-instruct-2512 +Devstral 2 is a state-of-the-art coding model released in December 2025, which excels at using tools to explore codebases, editing multiple files and powering software engineering agents. + +| Attribute | Value | +|-----------|-------| +| Provider | Mistral | +| Supports structured output | Yes | +| Supports function calling | Yes | +| Supported languages | English | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | H100-SXM-2 (75k), H100-SXM-4, H100-SXM-8 | +| Hugging Face model card | [devstral-2-123b-instruct-2512](https://huggingface.co/mistralai/Devstral-2-123B-Instruct-2512) | + +#### Model name +``` +mistral/devstral-2-123b-instruct-2512:fp8 +``` + +### Devstral-small-2505 +Devstral Small is a fine-tune of Mistral Small 3.1, optimized to perform software engineering tasks. +It is a good fit to be used as a coding agent, for instance in an IDE. + +| Attribute | Value | +|-----------|-------| +| Provider | Mistral | +| Supports structured output | Yes | +| Supports function calling | Yes | +| Supported languages | English, French, German, Spanish, Portuguese, Italian, Japanese, Korean, Russian, Chinese, Arabic, Persian, Indonesian, Malay, Nepali, Polish, Romanian, Serbian, Swedish, Turkish, Ukrainian, Vietnamese, Hindi, Bengali | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | H100, H100-2| + +#### Model name +``` +mistral/devstral-small-2505:fp8 +mistral/devstral-small-2505:bf16 +``` + +### Qwen3-coder-30b-a3b-instruct +Qwen3-coder is an improved version of Qwen2.5 with better accuracy and throughput. +Thanks to its a3b architecture, only a subset of its weights is activated for a given generation, leading to much faster input and output token processing, ideal for code completion. + +| Attribute | Value | +|-----------|-------| +| Provider | Qwen | +| Supports structured output | Yes | +| Supports function calling | Yes | +| Supports parallel tool-calling | Yes | +| Supported languages | English, French, German, Chinese, Japanese, Korean, and 113 additional languages and dialects | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | L40S, H100, H100-2 | +| Hugging Face model card | [qwen3-coder-30b-a3b-instruct](https://huggingface.co/Qwen/Qwen3-Coder-30B-A3B-Instruct) | + +\*Maximum context length is only mentioned when an instance's VRAM size limits context length. Otherwise, maximum context length is the one defined by the model. + +#### Model name +``` +qwen/qwen3-coder-30b-a3b-instruct:fp8 +``` + +### Qwen2.5-coder-32b-instruct +Qwen2.5-coder is your intelligent programming assistant familiar with more than 40 programming languages. +With Qwen2.5-coder deployed at Scaleway, your company can benefit from code generation, AI-assisted code repair, and code reasoning. + +| Attribute | Value | +|-----------|-------| +| Provider | Qwen | +| Supports structured output | Yes | +| Supports function calling | Yes | +| Supports parallel tool-calling | No | +| Supported languages | English, French, Spanish, Portuguese, German, Italian, Russian, Chinese, Japanese, Korean, Vietnamese, Thai, Arabic, and 16 additional languages | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | H100, H100-2 | + +\*Maximum context length is only mentioned when an instance's VRAM size limits context length. Otherwise, maximum context length is the one defined by the model. + +#### Model name +``` +qwen/qwen2.5-coder-32b-instruct:int8 +``` + +## Embeddings models + +### Qwen3-embedding-8b +Qwen/Qwen3-Embedding-8B is a state-of-the-art embedding model ranking 3rd on the METB leaderboard as of November 2025, supporting custom dimensions between 32 and 4096. + +| Attribute | Value | +|-----------|-------| +| Provider | Qwen | +| Supports structured output | No | +| Supports function calling | No | +| Embedding dimensions (maximum) | 4096 | +| Embedding dimensions (minimum) | 32 | +| Matryoshka embedding | Yes | +| Supported languages | English, French, German, Chinese, Japanese, Korean, and 113 additional languages and dialects | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | L4, L40S, H100, H100-2 | +| Hugging Face model card | [qwen3-embedding-8b](https://huggingface.co/Qwen/Qwen3-Embedding-8B) | + +\*Maximum context length is only mentioned when an instance's VRAM size limits context length. Otherwise, maximum context length is the one defined by the model. - Image sizes are limited to 32 million pixels (e.g., a resolution of about 8096 x 4048). Images with a resolution higher than 1024 x 1024 are supported, but automatically downscaled to fit these limitations (image ratio and proportions will be preserved). + [Matryoshka embeddings](https://huggingface.co/blog/matryoshka) refers to embeddings trained on multiple dimension numbers. Consequently, resulting vector dimensions will be sorted by most meaningful first. For example, a 4096-dimension vector can be truncated to its 768 first dimensions and used directly. -## Embedding models -Our [Embeddings API](/generative-apis/how-to/query-embedding-models) provides built-in support for the following models, hosted in Scaleway data centers, available via serverless endpoints. +### Bge-multilingual-gemma2 +BGE-Multilingual-Gemma2 tops the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard), scoring the number one spot in French and Polish, and number seven in English (as of Q4 2024). +As its name suggests, the model’s training data spans a broad range of languages, including English, Chinese, Polish, French, and more. + +| Attribute | Value | +|-----------|-------| +| Provider | BAAI | +| Supports structured output | No | +| Supports function calling | No | +| Embedding dimensions (maximum) | 3584 | +| Embedding dimensions (minimum) | 3584 | +| Matryoshka embedding | No | +| Supported languages | English, French, Chinese, Japanese, Korean | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | L4, L40S, H100, H100-2 | +| Hugging Face model card | [bge-multilingual-gemma2](https://huggingface.co/BAAI/bge-multilingual-gemma2) | + +\*Maximum context length is only mentioned when an instance's VRAM size limits context length. Otherwise, maximum context length is the one defined by the model. + +#### Model name +``` +baai/bge-multilingual-gemma2:fp32 +``` + +### Sentence-t5-xxl +The Sentence-T5-XXL model represents a significant evolution in sentence embeddings, building on the robust foundation of the Text-To-Text Transfer Transformer (T5) architecture. +Designed for performance in various language processing tasks, Sentence-T5-XXL leverages the strengths of T5's encoder-decoder structure to generate high-dimensional vectors that encapsulate rich semantic information. +This model has been meticulously tuned for tasks such as text classification, semantic similarity, and clustering, making it a useful tool in the Retrieval-Augmented Generation (RAG) framework. It excels in sentence similarity tasks, but its performance in semantic search tasks is less optimal. + + +| Attribute | Value | +|-----------|-------| +| Provider | SBERT | +| Supports structured output | No | +| Supports function calling | No | +| Embedding dimensions | 768 | +| Matryoshka embedding | No | +| Supported languages | English | +| Compatible Instances (max context in tokens\*) - Dedicated Deployment | L4 | + +\*Maximum context length is only mentioned when an instance's VRAM size limits context length. Otherwise, maximum context length is the one defined by the model. -| Provider | Model string | Embedding dimension (Maximum) | Embedding dimensions (Minimum) | Context window | License | Model card | -|-----------------|-----------------|-----------------|-----------------|-----------------|-----------------|-----------------| -| Qwen | `qwen3-embedding-8b` | 4096 | 32 | 32 000 | [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) | [HF](https://huggingface.co/Qwen/Qwen3-Embedding-8B) | -| BAAI | `bge-multilingual-gemma2` | 3584 | 3584 | 8192 | [Gemma](https://ai.google.dev/gemma/terms) | [HF](https://huggingface.co/BAAI/bge-multilingual-gemma2) | +#### Model name +``` +sentence-transformers/sentence-t5-xxl:fp32 +``` ## Request a model @@ -82,11 +758,11 @@ Our [Embeddings API](/generative-apis/how-to/query-embedding-models) provides bu | Provider | Model string | Deprecation date | EOL date | Requests routed to model |-----------------|-----------------|-----------------|-----------------|-----------------| -| Mistral | `mistral-small-3.1-24b-instruct-2503` | 14th August, 2025 | 14th November, 2025 | `mistral-small-3.2-24b-instruct-2506` | -| Mistral | `devstral-small-2505` | 14th August, 2025 | 14th November, 2025 | `qwen3-coder-30b-a3b-instruct` | -| Qwen | `qwen2.5-coder-32b-instruct` | 14th August, 2025 | 14th November, 2025 | `qwen3-coder-30b-a3b-instruct` | -| Meta | `llama-3.1-70b-instruct` | 25th February, 2025 | 25th May, 2025 | `llama-3.3-70b-instruct` | -| SBERT | `sentence-t5-xxl` | 26th November, 2024 | 26 February, 2025 | None | -| Deepseek | `deepseek-r1-distill-llama-70b` | 16th January, 2026 | 16th April, 2026 | `llama-3.3-70b-instruct` | -| Mistral | `mistral-nemo-instruct-2407` | 16th January, 2026 | 16th April, 2026 | `mistral-small-3.2-24b-instruct-2506` | -| Meta | `llama-3.1-8b-instruct` | 16th January, 2026 | 16th April, 2026 | `mistral-small-3.2-24b-instruct-2506` | +| Mistral | `mistral-small-3.1-24b-instruct-2503` | 14 August 2025 | 14 November 2025 | `mistral-small-3.2-24b-instruct-2506` | +| Mistral | `devstral-small-2505` | 14 August 2025 | 14 November 2025 | `qwen3-coder-30b-a3b-instruct` | +| Qwen | `qwen2.5-coder-32b-instruct` | 14 August 2025 | 14 November 2025 | `qwen3-coder-30b-a3b-instruct` | +| Meta | `llama-3.1-70b-instruct` | 25 February 2025 | 25 May 2025 | `llama-3.3-70b-instruct` | +| SBERT | `sentence-t5-xxl` | 26 November 2024 | 26 February 2025 | None | +| Deepseek | `deepseek-r1-distill-llama-70b` | 16 January 2026 | 16 April 2026 | `llama-3.3-70b-instruct` | +| Mistral | `mistral-nemo-instruct-2407` | 16 January 2026 | 16 April 2026 | `mistral-small-3.2-24b-instruct-2506` | +| Meta | `llama-3.1-8b-instruct` | 16 January 2026 | 16 April 2026 | `mistral-small-3.2-24b-instruct-2506` | diff --git a/pages/generative-apis/reference-content/supported-models_backup.mdx b/pages/generative-apis/reference-content/supported-models_backup.mdx new file mode 100644 index 0000000000..1245ff39cf --- /dev/null +++ b/pages/generative-apis/reference-content/supported-models_backup.mdx @@ -0,0 +1,568 @@ +--- +title: Generative APIs supported models +description: This page lists the open-source large language models supported by Scaleway. +tags: +dates: + validation: 2026-04-24 + posted: 2024-04-18 +--- +This page provides a quick overview of available models in Scaleway's catalog and their core attributes. Expand any model below to see usage examples and detailed capabilities. + + + For further information, specific to the deployment modes (Serverless versus Dedicated), see the following documentation: + + - [What rate limits apply with Scaleway Generative APIs - Serverless?](/generative-apis/reference-content/rate-limits/) + - [Understand Generative APIs model lifecycle for Serverless](/generative-apis/reference-content/model-lifecycle/) + - [Model integration, model lifecycle, licensing for Dedicated Deployments](/generative-apis/reference-content/supported-models/) {/*COMMENT: Fix relative link. */} + - [OpenAI API compatibility](/generative-apis/reference-content/openai-compatibility/) + + +## Models technical summary + +| Model name | Provider | Available in Serverless? | Maximum context window (tokens) | Modalities | License \* | +|------------|----------|--------------|--------------|------------|-----------| +| [`gpt-oss-120b`](#gpt-oss-120b) | OpenAI | Yes | 128k | Text | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`whisper-large-v3`](#whisper-large-v3) | OpenAI | Yes | - | Audio transcription | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`qwen3.5-397b-a17b`](#qwen35-397b-a17b) | Qwen | Yes | 250k | Text, Code, Vision | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`qwen3-235b-a22b-instruct-2507`](#qwen3-235b-a22b-instruct-2507) | Qwen | Yes | 250k | Text | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`gemma-3-27b-it`](#gemma-3-27b-it) | Google | Yes | 40k | Text, Vision | [Gemma](https://ai.google.dev/gemma/terms) | +| [`llama-3.3-70b-instruct`](#llama-33-70b-instruct) | Meta | Yes | 128k | Text | [Llama 3.3 Community](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) | +| [`llama-3.1-70b-instruct`](#llama-31-70b-instruct) | Meta | [EOL for Serverless](#end-of-life-eol-models-for-serverless) | 128k | Text | [Llama 3.1 Community](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct/blob/main/LICENSE) | +| [`llama-3.1-8b-instruct`](#llama-31-8b-instruct) | Meta | [EOL for Serverless](#end-of-life-eol-models-for-serverless) | 128k | Text | [Llama 3.1 Community](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/blob/main/LICENSE) | +| [`llama-3-70b-instruct`](#llama-3-70b-instruct) | Meta | No | 8k | Text | [Llama 3 Community](https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/LICENSE) | +| [`llama-3.1-nemotron-70b-instruct`](#llama-31-nemotron-70b-instruct) | Nvidia | No | 128k | Text | [Llama 3.1 Community](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct/blob/main/LICENSE) | +| [`deepseek-r1-distill-70b`](#deepseek-r1-distill-llama-70b) | Deepseek | Yes | 128k | Text | [MIT](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/blob/main/LICENSE) and [Llama 3.3 Community](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct/blob/main/LICENSE) | +| [`deepseek-r1-distill-8b`](#deepseek-r1-distill-llama-8b) | Deepseek | No | 128k | Text | [MIT](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/blob/main/LICENSE) and [Llama 3.1 Community](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/blob/main/LICENSE) | +| [`mistral-7b-instruct-v0.3`](#mistral-7b-instruct-v03) | Mistral | No | 32k | Text | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`mistral-large-3-675b-instruct-2512`](#mistral-large-3-675b-instruct-2512) | Mistral | No | 250k | Text, Vision | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`mistral-small-3.2-24b-instruct-2506`](#mistral-small-32-24b-instruct-2506) | Mistral | Yes | 128k | Text, Vision | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`mistral-small-3.1-24b-instruct-2503`](#mistral-small-31-24b-instruct-2503) | Mistral | [EOL for Serverless](#end-of-life-eol-models-for-serverless) | 128k | Text, Vision | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`mistral-small-24b-instruct-2501`](#mistral-small-24b-instruct-2501) | Mistral | No | 32k | Text | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`voxtral-small-24b-2507`](#voxtral-small-24b-2507) | Mistral | Yes | 32k | Text | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`mistral-nemo-instruct-2407`](#mistral-nemo-instruct-2407) | Mistral | [EOL for Serverless](#end-of-life-eol-models-for-serverless) | 128k | Text | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`mixtral-8x7b-instruct-v0.1`](#mixtral-8x7b-instruct-v01) | Mistral | No | 32k | Text | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`magistral-small-2506`](#magistral-small-2506) | Mistral | No | 32k | Text | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`devstral-2-123b-instruct-2512`](#devstral-2-123b-instruct-2512) | Mistral | Yes | 260k | Text, Code | [Modified MIT](https://huggingface.co/mistralai/Devstral-2-123B-Instruct-2512/blob/main/LICENSE) | +| [`devstral-small-2505`](#devstral-small-2505) | Mistral | [EOL for Serverless](#end-of-life-eol-models-for-serverless) | 128k | Text | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`pixtral-12b-2409`](#pixtral-12b-2409) | Mistral | Yes | 128k | Text, Vision | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`molmo-72b-0924`](#molmo-72b-0924) | Allen AI | No | 50k | Text, Vision | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) and [Twonyi Qianwen license](https://huggingface.co/Qwen/Qwen2-72B/blob/main/LICENSE)| +| [`holo2-30b-a3b`](#holo2-30b-a3b) | H | Yes | 22k | Text, Vision | [CC-BY-NC-4.0](https://spdx.org/licenses/CC-BY-NC-4.0)| +| [`qwen3-embedding-8b`](#qwen3-embedding-8b) | Qwen | Yes | 32k | Embeddings | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`qwen3-coder-30b-a3b-instruct`](#qwen3-coder-30b-a3b-instruct) | Qwen | Yes | 128k | Code | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`qwen2.5-coder-32b-instruct`](#qwen25-coder-32b-instruct) | Qwen | [EOL for Serverless](#end-of-life-eol-models-for-serverless) | 32k | Code | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| [`bge-multilingual-gemma2`](#bge-multilingual-gemma2) | BAAI | Yes | 8k | Embeddings | [Gemma](https://ai.google.dev/gemma/terms) | +| [`sentence-t5-xxl`](#sentence-t5-xxl) | Sentence transformers | [EOL for Serverless](#end-of-life-eol-models-for-serverless) | 512 | Embeddings | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) |qwen3-235b-a22b-instruct-2507 + +\*Licences which are not open-weight and may restrict commercial usage (such as `CC-BY-NC-4.0`), do not apply to usage through Scaleway Products due to existing partnerships between Scaleway and the corresponding providers. Original licences are provided for transparency only. + +## Models feature summary +| Model name | Serverless: Maximum output (tokens) | Dedicated: Compatible Instances (max context in tokens\*) |Structured output supported | Function calling | Supported languages | +| --- | --- | --- | --- | --- | --- | +| `gpt-oss-120b` | 32k | H100 |Yes | Yes | English | +| `whisper-large-v3` | | L4, L40S, H100, H100-SXM-2 | - | - | English, French, German, Chinese, Japanese, Korean and 81 additional languages | +| `qwen3.5-397b-a17b` | 16k | H100-SXM-8 | Yes | Yes | English, French, German, Chinese, Japanese, Korean and 113 additional languages and dialects | +| `qwen3-235b-a22b-instruct-2507` | 16k | H100-SXM-2 (40k), H100-SXM-4 | Yes | Yes | English, French, German, Chinese, Japanese, Korean and 113 additional languages and dialects | +| `gemma-3-27b-it` | 8k | H100, H100-2 | Yes | Partial | English, Chinese, Japanese, Korean and 31 additional languages | +| `llama-3.3-70b-instruct` | 16k | H100 (15k), H100-2 | Yes | Yes | English, German, French, Italian, Portuguese, Hindi, Spanish, and Thai | +| `llama-3.1-70b-instruct` | [EOL for Serverless](#end-of-life-eol-models-for-serverless) | H100 (15k), H100-2 | Yes | Yes | English, German, French, Italian, Portuguese, Hindi, Spanish, and Thai | +| `llama-3.1-8b-instruct` | [EOL for Serverless](#end-of-life-eol-models-for-serverless) | L4 (90k), L40S, H100, H100-2 | Yes | Yes | English, German, French, Italian, Portuguese, Hindi, Spanish, and Thai | +| `llama-3-70b-instruct` | N/A | H100, H100-2 | Yes | No | English | +| `llama-3.1-nemotron-70b-instruct` | N/A | H100 (15k), H100-2 | Yes | Yes | English | +| `deepseek-r1-distill-llama-70B` | 4k | H100 (13k), H100-2 | Yes | Yes | English, Chinese | {/* COMMENT: Is this the same as deepseek-r1-distill-70b above? */} +| `deepseek-r1-distill-llama-8B` | N/A | L4 (90k), L40S, H100, H100-2 | Yes | Yes | English, Chinese | {/* COMMENT: Is this the same as deepseek-r1-distill-8b above? */} +| `mistral-7b-instruct-v0.3` | N/A | L4, L40S, H100, H100-2 | Yes | Yes | English | +| `mistral-large-3-675b-instruct-2512` | N/A | H100-SXM-8 (180k) | Yes | Yes | English, French, German, Spanish, Italian, Portuguese, Dutch, Chinese, Japanese, Korean, Arabic | +| `mistral-small-3.2-24b-instruct-2506` | 32k | H100, H100-2 | Yes | Yes | English, French, German, Greek, Hindi, Indonesian, Italian, Japanese, Korean, Malay, Nepali, Polish, Portuguese, Romanian, Russian, Serbian, Spanish, Swedish, Turkish, Ukrainian, Vietnamese, Arabic, Bengali, Chinese, Farsi | +| `mistral-small-3.1-24b-instruct-2503` | [EOL for Serverless](#end-of-life-eol-models-for-serverless) | H100, H100-2 | Yes | Yes | English, French, German, Greek, Hindi, Indonesian, Italian, Japanese, Korean, Malay, Nepali, Polish, Portuguese, Romanian, Russian, Serbian, Spanish, Swedish, Turkish, Ukrainian, Vietnamese, Arabic, Bengali, Chinese, Farsi | +| `mistral-small-24b-instruct-2501` | N/A | L40S (20k), H100, H100-2 | Yes | Yes | English, French, German, Dutch, Spanish, Italian, Polish, Portuguese, Chinese, Japanese, Korean | +| `voxtral-small-24b-2507` | 16k | H100, H100-2 | Yes | Yes | English, French, German, Dutch, Spanish, Italian, Portuguese, Hindi | +| `mistral-nemo-instruct-2407` | [EOL for Serverless](#end-of-life-eol-models-for-serverless) | L40S, H100, H100-2 | Yes | Yes | English, French, German, Spanish, Italian, Portuguese, Russian, Chinese, Japanese | +| `mixtral-8x7b-instruct-v0.1` | N/A | H100-2 | Yes | No | English, French, German, Italian, Spanish | +| `magistral-small-2506` | N/A | L40S, H100, H100-2 | Yes | Yes | English, French, German, Spanish, Portuguese, Italian, Japanese, Korean, Russian, Chinese, Arabic, Persian, Indonesian, Malay, Nepali, Polish, Romanian, Serbian, Swedish, Turkish, Ukrainian, Vietnamese, Hindi, Bengali | +| `devstral-2-123b-instruct-2512` | 16k | H100-SXM-2 (75k), H100-SXM-4, H100-SXM-8 | Yes | Yes | English | +| `devstral-small-2505` | [EOL for Serverless](#end-of-life-eol-models-for-serverless) | H100, H100-2 | Yes | Yes | English, French, German, Spanish, Portuguese, Italian, Japanese, Korean, Russian, Chinese, Arabic, Persian, Indonesian, Malay, Nepali, Polish, Romanian, Serbian, Swedish, Turkish, Ukrainian, Vietnamese, Hindi, Bengali | +| `pixtral-12b-2409` | 4k | L40S (50k), H100, H100-2 | Yes | Yes | English | +| `molmo-72b-0924` | N/A | H100-2 | Yes | No | English | +| `holo2-30b-a3b` | 32k | H100-SXM-2 | Yes | No | English | +| `qwen3-embedding-8b` | N/A | L4, L40S, H100, H100-2 | No | No | English, French, German, Chinese, Japanese, Korean and 113 additional languages and dialects | +| `qwen3-coder-30b-a3b-instruct` | 32k | L40S, H100, H100-2 | Yes | Yes | English, French, German, Chinese, Japanese, Korean and 113 additional languages and dialects | +| `qwen2.5-coder-32b-instruct` | [EOL for Serverless](#end-of-life-eol-models-for-serverless) | H100, H100-2 | Yes | Yes | English, French, Spanish, Portuguese, German, Italian, Russian, Chinese, Japanese, Korean, Vietnamese, Thai, Arabic and 16 additional languages. | +| `bge-multilingual-gemma2` | N/A | L4, L40S, H100, H100-2 | No | No | English, French, Chinese, Japanese, Korean | +| `sentence-t5-xxl` | [EOL for Serverless](#end-of-life-eol-models-for-serverless) | L4 | No | No | English | + +\*Maximum context length is only mentioned when an instance's VRAM size limits context length. Otherwise, maximum context length is the one defined by the model. + +## Model details + + Despite efforts for accuracy, the possibility of generated text containing inaccuracies or [hallucinations](/generative-apis/concepts/#hallucinations) exists. Always verify the content generated independently. + + +## Multimodal models (Text and Vision) + + + Vision models can understand and analyze images, not generate them. You will use vision models through the `/v1/chat/completions` endpoint. + + +### Gemma-3-27b-it +Gemma-3-27b-it is a model developed by Google to perform text processing and image analysis on many languages. +The model was not trained specifically to output function / tool call tokens. Hence function calling is currently supported, but reliability remains limited. + +#### Model names +``` +google/gemma-3-27b-it:bf16 +``` +| Attribute | Value | +|-----------|-------| +| Supports parallel tool-calling | No | +| Supported image formats | PNG, JPEG, WEBP, and non-animated GIFs | +| Maximum image resolution (pixels) | 896x896 | +| Token dimension (pixels)| 56x56 | + +- Pan & Scan is not yet supported for Gemma 3 images. This means that high-resolution images are currently resized to 896x896 resolution, which may generate artifacts and lead to a lower accuracy. + +### Mistral-large-3-675b-instruct-2512 +Mistral-large-3-675b-instruct-2512 is a frontier model, performing among the best open-weight models as of December 2025. It is ideal for agentic workflows and image understanding. + +| Attribute | Value | +|-----------|-------| +| Supports parallel tool-calling | Yes | +| Supported image formats | PNG, JPEG, WEBP, and non-animated GIFs | +| Maximum image resolution (pixels) | 1540x1540 | +| Token dimension (pixels)| 28x28 | + +#### Model names +``` +mistral/mistral-large-3-675b-instruct-2512:fp4 +``` + +### Mistral-small-3.2-24b-instruct-2506 +Mistral-small-3.2-24b-instruct-2506 is an improved version of Mistral-small-3.1, which performs better on tool-calling. +This model was optimized to have a dense knowledge and faster token throughput compared to its size. + +| Attribute | Value | +|-----------|-------| +| Supports parallel tool-calling | Yes | +| Supported image formats | PNG, JPEG, WEBP, and non-animated GIFs | +| Maximum image resolution (pixels) | 1540x1540 | +| Token dimension (pixels)| 28x28 | + +#### Model names +``` +mistral/mistral-small-3.2-24b-instruct-2506:fp8 +``` + +### Mistral-small-3.1-24b-instruct-2503 +Mistral-small-3.1-24b-instruct-2503 is a model developed by Mistral to perform text processing and image analysis on many languages. +This model was optimized to have a dense knowledge and faster token throughput compared to its size. + +| Attribute | Value | +|-----------|-------| +| Supports parallel tool-calling | Yes | +| Supported image formats | PNG, JPEG, WEBP, and non-animated GIFs | +| Maximum image resolution (pixels) | 1540x1540 | +| Token dimension (pixels)| 28x28 | + +#### Model names +``` +mistral/mistral-small-3.1-24b-instruct-2503:bf16 +mistral/mistral-small-3.1-24b-instruct-2503:fp8 +``` + +- Bitmap (or raster) image formats, meaning storing images as grids of individual pixels, are supported. Vector image formats (SVG, PSD) are not supported, neither PDFs nor videos. +- Image size is limited in the following ways: + - Directly by the maximum context window. As an example, since tokens are squares of 28x28 pixels, the maximum context window taken by a single image is `3025` tokens (i.e., `(1540*1540)/(28*28)`) + - Indirectly by the model accuracy: resolution above 1540x1540 will not increase model output accuracy. Indeed, images above a width or height of 1540 pixels will be automatically downscaled to fit within the 1540x1540 dimension. Note that image ratio and overall aspect is preserved (images are not cropped, only additionally compressed). + +### Qwen3.5-397b-a17b +Qwen3.5-397b-a17b is a model developed by Mistral to perform text processing, agentic coding, image, and video analysis in several languages. +This model was released as a frontier reasoning model on 16 February 2026. + +| Attribute | Value | +|-----------|-------| +| Supports parallel tool-calling | Yes | +| Supported image formats | PNG, JPEG, WEBP, and non-animated GIFs | +| Supported video formats | MP4, MPEG, MOV, OGG and WEBM | +| Maximum image resolution (pixels) | 4096x4096 | +| Token dimension (pixels)| 32x32 | + +#### Model names +``` +qwen/qwen3.5-397b-a17b:int4 +``` + +### Pixtral-12b-2409 +Pixtral is a vision language model introducing a novel architecture: 12B parameter multimodal decoder plus 400M parameter vision encoder. +It can analyze images and offer insights from visual content alongside text. + +| Attribute | Value | +|-----------|-------| +| Supports parallel tool-calling | Yes | +| Supported image formats | PNG, JPEG, WEBP, and non-animated GIFs | +| Maximum image resolution (pixels) | 1024x1024 | +| Token dimension (pixels)| 16x16 | +| Maximum images per request | 12 | + +#### Model name +``` +mistral/pixtral-12b-2409:bf16 +``` +### Holo2-30b-a3b +Holo2 30B is a text and vision model optimized to analyze a Graphical User Interface, such as a web browser or software, and take actions. + +| Attribute | Value | +|-----------|-------| +| Supports parallel tool-calling | Yes | +| Supported image formats | PNG, JPEG, WEBP, and non-animated GIFs | +| Token dimension (pixels)| 16x16 | + +#### Model name +``` +hcompany/holo2-30b-a3b:bf16 +``` + +### Molmo-72b-0924 +Molmo 72B is the powerhouse of the Molmo family of multimodal models developed by the renowned research lab Allen Institute for AI. +Vision-language models like Molmo can analyze an image and offer insights from visual content alongside text. This multimodal functionality creates new opportunities for applications that need both visual and textual comprehension. + +#### Model name +``` +allenai/molmo-72b-0924:fp8 +``` + +## Multimodal models (Text and Audio) + +### Voxtral-small-24b-2507 +Voxtral-small-24b-2507 is a model developed by Mistral to perform text processing and audio analysis on many languages. +This model was optimized to enable transcription in many languages while keeping conversational capabilities (translations, classification, etc.) + +| Attribute | Value | +|-----------|-------| +| Supports parallel tool-calling | Yes | +| Supported audio formats | WAV and MP3 | +| Audio chunk duration | 30 seconds | +| Token duration (audio)| 80ms | +| Maximum transcription duration| 30 minutes | +| Maximum understanding duration| 40 minutes | +| Serverless - Maximum file size | 25 MB | + + +#### Model names +``` +mistral/voxtral-small-24b-2507:bf16 +mistral/voxtral-small-24b-2507:fp8 +``` + +- Mono and stereo audio formats are supported. For stereo formats, both left and right channels are merged before being processed. +- Audio files are processed in 30-second chunks: + - If audio sent is less than 30 seconds, the rest of the chunk will be considered silent. + - 80ms is equal to 1 input token. + +## Audio transcription models + +### Whisper-large-v3 +Whisper-large-v3 is a model developed by OpenAI to transcribe audio in many languages. +This model is optimized for audio transcription tasks. + +| Attribute | Value | +|-----------|-------| +| Supported audio formats | WAV and MP3 | +| Audio chunk duration | 30 seconds | +| Serverless - Maximum file size | 25 MB | + +#### Model names +``` +openai/whisper-large-v3:bf16 +``` + +- Mono and stereo audio formats are supported. For stereo formats, left and right channels are merged before being processed. +- Audio files are processed in 30-second chunks: + - If audio sent is less than 30 seconds, the rest of the chunk will be considered silent. + +## Text models + +### Qwen3-235b-a22b-instruct-2507 +Released 23 July 2025, Qwen 3 235B A22B is an open-weight model, competitive in multiple benchmarks (such as [LM Arena for text use cases](https://lmarena.ai/leaderboard)) compared to Gemini 2.5 Pro and GPT4.5. + +| Attribute | Value | +|-----------|-------| +| Supports parallel tool-calling | Yes | + + + +#### Model name +``` +qwen/qwen3-235b-a22b-instruct-2507 +``` + +### Gpt-oss-120b +Released 5 August 2025, GPT OSS 120B is an open-weight model providing significant throughput performance and reasoning capabilities. +Currently, this model should be used through Responses API, as Chat Completion does not yet support tool-calling for this model. + +| Attribute | Value | +|-----------|-------| +| Supports parallel tool-calling | Yes | + +#### Model name +``` +openai/gpt-oss-120b:fp4 +``` + +### Llama-3.3-70b-instruct +Released 6 December 2024, Meta’s Llama 3.3 70b is a fine-tune of the [Llama 3.1 70b](/managed-inference/reference-content/model-catalog/#llama-31-70b-instruct) model. +This model is still text-only (text in/text out). However, Llama 3.3 was designed to approach the performance of Llama 3.1 405B on some applications. {/*COMMENT: Fix relative link.*/} + +| Attribute | Value | +|-----------|-------| +| Supports parallel tool-calling | Yes | + +#### Model name +``` +meta/llama-3.3-70b-instruct:fp8 +meta/llama-3.3-70b-instruct:bf16 +``` + +### Llama-3.1-70b-instruct +Released 23 July 2024, Meta’s Llama 3.1 is an iteration of the open-access Llama family. +Llama 3.1 was designed to match the best proprietary models and outperform many of the available open-source common industry benchmarks. + +| Attribute | Value | +|-----------|-------| +| Supports parallel tool-calling | Yes | + +#### Model names +``` +meta/llama-3.1-70b-instruct:fp8 +meta/llama-3.1-70b-instruct:bf16 +``` + +### Llama-3.1-8b-instruct +Released 23 July 2024, Meta’s Llama 3.1 is an iteration of the open-access Llama family. +Llama 3.1 was designed to match the best proprietary models and outperform many of the available open-source common industry benchmarks. + +| Attribute | Value | +|-----------|-------| +| Supports parallel tool-calling | Yes | + +#### Model names +``` +meta/llama-3.1-8b-instruct:fp8 +meta/llama-3.1-8b-instruct:bf16 +``` + +### Llama-3-70b-instruct +Meta’s Llama 3 is an iteration of the open-access Llama family. +Llama 3 was designed to match the best proprietary models, enhanced by community feedback for greater utility and responsibly spearheading the deployment of LLMs. +With a commitment to open-source principles, this release marks the beginning of a multilingual, multimodal future for Llama 3, pushing the boundaries in reasoning and coding capabilities. + +#### Model name +``` +meta/llama-3-70b-instruct:fp8 +``` + +### Llama-3.1-Nemotron-70b-instruct +Introduced 14 October 2024, NVIDIA's Nemotron 70B Instruct is a specialized version of the Llama 3.1 model designed to follow complex instructions. +NVIDIA employed Reinforcement Learning from Human Feedback (RLHF) to fine-tune the model’s ability to generate relevant and informative responses. + +#### Model name +``` +nvidia/llama-3.1-nemotron-70b-instruct:fp8 +``` + +### DeepSeek-R1-Distill-Llama-70B +Released 21 January 2025, Deepseek’s R1 Distilled Llama 70B is a distilled version of the Llama model family based on Deepseek R1. +DeepSeek R1 Distill Llama 70B is designed to improve the performance of Llama models on reasoning use cases, such as mathematics and coding tasks. + +| Attribute | Value | +|-----------|-------| +| Supports parallel tool-calling | No | + +#### Model name +``` +deepseek/deepseek-r1-distill-llama-70b:fp8 +deepseek/deepseek-r1-distill-llama-70b:bf16 +``` + +### DeepSeek-R1-Distill-Llama-8B +Released 21 January 2025, Deepseek’s R1 Distilled Llama 8B is a distilled version of the Llama model family based on Deepseek R1. +DeepSeek R1 Distill Llama 8B is designed to improve the performance of Llama models on reasoning use cases, such as mathematics and coding tasks. + +#### Model names +``` +deepseek/deepseek-r1-distill-llama-8b:fp8 +deepseek/deepseek-r1-distill-llama-8b:bf16 +``` + +### Mixtral-8x7b-instruct-v0.1 +Mixtral-8x7b-instruct-v0.1, developed by Mistral, is tailored for instructional platforms and virtual assistants. +Trained on vast instructional datasets, it provides clear and concise instructions across various domains, enhancing user learning experiences. + +#### Model names +``` +mistral/mixtral-8x7b-instruct-v0.1:fp8 +mistral/mixtral-8x7b-instruct-v0.1:bf16 +``` + +### Mistral-7b-instruct-v0.3 +The first dense model released by Mistral AI, perfect for experimentation, customization, and quick iteration. At the time of the release, it matched the capabilities of models up to 30B parameters. +This model is open-weight and distributed under the Apache 2.0 license. + +#### Model name +``` +mistral/mistral-7b-instruct-v0.3:bf16 +``` + +### Mistral-small-24b-instruct-2501 +Mistral Small 24B Instruct is a state-of-the-art transformer model of 24B parameters, built by Mistral. +This model is open-weight and distributed under the Apache 2.0 license. + +#### Model name +``` +mistral/mistral-small-24b-instruct-2501:fp8 +mistral/mistral-small-24b-instruct-2501:bf16 +``` + +### Mistral-nemo-instruct-2407 +Mistral Nemo is a state-of-the-art transformer model of 12B parameters, built by Mistral in collaboration with NVIDIA. +This model is open-weight and distributed under the Apache 2.0 license. +It was trained on a large proportion of multilingual and code data. + +| Attribute | Value | +|-----------|-------| +| Supports parallel tool-calling | Yes | + +#### Model name +``` +mistral/mistral-nemo-instruct-2407:fp8 +``` + +### Magistral-small-2506 +Magistral Small is a reasoning model optimized to perform well on reasoning tasks, such as academic or scientific questions. +It is well suited for complex tasks requiring multiple reasoning steps. + +#### Model name +``` +mistral/magistral-small-2506:fp8 +mistral/magistral-small-2506:bf16 +``` + +## Code models + +### Devstral-2-123b-instruct-2512 +Devstral 2 is a state-of-the-art coding model released in December 2025, which excels at using tools to explore codebases, editing multiple files and powering software engineering agents. + +#### Model name +``` +mistral/devstral-2-123b-instruct-2512:fp8 +``` + +### Devstral-small-2505 +Devstral Small is a fine-tune of Mistral Small 3.1, optimized to perform software engineering tasks. +It is a good fit to be used as a coding agent, for instance in an IDE. + +#### Model name +``` +mistral/devstral-small-2505:fp8 +mistral/devstral-small-2505:bf16 +``` + +### Qwen3-coder-30b-a3b-instruct +Qwen3-coder is an improved version of Qwen2.5 with better accuracy and throughput. +Thanks to its a3b architecture, only a subset of its weights is activated for a given generation, leading to much faster input and output token processing, ideal for code completion. + +| Attribute | Value | +|-----------|-------| +| Supports parallel tool-calling | Yes | + +#### Model name +``` +qwen/qwen3-coder-30b-a3b-instruct:fp8 +``` + +### Qwen2.5-coder-32b-instruct +Qwen2.5-coder is your intelligent programming assistant familiar with more than 40 programming languages. +With Qwen2.5-coder deployed at Scaleway, your company can benefit from code generation, AI-assisted code repair, and code reasoning. + +| Attribute | Value | +|-----------|-------| +| Supports parallel tool-calling | No | + +#### Model name +``` +qwen/qwen2.5-coder-32b-instruct:int8 +``` + +## Embeddings models + +### Qwen3-embedding-8b +Qwen/Qwen3-Embedding-8B is a state-of-the-art embedding model ranking 3rd on the METB leaderboard as of November 2025, supporting custom dimensions between 32 and 4096. + +| Attribute | Value | +|-----------|-------| +| Embedding dimensions (maximum) | 4096 | +| Embedding dimensions (minimum) | 32 | +| Matryoshka embedding | Yes | +| Serverless - Context window | 32,000 | + + + [Matryoshka embeddings](https://huggingface.co/blog/matryoshka) refers to embeddings trained on multiple dimension numbers. Consequently, resulting vector dimensions will be sorted by most meaningful first. For example, a 4096-dimension vector can be truncated to its 768 first dimensions and used directly. + + + +### Bge-multilingual-gemma2 +BGE-Multilingual-Gemma2 tops the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard), scoring the number one spot in French and Polish, and number seven in English (as of Q4 2024). +As its name suggests, the model’s training data spans a broad range of languages, including English, Chinese, Polish, French, and more. + +| Attribute | Value | +|-----------|-------| +| Embedding dimensions (maximum) | 3584 | +| Embedding dimensions (minimum) | 3584 | +| Matryoshka embedding | No | +| Serverless - Context window | 32,000 | + +#### Model name +``` +baai/bge-multilingual-gemma2:fp32 +``` + +### Sentence-t5-xxl +The Sentence-T5-XXL model represents a significant evolution in sentence embeddings, building on the robust foundation of the Text-To-Text Transfer Transformer (T5) architecture. +Designed for performance in various language processing tasks, Sentence-T5-XXL leverages the strengths of T5's encoder-decoder structure to generate high-dimensional vectors that encapsulate rich semantic information. +This model has been meticulously tuned for tasks such as text classification, semantic similarity, and clustering, making it a useful tool in the Retrieval-Augmented Generation (RAG) framework. It excels in sentence similarity tasks, but its performance in semantic search tasks is less optimal. + + +| Attribute | Value | +|-----------|-------| +| Embedding dimensions | 768 | +| Matryoshka embedding | No | + +#### Model name +``` +sentence-transformers/sentence-t5-xxl:fp32 +``` + +## Request a model + +**Do not see a model you want to use?** [Tell us or vote for what you would like to add here.](https://feature-request.scaleway.com/?tags=ai-services) + +## Deprecated models for Serverless + +The models below can still be accessed in Generative APIs - Serverless, but their End of Life (EOL) is planned according to our [model lifecycle policy](/generative-apis/reference-content/model-lifecycle/). +Deprecated models should not be queried anymore. We recommend to use newer models available in Generative APIs - Serverless or to deploy these models in [dedicated Generative APIs deployments](https://console.scaleway.com/inference/deployments). + +| Provider | Model string | Deprecation date | End of Life (EOL) date | After End of Life date, requests routed to model +|-----------------|-----------------|-----------------|-----------------|-----------------| +| Deepseek | `deepseek-r1-distill-llama-70b` | 16 January 2026 | 16 April 2026 | `llama-3.3-70b-instruct` | +| Mistral | `mistral-nemo-instruct-2407` | 16 January 2026 | 16 April 2026 | `mistral-small-3.2-24b-instruct-2506` | +| Meta | `llama-3.1-8b-instruct` | 16 January 2026 | 16 April 2026 | `mistral-small-3.2-24b-instruct-2506` | + +## End of Life (EOL) models for Serverless + +These models are not accessible anymore from Generative APIs - Serverless. They can still however be deployed on [dedicated Generative APIs deployments](https://console.scaleway.com/inference/deployments). {/* COMMENT: Fix relative link */} + +| Provider | Model string | EOL date | Requests routed to model +|-----------------|-----------------|-----------------|-----------------| +| Mistral | `mistral-small-3.1-24b-instruct-2503` | 14 November 2025 | `mistral-small-3.2-24b-instruct-2506` | +| Mistral | `devstral-small-2505` | 14 November 2025 | `qwen3-coder-30b-a3b-instruct` | +| Qwen | `qwen2.5-coder-32b-instruct` | 14 November 2025 | `qwen3-coder-30b-a3b-instruct` | +| Meta | `llama-3.1-70b-instruct` | 25 May 2025 | `llama-3.3-70b-instruct` | +| SBERT | `sentence-t5-xxl` | 26 February 2025 | None | diff --git a/pages/managed-inference/reference-content/supported-models.mdx b/pages/generative-apis/reference-content/supported-models_old.mdx similarity index 79% rename from pages/managed-inference/reference-content/supported-models.mdx rename to pages/generative-apis/reference-content/supported-models_old.mdx index 46db2a6a95..31a6bf0b7d 100644 --- a/pages/managed-inference/reference-content/supported-models.mdx +++ b/pages/generative-apis/reference-content/supported-models_old.mdx @@ -1,34 +1,34 @@ --- -title: Supported models in Managed Inference -description: Explore all AI models supported by Managed Inference +title: Model integration, model lifecycle, licensing for Dedicated Deployments +description: Explore all AI models supported by Generative APIs - Dedicated Deployment tags: support models custom catalog dates: - validation: 2025-10-16 + validation: 2026-04-24 posted: 2025-04-08 --- -Scaleway Managed Inference allows you to deploy various AI models, either from: +Scaleway Generative APIs - Dedicated Deployment allows you to deploy various AI models, either from: - * [Scaleway model catalog](#scaleway-model-catalog): A curated set of ready-to-deploy models available through the [Scaleway console](https://console.scaleway.com/inference/deployments/) or the [Managed Inference models API](https://www.scaleway.com/en/developers/api/managed-inference/#path-models-list-models) - * [Custom models](#custom-models): Models that you import, typically from sources like Hugging Face. + * [Scaleway model catalog](#scaleway-model-catalog): A curated set of ready-to-deploy models available through the [Scaleway console](https://console.scaleway.com/inference/deployments/) or the [Dedicated Deployment models API](https://www.scaleway.com/en/developers/api/managed-inference/#path-models-list-models) {/*COMMENT: Fix relative link. */} + * [Custom models](#custom-models): Models that you import, typically from sources such as Hugging Face. ## Scaleway model catalog -You can find a complete list of all models available in Scaleway's catalog on the [Managed Inference model catalog page](/managed-inference/reference-content/model-catalog/). +You can find a complete list of all models available in Scaleway's catalog on the [Generative APIs model catalog page](/generative-apis/reference-content/model-catalog/). ## Custom models - Custom model support is currently in **beta**. If you encounter issues or limitations, please report them via our [Slack community channel](https://scaleway-community.slack.com/archives/C01SGLGRLEA) or [customer support](https://console.scaleway.com/support/tickets/create?for=product&productName=inference). + Custom model support is currently in **beta**. If you encounter issues or limitations, report them via our [Slack community channel](https://scaleway-community.slack.com/archives/C01SGLGRLEA) or [customer support](https://console.scaleway.com/support/tickets/create?for=product&productName=inference). ### Prerequisites - We recommend starting with a variation of a supported model from the Scaleway catalog. + We recommend starting with a variation of a supported model from the [Scaleway catalog](/generative-apis/reference-content/model-catalog/). For example, you can deploy a [quantized (4-bit) version of Llama 3.3](https://huggingface.co/unsloth/Llama-3.3-70B-Instruct-bnb-4bit). If deploying a fine-tuned version of Llama 3.3, make sure your file structure matches the example linked above. - Examples whose compatibility has been tested are available in [tested models](#known-compatible-models). + Examples whose compatibility has been tested are available in the section about [tested models](#known-compatible-models). To deploy a custom model via Hugging Face, ensure the following: @@ -44,7 +44,7 @@ To deploy a custom model via Hugging Face, ensure the following: Your model repository must include: * A `config.json` file containing: - * An `architectures` array (see [supported architectures](#supported-model-architectures) for the exact list of supported values). + * An `architectures` array (See [supported architectures](#supported-model-architectures) for the exact list of supported values.) * `max_position_embeddings` * Model weights in the [`.safetensors`](https://huggingface.co/docs/safetensors/index) format * A `tokenizer.json` file @@ -77,35 +77,35 @@ Depending on the model type, specific endpoints and features will be supported. ### Chat models -The Chat API will be exposed for this model under `/v1/chat/completions` endpoint. +The Chat API is exposed for chat models under the `/v1/chat/completions` endpoint. **Structured outputs** or **Function calling** are not yet supported for custom models. ### Vision models -Chat API will be exposed for this model under `/v1/chat/completions` endpoint. +The Chat API is exposed for vision models under the `/v1/chat/completions` endpoint. **Structured outputs** or **Function calling** are not yet supported for custom models. ### Multimodal models -These models will be treated similarly to both Chat and Vision models. +Multimodal models are treated the same way as chat and vision models. ### Embedding models -Embeddings API will be exposed for this model under `/v1/embeddings` endpoint. +The Embeddings API is exposed for embedding models under the `/v1/embeddings` endpoint. ## Custom model lifecycle -Currently, custom model deployments are considered to be valid for the long term, and we will ensure any updates or changes to Managed Inference will not impact existing deployments. +Currently, custom model deployments are considered to be valid for the long term, and we will ensure any updates or changes to Generative APIs - Dedicated Deployment will not impact existing deployments. In case of breaking changes, leading to some custom models not being supported anymore, we will notify you **at least 3 months beforehand**. ## Licensing -When deploying custom models, **you remain responsible** for complying with any License requirements from the model provider, as you would do by running the model on a custom provisioned GPU. +When deploying custom models, **you remain responsible** for complying with any license requirements from the model provider, as you would do by running the model on a custom provisioned GPU. ## Supported model architectures -Custom models must conform to one of the architectures listed below. Click to expand full list. +Custom models must conform to one of the architectures listed below. Click to expand the full list. ## Supported custom model architectures @@ -237,11 +237,11 @@ Custom models must conform to one of the architectures listed below. Click to ex ## Known compatible models -Several models have already been verified to work on Managed Inference custom models. This list is not exhaustive and is updated gradually. Click to expand the full list. +Several models have already been verified to work on Generative APIs - Dedicated Deployment custom models. This list is not exhaustive and is updated gradually. Click to expand the full list. ## Models verified for compatibility - The following model compatibility has been verified: + The following models' compatibility has been verified: * `google/medgemma-27b-it` * `HuggingFaceTB/SmolLM2-135M-Instruct` * `ibm-granite/granite-vision-3.2-2b` diff --git a/pages/generative-apis/reference-content/supported-models_serverless.mdx b/pages/generative-apis/reference-content/supported-models_serverless.mdx new file mode 100644 index 0000000000..3e9fd25481 --- /dev/null +++ b/pages/generative-apis/reference-content/supported-models_serverless.mdx @@ -0,0 +1,99 @@ +--- +title: Supported models +description: This page lists which open-source chat or embedding models Scaleway is currently hosting +tags: generative-apis ai-data supported-models +dates: + validation: 2025-09-12 + posted: 2024-09-02 +--- + +Our API supports the most popular models for [Chat](/generative-apis/how-to/query-language-models), [Vision](/generative-apis/how-to/query-vision-models/), [Audio](/generative-apis/how-to/query-audio-models/) and [Embeddings](/generative-apis/how-to/query-embedding-models/). + +## Multimodal models + +### Chat and Vision models + +| Provider | Model string | Context window (Tokens) | Maximum output (Tokens)| License \* | Model card | +|-----------------|-----------------|-----------------|-----------------|-----------------|-----------------| +| Qwen | `qwen3.5-397b-a17b` | 250k | 16k | [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) | [HF](https://huggingface.co/Qwen/Qwen3.5-397B-A17B) | +| Mistral | `mistral-small-3.2-24b-instruct-2506` | 128k | 32k | [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) | [HF](https://huggingface.co/mistralai/Mistral-Small-3.2-24B-Instruct-2506) | +| Google (Preview) | `gemma-3-27b-it` | 40k | 8k | [Gemma](https://ai.google.dev/gemma/terms) | [HF](https://huggingface.co/google/gemma-3-27b-it) | +| H | `holo2-30b-a3b` | 22k | 32k | [CC-BY-NC-4.0](https://spdx.org/licenses/CC-BY-NC-4.0) | [HF](https://huggingface.co/Hcompany/Holo2-30B-A3B) | + +\*Licences which are not open-weight and may restrict commercial usage (such as `CC-BY-NC-4.0`), do not apply to usage through Scaleway Products due to existing partnerships between Scaleway and the corresponding providers. Original licences are provided for transparency only. + +### Chat and Audio models + +| Provider | Model string | Context window (Tokens) | Maximum output (Tokens)| License | Model card | +|-----------------|-----------------|-----------------|-----------------|-----------------|-----------------| +| Mistral | `voxtral-small-24b-2507` | 32k | 16k | [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) | [HF](https://huggingface.co/mistralai/Voxtral-Small-24B-2507) | + +### Audio transcription models + +| Provider | Model string | Maximum audio duration (Minutes) | Chunk size (Seconds) | Maximum file size (MB) | License | Model card | +|-----------------|-----------------|-----------------|-----------------|-----------------|-----------------|-----------------| +| Mistral | `voxtral-small-24b-2507` | 30 | 30 | 25 | [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) | [HF](https://huggingface.co/mistralai/Voxtral-Small-24B-2507) | +| OpenAI | `whisper-large-v3` | - | 30 | 25 | [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) | [HF](https://huggingface.co/openai/whisper-large-v3) | + +## Chat models + +| Provider | Model string | Context window (Tokens) | Maximum output (Tokens)| License | Model card | +|-----------------|-----------------|-----------------|-----------------|-----------------|-----------------| +| OpenAI | `gpt-oss-120b` | 128k | 32k | [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) | [HF](https://huggingface.co/openai/gpt-oss-120b) | +| Meta | `llama-3.3-70b-instruct` | 100k | 16k | [Llama 3.3 Community](https://www.llama.com/llama3_3/license/) | [HF](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) | +| Meta | `llama-3.1-8b-instruct` | 128k | 16k | [Llama 3.1 Community](https://llama.meta.com/llama3_1/license/) | [HF](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) | +| Mistral | `mistral-nemo-instruct-2407` | 128k | 8k | [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) | [HF](https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407) | +| Mistral | `devstral-2-123b-instruct-2512` | 200k | 16k | [Modified MIT](https://huggingface.co/mistralai/Devstral-2-123B-Instruct-2512/blob/main/LICENSE) | [HF](https://huggingface.co/mistralai/Devstral-2-123B-Instruct-2512) | +| Qwen | `qwen3-235b-a22b-instruct-2507` | 250k | 16k | [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) | [HF](https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507) | +| Qwen | `qwen3-coder-30b-a3b-instruct` | 128k | 32k | [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) | [HF](https://huggingface.co/Qwen/Qwen3-Coder-30B-A3B-Instruct) | +| DeepSeek | `deepseek-r1-distill-llama-70b` | 16k | 4k | [MIT](https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/mit.md) | [HF](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B) | + + + If you are unsure which chat model to use, we currently recommend `mistral-small-3.2-24b-instruct-2506` to get started, and `qwen3.5-397b-a17b` for best accuracy or coding tasks. + + +## Vision models + +| Provider | Model string | Context window (Tokens) | Maximum output (Tokens)| License | Model card | +|-----------------|-----------------|-----------------|-----------------|-----------------|-----------------| +| Mistral | `pixtral-12b-2409` | 128k | 4k | [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) | [HF](https://huggingface.co/mistralai/Pixtral-12B-2409) | + + + Image sizes are limited to 32 million pixels (e.g., a resolution of about 8096 x 4048). Images with a resolution higher than 1024 x 1024 are supported, but automatically downscaled to fit these limitations (image ratio and proportions will be preserved). + + +## Embedding models + +Our [Embeddings API](/generative-apis/how-to/query-embedding-models) provides built-in support for the following models, hosted in Scaleway data centers, available via serverless endpoints. + +| Provider | Model string | Embedding dimension (Maximum) | Embedding dimensions (Minimum) | Context window | License | Model card | +|-----------------|-----------------|-----------------|-----------------|-----------------|-----------------|-----------------| +| Qwen | `qwen3-embedding-8b` | 4096 | 32 | 32 000 | [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) | [HF](https://huggingface.co/Qwen/Qwen3-Embedding-8B) | +| BAAI | `bge-multilingual-gemma2` | 3584 | 3584 | 8192 | [Gemma](https://ai.google.dev/gemma/terms) | [HF](https://huggingface.co/BAAI/bge-multilingual-gemma2) | + +## Request a model + +**Do not see a model you want to use?** [Tell us or vote for what you would like to add here.](https://feature-request.scaleway.com/?tags=ai-services) + +## Deprecated models + +These models can still be accessed in Generative APIs, but their End of Life (EOL) is planned according to our [model lifecycle policy](/generative-apis/reference-content/model-lifecycle/). +Deprecated models should not be queried anymore. We recommend to use newer models available in Generative APIs or to deploy these models in dedicated [Managed Inference](https://console.scaleway.com/inference/deployments) deployments. + +| Provider | Model string | Deprecation date | End of Life (EOL) date | After End of Life date, requests routed to model +|-----------------|-----------------|-----------------|-----------------|-----------------| +| Deepseek | `deepseek-r1-distill-llama-70b` | 16th January, 2026 | 16th April, 2026 | `llama-3.3-70b-instruct` | +| Mistral | `mistral-nemo-instruct-2407` | 16th January, 2026 | 16th April, 2026 | `mistral-small-3.2-24b-instruct-2506` | +| Meta | `llama-3.1-8b-instruct` | 16th January, 2026 | 16th April, 2026 | `mistral-small-3.2-24b-instruct-2506` | + +## End of Life (EOL) models + +These models are not accessible anymore from Generative APIs. They can still however be deployed on dedicated [Managed Inference](https://console.scaleway.com/inference/deployments) deployments. + +| Provider | Model string | EOL date | Requests routed to model +|-----------------|-----------------|-----------------|-----------------| +| Mistral | `mistral-small-3.1-24b-instruct-2503` | 14th November, 2025 | `mistral-small-3.2-24b-instruct-2506` | +| Mistral | `devstral-small-2505` | 14th November, 2025 | `qwen3-coder-30b-a3b-instruct` | +| Qwen | `qwen2.5-coder-32b-instruct` | 14th November, 2025 | `qwen3-coder-30b-a3b-instruct` | +| Meta | `llama-3.1-70b-instruct` | 25th May, 2025 | `llama-3.3-70b-instruct` | +| SBERT | `sentence-t5-xxl` | 26 February, 2025 | None | diff --git a/pages/generative-apis/troubleshooting/fixing-common-issues.mdx b/pages/generative-apis/troubleshooting/fixing-common-issues.mdx index 406b777411..a1b8932433 100644 --- a/pages/generative-apis/troubleshooting/fixing-common-issues.mdx +++ b/pages/generative-apis/troubleshooting/fixing-common-issues.mdx @@ -19,7 +19,7 @@ Below are common issues that you may encounter when using Generative APIs, their - Reduce your input size below what is [supported by the model](/generative-apis/reference-content/supported-models/). - If you are using a third party tool such as IDEs, you should edit their configuration to set an appropriate maximum context window for the model. More information for [VS Code (Continue)](/generative-apis/reference-content/adding-ai-to-vscode-using-continue/#configure-continue-through-a-configuration-file), [IntelliJ (Continue)](/generative-apis/reference-content/adding-ai-to-intellij-using-continue/#configure-continue-through-configuration-file) and [Zed](/generative-apis/reference-content/adding-ai-to-zed-ide/). - Use a model supporting longer context window values. -- Use [Managed Inference](/managed-inference/), where the context window can be increased for [several configurations with additional GPU vRAM](/managed-inference/reference-content/supported-models/). For instance, `llama-3.3-70b-instruct` model in `fp8` quantization can be served with: +- Use [Generative APIs - Dedicated Deployment](/generative-apis/how-to/create-deployment/), where the context window can be increased for [several configurations with additional GPU vRAM](/generative-apis/reference-content/supported-models/). For instance, `llama-3.3-70b-instruct` model in `fp8` quantization can be served with: - `15k` tokens context window on `H100` Instances. - `128k` tokens context window on `H100-2` Instances. @@ -62,7 +62,7 @@ Below are common issues that you may encounter when using Generative APIs, their llm = init_chat_model("llama-3.3-70b-instruct", max_tokens="8000", model_provider="openai", base_url="https://api.scaleway.ai/v1", temperature=0.7) ``` - Use a model supporting a higher `max_completion_tokens` value. -- Use [Managed Inference](/managed-inference/), where these limits on completion tokens do not apply (your completion tokens amount will still be limited by the maximum context window supported by the model). +- Use [Generative APIs - Dedicated Deployment](/generative-apis/how-to/create-deployment/), where these limits on completion tokens do not apply (your completion tokens amount will still be limited by the maximum context window supported by the model). ## 429: Too Many Requests - You exceeded your current quota of requests/tokens per minute @@ -71,11 +71,11 @@ Below are common issues that you may encounter when using Generative APIs, their - You consumed too many tokens (input and output) with your API requests over a given minute ### Solution -- [Add a payment method](/billing/how-to/add-payment-method/#how-to-add-a-credit-card) and [validate your identity](/account/how-to/verify-identity/) to increase automatically your quotas [based on standard limits](/organizations-and-projects/additional-content/organization-quotas/#generative-apis). +- [Add a payment method](/billing/how-to/add-payment-method/#how-to-add-a-credit-card) and [validate your identity](/account/how-to/verify-identity/) to automatically increase your quotas [based on standard limits](/organizations-and-projects/additional-content/organization-quotas/#generative-apis). - Smooth out your API request rate by limiting the number of API requests you perform over a given minute, so that you remain below your [Organization quotas for Generative APIs](/organizations-and-projects/additional-content/organization-quotas/#generative-apis). You can use rate limit information provided in the [HTTP response header](/generative-apis/reference-content/rate-limits/#how-can-i-monitor-rate-limits) for this purpose. - Use [Batches API](https://console.scaleway.com/generative-api/batches) for non-real time workloads. Requests performed through Batches API do not have a rate limit and are billed with a [-50% discount compared to standard model prices](https://www.scaleway.com/en/pricing/model-as-a-service/#generative-apis). - Reduce the size of the input or output tokens processed by your API requests. -- Use [Managed Inference](/managed-inference/), where these quotas do not apply (your throughput will be limited only by the amount of Inference Deployment you provision). +- Use [Generative APIs - Dedicated Deployment](/generative-apis/faq/#how-is-generative-apis---dedicated-deployment-billed-), where these quotas do not apply (your throughput will be limited only by the amount of Inference Deployment you provision). - Contact your assigned Scaleway account manager or [our Sales team](https://www.scaleway.com/en/contact-sales/) to discuss volume commitments for specific models, which will enable us to increase your quota proportionally. ## 429: Too Many Requests - You exceeded your current threshold of concurrent requests @@ -86,7 +86,7 @@ Below are common issues that you may encounter when using Generative APIs, their ### Solution - Smooth out your API requests rate by limiting the number of API requests you perform at the same time (e.g., requests that did not receive a complete response and are still open) so that you remain below your [Organization quotas for Generative APIs](/organizations-and-projects/additional-content/organization-quotas/#generative-apis). You can use rate limit information provided in [HTTP response header](/generative-apis/reference-content/rate-limits/#how-can-i-monitor-rate-limits) for this purpose. - Use [Batches API](https://console.scaleway.com/generative-api/batches) for non-real time workloads. Requests performed through Batches API do not have a rate limit and are billed with a [-50% discount compared to standard model prices](https://www.scaleway.com/en/pricing/model-as-a-service/#generative-apis). -- Use [Managed Inference](/managed-inference/), where concurrent request limits do not apply. Note that exceeding the number of concurrent requests your Inference deployment can handle may impact performance metrics. +- Use [Generative APIs - Dedicated Deployment](/generative-apis/how-to/create-deployment/), where concurrent request limits do not apply. Note that exceeding the number of concurrent requests your Inference deployment can handle may impact performance metrics. ## 504: Gateway Timeout @@ -99,7 +99,7 @@ Below are common issues that you may encounter when using Generative APIs, their For queries that are too long to process: - Set a stricter **maximum token limit** to prevent overly long responses. - Reduce the size of the input tokens, or split the input into multiple API requests. -- Use [Managed Inference](/managed-inference/), where no query timeout is enforced. +- Use [Generative APIs - Dedicated Deployment](/generative-apis/how-to/create-deployment/), where no query timeout is enforced. For queries where the model enters an infinite loop (more frequent when using **structured output**): - Set the `temperature` to the default value recommended for the model. These values can be found in the [Generative APIs Playground](https://console.scaleway.com/generative-api/models/fr-par/playground) when selecting the model. Avoid using temperature `0`, as this can lock the model into outputting only the next (and same) most probable token repeatedly. @@ -144,7 +144,7 @@ For queries where the model enters an infinite loop (more frequent when using ** - The model will therefore not output reasoning tokens, and the field `reasoning` will be empty, leading to a faster answer. - Note that in some edge cases, such as asking mathematical or logical questions, some models can still produce verbose responses, because of their default behavior, even without activating "reasoning"/"thinking". As a workaround, we recommend editing your prompts, by asking the model to be brief. Example: `What is the solution to x⁴+x³+1=0 ? Be brief.` - Optimize your request to reduce the number of input tokens in your payload. -- Use [Managed Inference](/managed-inference/), where capacity is dedicated to your usage only. +- Use [Generative APIs - Dedicated Deployment](/generative-apis/how-to/create-deployment/), where capacity is dedicated to your usage only. ## Multiple "role": "user" successive messages diff --git a/pages/managed-inference/concepts.mdx b/pages/managed-inference/concepts.mdx deleted file mode 100644 index 16346f1f42..0000000000 --- a/pages/managed-inference/concepts.mdx +++ /dev/null @@ -1,92 +0,0 @@ ---- -title: Managed Inference - Concepts -description: This page explains all the concepts related to Managed Inference -tags: -dates: - validation: 2025-06-02 ---- -## Allowed IPs - -The **Allowed IPs** feature is no longer available for Managed Inference deployments. Use one of the alternative methods detailed in our [dedicated documentation](/managed-inference/how-to/manage-allowed-ips/) to restrict access to your Managed Inference deployments. - -## Context size - -The context size refers to the length or size of the input text used to generate predictions or responses from a Large Language Model (LLM). It is crucial in determining the model's understanding of the given prompt or query. - -## Deployment - -A deployment makes a trained language model available for real-world applications. It encompasses tasks such as integrating the model into existing systems, optimizing its performance, and ensuring scalability and reliability. - -## Embedding models - -Embedding models are a representation-learning technique that converts textual data into numerical vectors. These vectors capture semantic information about the text and are often used as input to downstream machine-learning models, or algorithms. - -## Endpoint - -In the context of LLMs, an endpoint refers to a network-accessible URL or interface through which clients can interact with the model for inference tasks. It exposes methods for sending input data and receiving model predictions or responses. - -## Fine-tuning - -Fine-tuning involves further training a pre-trained language model on domain-specific or task-specific data to improve performance on a particular task. This process often includes updating the model's parameters using a smaller, task-specific dataset. - -## Few-shot prompting - -Few-shot prompting uses the power of language models to generate responses with minimal input, relying on just a handful of examples or prompts. -It demonstrates the model's ability to generalize from limited training data to produce coherent and contextually relevant outputs. - -## Function calling - -Function calling allows a large language model (LLM) to interact with external tools or APIs, executing specific tasks based on user requests. The LLM identifies the appropriate function, extracts the required parameters, and returns the results as structured data, typically in JSON format. - -Refer to [Support for function calling in Scaleway Managed Inference](/managed-inference/reference-content/function-calling-support/) for more information. - -## Hallucinations - -Hallucinations in LLMs refer to instances where generative AI models generate responses that, while grammatically coherent, contain inaccuracies or nonsensical information. These inaccuracies are termed "hallucinations" because the models create false or misleading content. Hallucinations can occur because of constraints in the training data, biases embedded within the models, or the complex nature of language itself. - -## Inference - -Inference is the process of deriving logical conclusions or predictions from available data. This concept involves using statistical methods, machine learning algorithms, and reasoning techniques to make decisions or draw insights based on observed patterns or evidence. -Inference is fundamental in various AI applications, including natural language processing, image recognition, and autonomous systems. - -## JSON mode - -JSON mode allows you to guide the language model in outputting well-structured JSON data. -To activate JSON mode, provide the `response_format` parameter with `{"type": "json_object"}`. -JSON mode is useful for applications like chatbots or APIs, where a machine-readable format is essential for easy processing. - -## Large Language Model Applications - -LLM Applications are applications or software tools that leverage the capabilities of LLMs for various tasks, such as text generation, summarization, or translation. These apps provide user-friendly interfaces for interacting with the models and accessing their functionalities. - -## Large Language Models - -LLMs are advanced artificial intelligence systems capable of understanding and generating human-like text on various topics. -These models, such as Llama-3, are trained on vast amounts of data to learn the patterns and structures of language, enabling them to generate coherent and contextually relevant responses to queries or prompts. -LLMs have applications in natural language processing, text generation, translation, and other tasks requiring sophisticated language understanding and production. - -## Node number - -The node number (or node count) defines the number of nodes, or Instances, that are running your Managed Inference deployment. [Increasing the node number](/managed-inference/how-to/configure-autoscaling/) scales your deployment, so that it can handle more load. - -## Prompt - -In the context of generative AI models, a prompt refers to the input provided to the model to generate a desired response. -It typically consists of a sentence, paragraph, or series of keywords or instructions that guide the model in producing text relevant to the given context or task. -The quality and specificity of the prompt greatly influence the generated output, as the model uses it to understand the user's intent and create responses accordingly. - -## Quantization - -Quantization is a technique used to reduce the precision of numerical values in a model's parameters or activations to improve efficiency and reduce memory footprint during inference. It involves representing floating-point values with fewer bits while minimizing the loss of accuracy. -AI models provided for deployment are named with suffixes that denote their quantization levels, such as `:int8`, `:fp8`, and `:fp16`. - -## Retrieval Augmented Generation (RAG) - -RAG is an architecture combining information retrieval elements with language generation to enhance the capabilities of LLMs. It involves retrieving relevant context or knowledge from external sources and incorporating it into the generation process to produce more informative and contextually grounded outputs. - -## Structured outputs - -Structured outputs enable you to format the model's responses to suit specific use cases. To activate structured outputs, provide the `response_format` parameter with `"type": "json_schema"` and define its `"json_schema": {}`. -By customizing the structure, such as using lists, tables, or key-value pairs, you ensure that the data returned is in a form that is easy to extract and process. -By specifying the expected response format through the API, you can make the model consistently deliver the output your system requires. - diff --git a/pages/managed-inference/faq.mdx b/pages/managed-inference/faq.mdx deleted file mode 100644 index 11b5a04dc6..0000000000 --- a/pages/managed-inference/faq.mdx +++ /dev/null @@ -1,93 +0,0 @@ ---- -title: Managed Inference FAQ -description: Get answers to the most frequently asked questions about Scaleway Managed Inference. -dates: - validation: 2025-12-19 -productIcon: InferenceProductIcon ---- - -## Overview - -### What is Scaleway Managed Inference? -Scaleway's Managed Inference is a fully managed service that allows you to deploy, run, and scale AI models in a dedicated environment. -It provides optimized infrastructure, customizable deployment options, and secure access controls to meet the needs of enterprises and developers looking for high-performance inference solutions. - -### Where are the inference servers located? -All models are currently hosted in a secure data center located in Paris, France, operated by [OPCORE](https://www.opcore.com/). This ensures low latency for European users and compliance with European data privacy regulations. - -### What is the difference between Managed Inference and Generative APIs? -- **Managed Inference**: Allows deployment of curated or custom models with chosen quantization and instances, offering predictable throughput and enhanced security features like private network isolation and access control. Managed Inference is billed by hourly usage, whether provisioned capacity is receiving traffic or not. -- **Generative APIs**: A serverless service providing access to pre-configured AI models via API, billed per token usage. - -### Is Managed Inference suitable for real-time applications? -Yes, Managed Inference is designed for low-latency, high-throughput applications, making it suitable for real-time use cases such as chatbots, recommendation systems, fraud detection, and live video processing. - -### Can I fine-tune or retrain my models within Managed Inference? -Managed Inference is primarily designed for deploying and running inference workloads. If you need to fine-tune or retrain models, you may need to use a separate training environment, such as [Scaleway’s GPU Instances](/gpu/quickstart/), and then deploy the trained model in Managed Inference. - -## Getting started - -### How do I deploy a model using Managed Inference? -Deployment is done through Scaleway's [console](https://console.scaleway.com/inference/deployments) or [API](https://www.scaleway.com/en/developers/api/managed-inference/). You can choose a model from Scaleway’s selection or import your own directly from Hugging Face's repositories, configure [Instance types](/gpu/reference-content/choosing-gpu-instance-type/), set up networking options, and start inference with minimal setup. - -### Can I run inference on private models? -Yes, Managed Inference allows you to deploy private models with access control settings. You can restrict access to specific users, teams, or networks. - -## Offering and availability - -### What Instance types are available for inference? -Managed Inference offers different Instance types optimized for various workloads from Scaleway's [GPU Instances](/gpu/reference-content/choosing-gpu-instance-type/) range. -You can select the Instance type based on your model’s computational needs and compatibility. - -### What types of models can I deploy with Managed Inference? -You can deploy a variety of models, including: -* Large language models (LLMs) -* Image processing models -* Audio recognition models -* Custom AI models (through API only yet) -Managed Inference supports both open-source models and your own uploaded proprietary models. - -### What are the performance guarantees (vs. Generative APIs)? -Managed Inference provides dedicated resources, ensuring predictable performance and lower latency compared to Generative APIs, which are a shared, serverless offering optimized for infrequent traffic with moderate peak loads. Managed Inference is ideal for workloads that require consistent response times, high availability, custom hardware configurations, or generate extreme peak loads during a narrow period. -Compared to Generative APIs, no usage quota is applied to the number of tokens per second generated, since the output is limited by the GPU Instance size and the number of your Managed Inference deployments. - -## Pricing and billing - -### How is Managed Inference billed? -Billing is based on the Instance type and usage duration (in minutes). Unlike [Generative APIs](/generative-apis/quickstart/), which are billed per token, Managed Inference provides predictable costs based on the allocated infrastructure. Billing only starts when a deployment is ready and can be queried. -Pricing details can be found on the [Scaleway pricing page](https://www.scaleway.com/en/pricing/model-as-a-service/#managed-inference). - -### Can I pause Managed Inference billing when the Instance is not in use? -When a Managed Inference deployment is running, corresponding resources are provisioned and thus billed. Resources can therefore not be paused. -However, you can still optimize your Managed Inference deployment to fit within specific time ranges (such as during working hours). To do so, you can automate deployment creation and deletion using the [Managed Inference API](https://www.scaleway.com/en/developers/api/managed-inference/), [Terraform](https://registry.terraform.io/providers/scaleway/scaleway/latest/docs/resources/inference_deployment), or [Scaleway SDKs](/scaleway-sdk/). These actions can be programmed using [Serverless Jobs](/serverless-jobs/) to be automatically carried out periodically. - -## Quotas and limitations - -### Do model licenses apply when using Managed Inference? -Yes, model licenses need to be complied with when using Managed Inference. Applicable licenses are available for [each model in our documentation](/managed-inference/reference-content/). -- For models provided in the Scaleway catalog, you need to accept licenses (including potential EULA) before creating any Managed Inference deployment. -- For custom models you choose to import on Scaleway, you are responsible for complying with model licenses (as with any software you choose to install on a GPU Instance, for example). - -## Compatibility and integration - -### Can I use Managed Inference with other Scaleway services? -Absolutely. Managed Inference integrates seamlessly with other Scaleway services, such as [Object Storage](/object-storage/quickstart/) for model hosting, [Kubernetes](/kubernetes/quickstart/) for containerized applications, and [Scaleway IAM](/iam/quickstart/) for access management. - -### Does Managed Inference support model quantization? -Yes, Scaleway Managed Inference supports model [quantization](/managed-inference/concepts/#quantization) to optimize performance and reduce inference latency. You can select different quantization options depending on your accuracy and efficiency requirements. - -### Is Managed Inference compatible with OpenAI APIs? -Managed Inference aims to achieve seamless compatibility with OpenAI APIs. Find detailed information in the [Scaleway Managed Inference as drop-in replacement for the OpenAI APIs](/managed-inference/reference-content/openai-compatibility/) documentation. - -## Usage and management - -### How can I monitor performance? -Managed Inference metrics and logs are available in [Scaleway Cockpit](https://console.scaleway.com/cockpit/overview). You can follow your deployment metrics in real-time, such as token throughput, request latency, GPU power usage, and GPU VRAM usage. - -## Privacy and safety - -### Where can I find information regarding the data, privacy, and security policies applied to Scaleway's AI services? -You can find detailed information regarding the policies applied to Scaleway's AI services in our [Data, privacy, and security for Scaleway's AI services](/managed-inference/reference-content/data-privacy-security-scaleway-ai-services/) documentation. - -### What are the SLAs applicable to Managed Inference? -We are currently working on defining our SLAs for Managed Inference. We will provide more information on this topic soon. \ No newline at end of file diff --git a/pages/managed-inference/how-to/change-model.mdx b/pages/managed-inference/how-to/change-model.mdx deleted file mode 100644 index 876cdd9135..0000000000 --- a/pages/managed-inference/how-to/change-model.mdx +++ /dev/null @@ -1,36 +0,0 @@ ---- -title: How to change the model of a Managed Inference deployment -description: Learn how to change the model of your Scaleway Managed Inference deployment in just a few easy clicks. -tags: managed-inference ai-data change model -dates: - posted: 2025-07-18 - validation: 2025-07-18 -categories: - - ai-data ---- -import Requirements from '@macros/iam/requirements.mdx' - - -You can change the model used by your Managed Inference deployment at any time, as long as the new model is compatible with the existing deployment node. If you want to change to a model that is too big for your current node type, you must [delete your existing deployment](/managed-inference/how-to/delete-deployment/) and [create a new one](/managed-inference/how-to/create-deployment/) with a compatible node type. - -Follow the steps below to change the model using the Scaleway console: - - - - - A Scaleway account logged into the [console](https://console.scaleway.com) - - A [Managed Inference deployment](/managed-inference/quickstart/) - - [Owner](/iam/concepts/#owner) status or [IAM permissions](/iam/concepts/#permission) allowing you to perform actions in the intended Organization - -1. Click **Managed Inference** in the **AI** section of the [Scaleway console](https://console.scaleway.com) side menu. A list of your deployments displays. -2. From the drop-down menu, select the geographical region you want to manage. -3. Choose a deployment either by clicking its name, or selecting **More info** from the drop-down menu represented by the icon to access the deployment dashboard. -4. Click the **Settings** tab of your deployment to display additional settings. -5. In the **Model** panel, click **Change model**. - A pop-up displays, showing your current model. -6. From the drop-down menu, select the model you want to change to, and click **Change model**. -

- The change of model is initiated. Note that while the model is changed, your deployment will not be available for 15 - 30 minutes. - -Remember to update the model string in your client configuration, to reflect the new model. You can find the model string in the code sample available in the **Playground** tab of your deployment's dashboard (use the **View code** button). - -If you have also changed to a different type of model (e.g. from a chat model to an embedding model), you will also need to update the client code itself, as well as the model string. \ No newline at end of file diff --git a/pages/managed-inference/how-to/configure-autoscaling.mdx b/pages/managed-inference/how-to/configure-autoscaling.mdx deleted file mode 100644 index caf94b19e1..0000000000 --- a/pages/managed-inference/how-to/configure-autoscaling.mdx +++ /dev/null @@ -1,36 +0,0 @@ ---- -title: How to scale Managed Inference deployments -description: This page explains how to scale Managed Inference deployments in size -tags: managed-inference ai-data ip-address -dates: - validation: 2025-06-03 - posted: 2025-06-03 ---- -import Requirements from '@macros/iam/requirements.mdx' - - -You can scale your Managed Inference deployment up or down to match it to the incoming load of your deployment. - - -This feature is currently in [Public Beta](https://www.scaleway.com/en/betas/). - - - - - - A Scaleway account logged into the [console](https://console.scaleway.com) - - A [Managed Inference deployment](/managed-inference/quickstart/) - - [Owner](/iam/concepts/#owner) status or [IAM permissions](/iam/concepts/#permission) allowing you to perform actions in the intended Organization - -## How to scale a Managed Inference deployment in size - -1. Click **Managed Inference** in the **AI** section of the [Scaleway console](https://console.scaleway.com) side menu. A list of your deployments displays. -2. Click a deployment name or > **More info** to access the deployment dashboard. -3. Click the **Settings** tab and navigate to the **Scaling** section. -4. Click **Update node count** and adjust the number of nodes in your deployment. - - High availability is only guaranteed with two or more nodes. - -5. Click **Update node count** to update the number of nodes in your deployment. - - Your deployment will be unavailable for 15-30 minutes while the node update is in progress. - \ No newline at end of file diff --git a/pages/managed-inference/how-to/delete-deployment.mdx b/pages/managed-inference/how-to/delete-deployment.mdx deleted file mode 100644 index a2092c2cf1..0000000000 --- a/pages/managed-inference/how-to/delete-deployment.mdx +++ /dev/null @@ -1,31 +0,0 @@ ---- -title: How to delete a Managed Inference deployment -description: This page explains how to delete a Managed Inference deployment via the Scaleway console. -tags: managed-inference ai-data delete -dates: - validation: 2025-07-21 - posted: 2024-03-06 ---- -import Requirements from '@macros/iam/requirements.mdx' - - -Once you have finished your inference tasks, you can delete your deployment. This page explains how to do so from the Scaleway console. - - - - - A Scaleway account logged into the [console](https://console.scaleway.com) - - A [Managed Inference deployment](/managed-inference/quickstart/) - - [Owner](/iam/concepts/#owner) status or [IAM permissions](/iam/concepts/#permission) allowing you to perform actions in the intended Organization - -1. Click **Managed Inference** in the **AI** section of the [Scaleway console](https://console.scaleway.com) side menu. A list of your deployments displays. -2. From the drop-down menu, select the geographical region you want to manage. -3. Choose a deployment by clicking its name. The deployment's **Overview** page displays. -4. Navigate to the **Settings** tab. -5. Click **Delete deployment** at the bottom of the page. -6. Type **DELETE** to confirm and click **Delete deployment**. - - Alternatively, from the Deployments listing, click the icon next to the deployment name you no longer need, and click **Delete**. A pop-up appears. Type **DELETE** to confirm, then click **Delete deployment**. - - - Deleting a deployment is a permanent action that erases all its associated data and resources. - \ No newline at end of file diff --git a/pages/managed-inference/how-to/index.mdx b/pages/managed-inference/how-to/index.mdx deleted file mode 100644 index d14f393696..0000000000 --- a/pages/managed-inference/how-to/index.mdx +++ /dev/null @@ -1,4 +0,0 @@ ---- -title: Managed Inference - How Tos -description: Managed Inference How Tos ---- \ No newline at end of file diff --git a/pages/managed-inference/how-to/monitor-deployment.mdx b/pages/managed-inference/how-to/monitor-deployment.mdx deleted file mode 100644 index ad66317a84..0000000000 --- a/pages/managed-inference/how-to/monitor-deployment.mdx +++ /dev/null @@ -1,31 +0,0 @@ ---- -title: How to monitor a Managed Inference deployment -description: This page explains how to monitor a Managed Inference deployment -tags: managed-inference ai-data monitoring -dates: - validation: 2026-01-21 - posted: 2024-03-06 ---- -import Requirements from '@macros/iam/requirements.mdx' -import CockpitIamPermissions from '@macros/cockpit/iam-permissions-cockpit.mdx' - - -This documentation page shows you how to monitor your Managed Inference deployment with [Cockpit](/cockpit/quickstart/). - - - - - A Scaleway account logged into the [console](https://console.scaleway.com) - - A [Managed Inference deployment](/managed-inference/quickstart/) - - [Owner](/iam/concepts/#owner) status or [IAM permissions](/iam/concepts/#permission) allowing you to perform actions in the intended Organization - - - -## How to monitor your LLM dashboard - -1. Click **Managed Inference** in the **AI** section of the [Scaleway console](https://console.scaleway.com) side menu. A list of your deployments displays. -2. From the drop-down menu, select the geographical region you want to manage. -3. Click a deployment name or > **More info** to access the deployment dashboard. -4. Click the **Monitoring** tab of your deployment. The Cockpit overview displays. -5. Click **Open Grafana metrics dashboard** to open your Cockpit's Grafana interface. -6. Log in to Grafana. -7. Select your Managed Inference dashboard from the [list of your preconfigured dashboards](/cockpit/how-to/access-grafana-and-managed-dashboards/) to visualize your metrics. \ No newline at end of file diff --git a/pages/managed-inference/index.mdx b/pages/managed-inference/index.mdx deleted file mode 100644 index 5fa1485387..0000000000 --- a/pages/managed-inference/index.mdx +++ /dev/null @@ -1,60 +0,0 @@ ---- -title: Managed Inference Documentation -description: Dive into Scaleway Managed Inference with our quickstart guides, how-tos, tutorials and more. ---- - - - -## Getting Started - - - - - - - - - - -## Changelog - - diff --git a/pages/managed-inference/menu.ts b/pages/managed-inference/menu.ts deleted file mode 100644 index c94ddd3bf6..0000000000 --- a/pages/managed-inference/menu.ts +++ /dev/null @@ -1,99 +0,0 @@ -export const managedInferenceMenu = { - items: [ - { - label: 'Overview', - slug: '../managed-inference', - }, - { - label: 'Concepts', - slug: 'concepts', - }, - { - label: 'Quickstart', - slug: 'quickstart', - }, - { - label: 'FAQ', - slug: 'faq', - }, - { - items: [ - { - label: 'Deploy a model', - slug: 'create-deployment', - }, - { - label: 'Import a custom model', - slug: 'import-custom-model', - }, - { - label: 'Change the model of a deployment', - slug: 'change-model', - }, - { - label: 'Monitor a deployment', - slug: 'monitor-deployment', - }, - { - label: 'Configure autoscaling', - slug: 'configure-autoscaling', - }, - { - label: 'Manage access to a deployment', - slug: 'manage-allowed-ips', - }, - { - label: - 'Use your Managed Inference deployment with a Private Network', - slug: 'managed-inference-with-private-network', - }, - { - label: 'Delete a deployment', - slug: 'delete-deployment', - }, - ], - label: 'How to', - slug: 'how-to', - }, - { - items: [ - { - label: 'Managed Inference API Reference', - slug: 'https://www.scaleway.com/en/developers/api/managed-inference/', - }, - ], - label: 'API/CLI', - slug: 'api-cli', - }, - { - items: [ - { - label: - "Data, privacy, and security for Scaleway's AI services", - slug: 'data-privacy-security-scaleway-ai-services', - }, - { - label: 'OpenAI API compatibility', - slug: 'openai-compatibility', - }, - { - label: 'Supported models in Managed Inference', - slug: 'supported-models', - }, - { - label: - 'Support for function calling in Scaleway Managed Inference', - slug: 'function-calling-support', - }, - { - label: 'Managed Inference model catalog', - slug: 'model-catalog', - }, - ], - label: 'Additional Content', - slug: 'reference-content', - }, - ], - label: 'Managed Inference', - slug: 'managed-inference', -} diff --git a/pages/managed-inference/quickstart.mdx b/pages/managed-inference/quickstart.mdx deleted file mode 100644 index dfcf4d7349..0000000000 --- a/pages/managed-inference/quickstart.mdx +++ /dev/null @@ -1,101 +0,0 @@ ---- -title: Managed Inference - Quickstart -description: Start with Scaleway Managed Inference for secure, scalable AI model deployment in Europe's premier platform. Privacy-focused, fully managed. -tags: -dates: - validation: 2025-07-21 ---- -import Requirements from '@macros/iam/requirements.mdx' - - -Scaleway Managed Inference is the first European Managed Inference platform on the market. It is a scalable and secure inference engine for Large Language Models (LLMs). - -Scaleway Managed Inference is a fully managed service that allows you to serve generative AI models in a production environment. -With Scaleway Managed Inference, you can easily deploy, manage, and scale LLMs without worrying about the underlying infrastructure. - -Here are some of the key features of Scaleway Managed Inference: - -* **Easy deployment**: Deploy state-of-the-art open weights LLMs with just a few clicks. Scaleway Managed Inference provides a simple and intuitive interface for generating dedicated endpoints. -* **Security**: Scaleway provides [a secure environment](/managed-inference/reference-content/data-privacy-security-scaleway-ai-services/) to run your models. Our platform is built on top of a secure architecture, and we use state-of-the-art cloud security. -* **Complete data privacy**: [No storage](/managed-inference/reference-content/data-privacy-security-scaleway-ai-services/#data-storage-policies) or third-party access to your data (prompt or responses), to ensure it remains exclusively yours. -* **Interoperability**: Scaleway Managed Inference was designed as a drop-in [replacement for the OpenAI APIs](/managed-inference/reference-content/openai-compatibility/), for a seamless transition on your applications already using its libraries. - -## Console overview - -Discover the Managed Inference interface on the Scaleway console. - - - - - - A Scaleway account logged into the [console](https://console.scaleway.com) - - [Owner](/iam/concepts/#owner) status or [IAM permissions](/iam/concepts/#permission) allowing you to perform actions in the intended Organization - -## How to create a Managed Inference deployment - -1. Navigate to the **AI** section of the [Scaleway console](https://console.scaleway.com/), and select **Managed Inference** from the side menu to access the Managed Inference dashboard. -2. From the drop-down menu, select the geographical region where you want to create your deployment. -3. Click **Create deployment** to launch the deployment creation wizard. -4. Provide the necessary information: - - Select the desired model and the quantization to use for your deployment [from the available options](/managed-inference/reference-content/). - - Scaleway Managed Inference allows you to deploy various AI models, either from the Scaleway catalog or by importing a custom model. For detailed information about supported models, visit our [Supported models in Managed Inference](/managed-inference/reference-content/supported-models/) documentation. - - - Some models may require acceptance of an end-user license agreement (EULA). If prompted, review the terms and conditions and accept the license accordingly. - - - Choose the geographical **region** for the deployment. - - Select a node type, the GPU Instance that will be used with your deployment. - - Choose the number of nodes for your deployment. Note that this feature is currently in [Public Beta](https://www.scaleway.com/en/betas/). - - High availability is only guaranteed with two or more nodes. - -5. Enter a **name** for the deployment, along with optional tags to aid in organization. -6. Configure the **network** settings for the deployment: - - Enable **Private Network** for secure communication and restricted availability within Private Networks. Choose an existing Private Network from the drop-down list, or create a new one. - - Enable **Public Network** to access resources via the public Internet. API key protection is enabled by default. - - - Enabling both private and public networks will result in two distinct endpoints (public and private) for your deployment. - - Deployments must have at least one endpoint, either public or private. - -7. Click **Create deployment** to launch the deployment process. Once the deployment is ready, it will be listed among your deployments. - -## How to access a Managed Inference deployment - -Managed Inference deployments have authentication enabled by default. As such, your endpoints expect a secret key generated with Scaleway's Identity and Access Management service (IAM) for authentication. - -1. Click **Managed Inference** in the **AI** section of the side menu. The Managed Inference dashboard displays. -2. From the drop-down menu, select the geographical region where you want to manage. -3. Click the name of the deployment you wish to access. The deployment's **Overview** page displays. -4. Scroll down to the **Deployment authentication** section and click the **Generate key** button. The token creation wizard displays. -5. Fill in the [required information for API key creation](/iam/how-to/create-api-keys/) and click **Generate API key**. -6. Copy and safely store your credentials before closing the window, as they will not be shown again. - - - You have full control over authentication from the **Security** tab of your deployment. Authentication is enabled by default. - - -## How to interact with Managed Inference - -1. Click **Managed Inference** in the **AI** section of the side menu. The Managed Inference dashboard displays. -2. From the drop-down menu, select the geographical region where your desired deployment was created. -3. Click the name of the deployment you wish to edit. The deployment's **Overview** page displays. -4. Click the **Playground** tab, then **View code** to see code examples in various environments. Copy and paste them into your code editor or terminal. - - - Prompt structure may vary from one model to another. Refer to the specific instructions for use in our [dedicated documentation](/managed-inference/reference-content/). - - -## How to delete a deployment - -1. Click **Managed Inference** in the **AI** section of the [Scaleway console](https://console.scaleway.com) side menu. A list of your deployments displays. -2. From the drop-down menu, select the geographical region where your deployment was created. -3. Click the name of the deployment you wish to delete. -4. Navigate to the **Settings** tab. -5. Click **Delete deployment** at the bottom of the page. -6. Type **DELETE** to confirm and click **Delete deployment**. - - Alternatively, from the Deployments listing, click the icon next to the deployment name you no longer need, and click **Delete**. A pop-up appears. Type **DELETE** to confirm, then click **Delete deployment**. - - - Deleting a deployment is a permanent action that erases all its associated data and resources. - \ No newline at end of file diff --git a/pages/managed-inference/reference-content/data-privacy-security-scaleway-ai-services.mdx b/pages/managed-inference/reference-content/data-privacy-security-scaleway-ai-services.mdx deleted file mode 100644 index 69f1ddef73..0000000000 --- a/pages/managed-inference/reference-content/data-privacy-security-scaleway-ai-services.mdx +++ /dev/null @@ -1,47 +0,0 @@ ---- -title: Data, privacy, and security for Scaleway's AI services -description: Data, privacy, and security for Scaleway's AI services -tags: -dates: - validation: 2025-06-20 - posted: 2024-06-07 ---- - -This page sets out information about **data**, *privacy**, and **security** concerns for Scaleway's AI services. - -## Data security measures by Scaleway AI Services - -Scaleway's AI services implement robust security measures to ensure customer data privacy and integrity. Below is an overview of these practices. - -### Data usage policies - -Scaleway's Managed Inference services adhere to the following data usage policies: - -- **Stateless models**: Scaleway's models do not retain any prompts or generated content after processing. - - Prompts or completions are not stored within the model. - - Data is not used for training, retraining, or improving the base models. - -- **Data isolation**: Customer data, including prompts, completions, embeddings, and training data, remains isolated and secure: - - Data is not accessible to other Scaleway customers. - - Data is not accessible to the creators of the underlying large language models (LLMs). - - Data is not used to improve Scaleway or third-party products, services, or models. - -- **Service control and hosting**: Scaleway maintains full control over the Managed Inference service, hosting the LLMs on its infrastructure in Europe without interaction with third-party services. - -### Data storage policies - -- **Customer Data**: Inputs and outputs processed during inference are considered customer data. Scaleway does not log this data unless explicitly permitted by the customer through an opt-in feature, which is not yet available. The only exception is when your traffic harms your resource availability (e.g., by generating abnormal HTTP 500 errors), or could represent malicious activity. In this case, we may temporarily store the corresponding HTTP request content to identify and fix root cause issues or any security vulnerability. - -### Data security measures - -- **Hosting:** Models deployed or consumed for inference are hosted in Europe within the data center region specified by the customer. -- **Encryption**: All traffic between the customer and the inference service is encrypted using in-transit TLS encryption to ensure data protection during transmission. -- **Endpoint Security**: Public-facing endpoints are secured with API key tokens. -- **Virtual Private Cloud (VPC)**: The service can be hosted in a Virtual Private Cloud within private subnets. Access to the service can be restricted based on allowed IP ranges. - -### Legal and compliance - -- **GDPR compliance**: Scaleway's AI services comply with the General Data Protection Regulation (GDPR), ensuring that all personal data is processed in accordance with European Union laws. This includes implementing strong data protection measures, maintaining transparency in data processing activities, and ensuring customers' rights are upheld. -- **Extraterritorial law protection**: As a European company, Scaleway's AI services are not subject to extraterritorial laws such as the American Cloud Act. This means that customer data hosted in Europe is protected by European laws, which provide a high level of data sovereignty and security. - -By integrating these compliance measures, Scaleway reinforces its commitment to providing a secure and legally compliant environment for AI services, aligning with the stringent data protection standards expected by our customers. diff --git a/pages/managed-inference/reference-content/function-calling-support.mdx b/pages/managed-inference/reference-content/function-calling-support.mdx deleted file mode 100644 index a00e061bc6..0000000000 --- a/pages/managed-inference/reference-content/function-calling-support.mdx +++ /dev/null @@ -1,53 +0,0 @@ ---- -title: Support for function calling in Scaleway Managed Inference -description: Function calling allows models to connect to external tools. -tags: -dates: - validation: 2025-06-20 - posted: 2024-10-25 ---- - -This page describes the concept of **function calling** to allow models to connect to external tools, and explains how to implement it with Scaleway Managed Inference. - -## What is function calling? - -Function calling allows a Large Language Model (LLM) to interact with external tools or APIs, executing specific tasks based on user requests. The LLM identifies the appropriate function, extracts the required parameters, and returns the results as structured data, typically in JSON format. While errors can occur, custom parsers or tools like LlamaIndex and LangChain can help ensure valid results. - -## How to implement function calling in Scaleway Managed Inference - -[This tutorial](/tutorials/building-ai-application-function-calling/) will guide you through the steps of creating a simple flight schedule assistant that can understand natural language queries about flights and return structured information. - -## Models with function calling capabilities - -The following models in Scaleway's Managed Inference library can call tools as per the OpenAI method: - -* meta/llama-3.1-8b-instruct -* meta/llama-3.1-70b-instruct -* meta/llama-3.3-70b-instruct -* mistral/mistral-7b-instruct-v0.3 -* mistral/mistral-nemo-instruct-2407 -* mistral/pixtral-12b-2409 -* nvidia/llama-3.1-nemotron-70b-instruct -* deepseek/deepseek-r1-distill-llama-70b -* deepseek/deepseek-r1-distill-llama-8b - -## Understanding function calling - -Function calling consists of three main components: -- **Tool definitions**: JSON schemas that describe available functions and their parameters -- **Tool selection**: Automatic or manual selection of appropriate functions based on user queries -- **Tool execution**: Processing function calls and handling their responses - -The workflow typically follows these steps: -1. Define available tools using JSON schema -2. Send system and user query along with tool definitions -3. Process model's function selection -4. Execute selected functions -5. Return results to model for final response - -## Further resources - -For more information about function calling and advanced implementations, refer to these resources: - -- [OpenAI Function Calling Guide](https://platform.openai.com/docs/guides/function-calling) -- [JSON Schema Specification](https://json-schema.org/specification) diff --git a/pages/managed-inference/reference-content/index.mdx b/pages/managed-inference/reference-content/index.mdx deleted file mode 100644 index fee56936c0..0000000000 --- a/pages/managed-inference/reference-content/index.mdx +++ /dev/null @@ -1,4 +0,0 @@ ---- -title: Managed Inference - Additional content -description: Managed Inference - Additional content ---- \ No newline at end of file diff --git a/pages/managed-inference/reference-content/model-catalog.mdx b/pages/managed-inference/reference-content/model-catalog.mdx deleted file mode 100644 index 238f7e6801..0000000000 --- a/pages/managed-inference/reference-content/model-catalog.mdx +++ /dev/null @@ -1,529 +0,0 @@ ---- -title: Managed Inference model catalog -description: Deploy your own model with Scaleway Managed Inference. Privacy-focused, fully managed. -tags: -dates: - validation: 2025-10-27 - posted: 2024-04-18 ---- -A quick overview of available models in Scaleway's catalog and their core attributes. Expand any model below to see usage examples, curl commands, and detailed capabilities. - - - For more information about all the models supported in Managed Inference, refer to the [Supported Models in Managed Inference](/managed-inference/reference-content/supported-models/) page. - - -## Models technical summary - -| Model name | Provider | Maximum Context length (tokens) | Modalities | Compatible Instances (Max Context in tokens\*) | License \** | -|------------|----------|--------------|------------|-----------|---------| -| [`gpt-oss-120b`](#gpt-oss-120b) | OpenAI | 128k | Text | H100 | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | -| [`whisper-large-v3`](#whisper-large-v3) | OpenAI | - | Audio transcription | L4, L40S, H100, H100-SXM-2 | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | -| [`qwen3.5-397b-a17b`](#qwen35-397b-a17b) | Qwen | 250k | Text, Code, Vision | H100-SXM-8 | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | -| [`qwen3-235b-a22b-instruct-2507`](#qwen3-235b-a22b-instruct-2507) | Qwen | 250k | Text | H100-SXM-2 (40k), H100-SXM-4 | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | -| [`gemma-3-27b-it`](#gemma-3-27b-it) | Google | 40k | Text, Vision | H100, H100-2 | [Gemma](https://ai.google.dev/gemma/terms) | -| [`llama-3.3-70b-instruct`](#llama-33-70b-instruct) | Meta | 128k | Text | H100 (15k), H100-2 | [Llama 3.3 Community](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) | -| [`llama-3.1-70b-instruct`](#llama-31-70b-instruct) | Meta | 128k | Text | H100 (15k), H100-2 | [Llama 3.1 Community](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct/blob/main/LICENSE) | -| [`llama-3.1-8b-instruct`](#llama-31-8b-instruct) | Meta | 128k | Text | L4 (90k), L40S, H100, H100-2 | [Llama 3.1 Community](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/blob/main/LICENSE) | -| [`llama-3-70b-instruct`](#llama-3-70b-instruct) | Meta | 8k | Text | H100, H100-2 | [Llama 3 Community](https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/LICENSE) | -| [`llama-3.1-nemotron-70b-instruct`](#llama-31-nemotron-70b-instruct) | Nvidia | 128k | Text | H100 (15k), H100-2 | [Llama 3.1 Community](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct/blob/main/LICENSE) | -| [`deepseek-r1-distill-70b`](#deepseek-r1-distill-llama-70b) | Deepseek | 128k | Text | H100 (13k), H100-2 | [MIT](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/blob/main/LICENSE) and [Llama 3.3 Community](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct/blob/main/LICENSE) | -| [`deepseek-r1-distill-8b`](#deepseek-r1-distill-llama-8b) | Deepseek | 128k | Text | L4 (90k), L40S, H100, H100-2 | [MIT](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/blob/main/LICENSE) and [Llama 3.1 Community](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/blob/main/LICENSE) | -| [`mistral-7b-instruct-v0.3`](#mistral-7b-instruct-v03) | Mistral | 32k | Text | L4, L40S, H100, H100-2 | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | -| [`mistral-large-3-675b-instruct-2512`](#mistral-large-3-675b-instruct-2512) | Mistral | 250k | Text, Vision | H100-SXM-8 (180k) | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | -| [`mistral-small-3.2-24b-instruct-2506`](#mistral-small-32-24b-instruct-2506) | Mistral | 128k | Text, Vision | H100, H100-2 | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | -| [`mistral-small-3.1-24b-instruct-2503`](#mistral-small-31-24b-instruct-2503) | Mistral | 128k | Text, Vision | H100, H100-2 | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | -| [`mistral-small-24b-instruct-2501`](#mistral-small-24b-instruct-2501) | Mistral | 32k | Text | L40S (20k), H100, H100-2 | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | -| [`voxtral-small-24b-2507`](#voxtral-small-24b-2507) | Mistral | 32k | Text | H100, H100-2 | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | -| [`mistral-nemo-instruct-2407`](#mistral-nemo-instruct-2407) | Mistral | 128k | Text | L40S, H100, H100-2 | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | -| [`mixtral-8x7b-instruct-v0.1`](#mixtral-8x7b-instruct-v01) | Mistral | 32k | Text | H100-2 | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | -| [`magistral-small-2506`](#magistral-small-2506) | Mistral | 32k | Text | L40S, H100, H100-2 | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | -| [`devstral-2-123b-instruct-2512`](#devstral-2-123b-instruct-2512) | Mistral | 260k | Text, Code | H100-SXM-2 (75k), H100-SXM-4, H100-SXM-8 | [Modified MIT](https://huggingface.co/mistralai/Devstral-2-123B-Instruct-2512/blob/main/LICENSE) | -| [`devstral-small-2505`](#devstral-small-2505) | Mistral | 128k | Text | H100, H100-2 | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | -| [`pixtral-12b-2409`](#pixtral-12b-2409) | Mistral | 128k | Text, Vision | L40S (50k), H100, H100-2 | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | -| [`molmo-72b-0924`](#molmo-72b-0924) | Allen AI | 50k | Text, Vision | H100-2 | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) and [Twonyi Qianwen license](https://huggingface.co/Qwen/Qwen2-72B/blob/main/LICENSE)| -| [`holo2-30b-a3b`](#holo2-30b-a3b) | H | 22k | Text, Vision | H100-SXM-2 | [CC-BY-NC-4.0](https://spdx.org/licenses/CC-BY-NC-4.0)| -| [`qwen3-embedding-8b`](#qwen3-embedding-8b) | Qwen | 32k | Embeddings | L4, L40S, H100, H100-2 | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | -| [`qwen3-coder-30b-a3b-instruct`](#qwen3-coder-30b-a3b-instruct) | Qwen | 128k | Code | L40S, H100, H100-2 | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | -| [`qwen2.5-coder-32b-instruct`](#qwen25-coder-32b-instruct) | Qwen | 32k | Code | H100, H100-2 | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | -| [`bge-multilingual-gemma2`](#bge-multilingual-gemma2) | BAAI | 8k | Embeddings | L4, L40S, H100, H100-2 | [Gemma](https://ai.google.dev/gemma/terms) | -| [`sentence-t5-xxl`](#sentence-t5-xxl) | Sentence transformers | 512 | Embeddings | L4 | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | - -\*Maximum context length is only mentioned when instances VRAM size limits context length. Otherwise, maximum context length is the one defined by the model. -\**Licences which are not open-weight and may restrict commercial usage (such as `CC-BY-NC-4.0`), do not apply to usage through Scaleway Products due to existing partnerships between Scaleway and the corresponding providers. Original licences are provided for transparency only. - -## Models feature summary -| Model name | Structured output supported | Function calling | Supported languages | -| --- | --- | --- | --- | -| `gpt-oss-120b` | Yes | Yes | English | -| `whisper-large-v3` | - | - | English, French, German, Chinese, Japanese, Korean and 81 additional languages | -| `qwen3.5-397b-a17b` | Yes | Yes | English, French, German, Chinese, Japanese, Korean and 113 additional languages and dialects | -| `qwen3-235b-a22b-instruct-2507` | Yes | Yes | English, French, German, Chinese, Japanese, Korean and 113 additional languages and dialects | -| `gemma-3-27b-it` | Yes | Partial | English, Chinese, Japanese, Korean and 31 additional languages | -| `llama-3.3-70b-instruct` | Yes | Yes | English, German, French, Italian, Portuguese, Hindi, Spanish, and Thai | -| `llama-3.1-70b-instruct` | Yes | Yes | English, German, French, Italian, Portuguese, Hindi, Spanish, and Thai | -| `llama-3.1-8b-instruct` | Yes | Yes | English, German, French, Italian, Portuguese, Hindi, Spanish, and Thai | -| `llama-3-70b-instruct` | Yes | No | English | -| `llama-3.1-nemotron-70b-instruct` | Yes | Yes | English | -| `deepseek-r1-distill-llama-70B` | Yes | Yes | English, Chinese | -| `deepseek-r1-distill-llama-8B` | Yes | Yes | English, Chinese | -| `mistral-7b-instruct-v0.3` | Yes | Yes | English | -| `mistral-large-3-675b-instruct-2512` | Yes | Yes | English, French, German, Spanish, Italian, Portuguese, Dutch, Chinese, Japanese, Korean, Arabic | -| `mistral-small-3.2-24b-instruct-2506` | Yes | Yes | English, French, German, Greek, Hindi, Indonesian, Italian, Japanese, Korean, Malay, Nepali, Polish, Portuguese, Romanian, Russian, Serbian, Spanish, Swedish, Turkish, Ukrainian, Vietnamese, Arabic, Bengali, Chinese, Farsi | -| `mistral-small-3.1-24b-instruct-2503` | Yes | Yes | English, French, German, Greek, Hindi, Indonesian, Italian, Japanese, Korean, Malay, Nepali, Polish, Portuguese, Romanian, Russian, Serbian, Spanish, Swedish, Turkish, Ukrainian, Vietnamese, Arabic, Bengali, Chinese, Farsi | -| `mistral-small-24b-instruct-2501` | Yes | Yes | English, French, German, Dutch, Spanish, Italian, Polish, Portuguese, Chinese, Japanese, Korean | -| `voxtral-small-24b-2507` | Yes | Yes | English, French, German, Dutch, Spanish, Italian, Portuguese, Hindi | -| `mistral-nemo-instruct-2407` | Yes | Yes | English, French, German, Spanish, Italian, Portuguese, Russian, Chinese, Japanese | -| `mixtral-8x7b-instruct-v0.1` | Yes | No | English, French, German, Italian, Spanish | -| `magistral-small-2506` | Yes | Yes | English, French, German, Spanish, Portuguese, Italian, Japanese, Korean, Russian, Chinese, Arabic, Persian, Indonesian, Malay, Nepali, Polish, Romanian, Serbian, Swedish, Turkish, Ukrainian, Vietnamese, Hindi, Bengali | -| `devstral-2-123b-instruct-2512` | Yes | Yes | English | -| `devstral-small-2505` | Yes | Yes | English, French, German, Spanish, Portuguese, Italian, Japanese, Korean, Russian, Chinese, Arabic, Persian, Indonesian, Malay, Nepali, Polish, Romanian, Serbian, Swedish, Turkish, Ukrainian, Vietnamese, Hindi, Bengali | -| `pixtral-12b-2409` | Yes | Yes | English | -| `molmo-72b-0924` | Yes | No | English | -| `holo2-30b-a3b` | Yes | No | English | -| `qwen3-embedding-8b` | No | No | English, French, German, Chinese, Japanese, Korean and 113 additional languages and dialects | -| `qwen3-coder-30b-a3b-instruct` | Yes | Yes | English, French, German, Chinese, Japanese, Korean and 113 additional languages and dialects | -| `qwen2.5-coder-32b-instruct` | Yes | Yes | English, French, Spanish, Portuguese, German, Italian, Russian, Chinese, Japanese, Korean, Vietnamese, Thai, Arabic and 16 additional languages. | -| `bge-multilingual-gemma2` | No | No | English, French, Chinese, Japanese, Korean | -| `sentence-t5-xxl` | No | No | English | - - -## Model details - - Despite efforts for accuracy, the possibility of generated text containing inaccuracies or [hallucinations](/managed-inference/concepts/#hallucinations) exists. Always verify the content generated independently. - - -## Multimodal models (Text and Vision) - - - Vision models can understand and analyze images, not generate them. You will use it through the /v1/chat/completions endpoint. - - -### Gemma-3-27b-it -Gemma-3-27b-it is a model developed by Google to perform text processing and image analysis on many languages. -The model was not trained specifically to output function / tool call tokens. Hence function calling is currently supported, but reliability remains limited. - -#### Model names -``` -google/gemma-3-27b-it:bf16 -``` -| Attribute | Value | -|-----------|-------| -| Supports parallel tool calling | No | -| Supported image formats | PNG, JPEG, WEBP, and non-animated GIFs | -| Maximum image resolution (pixels) | 896x896 | -| Token dimension (pixels)| 56x56 | - -- Pan & Scan is not yet supported for Gemma 3 images. This means that high resolution images are currently resized to 896x896 resolution that may generate artifacts and lead to a lower accuracy. - -### Mistral-large-3-675b-instruct-2512 -Mistral-large-3-675b-instruct-2512 is a frontier model, performing among the best open-weight models as of December 2025. It is ideal for agentic workflows and image understanding. - -| Attribute | Value | -|-----------|-------| -| Supports parallel tool calling | Yes | -| Supported images formats | PNG, JPEG, WEBP, and non-animated GIFs | -| Maximum image resolution (pixels) | 1540x1540 | -| Token dimension (pixels)| 28x28 | - -#### Model names -``` -mistral/mistral-large-3-675b-instruct-2512:fp4 -``` - -### Mistral-small-3.2-24b-instruct-2506 -Mistral-small-3.2-24b-instruct-2506 is an improved version of Mistral-small-3.1 which performs better on tool calling. -This model was optimized to have a dense knowledge and faster tokens throughput compared to its size. - -| Attribute | Value | -|-----------|-------| -| Supports parallel tool calling | Yes | -| Supported images formats | PNG, JPEG, WEBP, and non-animated GIFs | -| Maximum image resolution (pixels) | 1540x1540 | -| Token dimension (pixels)| 28x28 | - -#### Model names -``` -mistral/mistral-small-3.2-24b-instruct-2506:fp8 -``` - -### Mistral-small-3.1-24b-instruct-2503 -Mistral-small-3.1-24b-instruct-2503 is a model developed by Mistral to perform text processing and image analysis on many languages. -This model was optimized to have a dense knowledge and faster tokens throughput compared to its size. - -| Attribute | Value | -|-----------|-------| -| Supports parallel tool calling | Yes | -| Supported images formats | PNG, JPEG, WEBP, and non-animated GIFs | -| Maximum image resolution (pixels) | 1540x1540 | -| Token dimension (pixels)| 28x28 | - -#### Model names -``` -mistral/mistral-small-3.1-24b-instruct-2503:bf16 -mistral/mistral-small-3.1-24b-instruct-2503:fp8 -``` - -- Bitmap (or raster) image formats, meaning storing images as grids of individual pixels, are supported. Vector image formats (SVG, PSD) are not supported, neither PDFs nor videos. -- Images size are limited in the following ways: - - Directly by the maximum context window. As an example, since tokens are squares of 28x28 pixels, the maximum context window taken by a single image is `3025` tokens (ie. `(1540*1540)/(28*28)`) - - Indirectly by the model accuracy: resolution above 1540x1540 will not increase model output accuracy. Indeed, images above 1540 pixels width or height will be automatically downscaled to fit within 1540x1540 dimension. Note that image ratio and overall aspect is preserved (images are not cropped, only additionally compressed). - -### Qwen3.5-397b-a17b -Qwen3.5-397b-a17b is a model developed by Mistral to perform text processing, agentic coding, image, and video analysis in several languages. -This model was released as a frontier reasoning model on 16th February 2026. - -| Attribute | Value | -|-----------|-------| -| Supports parallel tool calling | Yes | -| Supported images formats | PNG, JPEG, WEBP, and non-animated GIFs | -| Supported video formats | MP4, MPEG, MOV, OGG and WEBM | -| Maximum image resolution (pixels) | 4096x4096 | -| Token dimension (pixels)| 32x32 | - -#### Model names -``` -qwen/qwen3.5-397b-a17b:int4 -``` - -### Pixtral-12b-2409 -Pixtral is a vision language model introducing a novel architecture: 12B parameter multimodal decoder plus 400M parameter vision encoder. -It can analyze images and offer insights from visual content alongside text. - -| Attribute | Value | -|-----------|-------| -| Supports parallel tool calling | Yes | -| Supported images formats | PNG, JPEG, WEBP, and non-animated GIFs | -| Maximum image resolution (pixels) | 1024x1024 | -| Token dimension (pixels)| 16x16 | -| Maximum images per request | 12 | - -#### Model name -``` -mistral/pixtral-12b-2409:bf16 -``` -### Holo2-30b-a3b -Holo2 30B is a text and vision model optimized to analyse Graphical User Interface, such as Web browser or software, and take actions. - -| Attribute | Value | -|-----------|-------| -| Supports parallel tool calling | Yes | -| Supported images formats | PNG, JPEG, WEBP, and non-animated GIFs | -| Token dimension (pixels)| 16x16 | - -#### Model name -``` -hcompany/holo2-30b-a3b:bf16 -``` - -### Molmo-72b-0924 -Molmo 72B is the powerhouse of the Molmo family, multimodal models developed by the renowned research lab Allen Institute for AI. -Vision-language models like Molmo can analyze an image and offer insights from visual content alongside text. This multimodal functionality creates new opportunities for applications that need both visual and textual comprehension. - -#### Model name -``` -allenai/molmo-72b-0924:fp8 -``` - -## Multimodal models (Text and Audio) - -### Voxtral-small-24b-2507 -Voxtral-small-24b-2507 is a model developed by Mistral to perform text processing and audio analysis on many languages. -This model was optimized to enable transcription in many languages while keeping conversational capabilities (translations, classification, etc.) - -| Attribute | Value | -|-----------|-------| -| Supports parallel tool calling | Yes | -| Supported audio formats | WAV and MP3 | -| Audio chunk duration | 30 seconds | -| Token duration (audio)| 80ms | -| Maximum transcription duration| 30 minutes | -| Maximum understanding duration| 40 minutes | - -#### Model names -``` -mistral/voxtral-small-24b-2507:bf16 -mistral/voxtral-small-24b-2507:fp8 -``` - -- Mono and stereo audio formats are supported. For stereo formats, both left and right channels are merged before being processed. -- Audio files are processed in 30 seconds chunks: - - If audio sent is less than 30 seconds, the rest of the chunk will be considered silent. - - 80ms is equal to 1 input token - -## Audio transcription models - -### Whisper-large-v3 -Whisper-large-v3 is a model developed by OpenAI to transcribe audio in many languages. -This model is optimized for audio transcription tasks. - -| Attribute | Value | -|-----------|-------| -| Supported audio formats | WAV and MP3 | -| Audio chunk duration | 30 seconds | - -#### Model names -``` -openai/whisper-large-v3:bf16 -``` - -- Mono and stereo audio formats are supported. For stereo formats, left and right channels are merged before being processed. -- Audio files are processed in 30-second chunks: - - If audio sent is less than 30 seconds, the rest of the chunk will be considered silent. - -## Text models - -### Qwen3-235b-a22b-instruct-2507 -Released July 23, 2025, Qwen 3 235B A22B is an open-weight model, competitive in multiple benchmarks (such as [LM Arena for text use cases](https://lmarena.ai/leaderboard)) compared to Gemini 2.5 Pro and GPT4.5. - -| Attribute | Value | -|-----------|-------| -| Supports parallel tool calling | Yes | - - - -#### Model name -``` -openai/gpt-oss-120b:fp4 -``` - -### Gpt-oss-120b -Released August 5, 2025, GPT OSS 120B is an open-weight model providing significant throughput performance and reasoning capabilities. -Currently, this model should be used through Responses API, as Chat Completion does not yet support tool calling for this model. - -| Attribute | Value | -|-----------|-------| -| Supports parallel tool calling | Yes | - -#### Model name -``` -openai/gpt-oss-120b:fp4 -``` - -### Llama-3.3-70b-instruct -Released December 6, 2024, Meta’s Llama 3.3 70b is a fine-tune of the [Llama 3.1 70b](/managed-inference/reference-content/model-catalog/#llama-31-70b-instruct) model. -This model is still text-only (text in/text out). However, Llama 3.3 was designed to approach the performance of Llama 3.1 405B on some applications. - -| Attribute | Value | -|-----------|-------| -| Supports parallel tool calling | Yes | - -#### Model name -``` -meta/llama-3.3-70b-instruct:fp8 -meta/llama-3.3-70b-instruct:bf16 -``` - -### Llama-3.1-70b-instruct -Released July 23, 2024, Meta’s Llama 3.1 is an iteration of the open-access Llama family. -Llama 3.1 was designed to match the best proprietary models and outperform many of the available open source on common industry benchmarks. - -| Attribute | Value | -|-----------|-------| -| Supports parallel tool calling | Yes | - -#### Model names -``` -meta/llama-3.1-70b-instruct:fp8 -meta/llama-3.1-70b-instruct:bf16 -``` - -### Llama-3.1-8b-instruct -Released July 23, 2024, Meta’s Llama 3.1 is an iteration of the open-access Llama family. -Llama 3.1 was designed to match the best proprietary models and outperform many of the available open source on common industry benchmarks. - -| Attribute | Value | -|-----------|-------| -| Supports parallel tool calling | Yes | - -#### Model names -``` -meta/llama-3.1-8b-instruct:fp8 -meta/llama-3.1-8b-instruct:bf16 -``` - -### Llama-3-70b-instruct -Meta’s Llama 3 is an iteration of the open-access Llama family. -Llama 3 was designed to match the best proprietary models, enhanced by community feedback for greater utility and responsibly spearheading the deployment of LLMs. -With a commitment to open-source principles, this release marks the beginning of a multilingual, multimodal future for Llama 3, pushing the boundaries in reasoning and coding capabilities. - -#### Model name -``` -meta/llama-3-70b-instruct:fp8 -``` - -### Llama-3.1-Nemotron-70b-instruct -Introduced October 14, 2024, NVIDIA's Nemotron 70B Instruct is a specialized version of the Llama 3.1 model designed to follow complex instructions. -NVIDIA employed Reinforcement Learning from Human Feedback (RLHF) to fine-tune the model’s ability to generate relevant and informative responses. - -#### Model name -``` -nvidia/llama-3.1-nemotron-70b-instruct:fp8 -``` - -### DeepSeek-R1-Distill-Llama-70B -Released January 21, 2025, Deepseek’s R1 Distilled Llama 70B is a distilled version of the Llama model family based on Deepseek R1. -DeepSeek R1 Distill Llama 70B is designed to improve the performance of Llama models on reasoning use cases such as mathematics and coding tasks. - -| Attribute | Value | -|-----------|-------| -| Supports parallel tool calling | No | - -#### Model name -``` -deepseek/deepseek-r1-distill-llama-70b:fp8 -deepseek/deepseek-r1-distill-llama-70b:bf16 -``` - -### DeepSeek-R1-Distill-Llama-8B -Released January 21, 2025, Deepseek’s R1 Distilled Llama 8B is a distilled version of the Llama model family based on Deepseek R1. -DeepSeek R1 Distill Llama 8B is designed to improve the performance of Llama models on reasoning use cases such as mathematics and coding tasks. - -#### Model names -``` -deepseek/deepseek-r1-distill-llama-8b:fp8 -deepseek/deepseek-r1-distill-llama-8b:bf16 -``` - -### Mixtral-8x7b-instruct-v0.1 -Mixtral-8x7b-instruct-v0.1, developed by Mistral, is tailored for instructional platforms and virtual assistants. -Trained on vast instructional datasets, it provides clear and concise instructions across various domains, enhancing user learning experiences. - -#### Model names -``` -mistral/mixtral-8x7b-instruct-v0.1:fp8 -mistral/mixtral-8x7b-instruct-v0.1:bf16 -``` - -### Mistral-7b-instruct-v0.3 -The first dense model released by Mistral AI, perfect for experimentation, customization, and quick iteration. At the time of the release, it matched the capabilities of models up to 30B parameters. -This model is open-weight and distributed under the Apache 2.0 license. - -#### Model name -``` -mistral/mistral-7b-instruct-v0.3:bf16 -``` - -### Mistral-small-24b-instruct-2501 -Mistral Small 24B Instruct is a state-of-the-art transformer model of 24B parameters, built by Mistral. -This model is open-weight and distributed under the Apache 2.0 license. - -#### Model name -``` -mistral/mistral-small-24b-instruct-2501:fp8 -mistral/mistral-small-24b-instruct-2501:bf16 -``` - -### Mistral-nemo-instruct-2407 -Mistral Nemo is a state-of-the-art transformer model of 12B parameters, built by Mistral in collaboration with NVIDIA. -This model is open-weight and distributed under the Apache 2.0 license. -It was trained on a large proportion of multilingual and code data. - -| Attribute | Value | -|-----------|-------| -| Supports parallel tool calling | Yes | - -#### Model name -``` -mistral/mistral-nemo-instruct-2407:fp8 -``` - -### Magistral-small-2506 -Magistral Small is a reasoning model, optimized to perform well on reasoning tasks such as academic or scientific questions. -It is well suited for complex tasks requiring multiple reasoning steps. - -#### Model name -``` -mistral/magistral-small-2506:fp8 -mistral/magistral-small-2506:bf16 -``` - -## Code models - -### Devstral-2-123b-instruct-2512 -Devstral 2 is a state-of-the-art coding model as of December 2025, that excels at using tools to explore codebases, editing multiple files and powering software engineering agents. - -#### Model name -``` -mistral/devstral-2-123b-instruct-2512:fp8 -``` - -### Devstral-small-2505 -Devstral Small is a fine-tune of Mistral Small 3.1, optimized to perform software engineering tasks. -It is a good fit to be used as coding agent, for instance in an IDE. - -#### Model name -``` -mistral/devstral-small-2505:fp8 -mistral/devstral-small-2505:bf16 -``` - -### Qwen3-coder-30b-a3b-instruct -Qwen3-coder is an improved version of Qwen2.5 with better accuracy and throughput. -Thanks to its a3b architecture, only a subset of its weights are activated for a given generation, leading to much faster input and output token processing, ideal for code completion. - -| Attribute | Value | -|-----------|-------| -| Supports parallel tool calling | Yes | - -#### Model name -``` -qwen/qwen3-coder-30b-a3b-instruct:fp8 -``` - -### Qwen2.5-coder-32b-instruct -Qwen2.5-coder is your intelligent programming assistant familiar with more than 40 programming languages. -With Qwen2.5-coder deployed at Scaleway, your company can benefit from code generation, AI-assisted code repair, and code reasoning. - -| Attribute | Value | -|-----------|-------| -| Supports parallel tool calling | No | - -#### Model name -``` -qwen/qwen2.5-coder-32b-instruct:int8 -``` - -## Embeddings models - -### Qwen3-embedding-8b -Qwen/Qwen3-Embedding-8B state of the art embedding model ranking 3rd on METB leaderboard as of November 2025, supporting custom dimensions number between 32 and 4096. - -| Attribute | Value | -|-----------|-------| -| Embedding dimensions (maximum) | 4096 | -| Embedding dimensions (minimum) | 32 | -| Matryoshka embedding | Yes | - - - [Matryoshka embeddings](https://huggingface.co/blog/matryoshka) refers to embeddings trained on multiple dimensions number. As a result, resulting vectors dimensions will be sorted by most meaningful first. For example, a 4096 dimensions vector can be truncated to its 768 first dimensions and used directly. - - -### Bge-multilingual-gemma2 -BGE-Multilingual-Gemma2 tops the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard), scoring the number one spot in French and Polish, and number seven in English (as of Q4 2024). -As its name suggests, the model’s training data spans a broad range of languages, including English, Chinese, Polish, French, and more. - -| Attribute | Value | -|-----------|-------| -| Embedding dimensions | 3584 | -| Matryoshka embedding | No | - -#### Model name -``` -baai/bge-multilingual-gemma2:fp32 -``` - -### Sentence-t5-xxl -The Sentence-T5-XXL model represents a significant evolution in sentence embeddings, building on the robust foundation of the Text-To-Text Transfer Transformer (T5) architecture. -Designed for performance in various language processing tasks, Sentence-T5-XXL leverages the strengths of T5's encoder-decoder structure to generate high-dimensional vectors that encapsulate rich semantic information. -This model has been meticulously tuned for tasks such as text classification, semantic similarity, and clustering, making it a useful tool in the RAG (Retrieval-Augmented Generation) framework. It excels in sentence similarity tasks, but its performance in semantic search tasks is less optimal. - - -| Attribute | Value | -|-----------|-------| -| Embedding dimensions | 768 | -| Matryoshka embedding | No | - -#### Model name -``` -sentence-transformers/sentence-t5-xxl:fp32 -``` diff --git a/pages/organizations-and-projects/additional-content/organization-quotas.mdx b/pages/organizations-and-projects/additional-content/organization-quotas.mdx index 380a14ddd5..0f940b44f3 100644 --- a/pages/organizations-and-projects/additional-content/organization-quotas.mdx +++ b/pages/organizations-and-projects/additional-content/organization-quotas.mdx @@ -181,11 +181,11 @@ At Scaleway, quotas are applicable per [Organization](/iam/concepts/#organizatio | B300-SXM-4-288G | To use this product, you must [contact our Support team](https://console.scaleway.com/support/create). | To use this product, you must [contact our Support team](https://console.scaleway.com/support/create). | | B300-SXM-8-288G | To use this product, you must [contact our Support team](https://console.scaleway.com/support/create). | To use this product, you must [contact our Support team](https://console.scaleway.com/support/create). | -## Managed Inference +## Generative APIs - Dedicated Deployment -Managed Inference Deployments are limited to a maximum number of nodes, depending on the node types provisioned. +Generative APIs - Dedicated Deployments are limited to a maximum number of nodes, depending on the node types provisioned. [Contact our Support team](https://console.scaleway.com/support/create) if you want to increase your quotas further. @@ -196,16 +196,16 @@ Managed Inference Deployments are limited to a maximum number of nodes, dependin | Number of H100/H100-2 Nodes | To use this product, you must [validate your identity](/account/how-to/verify-identity/). | 3 | | Number of L4/L40S Nodes | To use this product, you must [validate your identity](/account/how-to/verify-identity/). | 5 | -## Generative APIs +## Generative APIs - Serverless -Generative APIs are rate limited based on: +Generative APIs - Serverless are rate limited based on: - Tokens per minute (total input and output tokens) - Requests per minute - Concurrent requests (total active HTTP sessions at the same time) -Note that requests performed through [Batches API](https://console.scaleway.com/generative-api/batches) do not have a rate limit and are billed with a [-50% discount compared to standard model prices](https://www.scaleway.com/en/pricing/model-as-a-service/#generative-apis). +Note that requests performed through [Batches API](https://console.scaleway.com/generative-api/batches) do not have a rate limit and are billed with a [-50% discount compared to standard model prices](https://www.scaleway.com/en/pricing/model-as-a-service/). [Contact our Support team](https://console.scaleway.com/support/create) if you want to increase your quotas above these limits. diff --git a/pages/use-cases/build-and-run-ai/index.mdx b/pages/use-cases/build-and-run-ai/index.mdx index 521a3831c0..c162ac2fcc 100644 --- a/pages/use-cases/build-and-run-ai/index.mdx +++ b/pages/use-cases/build-and-run-ai/index.mdx @@ -15,11 +15,11 @@ Scaleway supports AI development and deployment with powerful compute resources url="/generative-apis/reference-content/integrate-with-n8n/" /> diff --git a/pages/vpc/reference-content/getting-most-private-networks.mdx b/pages/vpc/reference-content/getting-most-private-networks.mdx index 05be1309e5..4ee5363028 100644 --- a/pages/vpc/reference-content/getting-most-private-networks.mdx +++ b/pages/vpc/reference-content/getting-most-private-networks.mdx @@ -117,7 +117,7 @@ Different types of Scaleway resources may have different requirements and possib | Compatible with private IPv6 | Yes | Yes | Yes | Yes | No | | Compatible with reserved IPs | Yes | Yes | No |No | No | | Additional information | -- | [Paid-for
feature](https://www.scaleway.com/en/pricing/elastic-metal/#network) | [Paid-for
feature](https://www.scaleway.com/en/pricing/apple-silicon/) | PN cannot be changed
after cluster creation | Must have at least one of
private and/or public endpoint | -| Documentation | [Go](/instances/how-to/use-private-networks/) | [Go](/elastic-metal/how-to/use-private-networks/) | [Go](/apple-silicon/how-to/use-private-networks/) | [Go](/kubernetes/reference-content/secure-cluster-with-private-network/#why-have-a-private-network-for-your-kubernetes-kapsule-cluster) | [Go](/managed-inference/how-to/managed-inference-with-private-network/) | +| Documentation | [Go](/instances/how-to/use-private-networks/) | [Go](/elastic-metal/how-to/use-private-networks/) | [Go](/apple-silicon/how-to/use-private-networks/) | [Go](/kubernetes/reference-content/secure-cluster-with-private-network/#why-have-a-private-network-for-your-kubernetes-kapsule-cluster) | [Go](/generative-apis/how-to/dedicated-deployment-with-private-network/) | | | Managed Database | Managed Database for Redis™ | Public Gateways | Load Balancer | Serverless Functions & Containers | diff --git a/tutorials/building-ai-application-function-calling/index.mdx b/tutorials/building-ai-application-function-calling/index.mdx index a96e67a1a8..52980ad33c 100644 --- a/tutorials/building-ai-application-function-calling/index.mdx +++ b/tutorials/building-ai-application-function-calling/index.mdx @@ -3,7 +3,6 @@ title: Get started with agentic AI - building a flight assistant with function c description: Learn how to implement function calling in your applications using a practical flight schedule example. tags: AI function-calling LLM python structured-data products: - - managed-inference - generative-apis hero: assets/function-calling.webp dates: @@ -28,7 +27,7 @@ This tutorial will guide you through creating a simple flight schedule assistant - A Scaleway account logged into the [console](https://console.scaleway.com) - Python 3.7 or higher - An API key from Scaleway [Identity and Access Management](/iam/) -- Access to Scaleway [Generative APIs](/generative-apis/quickstart/) or to Scaleway [Managed Inference](/managed-inference/quickstart/) +- Access to Scaleway [Generative APIs](/generative-apis/quickstart/) - The `openai` Python library installed ## Understanding function calling diff --git a/tutorials/how-to-implement-rag/index.mdx b/tutorials/how-to-implement-rag/index.mdx index 929bc5836f..005df9669d 100644 --- a/tutorials/how-to-implement-rag/index.mdx +++ b/tutorials/how-to-implement-rag/index.mdx @@ -3,7 +3,7 @@ title: Implementing Retrieval-Augmented Generation (RAG) with LangChain and Scal description: Master Retrieval-Augmented Generation (RAG) with LangChain and Scaleway Managed Inference tags: inference managed postgresql pgvector object storage RAG langchain machine learning AI language models products: - - managed-inference + - generative-apis dates: validation_frequency: 12 difficulty: beginner @@ -32,8 +32,8 @@ LangChain simplifies the process of enhancing language models with retrieval cap - A Scaleway account logged into the [console](https://console.scaleway.com) - [Owner](/iam/concepts/#owner) status or [IAM permissions](/iam/concepts/#permission) allowing you to perform actions in the intended Organization - A valid [API key](/iam/how-to/create-api-keys/) -- An [Inference Deployment](/managed-inference/how-to/create-deployment/): set it up using [sentence-transformers/sentence-t5-xxl](/managed-inference/reference-content/model-catalog/#sentence-t5-xxl) on an L4 instance to efficiently process embeddings. -- An [Inference Deployment](/managed-inference/how-to/create-deployment/) with the large language model of your choice. +- An [Inference Deployment](/generative-apis/how-to/create-deployment/): set it up using [sentence-transformers/sentence-t5-xxl](/generative-apis/reference-content/supported-models/#sentence-t5-xxl) on an L4 instance to efficiently process embeddings. +- An [Inference Deployment](/generative-apis/how-to/create-deployment/) with the large language model of your choice. - An [Object Storage Bucket](/object-storage/how-to/create-a-bucket/) to store all the data you want to inject into your LLM model. - A [Managed Database](/managed-databases-for-postgresql-and-mysql/how-to/create-a-database/) to securely store all your embeddings. diff --git a/tutorials/processing-images-structured-outputs-pixtral/index.mdx b/tutorials/processing-images-structured-outputs-pixtral/index.mdx index 6e5ca3d601..0b818d5f14 100644 --- a/tutorials/processing-images-structured-outputs-pixtral/index.mdx +++ b/tutorials/processing-images-structured-outputs-pixtral/index.mdx @@ -30,7 +30,7 @@ This tutorial will guide you through the process of using the Pixtral vision mod - A Scaleway account logged into the [console](https://console.scaleway.com) - A Python environment (version 3.7 or higher) - An API key from Scaleway [Identity and Access Management](/iam/) -- Access to a Scaleway [Managed Inference](/managed-inference/reference-content/model-catalog/#pixtral-12b-2409) endpoint with Pixtral deployed or to Scaleway [Generative APIs](/generative-apis/quickstart/) service +- Access to a Scaleway [Generative APIs - Dedicated Deployment](/generative-apis/reference-content/supported-models/#pixtral-12b-2409) endpoint with Pixtral deployed or to Scaleway [Generative APIs](/generative-apis/quickstart/) service - The `openai` and `pydantic` Python libraries installed ## Setting up the environment