diff --git a/tensorrt_llm/serve/openai_protocol.py b/tensorrt_llm/serve/openai_protocol.py index 8730976898fd..3c4b13ab53ff 100644 --- a/tensorrt_llm/serve/openai_protocol.py +++ b/tensorrt_llm/serve/openai_protocol.py @@ -43,11 +43,18 @@ from tensorrt_llm.llmapi.reasoning_parser import ReasoningParserFactory -def _logit_bias_to_embedding_bias(logit_bias: Optional[Dict[str, float]], - vocab_size: int) -> Optional[torch.Tensor]: +def _logit_bias_to_embedding_bias( + logit_bias: Optional[Dict[str, float]], + vocab_size: Optional[int]) -> Optional[torch.Tensor]: """Convert OpenAI logit_bias dict to embedding_bias tensor for sampling.""" if logit_bias is None: return None + if vocab_size is None: + raise ValueError( + "logit_bias requires a tokenizer, but the server was started " + "without one (e.g. num_postprocess_workers > 0). " + "Remove logit_bias from your request or set num_postprocess_workers=0." + ) # Create 1D zeros tensor as expected by executor API (will be unsqueezed to [1, vocab_size] internally) embedding_bias = torch.zeros(vocab_size, dtype=torch.float32) @@ -390,7 +397,7 @@ class CompletionRequest(OpenAIBaseModel): # doc: end-completion-extra-params def to_sampling_params(self, - vocab_size: int = 32000, + vocab_size: Optional[int] = None, gather_generation_logits: bool = False, backend: Optional[str] = None) -> SamplingParams: sampling_params = SamplingParams( @@ -752,7 +759,7 @@ class ChatCompletionRequest(OpenAIBaseModel): # doc: end-chat-completion-extra-params def to_sampling_params(self, - vocab_size: int = 32000, + vocab_size: Optional[int] = None, gather_generation_logits: bool = False, reasoning_parser: Optional[str] = None, backend: Optional[str] = None) -> SamplingParams: diff --git a/tensorrt_llm/serve/openai_server.py b/tensorrt_llm/serve/openai_server.py index e248ac96b1fa..278d9ef9ae29 100644 --- a/tensorrt_llm/serve/openai_server.py +++ b/tensorrt_llm/serve/openai_server.py @@ -516,6 +516,12 @@ def postproc_worker_enabled(self) -> bool: return True if self.generator.args.num_postprocess_workers > 0 else False + @property + def _vocab_size(self) -> Optional[int]: + if self.tokenizer is not None and self.tokenizer.tokenizer is not None: + return self.tokenizer.tokenizer.vocab_size + return None + @staticmethod def create_error_response( message: str, @@ -1043,7 +1049,7 @@ async def chat_stream_generator( # Pass the tokenizer vocabulary size so ``logit_bias`` can be # expanded into an embedding bias tensor in the sampler. sampling_params = request.to_sampling_params( - vocab_size=self.tokenizer.tokenizer.vocab_size, + vocab_size=self._vocab_size, gather_generation_logits=self.generator.args. gather_generation_logits, reasoning_parser=self.generator.args.reasoning_parser, @@ -1375,7 +1381,7 @@ async def generator_wrapper(generator: AsyncIterator[Any]): # Pass the tokenizer vocabulary size so ``logit_bias`` can be # expanded into an embedding bias tensor in the sampler. sampling_params = request.to_sampling_params( - vocab_size=self.tokenizer.tokenizer.vocab_size, + vocab_size=self._vocab_size, gather_generation_logits=self.generator.args. gather_generation_logits, backend=self.generator.args.backend) @@ -1510,8 +1516,7 @@ async def create_streaming_generator(promise: RequestOutput, request.stop_token_ids = harmony_stop_tokens sampling_params = request.to_sampling_params( - vocab_size=self.tokenizer.tokenizer.vocab_size, - reasoning_parser="gpt_oss") + vocab_size=self._vocab_size, reasoning_parser="gpt_oss") sampling_params.detokenize = False # Harmony adapter handles detokenization disaggregated_params = to_llm_disaggregated_params( request.disaggregated_params) diff --git a/tests/unittest/llmapi/apps/test_harmony_parsing.py b/tests/unittest/llmapi/apps/test_harmony_parsing.py index acf51f73c1d6..7636e0edb90d 100644 --- a/tests/unittest/llmapi/apps/test_harmony_parsing.py +++ b/tests/unittest/llmapi/apps/test_harmony_parsing.py @@ -36,7 +36,8 @@ get_harmony_adapter, handle_streaming_response, ) - from tensorrt_llm.serve.openai_protocol import StreamOptions + from tensorrt_llm.serve.openai_protocol import StreamOptions, _logit_bias_to_embedding_bias + from tensorrt_llm.serve.openai_server import OpenAIServer _harmony_available = True except (ImportError, ModuleNotFoundError): @@ -1192,5 +1193,13 @@ def test_stream_options_none_defaults_to_include(self): harmony_adapter.cleanup_stream_state(request_id) +def test_none_tokenizer_num_postprocess_workers(): + server = object.__new__(OpenAIServer) + server.tokenizer = None + assert server._vocab_size is None + with pytest.raises(ValueError, match="logit_bias requires a tokenizer"): + _logit_bias_to_embedding_bias({"0": 1.0}, vocab_size=None) + + if __name__ == "__main__": pytest.main([__file__, "-v"])