livekit · theomonnom · May 4, 2026 · May 4, 2026 · May 4, 2026 · May 4, 2026
diff --git a/examples/drive-thru/agent.py b/examples/drive-thru/agent.py
@@ -428,8 +428,9 @@ async def drive_thru_agent(ctx: JobContext) -> None:
                 ],
             },
         ),
-        llm=inference.LLM("openai/gpt-5-mini"),
-        tts=inference.TTS("cartesia/sonic-3", voice="f786b574-daa5-4673-aa0c-cbe3e8534c02"),
+        llm=inference.LLM("openai/gpt-4.1-mini"),
+        tts=inference.TTS("inworld/inworld-tts-2", voice="Sarah"),
+        expressiveness=True,
         turn_detection=MultilingualModel(),
         vad=silero.VAD.load(),
         max_tool_steps=10,

diff --git a/examples/drive-thru/database.py b/examples/drive-thru/database.py
@@ -7,22 +7,26 @@
 
 COMMON_INSTRUCTIONS = (
     "You are Mac, a quick and friendly McDonald’s drive-thru attendant. \n"
+    "Your output is directly synthesized to speech. Produce a natural verbatim transcript, "
+    "not polished text. Real people start responses with reactions (oh, hmm, ah) and "
+    "fillers (um, uh, like), not \"Absolutely\" or \"Certainly\". Include mid-sentence "
+    "fillers (like, you know, I mean) the way they appear in real transcripts. "
+    "Use informal phrasing: yeah, gonna, kinda, gotcha, lemme. \n"
+    "\n\n"
     "Your job is to guide the customer smoothly through their order, speaking in short, natural voice responses. \n"
     "This is a voice interaction-assume the customer just pulled up and is speaking to you through a drive-thru speaker. \n"
-    "Respond like you're hearing them, not reading text. \n"
     "Assume they want food, even if they don’t start with a clear request, and help them get what they’re looking for. \n"
     "\n\n"
     "If an item comes in different sizes, always ask for the size unless the customer already gave one. \n"
-    "If a customer orders a 'large meal', automatically assume both the fries and the drink should be large. \n"
+    "If a customer orders a ‘large meal’, automatically assume both the fries and the drink should be large. \n"
     "Do not ask again to confirm the size of the drink or fries. This inference is meant to streamline the interaction. \n"
     "If the customer clearly indicates a different size for the fries or drink, respect their preference. \n"
     "\n\n"
     "Never infer or assume any detail the customer has not explicitly stated — especially the drink for a combo meal. \n"
     "If a required detail is missing, always ask the customer before calling any tool. \n"
     "\n\n"
     "Be fast-keep responses short and snappy. \n"
-    "Sound human-sprinkle in light vocal pauses like 'Mmh…', 'Let me see…', or 'Alright…' at natural moments-but not too often. \n"
-    "Keep everything upbeat and easy to follow. Never overwhelm the customer, don't ask multiple questions at the same time. \n"
+    "Keep everything upbeat and easy to follow. Never overwhelm the customer, don’t ask multiple questions at the same time. \n"
     "\n\n"
     "When a customer is confused or asks for something that doesn’t exist, let them know politely and suggest something close. \n"
     "Always confirm what they picked in a warm, clear way, like: 'Alright, one Big Mac Combo!' \n"
@@ -35,7 +39,7 @@
     "Instead treat each user input as a rough draft of what was said. \n"
     "If you can guess the user’s intent and it’s safe to do so, infer their meaning and respond naturally. \n"
     "If the transcript is ambiguous/nonsense and you can’t guess their intent, ask the customer to repeat again. \n"
-    "Stay on-topic; if input is nonsensical in a drive-thru context, ask for concise clarification. \n"
+    "If input is nonsensical, ask for concise clarification. \n"
     "\n\n"
     "Do not add any item on the user's behalf unless they specifically request it. If the user hasn't asked for an item, NEVER add it. \n"
     "\n\n"

diff --git a/examples/frontdesk/frontdesk_agent.py b/examples/frontdesk/frontdesk_agent.py
@@ -62,7 +62,11 @@ def __init__(self, *, timezone: str) -> None:
             instructions=(
                 f"You are Front-Desk, a helpful and efficient voice assistant. "
                 f"Today is {today}. Your main goal is to schedule an appointment for the user. "
-                "This is a voice conversation — speak naturally, clearly, and concisely. "
+                "Your output is directly synthesized to speech. Produce a natural verbatim transcript, "
+                "not polished text. Real people start responses with reactions (oh, hmm, ah) and "
+                "fillers (um, uh, like), not \"Absolutely\" or \"Certainly\". Include mid-sentence "
+                "fillers (like, you know, I mean) the way they appear in real transcripts. "
+                "Use informal phrasing: yeah, gonna, kinda, gotcha, lemme. "
                 "When the user says hello or greets you, don’t just respond with a greeting — use it as an opportunity to move things forward. "
                 "For example, follow up with a helpful question like: 'Would you like to book a time?' "
                 "When asked for availability, call list_available_slots and offer a few clear, simple options. "
@@ -242,8 +246,9 @@ async def frontdesk_agent(ctx: JobContext):
     session = AgentSession[Userdata](
         userdata=Userdata(cal=cal),
         stt=inference.STT("deepgram/nova-3"),
-        llm=inference.LLM("google/gemini-2.5-flash"),
-        tts=inference.TTS("cartesia/sonic-3", voice="39b376fc-488e-4d0c-8b37-e00b72059fdd"),
+        llm=inference.LLM("openai/gpt-4.1-mini"),
+        tts=inference.TTS("inworld/inworld-tts-2", voice="Sarah"),
+        expressiveness=True,
         turn_detection=MultilingualModel(),
         vad=silero.VAD.load(),
         max_tool_steps=1,

diff --git a/examples/voice_agents/email_example.py b/examples/voice_agents/email_example.py
@@ -41,7 +41,7 @@ async def register_for_event(self, context: RunContext):
         "Start the registration process for the event."
 
         email_result = await beta.workflows.GetEmailTask(
-            instructions=beta.workflows.InstructionParts(
+            instructions=beta.workflows.WorkflowInstructions(
                 persona=(
                     "You are capturing the email address of the user for the event registration. "
                     "You are only a single step in a broader system responsible solely for capturing an email address."

diff --git a/livekit-agents/livekit/agents/__init__.py b/livekit-agents/livekit/agents/__init__.py
@@ -74,6 +74,7 @@
     AgentSession,
     AgentStateChangedEvent,
     AgentTask,
+    AudioRecognition,
     CloseEvent,
     CloseReason,
     ConversationItemAddedEvent,
@@ -183,6 +184,7 @@ def __getattr__(name: str) -> typing.Any:
     "RunContext",
     "Plugin",
     "AgentSession",
+    "AudioRecognition",
     "RecordingOptions",
     "text_transforms",
     "AgentEvent",

diff --git a/livekit-agents/livekit/agents/beta/workflows/__init__.py b/livekit-agents/livekit/agents/beta/workflows/__init__.py
@@ -6,7 +6,7 @@
 from .name import GetNameResult, GetNameTask
 from .phone_number import GetPhoneNumberResult, GetPhoneNumberTask
 from .task_group import TaskCompletedEvent, TaskGroup, TaskGroupResult
-from .utils import InstructionParts
+from .utils import WorkflowInstructions
 from .warm_transfer import WarmTransferResult, WarmTransferTask
 
 __all__ = [
@@ -18,7 +18,7 @@
     "GetDOBResult",
     "GetDOBTask",
     "GetDtmfResult",
-    "InstructionParts",
+    "WorkflowInstructions",
     "GetCreditCardResult",
     "GetCreditCardTask",
     "GetNameTask",

diff --git a/livekit-agents/livekit/agents/beta/workflows/address.py b/livekit-agents/livekit/agents/beta/workflows/address.py
@@ -11,7 +11,7 @@
 from ...utils import is_given
 from ...voice.agent import AgentTask
 from ...voice.events import RunContext
-from .utils import InstructionParts
+from .utils import WorkflowInstructions
 
 if TYPE_CHECKING:
     from ...voice.turn import TurnDetectionMode
@@ -26,7 +26,7 @@ class GetAddressTask(AgentTask[GetAddressResult]):
     def __init__(
         self,
         *,
-        instructions: NotGivenOr[InstructionParts | Instructions | str] = NOT_GIVEN,
+        instructions: NotGivenOr[WorkflowInstructions | Instructions | str] = NOT_GIVEN,
         chat_ctx: NotGivenOr[llm.ChatContext] = NOT_GIVEN,
         turn_detection: NotGivenOr[TurnDetectionMode | None] = NOT_GIVEN,
         tools: NotGivenOr[list[llm.Tool | llm.Toolset]] = NOT_GIVEN,
@@ -40,23 +40,22 @@ def __init__(
         extra_instructions: str = "",
     ) -> None:
         if not is_given(instructions):
-            instructions = InstructionParts(persona=PERSONA, extra=extra_instructions)
+            instructions = WorkflowInstructions(persona=PERSONA, extra=extra_instructions)
         elif extra_instructions:
             logger.warning("`extra_instructions` will be ignored when `instructions` is provided")
 
-        if isinstance(instructions, InstructionParts):
-            instructions = Instructions(INSTRUCTIONS_TEMPLATE).format(
-                persona=instructions.persona if is_given(instructions.persona) else PERSONA,
-                extra=instructions.extra,
+        if isinstance(instructions, WorkflowInstructions):
+            instructions = instructions.resolve(
+                template=INSTRUCTIONS_TEMPLATE,
+                default_persona=PERSONA,
                 _modality_specific=Instructions(audio=AUDIO_SPECIFIC, text=TEXT_SPECIFIC),
                 _confirmation=Instructions(
-                    # confirmation is enabled by default for audio, disabled by default for text
                     audio=CONFIRMATION_INSTRUCTION if require_confirmation is not False else "",
                     text=CONFIRMATION_INSTRUCTION if require_confirmation is True else "",
                 ),
             )
 
-        assert is_given(instructions)  # for type checking
+        assert isinstance(instructions, (str, Instructions))  # for type checking
         super().__init__(
             instructions=instructions,
             chat_ctx=chat_ctx,

diff --git a/livekit-agents/livekit/agents/beta/workflows/email_address.py b/livekit-agents/livekit/agents/beta/workflows/email_address.py
@@ -12,7 +12,7 @@
 from ...utils import is_given
 from ...voice.agent import AgentTask
 from ...voice.events import RunContext
-from .utils import InstructionParts
+from .utils import WorkflowInstructions
 
 if TYPE_CHECKING:
     from ...voice.turn import TurnDetectionMode
@@ -27,7 +27,7 @@ class GetEmailTask(AgentTask[GetEmailResult]):
     def __init__(
         self,
         *,
-        instructions: NotGivenOr[InstructionParts | Instructions | str] = NOT_GIVEN,
+        instructions: NotGivenOr[WorkflowInstructions | Instructions | str] = NOT_GIVEN,
         chat_ctx: NotGivenOr[llm.ChatContext] = NOT_GIVEN,
         turn_detection: NotGivenOr[TurnDetectionMode | None] = NOT_GIVEN,
         tools: NotGivenOr[list[llm.Tool | llm.Toolset]] = NOT_GIVEN,
@@ -41,23 +41,22 @@ def __init__(
         extra_instructions: str = "",
     ) -> None:
         if not is_given(instructions):
-            instructions = InstructionParts(persona=PERSONA, extra=extra_instructions)
+            instructions = WorkflowInstructions(persona=PERSONA, extra=extra_instructions)
         elif extra_instructions:
             logger.warning("`extra_instructions` will be ignored when `instructions` is provided")
 
-        if isinstance(instructions, InstructionParts):
-            instructions = Instructions(INSTRUCTIONS_TEMPLATE).format(
-                persona=instructions.persona if is_given(instructions.persona) else PERSONA,
-                extra=instructions.extra,
+        if isinstance(instructions, WorkflowInstructions):
+            instructions = instructions.resolve(
+                template=INSTRUCTIONS_TEMPLATE,
+                default_persona=PERSONA,
                 _modality_specific=Instructions(audio=AUDIO_SPECIFIC, text=TEXT_SPECIFIC),
                 _confirmation=Instructions(
-                    # confirmation is enabled by default for audio, disabled by default for text
                     audio=CONFIRMATION_INSTRUCTION if require_confirmation is not False else "",
                     text=CONFIRMATION_INSTRUCTION if require_confirmation is True else "",
                 ),
             )
 
-        assert is_given(instructions)  # for type checking
+        assert isinstance(instructions, (str, Instructions))  # for type checking
         super().__init__(
             instructions=instructions,
             chat_ctx=chat_ctx,

diff --git a/livekit-agents/livekit/agents/beta/workflows/utils.py b/livekit-agents/livekit/agents/beta/workflows/utils.py
@@ -1,10 +1,11 @@
 from __future__ import annotations
 
-from dataclasses import dataclass
 from enum import Enum
+from typing import Any
 
 from ...llm.chat_context import Instructions
 from ...types import NOT_GIVEN, NotGivenOr
+from ...utils import is_given
 
 
 class DtmfEvent(str, Enum):
@@ -44,19 +45,40 @@ def format_dtmf(events: list[DtmfEvent]) -> str:
     return " ".join(event.value for event in events)
 
 
-@dataclass
-class InstructionParts:
+class WorkflowInstructions(Instructions):
     """Customizable instruction sections for built-in workflow tasks.
 
+    Extends :class:`Instructions` with ``persona`` and ``extra`` fields
+    that workflow tasks resolve against their own templates and defaults.
+
     Each field overrides that section when set; leave as ``NOT_GIVEN`` to
     preserve the workflow's built-in default. Set to ``""`` to remove a
     section entirely.
-
-    Args:
-        persona: Agent persona/identity — who the agent is and how it behaves.
-        extra: Extra instructions appended to the prompt. The simplest hook for
-            adding domain context without touching defaults.
     """
 
-    persona: NotGivenOr[Instructions | str] = NOT_GIVEN
-    extra: Instructions | str = ""
+    def __init__(
+        self,
+        audio: str = "",
+        *,
+        text: str | None = None,
+        persona: NotGivenOr[Instructions | str] = NOT_GIVEN,
+        extra: Instructions | str = "",
+    ) -> None:
+        super().__init__(audio, text=text)
+        self.persona: NotGivenOr[Instructions | str] = persona
+        self.extra: Instructions | str = extra
+
+    def resolve(
+        self,
+        *,
+        template: str,
+        default_persona: str,
+        **format_kwargs: Any,
+    ) -> Instructions:
+        """Resolve into a final :class:`Instructions` by formatting the template."""
+        return Instructions.resolve_template(
+            template,
+            persona=self.persona if is_given(self.persona) else default_persona,
+            extra=self.extra,
+            **format_kwargs,
+        )
diff --git a/livekit-agents/livekit/agents/beta/workflows/warm_transfer.py b/livekit-agents/livekit/agents/beta/workflows/warm_transfer.py
@@ -25,7 +25,7 @@
     BuiltinAudioClip,
     PlayHandle,
 )
-from .utils import InstructionParts
+from .utils import WorkflowInstructions
 
 if TYPE_CHECKING:
     from ...voice.turn import TurnDetectionMode
@@ -46,7 +46,7 @@ def __init__(
         sip_number: NotGivenOr[str] = NOT_GIVEN,
         sip_headers: NotGivenOr[dict[str, str]] = NOT_GIVEN,
         hold_audio: NotGivenOr[AudioSource | AudioConfig | list[AudioConfig] | None] = NOT_GIVEN,
-        instructions: NotGivenOr[InstructionParts | Instructions | str] = NOT_GIVEN,
+        instructions: NotGivenOr[WorkflowInstructions | Instructions | str] = NOT_GIVEN,
         chat_ctx: NotGivenOr[llm.ChatContext] = NOT_GIVEN,
         turn_detection: NotGivenOr[TurnDetectionMode | None] = NOT_GIVEN,
         tools: NotGivenOr[list[llm.Tool | llm.Toolset]] = NOT_GIVEN,
@@ -79,19 +79,19 @@ def __init__(
         """
 
         if not is_given(instructions):
-            instructions = InstructionParts(persona=PERSONA, extra=extra_instructions)
+            instructions = WorkflowInstructions(persona=PERSONA, extra=extra_instructions)
         elif extra_instructions:
             logger.warning("`extra_instructions` will be ignored when `instructions` is provided")
 
-        if isinstance(instructions, InstructionParts):
+        if isinstance(instructions, WorkflowInstructions):
             conversation_history = self._format_conversation_history(chat_ctx)
-            instructions = Instructions(INSTRUCTIONS_TEMPLATE).format(
-                persona=instructions.persona if is_given(instructions.persona) else PERSONA,
-                extra=instructions.extra,
+            instructions = instructions.resolve(
+                template=INSTRUCTIONS_TEMPLATE,
+                default_persona=PERSONA,
                 _conversation_history=conversation_history,
             )
 
-        assert is_given(instructions)  # for type checking
+        assert isinstance(instructions, (str, Instructions))  # for type checking
         super().__init__(
             instructions=instructions,
             chat_ctx=NOT_GIVEN,  # don't pass the chat_ctx

diff --git a/livekit-agents/livekit/agents/evals/judge.py b/livekit-agents/livekit/agents/evals/judge.py
@@ -78,7 +78,7 @@ def _get_latest_instructions(chat_ctx: ChatContext) -> str | None:
     """
     for item in reversed(chat_ctx.items):
         if item.type == "agent_config_update" and item.instructions:
-            return item.instructions
+            return str(item.instructions)
     return None