Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
bf22a9f
feat: expressiveness mode, stateless Instructions, structured LLM output
theomonnom May 4, 2026
211f76d
fix: strip TTS markup from streamed transcripts when expressiveness i…
theomonnom May 4, 2026
43c1736
feat: XML-aware sentence tokenizer + regression tests
theomonnom May 4, 2026
8e18ca3
remove unused get_tags from _provider_format
theomonnom May 4, 2026
ce9aa9f
test: expand XML tokenizer coverage to 46 tests
theomonnom May 4, 2026
59f13ad
test: use full sentences inside wrapping tags to ensure merge is exer…
theomonnom May 4, 2026
cf133ce
fix: restore original pipeline instruction behavior, fix all CI test …
theomonnom May 4, 2026
06ef12a
refactor: ExpressivenessOptions as TypedDict, stateless Instructions …
theomonnom May 5, 2026
d4adc31
feat: expression tag for bracket-based TTS providers (Inworld, Eleven…
theomonnom May 5, 2026
14fbf85
feat: add Markup support to Inworld TTS plugin
theomonnom May 5, 2026
b43e505
refactor: plugins override Markup inner class instead of manually set…
theomonnom May 5, 2026
dd62cc8
refactor: audio_recognition property raises instead of returning None
theomonnom May 5, 2026
f6b6f12
feat: comprehensive Inworld TTS 2 support
theomonnom May 7, 2026
d640e3c
refactor: trim provider prompts — keep syntax/values, cut redundant e…
theomonnom May 7, 2026
b9517ec
feat: better Inworld delivery examples showing full range of expressi…
theomonnom May 7, 2026
d17b907
example: switch drive-thru to Inworld TTS 2 with expressiveness
theomonnom May 7, 2026
98be0a6
feat: max_token_len on sentence tokenizer for TTS input limits
theomonnom May 7, 2026
d61018a
fix: move markup conversion from token-level to sentence-level
theomonnom May 7, 2026
c8319d5
switch drive-thru to gpt-4.1-mini, don't truncate TTS debug logs
theomonnom May 7, 2026
b40c4ad
example: soften drive-thru guardrails to not block expressiveness
theomonnom May 7, 2026
b5b4aa9
fix: richer Inworld delivery examples — short labels sound flat
theomonnom May 7, 2026
ade74ae
fix: XML regex was greedy — never detected self-closing tags with att…
theomonnom May 7, 2026
aada98f
test: regression tests for self-closing XML tag regex bug
theomonnom May 7, 2026
4962032
feat(tts): XML-aware sentence tokenizer, provider prompts, markup pip…
theomonnom May 8, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions examples/drive-thru/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,8 +428,9 @@ async def drive_thru_agent(ctx: JobContext) -> None:
],
},
),
llm=inference.LLM("openai/gpt-5-mini"),
tts=inference.TTS("cartesia/sonic-3", voice="f786b574-daa5-4673-aa0c-cbe3e8534c02"),
llm=inference.LLM("openai/gpt-4.1-mini"),
tts=inference.TTS("inworld/inworld-tts-2", voice="Sarah"),
expressiveness=True,
turn_detection=MultilingualModel(),
vad=silero.VAD.load(),
max_tool_steps=10,
Expand Down
14 changes: 9 additions & 5 deletions examples/drive-thru/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,26 @@

COMMON_INSTRUCTIONS = (
"You are Mac, a quick and friendly McDonald’s drive-thru attendant. \n"
"Your output is directly synthesized to speech. Produce a natural verbatim transcript, "
"not polished text. Real people start responses with reactions (oh, hmm, ah) and "
"fillers (um, uh, like), not \"Absolutely\" or \"Certainly\". Include mid-sentence "
"fillers (like, you know, I mean) the way they appear in real transcripts. "
"Use informal phrasing: yeah, gonna, kinda, gotcha, lemme. \n"
"\n\n"
"Your job is to guide the customer smoothly through their order, speaking in short, natural voice responses. \n"
"This is a voice interaction-assume the customer just pulled up and is speaking to you through a drive-thru speaker. \n"
"Respond like you're hearing them, not reading text. \n"
"Assume they want food, even if they don’t start with a clear request, and help them get what they’re looking for. \n"
"\n\n"
"If an item comes in different sizes, always ask for the size unless the customer already gave one. \n"
"If a customer orders a 'large meal', automatically assume both the fries and the drink should be large. \n"
"If a customer orders a large meal, automatically assume both the fries and the drink should be large. \n"
"Do not ask again to confirm the size of the drink or fries. This inference is meant to streamline the interaction. \n"
"If the customer clearly indicates a different size for the fries or drink, respect their preference. \n"
"\n\n"
"Never infer or assume any detail the customer has not explicitly stated — especially the drink for a combo meal. \n"
"If a required detail is missing, always ask the customer before calling any tool. \n"
"\n\n"
"Be fast-keep responses short and snappy. \n"
"Sound human-sprinkle in light vocal pauses like 'Mmh…', 'Let me see…', or 'Alright…' at natural moments-but not too often. \n"
"Keep everything upbeat and easy to follow. Never overwhelm the customer, don't ask multiple questions at the same time. \n"
"Keep everything upbeat and easy to follow. Never overwhelm the customer, don’t ask multiple questions at the same time. \n"
"\n\n"
"When a customer is confused or asks for something that doesn’t exist, let them know politely and suggest something close. \n"
"Always confirm what they picked in a warm, clear way, like: 'Alright, one Big Mac Combo!' \n"
Expand All @@ -35,7 +39,7 @@
"Instead treat each user input as a rough draft of what was said. \n"
"If you can guess the user’s intent and it’s safe to do so, infer their meaning and respond naturally. \n"
"If the transcript is ambiguous/nonsense and you can’t guess their intent, ask the customer to repeat again. \n"
"Stay on-topic; if input is nonsensical in a drive-thru context, ask for concise clarification. \n"
"If input is nonsensical, ask for concise clarification. \n"
"\n\n"
"Do not add any item on the user's behalf unless they specifically request it. If the user hasn't asked for an item, NEVER add it. \n"
"\n\n"
Expand Down
11 changes: 8 additions & 3 deletions examples/frontdesk/frontdesk_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,11 @@ def __init__(self, *, timezone: str) -> None:
instructions=(
f"You are Front-Desk, a helpful and efficient voice assistant. "
f"Today is {today}. Your main goal is to schedule an appointment for the user. "
"This is a voice conversation — speak naturally, clearly, and concisely. "
"Your output is directly synthesized to speech. Produce a natural verbatim transcript, "
"not polished text. Real people start responses with reactions (oh, hmm, ah) and "
"fillers (um, uh, like), not \"Absolutely\" or \"Certainly\". Include mid-sentence "
"fillers (like, you know, I mean) the way they appear in real transcripts. "
"Use informal phrasing: yeah, gonna, kinda, gotcha, lemme. "
"When the user says hello or greets you, don’t just respond with a greeting — use it as an opportunity to move things forward. "
"For example, follow up with a helpful question like: 'Would you like to book a time?' "
"When asked for availability, call list_available_slots and offer a few clear, simple options. "
Expand Down Expand Up @@ -242,8 +246,9 @@ async def frontdesk_agent(ctx: JobContext):
session = AgentSession[Userdata](
userdata=Userdata(cal=cal),
stt=inference.STT("deepgram/nova-3"),
llm=inference.LLM("google/gemini-2.5-flash"),
tts=inference.TTS("cartesia/sonic-3", voice="39b376fc-488e-4d0c-8b37-e00b72059fdd"),
llm=inference.LLM("openai/gpt-4.1-mini"),
tts=inference.TTS("inworld/inworld-tts-2", voice="Sarah"),
expressiveness=True,
turn_detection=MultilingualModel(),
vad=silero.VAD.load(),
max_tool_steps=1,
Expand Down
2 changes: 1 addition & 1 deletion examples/voice_agents/email_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ async def register_for_event(self, context: RunContext):
"Start the registration process for the event."

email_result = await beta.workflows.GetEmailTask(
instructions=beta.workflows.InstructionParts(
instructions=beta.workflows.WorkflowInstructions(
persona=(
"You are capturing the email address of the user for the event registration. "
"You are only a single step in a broader system responsible solely for capturing an email address."
Expand Down
2 changes: 2 additions & 0 deletions livekit-agents/livekit/agents/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
AgentSession,
AgentStateChangedEvent,
AgentTask,
AudioRecognition,
CloseEvent,
CloseReason,
ConversationItemAddedEvent,
Expand Down Expand Up @@ -183,6 +184,7 @@ def __getattr__(name: str) -> typing.Any:
"RunContext",
"Plugin",
"AgentSession",
"AudioRecognition",
"RecordingOptions",
"text_transforms",
"AgentEvent",
Expand Down
4 changes: 2 additions & 2 deletions livekit-agents/livekit/agents/beta/workflows/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from .name import GetNameResult, GetNameTask
from .phone_number import GetPhoneNumberResult, GetPhoneNumberTask
from .task_group import TaskCompletedEvent, TaskGroup, TaskGroupResult
from .utils import InstructionParts
from .utils import WorkflowInstructions
from .warm_transfer import WarmTransferResult, WarmTransferTask

__all__ = [
Expand All @@ -18,7 +18,7 @@
"GetDOBResult",
"GetDOBTask",
"GetDtmfResult",
"InstructionParts",
"WorkflowInstructions",
"GetCreditCardResult",
"GetCreditCardTask",
"GetNameTask",
Expand Down
17 changes: 8 additions & 9 deletions livekit-agents/livekit/agents/beta/workflows/address.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from ...utils import is_given
from ...voice.agent import AgentTask
from ...voice.events import RunContext
from .utils import InstructionParts
from .utils import WorkflowInstructions

if TYPE_CHECKING:
from ...voice.turn import TurnDetectionMode
Expand All @@ -26,7 +26,7 @@ class GetAddressTask(AgentTask[GetAddressResult]):
def __init__(
self,
*,
instructions: NotGivenOr[InstructionParts | Instructions | str] = NOT_GIVEN,
instructions: NotGivenOr[WorkflowInstructions | Instructions | str] = NOT_GIVEN,
chat_ctx: NotGivenOr[llm.ChatContext] = NOT_GIVEN,
turn_detection: NotGivenOr[TurnDetectionMode | None] = NOT_GIVEN,
tools: NotGivenOr[list[llm.Tool | llm.Toolset]] = NOT_GIVEN,
Expand All @@ -40,23 +40,22 @@ def __init__(
extra_instructions: str = "",
) -> None:
if not is_given(instructions):
instructions = InstructionParts(persona=PERSONA, extra=extra_instructions)
instructions = WorkflowInstructions(persona=PERSONA, extra=extra_instructions)
elif extra_instructions:
logger.warning("`extra_instructions` will be ignored when `instructions` is provided")

if isinstance(instructions, InstructionParts):
instructions = Instructions(INSTRUCTIONS_TEMPLATE).format(
persona=instructions.persona if is_given(instructions.persona) else PERSONA,
extra=instructions.extra,
if isinstance(instructions, WorkflowInstructions):
instructions = instructions.resolve(
template=INSTRUCTIONS_TEMPLATE,
default_persona=PERSONA,
_modality_specific=Instructions(audio=AUDIO_SPECIFIC, text=TEXT_SPECIFIC),
_confirmation=Instructions(
# confirmation is enabled by default for audio, disabled by default for text
audio=CONFIRMATION_INSTRUCTION if require_confirmation is not False else "",
text=CONFIRMATION_INSTRUCTION if require_confirmation is True else "",
),
)

assert is_given(instructions) # for type checking
assert isinstance(instructions, (str, Instructions)) # for type checking
super().__init__(
instructions=instructions,
chat_ctx=chat_ctx,
Expand Down
17 changes: 8 additions & 9 deletions livekit-agents/livekit/agents/beta/workflows/email_address.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from ...utils import is_given
from ...voice.agent import AgentTask
from ...voice.events import RunContext
from .utils import InstructionParts
from .utils import WorkflowInstructions

if TYPE_CHECKING:
from ...voice.turn import TurnDetectionMode
Expand All @@ -27,7 +27,7 @@ class GetEmailTask(AgentTask[GetEmailResult]):
def __init__(
self,
*,
instructions: NotGivenOr[InstructionParts | Instructions | str] = NOT_GIVEN,
instructions: NotGivenOr[WorkflowInstructions | Instructions | str] = NOT_GIVEN,
chat_ctx: NotGivenOr[llm.ChatContext] = NOT_GIVEN,
turn_detection: NotGivenOr[TurnDetectionMode | None] = NOT_GIVEN,
tools: NotGivenOr[list[llm.Tool | llm.Toolset]] = NOT_GIVEN,
Expand All @@ -41,23 +41,22 @@ def __init__(
extra_instructions: str = "",
) -> None:
if not is_given(instructions):
instructions = InstructionParts(persona=PERSONA, extra=extra_instructions)
instructions = WorkflowInstructions(persona=PERSONA, extra=extra_instructions)
elif extra_instructions:
logger.warning("`extra_instructions` will be ignored when `instructions` is provided")

if isinstance(instructions, InstructionParts):
instructions = Instructions(INSTRUCTIONS_TEMPLATE).format(
persona=instructions.persona if is_given(instructions.persona) else PERSONA,
extra=instructions.extra,
if isinstance(instructions, WorkflowInstructions):
instructions = instructions.resolve(
template=INSTRUCTIONS_TEMPLATE,
default_persona=PERSONA,
_modality_specific=Instructions(audio=AUDIO_SPECIFIC, text=TEXT_SPECIFIC),
_confirmation=Instructions(
# confirmation is enabled by default for audio, disabled by default for text
audio=CONFIRMATION_INSTRUCTION if require_confirmation is not False else "",
text=CONFIRMATION_INSTRUCTION if require_confirmation is True else "",
),
)

assert is_given(instructions) # for type checking
assert isinstance(instructions, (str, Instructions)) # for type checking
super().__init__(
instructions=instructions,
chat_ctx=chat_ctx,
Expand Down
42 changes: 32 additions & 10 deletions livekit-agents/livekit/agents/beta/workflows/utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from __future__ import annotations

from dataclasses import dataclass
from enum import Enum
from typing import Any

from ...llm.chat_context import Instructions
from ...types import NOT_GIVEN, NotGivenOr
from ...utils import is_given


class DtmfEvent(str, Enum):
Expand Down Expand Up @@ -44,19 +45,40 @@ def format_dtmf(events: list[DtmfEvent]) -> str:
return " ".join(event.value for event in events)


@dataclass
class InstructionParts:
class WorkflowInstructions(Instructions):
"""Customizable instruction sections for built-in workflow tasks.

Extends :class:`Instructions` with ``persona`` and ``extra`` fields
that workflow tasks resolve against their own templates and defaults.

Each field overrides that section when set; leave as ``NOT_GIVEN`` to
preserve the workflow's built-in default. Set to ``""`` to remove a
section entirely.

Args:
persona: Agent persona/identity — who the agent is and how it behaves.
extra: Extra instructions appended to the prompt. The simplest hook for
adding domain context without touching defaults.
"""

persona: NotGivenOr[Instructions | str] = NOT_GIVEN
extra: Instructions | str = ""
def __init__(
self,
audio: str = "",
*,
text: str | None = None,
persona: NotGivenOr[Instructions | str] = NOT_GIVEN,
extra: Instructions | str = "",
) -> None:
super().__init__(audio, text=text)
self.persona: NotGivenOr[Instructions | str] = persona
self.extra: Instructions | str = extra

def resolve(
self,
*,
template: str,
default_persona: str,
**format_kwargs: Any,
) -> Instructions:
"""Resolve into a final :class:`Instructions` by formatting the template."""
return Instructions.resolve_template(
template,
persona=self.persona if is_given(self.persona) else default_persona,
extra=self.extra,
**format_kwargs,
)
16 changes: 8 additions & 8 deletions livekit-agents/livekit/agents/beta/workflows/warm_transfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
BuiltinAudioClip,
PlayHandle,
)
from .utils import InstructionParts
from .utils import WorkflowInstructions

if TYPE_CHECKING:
from ...voice.turn import TurnDetectionMode
Expand All @@ -46,7 +46,7 @@ def __init__(
sip_number: NotGivenOr[str] = NOT_GIVEN,
sip_headers: NotGivenOr[dict[str, str]] = NOT_GIVEN,
hold_audio: NotGivenOr[AudioSource | AudioConfig | list[AudioConfig] | None] = NOT_GIVEN,
instructions: NotGivenOr[InstructionParts | Instructions | str] = NOT_GIVEN,
instructions: NotGivenOr[WorkflowInstructions | Instructions | str] = NOT_GIVEN,
chat_ctx: NotGivenOr[llm.ChatContext] = NOT_GIVEN,
turn_detection: NotGivenOr[TurnDetectionMode | None] = NOT_GIVEN,
tools: NotGivenOr[list[llm.Tool | llm.Toolset]] = NOT_GIVEN,
Expand Down Expand Up @@ -79,19 +79,19 @@ def __init__(
"""

if not is_given(instructions):
instructions = InstructionParts(persona=PERSONA, extra=extra_instructions)
instructions = WorkflowInstructions(persona=PERSONA, extra=extra_instructions)
elif extra_instructions:
logger.warning("`extra_instructions` will be ignored when `instructions` is provided")

if isinstance(instructions, InstructionParts):
if isinstance(instructions, WorkflowInstructions):
conversation_history = self._format_conversation_history(chat_ctx)
instructions = Instructions(INSTRUCTIONS_TEMPLATE).format(
persona=instructions.persona if is_given(instructions.persona) else PERSONA,
extra=instructions.extra,
instructions = instructions.resolve(
template=INSTRUCTIONS_TEMPLATE,
default_persona=PERSONA,
_conversation_history=conversation_history,
)

assert is_given(instructions) # for type checking
assert isinstance(instructions, (str, Instructions)) # for type checking
super().__init__(
instructions=instructions,
chat_ctx=NOT_GIVEN, # don't pass the chat_ctx
Expand Down
2 changes: 1 addition & 1 deletion livekit-agents/livekit/agents/evals/judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def _get_latest_instructions(chat_ctx: ChatContext) -> str | None:
"""
for item in reversed(chat_ctx.items):
if item.type == "agent_config_update" and item.instructions:
return item.instructions
return str(item.instructions)
return None


Expand Down
Loading
Loading