Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions livekit-agents/livekit/agents/voice/audio_recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ def __init__(
self._stt_request_ids: list[str] = []
self._closing = asyncio.Event()

self._vad_speech_started: bool = False
self._turn_speech_started: bool = False

def update_options(
self,
Expand Down Expand Up @@ -653,6 +653,7 @@ def clear_user_turn(self) -> None:
self._audio_preflight_transcript = ""
self._final_transcript_confidence = []
self._user_turn_committed = False
self._turn_speech_started = False

# reset stt to clear the buffer from previous user turn
stt = self._stt
Expand Down Expand Up @@ -966,9 +967,9 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
async def _on_vad_event(self, ev: vad.VADEvent) -> None:
if ev.type == vad.VADEventType.START_OF_SPEECH:
speech_start_time = time.time() - ev.speech_duration - ev.inference_duration
if not self._vad_speech_started:
if not self._turn_speech_started:
self._speech_start_time = speech_start_time
self._vad_speech_started = True
self._turn_speech_started = True

with trace.use_span(self._ensure_user_turn_span(start_time=speech_start_time)):
self._hooks.on_start_of_speech(ev, speech_start_time=speech_start_time)
Expand All @@ -995,7 +996,6 @@ async def _on_vad_event(self, ev: vad.VADEvent) -> None:
with trace.use_span(self._ensure_user_turn_span()):
self._hooks.on_end_of_speech(ev)

self._vad_speech_started = False
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 clear_user_turn doesn't reset _turn_speech_started, causing stale _speech_start_time in subsequent turns

By making _turn_speech_started turn-scoped (no longer reset on VAD END_OF_SPEECH at the old line 998), clear_user_turn() at audio_recognition.py:650-660 must now explicitly reset _turn_speech_started (and _speech_start_time). Previously, even though clear_user_turn didn't reset _vad_speech_started, the next VAD END_OF_SPEECH event would reset it, allowing the subsequent START_OF_SPEECH to correctly set _speech_start_time. Now, if a user cancels a turn mid-speech (e.g., push-to-talk cancel_turn), _turn_speech_started remains True indefinitely (VAD EOS no longer resets it), so the next turn's first VAD START_OF_SPEECH skips updating _speech_start_time, leaving it stale from the previous turn. This causes incorrect started_speaking_at / stopped_speaking_at / end_of_turn_delay metrics in _EndOfTurnInfo passed to on_end_of_turn.

Prompt for agents
The removal of `self._vad_speech_started = False` from the VAD END_OF_SPEECH handler (old line 998) makes the flag turn-scoped. This is correct for the intended fix (preventing subsequent speech bursts within a turn from overwriting _speech_start_time). However, `clear_user_turn()` at line 650 now needs to also reset `_turn_speech_started` and `_speech_start_time` to ensure a clean slate for the next turn. Without this, cancelling a turn (e.g., push-to-talk cancel_turn) while the user is speaking leaves `_turn_speech_started = True`, so the next turn's VAD START_OF_SPEECH won't update `_speech_start_time`. Add `self._turn_speech_started = False` and `self._speech_start_time = None` to the `clear_user_turn` method body.
Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.

self._speaking = False

if self._vad_base_turn_detection or (
Expand Down Expand Up @@ -1156,7 +1156,7 @@ async def _bounce_eou_task(
# only reset if there is no new speech
if self._last_speaking_time == last_speaking_time:
self._speech_start_time = None
self._vad_speech_started = False
self._turn_speech_started = False
self._last_speaking_time = None

self._user_turn_committed = False
Expand Down Expand Up @@ -1223,7 +1223,7 @@ async def _forward() -> None:
with trace.use_span(self._ensure_user_turn_span()):
self._hooks.on_end_of_speech(None)
self._speaking = False
self._vad_speech_started = False
self._turn_speech_started = False

@utils.log_exceptions(logger=logger)
async def _interruption_task(
Expand Down
Loading