From b723a0fbd0c056a8867599b393ea2bda9f2ea162 Mon Sep 17 00:00:00 2001 From: Katrina Date: Thu, 4 Jun 2026 15:12:00 -0400 Subject: [PATCH 1/8] add configurable setting to evaluate records that timed out --- src/eva/metrics/diagnostic/__init__.py | 2 + src/eva/metrics/runner.py | 2 +- .../validation/user_behavioral_fidelity.py | 14 ++- src/eva/models/config.py | 6 +- src/eva/models/results.py | 4 + src/eva/orchestrator/runner.py | 118 +++++++++++++++++- src/eva/orchestrator/validation_runner.py | 38 +++--- tests/fixtures/metric_signatures.json | 8 +- 8 files changed, 166 insertions(+), 26 deletions(-) diff --git a/src/eva/metrics/diagnostic/__init__.py b/src/eva/metrics/diagnostic/__init__.py index 687c1ef9..eb3d5e17 100644 --- a/src/eva/metrics/diagnostic/__init__.py +++ b/src/eva/metrics/diagnostic/__init__.py @@ -2,6 +2,7 @@ from . import authentication_success # noqa from . import conversation_correctly_finished # noqa +from . import conversation_timeout # noqa from . import response_speed # noqa from . import speakability # noqa from . import stt_wer # noqa @@ -11,6 +12,7 @@ __all__ = [ "authentication_success", "conversation_correctly_finished", + "conversation_timeout", "response_speed", "speakability", "stt_wer", diff --git a/src/eva/metrics/runner.py b/src/eva/metrics/runner.py index 64c7aec2..22b87d15 100644 --- a/src/eva/metrics/runner.py +++ b/src/eva/metrics/runner.py @@ -912,7 +912,7 @@ def _compute_latency_summary(self) -> dict[str, Any]: Returns a dict with the mean of mean_ms values for each latency type that has at least one non-null entry. """ - latency_keys = ["llm_latency", "stt_latency", "tts_latency"] + latency_keys = ["llm_latency", "stt_latency", "tts_latency", "model_response_latency"] collected: dict[str, list[float]] = {k: [] for k in latency_keys} for _record_id, record_dir in self._discover_record_dirs(self.run_dir, self.record_ids): diff --git a/src/eva/metrics/validation/user_behavioral_fidelity.py b/src/eva/metrics/validation/user_behavioral_fidelity.py index 0af13816..de0d2e4d 100644 --- a/src/eva/metrics/validation/user_behavioral_fidelity.py +++ b/src/eva/metrics/validation/user_behavioral_fidelity.py @@ -87,11 +87,15 @@ def get_prompt_variables(self, context: MetricContext, transcript_text: str) -> context.audio_timestamps_user_turns, context.audio_timestamps_assistant_turns, ) - conversation_end = ( - "the agent's failure to respond to the final user turn." - if agent_timeout - else "the user calling the end_call tool." - ) + if context.conversation_ended_reason == "timeout": + conversation_end = ( + "a system timeout — the conversation exceeded the allowed time limit. " + "The user did NOT end the call; the system terminated the conversation." + ) + elif agent_timeout: + conversation_end = "the agent's failure to respond to the final user turn." + else: + conversation_end = "the user calling the end_call tool." return { "conversation_evidence": conversation_evidence, diff --git a/src/eva/models/config.py b/src/eva/models/config.py index 6752d54b..d705e021 100644 --- a/src/eva/models/config.py +++ b/src/eva/models/config.py @@ -429,12 +429,14 @@ class ModelDeployment(DeploymentTypedDict): init=False, ) - validation_thresholds: dict[str, float] = Field( + validation_thresholds: dict[str, float | int] = Field( { "conversation_valid_end": 1.0, "user_behavioral_fidelity": 1.0, + "max_timeout_attempts": 1, }, - description="Validation metric thresholds for rerun decisions (JSON)", + description="Validation metric thresholds and settings for rerun decisions (JSON). " + "max_timeout_attempts controls how many timeout attempts before accepting the last one for evaluation.", ) # Multi-attempt (for pass@k evaluation) diff --git a/src/eva/models/results.py b/src/eva/models/results.py index 0caa7ab6..c242398e 100644 --- a/src/eva/models/results.py +++ b/src/eva/models/results.py @@ -80,6 +80,10 @@ class ConversationResult(BaseModel): ) initial_scenario_db_hash: str | None = Field(None, description="SHA-256 hash of initial scenario database") final_scenario_db_hash: str | None = Field(None, description="SHA-256 hash of final scenario database") + timeout_accepted: bool = Field( + False, + description="Whether this record was accepted after exhausting timeout attempts (gate bypass)", + ) class MetricScore(BaseModel): diff --git a/src/eva/orchestrator/runner.py b/src/eva/orchestrator/runner.py index 17a285bb..072cfc6b 100644 --- a/src/eva/orchestrator/runner.py +++ b/src/eva/orchestrator/runner.py @@ -159,6 +159,10 @@ async def run(self, records: list[EvaluationRecord]) -> RunResult: all_output_ids = list(output_id_to_record.keys()) pending_output_ids = list(all_output_ids) rerun_history: dict[str, list[dict]] = {} + timeout_attempt_counts: dict[str, int] = {} + timeout_validation_cache: dict[str, dict[int, ValidationResult]] = {} + max_timeout_attempts = int(self.config.validation_thresholds.get("max_timeout_attempts", 3)) + timeout_accepted_ids: set[str] = set() started_at = datetime.now() # Initialize port pool once before the attempt loop. @@ -272,8 +276,12 @@ async def _run_and_pipeline( not_finished_ids: list[str] = [] failed_validation_ids: list[str] = [] validation_results: dict[str, ValidationResult] = {} + # Map output_id → ConversationResult for timeout detection + result_map: dict[str, ConversationResult] = {} for output_id, _result, passed, vr in pipeline_results: + if isinstance(_result, ConversationResult): + result_map[output_id] = _result if vr is None or (not vr.passed and not vr.failed_metrics): not_finished_ids.append(output_id) else: @@ -290,10 +298,20 @@ async def _run_and_pipeline( failed_this_attempt = not_finished_ids + failed_validation_ids for oid in not_finished_ids: + # Distinguish timeout from other not_finished reasons + cr = result_map.get(oid) + is_timeout = cr is not None and cr.conversation_ended_reason == "timeout" + reason = "timeout" if is_timeout else "not_finished" + if is_timeout: + timeout_attempt_counts[oid] = timeout_attempt_counts.get(oid, 0) + 1 + # Eagerly run LLM validation (skip gate) on every timeout attempt + # and cache the result for later lookup. + vr = await pipeline_validation_runner.validate_one(oid, skip_gate=True) + timeout_validation_cache.setdefault(oid, {})[attempt_number] = vr rerun_history.setdefault(oid, []).append( { "attempt": attempt_number, - "reason": "not_finished", + "reason": reason, } ) for oid in failed_validation_ids: @@ -310,6 +328,76 @@ async def _run_and_pipeline( entry["failure_details"] = failure_details rerun_history.setdefault(oid, []).append(entry) + # Check for timeout-accepted records: records that have timed out + # max_timeout_attempts times get evaluated with gate bypass. + # The current attempt was already validated eagerly above; if it passes, + # accept immediately. Otherwise, scan cached results from previous + # timeout attempts and restore the archived directory if one passes. + newly_timeout_accepted: list[str] = [] + for oid in list(failed_this_attempt): + if timeout_attempt_counts.get(oid, 0) < max_timeout_attempts: + continue + + cached = timeout_validation_cache.get(oid, {}) + # Check current attempt first + current_vr = cached.get(attempt_number) + if current_vr and current_vr.passed: + logger.info( + f"Record {oid} timed out {timeout_attempt_counts[oid]} times, " + f"current attempt passed LLM validation — accepting" + ) + self._accept_timeout_record( + oid, + failed_this_attempt, + finished_ids, + newly_timeout_accepted, + metrics_runner, + metrics_background_tasks, + ) + continue + + # Scan previous timeout attempts for one that passed + accepted_from_archive = False + for prev_attempt, prev_vr in cached.items(): + if prev_attempt == attempt_number: + continue + if prev_vr.passed: + logger.info( + f"Record {oid} timed out {timeout_attempt_counts[oid]} times, " + f"restoring attempt {prev_attempt} which passed LLM validation" + ) + # Restore the archived attempt directory + archive_dir = self.output_dir / "records" / f"{oid}_failed_attempt_{prev_attempt}" + record_dir = self.output_dir / "records" / oid + if archive_dir.exists(): + # Move current attempt out of the way, restore the passing one + if record_dir.exists(): + shutil.move( + str(record_dir), + str(self.output_dir / "records" / f"{oid}_failed_attempt_{attempt_number}"), + ) + shutil.move(str(archive_dir), str(record_dir)) + self._accept_timeout_record( + oid, + failed_this_attempt, + finished_ids, + newly_timeout_accepted, + metrics_runner, + metrics_background_tasks, + ) + accepted_from_archive = True + break + + if not accepted_from_archive: + logger.info( + f"Record {oid} timed out {timeout_attempt_counts[oid]} times, " + f"no attempt passed LLM validation — staying pending" + ) + + if newly_timeout_accepted: + timeout_accepted_ids.update(newly_timeout_accepted) + logger.info(f"{len(newly_timeout_accepted)} timeout records accepted via gate bypass") + pending_output_ids = failed_this_attempt if not pending_output_ids: @@ -415,6 +503,7 @@ async def _run_and_pipeline( "total_attempts": attempt_number, "failed_record_ids": sorted(final_failed_ids), "successful_record_ids": sorted(successful_ids), + "timeout_accepted_record_ids": sorted(timeout_accepted_ids), }, "rerun_history": rerun_history, "final_failures": final_failures, @@ -1028,6 +1117,33 @@ def settings_customise_sources(cls, settings_cls, init_settings, **kwargs): runner.output_dir = run_dir # Use existing output dir, don't create new return runner + def _accept_timeout_record( + self, + oid: str, + failed_this_attempt: list[str], + finished_ids: list[str], + newly_timeout_accepted: list[str], + metrics_runner: MetricsRunner | None, + metrics_background_tasks: list[asyncio.Task], + ) -> None: + """Accept a timeout record by updating result.json and scheduling metrics.""" + failed_this_attempt.remove(oid) + finished_ids.append(oid) + newly_timeout_accepted.append(oid) + # Update result.json with timeout_accepted flag + result_path = self.output_dir / "records" / oid / "result.json" + if result_path.exists(): + with open(result_path) as f: + result_data = json.load(f) + result_data["timeout_accepted"] = True + with open(result_path, "w") as f: + json.dump(result_data, f, indent=2) + # Fire metrics if runner available + if metrics_runner is not None: + rdir = self.output_dir / "records" / oid + task = asyncio.create_task(metrics_runner.run_and_save_record(oid, rdir)) + metrics_background_tasks.append(task) + def _archive_failed_attempt(self, record_id: str, attempt_number: int) -> None: """Archive a failed attempt before rerunning. diff --git a/src/eva/orchestrator/validation_runner.py b/src/eva/orchestrator/validation_runner.py index 259b2232..88096433 100644 --- a/src/eva/orchestrator/validation_runner.py +++ b/src/eva/orchestrator/validation_runner.py @@ -39,7 +39,7 @@ def __init__( self, run_dir: Path, dataset: list[EvaluationRecord], - thresholds: dict[str, float], + thresholds: dict[str, float | int], metric_configs: dict[str, dict] | None = None, output_ids: list[str] | None = None, ): @@ -103,7 +103,7 @@ async def run_validation(self) -> dict[str, ValidationResult]: return validation_results - async def validate_one(self, output_id: str) -> ValidationResult: + async def validate_one(self, output_id: str, *, skip_gate: bool = False) -> ValidationResult: """Validate a single record inline for per-record pipelining. Runs a two-phase check matching run_validation(): @@ -118,6 +118,10 @@ async def validate_one(self, output_id: str) -> ValidationResult: Args: output_id: Record directory name (e.g. "1.2.1" or "1.2.1/trial_0"). + skip_gate: If True, bypass the conversation_valid_end gate and run only + LLM metrics. Used for timeout-accepted records where the conversation + timed out (no goodbye event) but we still want to evaluate against + thresholds. Returns: ValidationResult with pass/fail details. @@ -141,23 +145,25 @@ async def validate_one(self, output_id: str) -> ValidationResult: record_dir = self.run_dir / "records" / output_id - # Phase 1: gate metric - gate_metrics = await self._shared_gate_runner.run_and_save_record(output_id, record_dir) - rm = gate_metrics - ms = rm.metrics.get(GATE_METRIC) if rm else None - if ms is None or ms.error: - return ValidationResult(passed=False) # empty failed_metrics = "not_finished" - score = ms.normalized_score if ms.normalized_score is not None else ms.score - if score != 1.0: - return ValidationResult(passed=False) # empty failed_metrics = "not_finished" - - # Phase 2: LLM metrics (gate passed) + if not skip_gate: + # Phase 1: gate metric + gate_metrics = await self._shared_gate_runner.run_and_save_record(output_id, record_dir) + rm = gate_metrics + ms = rm.metrics.get(GATE_METRIC) if rm else None + if ms is None or ms.error: + return ValidationResult(passed=False) # empty failed_metrics = "not_finished" + score = ms.normalized_score if ms.normalized_score is not None else ms.score + if score != 1.0: + return ValidationResult(passed=False) # empty failed_metrics = "not_finished" + + # Phase 2: LLM metrics (gate passed or skipped) llm_metrics = await self._shared_llm_runner.run_and_save_record(output_id, record_dir) if llm_metrics is None: return ValidationResult(passed=False, failed_metrics=list(LLM_METRICS)) vr = self._evaluate_record(output_id, llm_metrics, LLM_METRICS) - vr.scores[GATE_METRIC] = 1.0 + if not skip_gate: + vr.scores[GATE_METRIC] = 1.0 return vr @staticmethod @@ -226,8 +232,8 @@ def _evaluate_record( details[metric_name] = metric_score.details continue - threshold = self.thresholds.get(metric_name, 1.0) - if score < threshold: + threshold = float(self.thresholds.get(metric_name, 1.0)) + if score is None or score < threshold: logger.debug( f"Record {record_id}: Metric '{metric_name}' score {score:.2f} < threshold {threshold:.2f}" ) diff --git a/tests/fixtures/metric_signatures.json b/tests/fixtures/metric_signatures.json index fc79f392..25090759 100644 --- a/tests/fixtures/metric_signatures.json +++ b/tests/fixtures/metric_signatures.json @@ -35,6 +35,12 @@ "source_hash": "91b71c803d77", "version": "v0.1" }, + "ConversationTimeoutMetric": { + "name": "conversation_timed_out_rate", + "prompt_hash": null, + "source_hash": "a3d355be900f", + "version": "v0.1" + }, "ConversationValidEndMetric": { "name": "conversation_valid_end", "prompt_hash": null, @@ -92,7 +98,7 @@ "UserBehavioralFidelityMetric": { "name": "user_behavioral_fidelity", "prompt_hash": "06477144c28e", - "source_hash": "af8144bd7731", + "source_hash": "4f5d1f8e0b21", "version": "v0.1" }, "UserSpeechFidelityMetric": { From 7edbc44422fae28f5db2d454a6af4884327cbe77 Mon Sep 17 00:00:00 2001 From: Katrina Date: Thu, 4 Jun 2026 15:31:31 -0400 Subject: [PATCH 2/8] add ConversationTimeoutMetric to show the rate that conversations finished in time --- .../diagnostic/conversation_timeout.py | 35 +++++++++++++++++++ src/eva/models/config.py | 2 +- src/eva/orchestrator/runner.py | 1 - tests/fixtures/metric_signatures.json | 4 +-- 4 files changed, 38 insertions(+), 4 deletions(-) create mode 100644 src/eva/metrics/diagnostic/conversation_timeout.py diff --git a/src/eva/metrics/diagnostic/conversation_timeout.py b/src/eva/metrics/diagnostic/conversation_timeout.py new file mode 100644 index 00000000..75118316 --- /dev/null +++ b/src/eva/metrics/diagnostic/conversation_timeout.py @@ -0,0 +1,35 @@ +"""Conversation-timeout diagnostic metric.""" + +from eva.metrics.base import CodeMetric, MetricContext +from eva.metrics.registry import register_metric +from eva.models.results import MetricScore + + +@register_metric +class ConversationTimeoutMetric(CodeMetric): + """1.0 when the conversation finished within the time limit; 0.0 when it timed out.""" + + name = "conversation_finished_on_time" + version = "v0.1" + description = "Diagnostic metric: 1.0 when conversation finished within time limit, 0.0 on timeout" + category = "diagnostic" + exclude_from_pass_at_k = True + + async def compute(self, context: MetricContext) -> MetricScore: + try: + reason = context.conversation_ended_reason + timed_out = reason == "timeout" + score = 0.0 if timed_out else 1.0 + + return MetricScore( + name=self.name, + score=score, + normalized_score=score, + details={ + "conversation_ended_reason": reason, + "timed_out": timed_out, + }, + ) + + except Exception as e: + return self._handle_error(e, context) diff --git a/src/eva/models/config.py b/src/eva/models/config.py index d705e021..b0d83ba4 100644 --- a/src/eva/models/config.py +++ b/src/eva/models/config.py @@ -436,7 +436,7 @@ class ModelDeployment(DeploymentTypedDict): "max_timeout_attempts": 1, }, description="Validation metric thresholds and settings for rerun decisions (JSON). " - "max_timeout_attempts controls how many timeout attempts before accepting the last one for evaluation.", + "max_timeout_attempts sets the max number of attempts that timeout before accepting a run for evaluation. Default is 1.", ) # Multi-attempt (for pass@k evaluation) diff --git a/src/eva/orchestrator/runner.py b/src/eva/orchestrator/runner.py index 072cfc6b..9051d221 100644 --- a/src/eva/orchestrator/runner.py +++ b/src/eva/orchestrator/runner.py @@ -276,7 +276,6 @@ async def _run_and_pipeline( not_finished_ids: list[str] = [] failed_validation_ids: list[str] = [] validation_results: dict[str, ValidationResult] = {} - # Map output_id → ConversationResult for timeout detection result_map: dict[str, ConversationResult] = {} for output_id, _result, passed, vr in pipeline_results: diff --git a/tests/fixtures/metric_signatures.json b/tests/fixtures/metric_signatures.json index 25090759..7947fbbe 100644 --- a/tests/fixtures/metric_signatures.json +++ b/tests/fixtures/metric_signatures.json @@ -36,9 +36,9 @@ "version": "v0.1" }, "ConversationTimeoutMetric": { - "name": "conversation_timed_out_rate", + "name": "conversation_finished_on_time", "prompt_hash": null, - "source_hash": "a3d355be900f", + "source_hash": "5fcfcc42ae78", "version": "v0.1" }, "ConversationValidEndMetric": { From a9a0f7e9c8c8919272b538aeab2f6c9c1bde75ce Mon Sep 17 00:00:00 2001 From: Katrina Date: Thu, 4 Jun 2026 15:45:49 -0400 Subject: [PATCH 3/8] update conversation end reason if there was an inactivity timeout --- src/eva/metrics/validation/user_behavioral_fidelity.py | 5 +++++ tests/fixtures/metric_signatures.json | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/eva/metrics/validation/user_behavioral_fidelity.py b/src/eva/metrics/validation/user_behavioral_fidelity.py index de0d2e4d..32b96676 100644 --- a/src/eva/metrics/validation/user_behavioral_fidelity.py +++ b/src/eva/metrics/validation/user_behavioral_fidelity.py @@ -94,6 +94,11 @@ def get_prompt_variables(self, context: MetricContext, transcript_text: str) -> ) elif agent_timeout: conversation_end = "the agent's failure to respond to the final user turn." + elif context.conversation_ended_reason == "inactivity_timeout": + conversation_end = ( + "an inactivity timeout — neither the user nor the agent spoke for an extended period. " + "The user did NOT end the call; the system terminated the conversation due to silence." + ) else: conversation_end = "the user calling the end_call tool." diff --git a/tests/fixtures/metric_signatures.json b/tests/fixtures/metric_signatures.json index 7947fbbe..35ef0564 100644 --- a/tests/fixtures/metric_signatures.json +++ b/tests/fixtures/metric_signatures.json @@ -98,7 +98,7 @@ "UserBehavioralFidelityMetric": { "name": "user_behavioral_fidelity", "prompt_hash": "06477144c28e", - "source_hash": "4f5d1f8e0b21", + "source_hash": "214ede84da72", "version": "v0.1" }, "UserSpeechFidelityMetric": { From aa1e254dc03d65a4437e5745eef7c7f4cbc9db50 Mon Sep 17 00:00:00 2001 From: Katrina Date: Thu, 4 Jun 2026 15:19:38 -0400 Subject: [PATCH 4/8] update default for conversation timeout --- .env.example | 2 +- src/eva/models/config.py | 2 +- tests/unit/models/test_config_models.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.env.example b/.env.example index dd78e6ef..62b9f072 100644 --- a/.env.example +++ b/.env.example @@ -172,7 +172,7 @@ EVA_MODEL__LLM=gpt-5.2 #i Conversation timeout in seconds. #d int #r 30,10000,10 -#v EVA_CONVERSATION_TIMEOUT_SECONDS=360 +#v EVA_CONVERSATION_TIMEOUT_SECONDS=600 #i Maximum rerun attempts for failed records. #d int diff --git a/src/eva/models/config.py b/src/eva/models/config.py index b0d83ba4..2b6c5b2b 100644 --- a/src/eva/models/config.py +++ b/src/eva/models/config.py @@ -487,7 +487,7 @@ class ModelDeployment(DeploymentTypedDict): description="Maximum number of concurrent conversations", ) conversation_timeout_seconds: int = Field( - 360, + 600, ge=30, le=10000, description="Timeout for each conversation in seconds", diff --git a/tests/unit/models/test_config_models.py b/tests/unit/models/test_config_models.py index 0500240f..f60d3e64 100644 --- a/tests/unit/models/test_config_models.py +++ b/tests/unit/models/test_config_models.py @@ -93,7 +93,7 @@ def test_create_minimal_config(self): # run_id = timestamp + model suffix (e.g. "2024-01-15_14-30-45.123456_nova-2_gpt-5.2_sonic") assert config.run_id.endswith("nova-2_gpt-5.2_sonic") assert config.max_concurrent_conversations == 1 - assert config.conversation_timeout_seconds == 360 + assert config.conversation_timeout_seconds == 600 def test_create_full_config(self, temp_dir: Path): """Test creating a RunConfig with all options.""" @@ -519,7 +519,7 @@ def test_defaults(self): assert c.model.stt == "deepgram" assert c.model.tts == "cartesia" assert c.max_concurrent_conversations == 1 - assert c.conversation_timeout_seconds == 360 + assert c.conversation_timeout_seconds == 600 assert c.base_port == 10000 assert c.port_pool_size == 150 assert c.max_rerun_attempts == 3 From c639d35c085c1ca0398fc3bc63b67c618d5a81b8 Mon Sep 17 00:00:00 2001 From: Katrina Date: Mon, 8 Jun 2026 11:32:54 -0400 Subject: [PATCH 5/8] pr improvements for readability --- .../diagnostic/conversation_timeout.py | 3 ++ .../validation/user_behavioral_fidelity.py | 4 +-- src/eva/orchestrator/runner.py | 31 +++++++++++-------- src/eva/orchestrator/validation_runner.py | 3 +- tests/fixtures/metric_signatures.json | 4 +-- 5 files changed, 27 insertions(+), 18 deletions(-) diff --git a/src/eva/metrics/diagnostic/conversation_timeout.py b/src/eva/metrics/diagnostic/conversation_timeout.py index 75118316..065b3d05 100644 --- a/src/eva/metrics/diagnostic/conversation_timeout.py +++ b/src/eva/metrics/diagnostic/conversation_timeout.py @@ -18,6 +18,9 @@ class ConversationTimeoutMetric(CodeMetric): async def compute(self, context: MetricContext) -> MetricScore: try: reason = context.conversation_ended_reason + # Note that `timeout` is treated differently from `inactivity_timeout`. `inactivity_timeout` + # indicates that there was a problem with the simulation whereas `timeout` indicates + # that the model could not complete the conversation on time. timed_out = reason == "timeout" score = 0.0 if timed_out else 1.0 diff --git a/src/eva/metrics/validation/user_behavioral_fidelity.py b/src/eva/metrics/validation/user_behavioral_fidelity.py index 32b96676..251058a9 100644 --- a/src/eva/metrics/validation/user_behavioral_fidelity.py +++ b/src/eva/metrics/validation/user_behavioral_fidelity.py @@ -89,14 +89,14 @@ def get_prompt_variables(self, context: MetricContext, transcript_text: str) -> ) if context.conversation_ended_reason == "timeout": conversation_end = ( - "a system timeout — the conversation exceeded the allowed time limit. " + "a system timeout - the conversation exceeded the allowed time limit. " "The user did NOT end the call; the system terminated the conversation." ) elif agent_timeout: conversation_end = "the agent's failure to respond to the final user turn." elif context.conversation_ended_reason == "inactivity_timeout": conversation_end = ( - "an inactivity timeout — neither the user nor the agent spoke for an extended period. " + "an inactivity timeout - neither the user nor the agent spoke for an extended period. " "The user did NOT end the call; the system terminated the conversation due to silence." ) else: diff --git a/src/eva/orchestrator/runner.py b/src/eva/orchestrator/runner.py index 9051d221..0aa5b346 100644 --- a/src/eva/orchestrator/runner.py +++ b/src/eva/orchestrator/runner.py @@ -161,7 +161,7 @@ async def run(self, records: list[EvaluationRecord]) -> RunResult: rerun_history: dict[str, list[dict]] = {} timeout_attempt_counts: dict[str, int] = {} timeout_validation_cache: dict[str, dict[int, ValidationResult]] = {} - max_timeout_attempts = int(self.config.validation_thresholds.get("max_timeout_attempts", 3)) + max_timeout_attempts = int(self.config.validation_thresholds.get("max_timeout_attempts", 1)) timeout_accepted_ids: set[str] = set() started_at = datetime.now() @@ -343,7 +343,7 @@ async def _run_and_pipeline( if current_vr and current_vr.passed: logger.info( f"Record {oid} timed out {timeout_attempt_counts[oid]} times, " - f"current attempt passed LLM validation — accepting" + f"current attempt passed LLM validation - accepting" ) self._accept_timeout_record( oid, @@ -376,21 +376,26 @@ async def _run_and_pipeline( str(self.output_dir / "records" / f"{oid}_failed_attempt_{attempt_number}"), ) shutil.move(str(archive_dir), str(record_dir)) - self._accept_timeout_record( - oid, - failed_this_attempt, - finished_ids, - newly_timeout_accepted, - metrics_runner, - metrics_background_tasks, - ) - accepted_from_archive = True - break + self._accept_timeout_record( + oid, + failed_this_attempt, + finished_ids, + newly_timeout_accepted, + metrics_runner, + metrics_background_tasks, + ) + accepted_from_archive = True + break + else: + logger.warning( + f"Record {oid}: archive dir for attempt {prev_attempt} not found at " + f"{archive_dir} - cannot restore, skipping" + ) if not accepted_from_archive: logger.info( f"Record {oid} timed out {timeout_attempt_counts[oid]} times, " - f"no attempt passed LLM validation — staying pending" + f"no attempt passed LLM validation - staying pending" ) if newly_timeout_accepted: diff --git a/src/eva/orchestrator/validation_runner.py b/src/eva/orchestrator/validation_runner.py index 88096433..d727b365 100644 --- a/src/eva/orchestrator/validation_runner.py +++ b/src/eva/orchestrator/validation_runner.py @@ -234,8 +234,9 @@ def _evaluate_record( threshold = float(self.thresholds.get(metric_name, 1.0)) if score is None or score < threshold: + score_str = f"{score:.2f}" if score is not None else "None" logger.debug( - f"Record {record_id}: Metric '{metric_name}' score {score:.2f} < threshold {threshold:.2f}" + f"Record {record_id}: Metric '{metric_name}' score {score_str} < threshold {threshold:.2f}" ) failed_metrics.append(metric_name) if metric_score.details: diff --git a/tests/fixtures/metric_signatures.json b/tests/fixtures/metric_signatures.json index 35ef0564..039cc951 100644 --- a/tests/fixtures/metric_signatures.json +++ b/tests/fixtures/metric_signatures.json @@ -38,7 +38,7 @@ "ConversationTimeoutMetric": { "name": "conversation_finished_on_time", "prompt_hash": null, - "source_hash": "5fcfcc42ae78", + "source_hash": "16f8d343506c", "version": "v0.1" }, "ConversationValidEndMetric": { @@ -98,7 +98,7 @@ "UserBehavioralFidelityMetric": { "name": "user_behavioral_fidelity", "prompt_hash": "06477144c28e", - "source_hash": "214ede84da72", + "source_hash": "471117585c48", "version": "v0.1" }, "UserSpeechFidelityMetric": { From 1a4278677458581d4510d408e7a1b63caab78142 Mon Sep 17 00:00:00 2001 From: Katrina Date: Mon, 8 Jun 2026 15:32:20 -0400 Subject: [PATCH 6/8] update metric name to time limit instead of timeout --- .env.example | 4 +-- src/eva/metrics/diagnostic/__init__.py | 4 +-- ..._timeout.py => conversation_time_limit.py} | 10 +++---- src/eva/models/config.py | 8 ++--- src/eva/models/results.py | 4 +-- src/eva/orchestrator/runner.py | 30 +++++++++---------- src/eva/orchestrator/worker.py | 2 +- src/eva/run_benchmark.py | 2 +- tests/fixtures/metric_signatures.json | 6 ++-- tests/unit/models/test_config_models.py | 16 +++++----- tests/unit/orchestrator/test_worker.py | 2 +- 11 files changed, 44 insertions(+), 44 deletions(-) rename src/eva/metrics/diagnostic/{conversation_timeout.py => conversation_time_limit.py} (78%) diff --git a/.env.example b/.env.example index 62b9f072..b8a15245 100644 --- a/.env.example +++ b/.env.example @@ -169,10 +169,10 @@ EVA_MODEL__LLM=gpt-5.2 #r 1,100,1 #v EVA_MAX_CONCURRENT_CONVERSATIONS=1 -#i Conversation timeout in seconds. +#i Conversation time limit in seconds. #d int #r 30,10000,10 -#v EVA_CONVERSATION_TIMEOUT_SECONDS=600 +#v EVA_CONVERSATION_TIME_LIMIT_SECONDS=600 #i Maximum rerun attempts for failed records. #d int diff --git a/src/eva/metrics/diagnostic/__init__.py b/src/eva/metrics/diagnostic/__init__.py index eb3d5e17..02fced4a 100644 --- a/src/eva/metrics/diagnostic/__init__.py +++ b/src/eva/metrics/diagnostic/__init__.py @@ -2,7 +2,7 @@ from . import authentication_success # noqa from . import conversation_correctly_finished # noqa -from . import conversation_timeout # noqa +from . import conversation_time_limit # noqa from . import response_speed # noqa from . import speakability # noqa from . import stt_wer # noqa @@ -12,7 +12,7 @@ __all__ = [ "authentication_success", "conversation_correctly_finished", - "conversation_timeout", + "conversation_time_limit", "response_speed", "speakability", "stt_wer", diff --git a/src/eva/metrics/diagnostic/conversation_timeout.py b/src/eva/metrics/diagnostic/conversation_time_limit.py similarity index 78% rename from src/eva/metrics/diagnostic/conversation_timeout.py rename to src/eva/metrics/diagnostic/conversation_time_limit.py index 065b3d05..9b9da8aa 100644 --- a/src/eva/metrics/diagnostic/conversation_timeout.py +++ b/src/eva/metrics/diagnostic/conversation_time_limit.py @@ -6,10 +6,10 @@ @register_metric -class ConversationTimeoutMetric(CodeMetric): +class ConversationTimeLimitExceededMetric(CodeMetric): """1.0 when the conversation finished within the time limit; 0.0 when it timed out.""" - name = "conversation_finished_on_time" + name = "conversation_completed_on_time" version = "v0.1" description = "Diagnostic metric: 1.0 when conversation finished within time limit, 0.0 on timeout" category = "diagnostic" @@ -18,10 +18,10 @@ class ConversationTimeoutMetric(CodeMetric): async def compute(self, context: MetricContext) -> MetricScore: try: reason = context.conversation_ended_reason - # Note that `timeout` is treated differently from `inactivity_timeout`. `inactivity_timeout` - # indicates that there was a problem with the simulation whereas `timeout` indicates + # Note that `time_limit_exceeded` is treated differently from `inactivity_timeout`. `inactivity_timeout` + # indicates that there was a problem with the simulation whereas `time_limit_exceeded` indicates # that the model could not complete the conversation on time. - timed_out = reason == "timeout" + timed_out = reason == "time_limit_exceeded" score = 0.0 if timed_out else 1.0 return MetricScore( diff --git a/src/eva/models/config.py b/src/eva/models/config.py index b53d8317..7b1e5b6a 100644 --- a/src/eva/models/config.py +++ b/src/eva/models/config.py @@ -433,10 +433,10 @@ class ModelDeployment(DeploymentTypedDict): { "conversation_valid_end": 1.0, "user_behavioral_fidelity": 1.0, - "max_timeout_attempts": 1, + "max_time_limit_attempts": 1, }, description="Validation metric thresholds and settings for rerun decisions (JSON). " - "max_timeout_attempts sets the max number of attempts that timeout before accepting a run for evaluation. Default is 1.", + "max_time_limit_attempts sets the max number of attempts that timeout before accepting a run for evaluation. Default is 1.", ) # Multi-attempt (for pass@k evaluation) @@ -486,11 +486,11 @@ class ModelDeployment(DeploymentTypedDict): le=100, description="Maximum number of concurrent conversations", ) - conversation_timeout_seconds: int = Field( + conversation_time_limit_seconds: int = Field( 600, ge=30, le=10000, - description="Timeout for each conversation in seconds", + description="Max conversation duration in seconds", ) # Output diff --git a/src/eva/models/results.py b/src/eva/models/results.py index c242398e..6e024e7e 100644 --- a/src/eva/models/results.py +++ b/src/eva/models/results.py @@ -80,9 +80,9 @@ class ConversationResult(BaseModel): ) initial_scenario_db_hash: str | None = Field(None, description="SHA-256 hash of initial scenario database") final_scenario_db_hash: str | None = Field(None, description="SHA-256 hash of final scenario database") - timeout_accepted: bool = Field( + time_limit_accepted: bool = Field( False, - description="Whether this record was accepted after exhausting timeout attempts (gate bypass)", + description="Whether this record was accepted after exhausting time limit attempts (gate bypass)", ) diff --git a/src/eva/orchestrator/runner.py b/src/eva/orchestrator/runner.py index 0aa5b346..59e33a03 100644 --- a/src/eva/orchestrator/runner.py +++ b/src/eva/orchestrator/runner.py @@ -161,8 +161,8 @@ async def run(self, records: list[EvaluationRecord]) -> RunResult: rerun_history: dict[str, list[dict]] = {} timeout_attempt_counts: dict[str, int] = {} timeout_validation_cache: dict[str, dict[int, ValidationResult]] = {} - max_timeout_attempts = int(self.config.validation_thresholds.get("max_timeout_attempts", 1)) - timeout_accepted_ids: set[str] = set() + max_time_limit_attempts = int(self.config.validation_thresholds.get("max_time_limit_attempts", 1)) + time_limit_accepted_ids: set[str] = set() started_at = datetime.now() # Initialize port pool once before the attempt loop. @@ -328,13 +328,13 @@ async def _run_and_pipeline( rerun_history.setdefault(oid, []).append(entry) # Check for timeout-accepted records: records that have timed out - # max_timeout_attempts times get evaluated with gate bypass. + # max_time_limit_attempts times get evaluated with gate bypass. # The current attempt was already validated eagerly above; if it passes, # accept immediately. Otherwise, scan cached results from previous # timeout attempts and restore the archived directory if one passes. - newly_timeout_accepted: list[str] = [] + newly_time_limit_accepted: list[str] = [] for oid in list(failed_this_attempt): - if timeout_attempt_counts.get(oid, 0) < max_timeout_attempts: + if timeout_attempt_counts.get(oid, 0) < max_time_limit_attempts: continue cached = timeout_validation_cache.get(oid, {}) @@ -349,7 +349,7 @@ async def _run_and_pipeline( oid, failed_this_attempt, finished_ids, - newly_timeout_accepted, + newly_time_limit_accepted, metrics_runner, metrics_background_tasks, ) @@ -380,7 +380,7 @@ async def _run_and_pipeline( oid, failed_this_attempt, finished_ids, - newly_timeout_accepted, + newly_time_limit_accepted, metrics_runner, metrics_background_tasks, ) @@ -398,9 +398,9 @@ async def _run_and_pipeline( f"no attempt passed LLM validation - staying pending" ) - if newly_timeout_accepted: - timeout_accepted_ids.update(newly_timeout_accepted) - logger.info(f"{len(newly_timeout_accepted)} timeout records accepted via gate bypass") + if newly_time_limit_accepted: + time_limit_accepted_ids.update(newly_time_limit_accepted) + logger.info(f"{len(newly_time_limit_accepted)} timeout records accepted via gate bypass") pending_output_ids = failed_this_attempt @@ -507,7 +507,7 @@ async def _run_and_pipeline( "total_attempts": attempt_number, "failed_record_ids": sorted(final_failed_ids), "successful_record_ids": sorted(successful_ids), - "timeout_accepted_record_ids": sorted(timeout_accepted_ids), + "time_limit_accepted_record_ids": sorted(time_limit_accepted_ids), }, "rerun_history": rerun_history, "final_failures": final_failures, @@ -1126,20 +1126,20 @@ def _accept_timeout_record( oid: str, failed_this_attempt: list[str], finished_ids: list[str], - newly_timeout_accepted: list[str], + newly_time_limit_accepted: list[str], metrics_runner: MetricsRunner | None, metrics_background_tasks: list[asyncio.Task], ) -> None: """Accept a timeout record by updating result.json and scheduling metrics.""" failed_this_attempt.remove(oid) finished_ids.append(oid) - newly_timeout_accepted.append(oid) - # Update result.json with timeout_accepted flag + newly_time_limit_accepted.append(oid) + # Update result.json with time_limit_accepted flag result_path = self.output_dir / "records" / oid / "result.json" if result_path.exists(): with open(result_path) as f: result_data = json.load(f) - result_data["timeout_accepted"] = True + result_data["time_limit_accepted"] = True with open(result_path, "w") as f: json.dump(result_data, f, indent=2) # Fire metrics if runner available diff --git a/src/eva/orchestrator/worker.py b/src/eva/orchestrator/worker.py index 7cb055bc..e2fb6a57 100644 --- a/src/eva/orchestrator/worker.py +++ b/src/eva/orchestrator/worker.py @@ -157,7 +157,7 @@ async def run(self) -> ConversationResult: try: conversation_ended_reason = await asyncio.wait_for( self._run_conversation(), - timeout=self.config.conversation_timeout_seconds, + timeout=self.config.conversation_time_limit_seconds, ) logger.info(f"Conversation {self.record.id} ended: {conversation_ended_reason}") except TimeoutError: diff --git a/src/eva/run_benchmark.py b/src/eva/run_benchmark.py index 0fc3a7e7..71c446ac 100644 --- a/src/eva/run_benchmark.py +++ b/src/eva/run_benchmark.py @@ -113,7 +113,7 @@ async def run_benchmark(config: RunConfig) -> int: else: logger.info(f" S2S model: {config.model.s2s}") logger.info(f" Max concurrent: {config.max_concurrent_conversations}") - logger.info(f" Timeout: {config.conversation_timeout_seconds}s") + logger.info(f" Time limit: {config.conversation_time_limit_seconds}s") return 0 # Create and run benchmark diff --git a/tests/fixtures/metric_signatures.json b/tests/fixtures/metric_signatures.json index 039cc951..9a535613 100644 --- a/tests/fixtures/metric_signatures.json +++ b/tests/fixtures/metric_signatures.json @@ -35,10 +35,10 @@ "source_hash": "91b71c803d77", "version": "v0.1" }, - "ConversationTimeoutMetric": { - "name": "conversation_finished_on_time", + "ConversationTimeLimitExceededMetric": { + "name": "conversation_completed_on_time", "prompt_hash": null, - "source_hash": "16f8d343506c", + "source_hash": "85c95f1e1c13", "version": "v0.1" }, "ConversationValidEndMetric": { diff --git a/tests/unit/models/test_config_models.py b/tests/unit/models/test_config_models.py index 3ee45421..74417de9 100644 --- a/tests/unit/models/test_config_models.py +++ b/tests/unit/models/test_config_models.py @@ -93,7 +93,7 @@ def test_create_minimal_config(self): # run_id = timestamp + model suffix (e.g. "2024-01-15_14-30-45.123456_nova-2_gpt-5.2_sonic") assert config.run_id.endswith("nova-2_gpt-5.2_sonic") assert config.max_concurrent_conversations == 1 - assert config.conversation_timeout_seconds == 600 + assert config.conversation_time_limit_seconds == 600 def test_create_full_config(self, temp_dir: Path): """Test creating a RunConfig with all options.""" @@ -107,7 +107,7 @@ def test_create_full_config(self, temp_dir: Path): "EVA_MODEL__TTS_PARAMS": json.dumps({"api_key": "test_key", "model": "sonic"}), "EVA_RUN_ID": "test_run_001", "EVA_MAX_CONCURRENT_CONVERSATIONS": "50", - "EVA_CONVERSATION_TIMEOUT_SECONDS": "180", + "EVA_CONVERSATION_TIME_LIMIT_SECONDS": "180", "EVA_OUTPUT_DIR": str(temp_dir / "output"), "EVA_BASE_PORT": "8000", "EVA_PORT_POOL_SIZE": "200", @@ -149,9 +149,9 @@ def test_validation_bounds(self): with pytest.raises(ValueError): _config(env_vars=_BASE_ENV | {"EVA_MAX_CONCURRENT_CONVERSATIONS": "0"}) - # conversation_timeout_seconds too low + # conversation_time_limit_seconds too low with pytest.raises(ValueError): - _config(env_vars=_BASE_ENV | {"EVA_CONVERSATION_TIMEOUT_SECONDS": "10"}) + _config(env_vars=_BASE_ENV | {"EVA_CONVERSATION_TIME_LIMIT_SECONDS": "10"}) @pytest.mark.parametrize("indent", (None, 2)) @pytest.mark.parametrize("vars_location", ("env_vars", "env_file_vars")) @@ -519,7 +519,7 @@ def test_defaults(self): assert c.model.stt == "deepgram" assert c.model.tts == "cartesia" assert c.max_concurrent_conversations == 1 - assert c.conversation_timeout_seconds == 600 + assert c.conversation_time_limit_seconds == 600 assert c.base_port == 10000 assert c.port_pool_size == 150 assert c.max_rerun_attempts == 3 @@ -654,9 +654,9 @@ def test_max_concurrent_conversations(self): c = _config(env_vars=_BASE_ENV | {"EVA_MAX_CONCURRENT_CONVERSATIONS": "20"}) assert c.max_concurrent_conversations == 20 - def test_conversation_timeout_seconds(self): - c = _config(env_vars=_BASE_ENV | {"EVA_CONVERSATION_TIMEOUT_SECONDS": "600"}) - assert c.conversation_timeout_seconds == 600 + def test_conversation_time_limit_seconds(self): + c = _config(env_vars=_BASE_ENV | {"EVA_CONVERSATION_TIME_LIMIT_SECONDS": "600"}) + assert c.conversation_time_limit_seconds == 600 def test_base_port(self): c = _config(env_vars=_BASE_ENV | {"EVA_BASE_PORT": "8000"}) diff --git a/tests/unit/orchestrator/test_worker.py b/tests/unit/orchestrator/test_worker.py index 97540a82..5497a6b3 100644 --- a/tests/unit/orchestrator/test_worker.py +++ b/tests/unit/orchestrator/test_worker.py @@ -33,7 +33,7 @@ def test_nearest_rank_rounds_up(self): def _make_worker(tmp_path: Path) -> ConversationWorker: config = MagicMock() - config.conversation_timeout_seconds = 60 + config.conversation_time_limit_seconds = 60 record = MagicMock() record.id = "test-record" record.current_date_time = "2026-01-01T00:00:00" From be24d1da9bef34645584b98beaed27f5680e0c16 Mon Sep 17 00:00:00 2001 From: Katrina Date: Mon, 8 Jun 2026 15:45:49 -0400 Subject: [PATCH 7/8] update metric name to time limit instead of timeout --- .env.example | 60 +- Dockerfile | 11 +- README.md | 43 + apps/README.md | 106 +- apps/analysis.py | 10 +- apps/config_editor.py | 62 +- apps/config_io.py | 7 +- apps/config_schema.py | 6 +- configs/agents/airline_agent.yaml | 11 +- configs/agents/itsm_agent.yaml | 2 +- configs/agents/medical_hr_agent.yaml | 2 +- configs/prompts/judge.yaml | 77 +- configs/prompts/simulation.yaml | 81 +- data/airline_dataset.json | 2893 ++++++-- data/airline_scenarios/1.1.2.json | 8 +- data/airline_scenarios/1.1.3.json | 8 +- data/airline_scenarios/1.1.4.json | 8 +- data/airline_scenarios/1.1.5.json | 8 +- data/airline_scenarios/1.2.1.json | 8 +- data/airline_scenarios/1.2.2.json | 8 +- data/airline_scenarios/1.2.3.json | 8 +- data/airline_scenarios/1.3.1.json | 8 +- data/airline_scenarios/1.3.2.json | 8 +- data/airline_scenarios/2.1.1.json | 8 +- data/airline_scenarios/2.1.2.json | 8 +- data/airline_scenarios/2.1.6.json | 8 +- data/airline_scenarios/2.2.2.json | 8 +- data/airline_scenarios/2.2.4.json | 8 +- data/airline_scenarios/2.2.5.json | 8 +- data/airline_scenarios/2.3.2.json | 8 +- data/airline_scenarios/2.3.4.json | 8 +- data/airline_scenarios/2.4.1.json | 8 +- data/airline_scenarios/2.4.2.json | 8 +- data/airline_scenarios/3.1.3.json | 8 +- data/airline_scenarios/3.1.5.json | 8 +- data/airline_scenarios/3.3.4.json | 8 +- data/airline_scenarios/4.1.1.json | 8 +- data/airline_scenarios/4.1.2.json | 8 +- data/airline_scenarios/4.1.3.json | 8 +- data/airline_scenarios/4.1.5.json | 8 +- data/airline_scenarios/4.2.1.json | 8 +- data/airline_scenarios/4.2.4.json | 8 +- data/airline_scenarios/4.2.5.json | 8 +- data/airline_scenarios/5.1.1.json | 8 +- data/airline_scenarios/5.1.2.json | 8 +- data/airline_scenarios/5.1.3.json | 8 +- data/airline_scenarios/5.1.5.json | 8 +- data/airline_scenarios/5.2.1.json | 8 +- data/airline_scenarios/5.2.2.json | 8 +- data/airline_scenarios/5.2.5.json | 8 +- data/airline_scenarios/5.2.6.json | 8 +- data/airline_scenarios/6.1.1.json | 8 +- data/airline_scenarios/6.1.4.json | 8 +- data/airline_scenarios/6.3.1.json | 8 +- data/airline_scenarios/6.3.4.json | 8 +- data/airline_scenarios/7.1.1.json | 8 +- data/airline_scenarios/7.2.1.json | 8 +- data/airline_scenarios/7.2.2.json | 8 +- data/airline_scenarios/7.2.5.json | 8 +- data/airline_scenarios/7.2.6.json | 8 +- data/airline_scenarios/7.2.8.json | 8 +- data/airline_scenarios/7.2.9.json | 8 +- data/airline_scenarios/7.3.1.json | 8 +- data/airline_scenarios/7.4.1.json | 8 +- data/itsm_aliases/adobe_acrobat_pro.json | 10 + data/itsm_aliases/adobe_creative_cloud.json | 10 + data/itsm_aliases/alpha_garage.json | 24 + data/itsm_aliases/central_garage.json | 21 + data/itsm_aliases/cisco_anyconnect.json | 9 + data/itsm_aliases/confluence.json | 10 + data/itsm_aliases/datadog.json | 9 + data/itsm_aliases/docker_desktop.json | 9 + data/itsm_aliases/downtown.json | 29 + .../downtown_engineering_center.json | 32 + data/itsm_aliases/downtown_garage_a.json | 30 + data/itsm_aliases/downtown_office.json | 31 + data/itsm_aliases/east_campus.json | 17 + data/itsm_aliases/east_campus_garage.json | 24 + data/itsm_aliases/executive_garage.json | 34 + data/itsm_aliases/figma.json | 8 + data/itsm_aliases/garage_a.json | 18 + data/itsm_aliases/garage_b.json | 23 + data/itsm_aliases/general_garage.json | 25 + data/itsm_aliases/general_surface_lot.json | 33 + .../harbor_engineering_center.json | 32 + data/itsm_aliases/headquarters.json | 31 + data/itsm_aliases/headquarters_east.json | 34 + data/itsm_aliases/headquarters_garage.json | 27 + data/itsm_aliases/headquarters_garage_a.json | 25 + data/itsm_aliases/intellij_idea.json | 11 + data/itsm_aliases/jira.json | 9 + data/itsm_aliases/lucidchart.json | 9 + data/itsm_aliases/main_garage.json | 21 + data/itsm_aliases/microsoft_365_apps.json | 8 + data/itsm_aliases/microsoft_365_e3.json | 8 + data/itsm_aliases/microsoft_project.json | 8 + data/itsm_aliases/microsoft_visio.json | 9 + data/itsm_aliases/north_garage.json | 22 + data/itsm_aliases/north_surface_lot.json | 35 + data/itsm_aliases/operations_center.json | 31 + .../riverside_engineering_center.json | 30 + data/itsm_aliases/salesforce.json | 10 + data/itsm_aliases/slack.json | 11 + data/itsm_aliases/south_garage.json | 19 + .../visual_studio_enterprise.json | 10 + data/itsm_dataset.json | 6378 ++++++++++------- data/itsm_scenarios/1.json | 25 +- data/itsm_scenarios/10.json | 24 +- data/itsm_scenarios/100.json | 22 +- data/itsm_scenarios/101.json | 25 +- data/itsm_scenarios/102.json | 22 +- data/itsm_scenarios/103.json | 24 +- data/itsm_scenarios/11.json | 22 +- data/itsm_scenarios/12.json | 26 +- data/itsm_scenarios/13.json | 23 +- data/itsm_scenarios/14.json | 26 +- data/itsm_scenarios/15.json | 30 +- data/itsm_scenarios/16.json | 31 +- data/itsm_scenarios/17.json | 28 +- data/itsm_scenarios/18.json | 26 +- data/itsm_scenarios/19.json | 10 +- data/itsm_scenarios/2.json | 28 +- data/itsm_scenarios/20.json | 22 +- data/itsm_scenarios/21.json | 26 +- data/itsm_scenarios/22.json | 24 +- data/itsm_scenarios/23.json | 12 +- data/itsm_scenarios/24.json | 22 +- data/itsm_scenarios/25.json | 13 +- data/itsm_scenarios/26.json | 24 +- data/itsm_scenarios/27.json | 24 +- data/itsm_scenarios/28.json | 22 +- data/itsm_scenarios/29.json | 21 +- data/itsm_scenarios/30.json | 20 +- data/itsm_scenarios/33.json | 21 +- data/itsm_scenarios/35.json | 24 +- data/itsm_scenarios/37.json | 26 +- data/itsm_scenarios/38.json | 10 +- data/itsm_scenarios/39.json | 12 +- data/itsm_scenarios/4.json | 35 +- data/itsm_scenarios/40.json | 28 +- data/itsm_scenarios/41.json | 31 +- data/itsm_scenarios/42.json | 20 +- data/itsm_scenarios/43.json | 21 +- data/itsm_scenarios/44.json | 26 +- data/itsm_scenarios/45.json | 21 +- data/itsm_scenarios/46.json | 25 +- data/itsm_scenarios/5.json | 27 +- data/itsm_scenarios/51.json | 25 +- data/itsm_scenarios/52.json | 21 +- data/itsm_scenarios/54.json | 27 +- data/itsm_scenarios/56.json | 26 +- data/itsm_scenarios/58.json | 24 +- data/itsm_scenarios/6.json | 19 +- data/itsm_scenarios/60.json | 21 +- data/itsm_scenarios/62.json | 26 +- data/itsm_scenarios/63.json | 24 +- data/itsm_scenarios/64.json | 24 +- data/itsm_scenarios/65.json | 24 +- data/itsm_scenarios/66.json | 21 +- data/itsm_scenarios/68.json | 21 +- data/itsm_scenarios/69.json | 24 +- data/itsm_scenarios/7.json | 24 +- data/itsm_scenarios/71.json | 8 +- data/itsm_scenarios/72.json | 28 +- data/itsm_scenarios/73.json | 19 +- data/itsm_scenarios/74.json | 28 +- data/itsm_scenarios/75.json | 24 +- data/itsm_scenarios/76.json | 23 +- data/itsm_scenarios/77.json | 21 +- data/itsm_scenarios/78.json | 14 +- data/itsm_scenarios/79.json | 30 +- data/itsm_scenarios/8.json | 15 +- data/itsm_scenarios/80.json | 36 +- data/itsm_scenarios/81.json | 25 +- data/itsm_scenarios/82.json | 25 +- data/itsm_scenarios/85.json | 13 +- data/itsm_scenarios/86.json | 28 +- data/itsm_scenarios/88.json | 25 +- data/itsm_scenarios/89.json | 17 +- data/itsm_scenarios/90.json | 24 +- data/itsm_scenarios/91.json | 24 +- data/itsm_scenarios/95.json | 27 +- data/itsm_scenarios/96.json | 22 +- data/itsm_scenarios/97.json | 16 +- data/itsm_scenarios/98.json | 25 +- data/itsm_scenarios/99.json | 21 +- data/medical_hr_dataset.json | 3610 ++++++---- data/medical_hr_scenarios/1.1.json | 8 +- data/medical_hr_scenarios/1.2.json | 8 +- data/medical_hr_scenarios/10.1.json | 4 +- data/medical_hr_scenarios/10.2.json | 4 +- data/medical_hr_scenarios/11.1.json | 4 +- data/medical_hr_scenarios/11.2.json | 8 +- data/medical_hr_scenarios/12.1.json | 4 +- data/medical_hr_scenarios/12.2.json | 8 +- data/medical_hr_scenarios/2.1.json | 8 +- data/medical_hr_scenarios/2.2.json | 8 +- data/medical_hr_scenarios/3.1.json | 8 +- data/medical_hr_scenarios/3.2.json | 8 +- data/medical_hr_scenarios/4.1.json | 8 +- data/medical_hr_scenarios/4.2.json | 4 +- data/medical_hr_scenarios/5.1.json | 8 +- data/medical_hr_scenarios/5.2.json | 8 +- data/medical_hr_scenarios/6.1.json | 8 +- data/medical_hr_scenarios/6.2.json | 8 +- data/medical_hr_scenarios/7.1.json | 4 +- data/medical_hr_scenarios/7.2.json | 4 +- data/medical_hr_scenarios/8.1.json | 8 +- data/medical_hr_scenarios/8.2.json | 8 +- data/medical_hr_scenarios/9.1.json | 8 +- data/medical_hr_scenarios/9.2.json | 4 +- data/medical_hr_scenarios/A1.json | 8 +- data/medical_hr_scenarios/A10.json | 4 +- data/medical_hr_scenarios/A2.json | 8 +- data/medical_hr_scenarios/A3.json | 8 +- data/medical_hr_scenarios/A4.json | 8 +- data/medical_hr_scenarios/A5.json | 8 +- data/medical_hr_scenarios/A6.json | 8 +- data/medical_hr_scenarios/A7.json | 4 +- data/medical_hr_scenarios/A9.json | 4 +- data/medical_hr_scenarios/D1.1.json | 8 +- data/medical_hr_scenarios/D1.2.json | 8 +- data/medical_hr_scenarios/D1.3.json | 8 +- data/medical_hr_scenarios/D10.1.json | 4 +- data/medical_hr_scenarios/D10.2.json | 4 +- data/medical_hr_scenarios/D10.3.json | 8 +- data/medical_hr_scenarios/D2.1.json | 8 +- data/medical_hr_scenarios/D2.2.json | 8 +- data/medical_hr_scenarios/D2.3.json | 8 +- data/medical_hr_scenarios/D3.1.json | 8 +- data/medical_hr_scenarios/D3.2.json | 8 +- data/medical_hr_scenarios/D3.3.json | 8 +- data/medical_hr_scenarios/D4.1.json | 8 +- data/medical_hr_scenarios/D4.2.json | 8 +- data/medical_hr_scenarios/D4.3.json | 8 +- data/medical_hr_scenarios/D5.1.json | 8 +- data/medical_hr_scenarios/D5.2.json | 8 +- data/medical_hr_scenarios/D5.3.json | 8 +- data/medical_hr_scenarios/D6.1.json | 8 +- data/medical_hr_scenarios/D6.2.json | 8 +- data/medical_hr_scenarios/D6.3.json | 8 +- data/medical_hr_scenarios/D7.1.json | 4 +- data/medical_hr_scenarios/D7.2.json | 8 +- data/medical_hr_scenarios/D7.3.json | 8 +- data/medical_hr_scenarios/D8.1.json | 8 +- data/medical_hr_scenarios/D8.2.json | 8 +- data/medical_hr_scenarios/D8.3.json | 8 +- data/medical_hr_scenarios/D9.1.json | 4 +- data/medical_hr_scenarios/D9.3.json | 4 +- data/medical_hr_scenarios/T1.1.json | 8 +- data/medical_hr_scenarios/T1.2.json | 8 +- data/medical_hr_scenarios/T1.3.json | 8 +- data/medical_hr_scenarios/T2.1.json | 8 +- data/medical_hr_scenarios/T2.2.json | 8 +- data/medical_hr_scenarios/T2.3.json | 8 +- data/medical_hr_scenarios/T3.1.json | 8 +- data/medical_hr_scenarios/T3.2.json | 8 +- data/medical_hr_scenarios/T3.3.json | 8 +- data/medical_hr_scenarios/T4.1.json | 4 +- data/medical_hr_scenarios/T4.2.json | 4 +- data/medical_hr_scenarios/T4.3.json | 8 +- data/medical_hr_scenarios/T5.1.json | 8 +- data/medical_hr_scenarios/T5.2.json | 4 +- data/medical_hr_scenarios/T5.3.json | 8 +- data/medical_hr_scenarios/T6.1.json | 4 +- data/medical_hr_scenarios/T6.2.json | 8 +- data/medical_hr_scenarios/T6.3.json | 4 +- data/medical_hr_scenarios/T7.1.json | 8 +- data/medical_hr_scenarios/T7.2.json | 8 +- data/medical_hr_scenarios/T7.3.json | 8 +- docs/metrics/stt_wer.md | 10 + scripts/regen_metric_signatures.py | 75 +- scripts/run_text_only.py | 65 +- src/eva/__init__.py | 4 +- src/eva/assistant/agentic/audit_log.py | 6 +- src/eva/assistant/agentic/system.py | 16 +- src/eva/assistant/audio_bridge.py | 2 +- src/eva/assistant/base_server.py | 10 +- src/eva/assistant/elevenlabs_server.py | 8 +- src/eva/assistant/gemini_live_server.py | 15 +- src/eva/assistant/openai_realtime_server.py | 8 +- src/eva/assistant/pipecat_server.py | 13 +- src/eva/assistant/pipeline/alm_base.py | 21 + src/eva/assistant/pipeline/alm_gemini.py | 5 +- src/eva/assistant/pipeline/alm_vllm.py | 5 +- .../assistant/pipeline/audio_llm_processor.py | 4 +- src/eva/assistant/pipeline/observers.py | 4 +- src/eva/assistant/pipeline/services.py | 115 +- src/eva/assistant/tools/itsm_tools.py | 22 +- .../metrics/accuracy/agent_speech_fidelity.py | 2 +- .../accuracy/agent_speech_fidelity_s2s.py | 11 +- src/eva/metrics/accuracy/faithfulness.py | 2 +- src/eva/metrics/base.py | 8 +- src/eva/metrics/diagnostic/speakability.py | 2 +- src/eva/metrics/diagnostic/stt_wer.py | 50 +- .../transcription_accuracy_key_entities.py | 2 +- src/eva/metrics/processor.py | 2 +- src/eva/metrics/runner.py | 39 +- src/eva/metrics/speech_fidelity_base.py | 19 +- .../validation/user_behavioral_fidelity.py | 7 +- .../validation/user_speech_fidelity.py | 2 +- src/eva/models/config.py | 60 +- src/eva/models/record.py | 15 +- src/eva/orchestrator/runner.py | 69 +- src/eva/orchestrator/validation_runner.py | 2 +- src/eva/orchestrator/worker.py | 49 +- src/eva/user_simulator/client.py | 19 +- src/eva/user_simulator/event_logger.py | 2 +- .../utils/wer_normalization/normalizers.py | 13 - src/eva/utils/wer_normalization/wer_utils.py | 44 +- .../whisper_normalizer/__init__.py | 5 +- .../whisper_normalizer/basic.py | 16 + .../whisper_normalizer/english.json | 1741 ----- .../whisper_normalizer/english.py | 544 -- tests/fixtures/metric_signatures.json | 46 +- tests/integration/test_metrics_end_to_end.py | 11 +- tests/unit/test_config_io.py | 2 +- tests/unit/utils/test_wer_utils.py | 278 +- 318 files changed, 11470 insertions(+), 8983 deletions(-) create mode 100644 data/itsm_aliases/adobe_acrobat_pro.json create mode 100644 data/itsm_aliases/adobe_creative_cloud.json create mode 100644 data/itsm_aliases/alpha_garage.json create mode 100644 data/itsm_aliases/central_garage.json create mode 100644 data/itsm_aliases/cisco_anyconnect.json create mode 100644 data/itsm_aliases/confluence.json create mode 100644 data/itsm_aliases/datadog.json create mode 100644 data/itsm_aliases/docker_desktop.json create mode 100644 data/itsm_aliases/downtown.json create mode 100644 data/itsm_aliases/downtown_engineering_center.json create mode 100644 data/itsm_aliases/downtown_garage_a.json create mode 100644 data/itsm_aliases/downtown_office.json create mode 100644 data/itsm_aliases/east_campus.json create mode 100644 data/itsm_aliases/east_campus_garage.json create mode 100644 data/itsm_aliases/executive_garage.json create mode 100644 data/itsm_aliases/figma.json create mode 100644 data/itsm_aliases/garage_a.json create mode 100644 data/itsm_aliases/garage_b.json create mode 100644 data/itsm_aliases/general_garage.json create mode 100644 data/itsm_aliases/general_surface_lot.json create mode 100644 data/itsm_aliases/harbor_engineering_center.json create mode 100644 data/itsm_aliases/headquarters.json create mode 100644 data/itsm_aliases/headquarters_east.json create mode 100644 data/itsm_aliases/headquarters_garage.json create mode 100644 data/itsm_aliases/headquarters_garage_a.json create mode 100644 data/itsm_aliases/intellij_idea.json create mode 100644 data/itsm_aliases/jira.json create mode 100644 data/itsm_aliases/lucidchart.json create mode 100644 data/itsm_aliases/main_garage.json create mode 100644 data/itsm_aliases/microsoft_365_apps.json create mode 100644 data/itsm_aliases/microsoft_365_e3.json create mode 100644 data/itsm_aliases/microsoft_project.json create mode 100644 data/itsm_aliases/microsoft_visio.json create mode 100644 data/itsm_aliases/north_garage.json create mode 100644 data/itsm_aliases/north_surface_lot.json create mode 100644 data/itsm_aliases/operations_center.json create mode 100644 data/itsm_aliases/riverside_engineering_center.json create mode 100644 data/itsm_aliases/salesforce.json create mode 100644 data/itsm_aliases/slack.json create mode 100644 data/itsm_aliases/south_garage.json create mode 100644 data/itsm_aliases/visual_studio_enterprise.json delete mode 100644 src/eva/utils/wer_normalization/normalizers.py delete mode 100644 src/eva/utils/wer_normalization/whisper_normalizer/english.json delete mode 100644 src/eva/utils/wer_normalization/whisper_normalizer/english.py diff --git a/.env.example b/.env.example index b8a15245..570434ed 100644 --- a/.env.example +++ b/.env.example @@ -58,6 +58,11 @@ AWS_SECRET_ACCESS_KEY=your_aws_secret_access_key_here # Voice Pipeline # ============================================== +#i LLM model alias for the assistant. Must match a model_name in EVA_MODEL_LIST. +#d enum +#x pipeline_mode=LLM +EVA_MODEL__LLM=gpt-5.2 + # Pipeline mode is controlled by the UI radio (LLM / S2S / AudioLLM). # The #x conditions below ensure each variable is only active for the right mode. @@ -73,16 +78,16 @@ EVA_MODEL__STT=cartesia #x pipeline_mode=LLM EVA_MODEL__STT_PARAMS='{"api_key": "your_cartesia_api_key", "model": "ink-whisper"}' -# --- LLM mode: TTS --- +# --- TTS (LLM and AudioLLM modes) --- #i TTS provider for the voice pipeline. #d enum #e cartesia,chatterbox,elevenlabs,gemini,kokoro,nvidia-baseten,openai,xtts -#x pipeline_mode=LLM +#x pipeline_mode=LLM,AudioLLM EVA_MODEL__TTS=cartesia #i TTS provider parameters. Must include "api_key" and "model". Use "urls" for round-robin load balancing. #d json_object -#x pipeline_mode=LLM +#x pipeline_mode=LLM,AudioLLM EVA_MODEL__TTS_PARAMS='{"api_key": "your_cartesia_api_key", "model": "sonic"}' # --- S2S mode --- @@ -150,11 +155,6 @@ EVA_MODEL_LIST='[ } ]' -#i LLM model alias for the assistant. Must match a model_name in EVA_MODEL_LIST. -#d enum -#x pipeline_mode=LLM -EVA_MODEL__LLM=gpt-5.2 - # ============================================== # Framework & Runtime # ============================================== @@ -234,6 +234,50 @@ EVA_MODEL__LLM=gpt-5.2 # User Config # ============================================== +# --- Language (mutually exclusive with Accent and Behavior) --- +#i ISO 639-1 language code for the user simulator. Datasets must exist for the selected language. Pattern for the agent ID pairs below: EVA_{LANG}_USER_{F|M}. +#d enum +#e en,fr,fr-CA,fr-ca +#x perturbation_mode=Language +#v EVA_LANGUAGE=en + +# --- Language agent IDs --- +#i ElevenLabs agent ID — Canadian French, female voice. +#d string +#x perturbation_mode=Language +#x EVA_LANGUAGE=fr-CA +#v EVA_FR_CA_USER_F= + +#i ElevenLabs agent ID — Canadian French, male voice. +#d string +#x perturbation_mode=Language +#x EVA_LANGUAGE=fr-CA +#v EVA_FR_CA_USER_M= + +#i ElevenLabs agent ID — European French, female voice. +#d string +#x perturbation_mode=Language +#x EVA_LANGUAGE=fr +#v EVA_FR_USER_F= + +#i ElevenLabs agent ID — European French, male voice. +#d string +#x perturbation_mode=Language +#x EVA_LANGUAGE=fr +#v EVA_FR_USER_M= + +#i ElevenLabs agent ID — English, female voice. +#d string +#x perturbation_mode=Language +#x EVA_LANGUAGE=en +#v EVA_EN_USER_F= + +#i ElevenLabs agent ID — English, male voice. +#d string +#x perturbation_mode=Language +#x EVA_LANGUAGE=en +#v EVA_EN_USER_M= + # --- Default user simulator agents --- #i ElevenLabs agent ID for the default female-voice user persona. #d string diff --git a/Dockerfile b/Dockerfile index fbb1d789..afe51b83 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,16 +14,19 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ git \ && rm -rf /var/lib/apt/lists/* -# Copy dependency files and source code +# Install dependencies — cached as long as pyproject.toml doesn't change COPY pyproject.toml README.md ./ -COPY src/ ./src/ - -# Install dependencies into a virtual environment +# Stub src so hatchling can resolve the package during dep install +RUN mkdir -p src/eva && echo '__version__ = "0.0.0"' > src/eva/__init__.py RUN python -m venv /opt/venv ENV PATH="/opt/venv/bin:$PATH" RUN pip install --no-cache-dir --upgrade pip && \ pip install --no-cache-dir . +# Copy real source and reinstall only the package (deps already cached above) +COPY src/ ./src/ +RUN pip install --no-cache-dir --no-deps . + # ============================================ # Stage 2: Runtime # ============================================ diff --git a/README.md b/README.md index d28951f9..89d77bde 100644 --- a/README.md +++ b/README.md @@ -162,6 +162,49 @@ streamlit run apps/config_editor.py The editor covers all variables grouped by tab (API keys, voice pipeline, model deployments, runtime settings, perturbations, etc.), with proper widgets for each type. See [`apps/README.md`](apps/README.md) for details. +### Adding a Language + +**1. Run `add_culture_data.py`** — handles all one-time setup: generates culturally appropriate names and translated utterances for every dataset record, writes a "respond in X" addendum to `configs/agents/language_addenda.yaml`, translates the assistant's opening greeting into `configs/agents/initial_messages.yaml`, generates a WER normalizer config, and patches `.env.example` with the new agent ID stubs. + +```bash +PYTHONPATH=src python scripts/add_culture_data.py \ + --language it \ + --language-name Italian \ + --native-name italiano \ + --auto-generate-names +``` + +Re-running is safe — existing entries are skipped (idempotent). Use `--dry-run` to preview changes before writing. + +For languages with significant regional spelling divergence (e.g. Portuguese, where pt-BR and pt-PT differ orthographically), pass `--include-spelling-variation` to also generate a spelling normalization map used during WER evaluation: + +```bash +PYTHONPATH=src python scripts/add_culture_data.py \ + --language pt \ + --language-name Portuguese \ + --auto-generate-names \ + --include-spelling-variation +``` + +See the script's `--help` for the full argument reference. + +**2. Add your ElevenLabs agent IDs** — the script adds the variable stubs to `.env.example`; fill in the values in your `.env` (or use the config editor's **User Config** tab): + +```bash +EVA_IT_USER_F=your_elevenlabs_agent_id_female +EVA_IT_USER_M=your_elevenlabs_agent_id_male +``` + +**3. Set `EVA_LANGUAGE` and run**: + +```bash +EVA_LANGUAGE=it EVA_DOMAIN=airline python main.py +``` + +#### WER normalization for new languages + +There are some automatically generated rules for WER calculation which will be generated with the `add_culture_data.py` script. To see the full implications of this auto generation, see [metrics/stt_wer.md](docs/metrics/stt_wer.md). + ### Exploring Results EVA includes a Streamlit analysis app for visualizing and comparing results: diff --git a/apps/README.md b/apps/README.md index 69a524c2..3a457ac7 100644 --- a/apps/README.md +++ b/apps/README.md @@ -12,7 +12,111 @@ Interactive UI for building and editing `.env` configuration files without hand- streamlit run apps/config_editor.py ``` -The app reads `.env.example` for the full variable set and loads existing values from `.env` if present. Each variable's widget type, enum options, ranges, and tooltips are declared directly in `.env.example` using annotation prefixes (`#i`, `#d`, `#e`, `#r`, `#x`, `#v`). Use the **Preview** button to inspect the generated file before saving, or **Download** to export it without writing to disk. +The app reads `.env.example` for the full variable set and loads existing values from `.env` if present. Each variable's widget type, enum options, ranges, and tooltips are declared directly in `.env.example` using annotation prefixes. Use the **Preview** button to inspect the generated file before saving, or **Download** to export it without writing to disk. + +### `.env.example` Annotation Scheme + +The editor is driven entirely from annotated comments in `.env.example` — there is no separate schema file. Each annotation prefix applies to the **immediately following** variable definition (active or inactive). Annotation order doesn't matter, but the block must be contiguous: any blank line or `# ` true-comment between annotations resets the accumulator. + +| Prefix | Name | Purpose | +|---|---|---| +| `# ` | True comment | Human-readable prose. Preserved verbatim, never parsed as metadata. | +| `#i ` | Info | Tooltip text shown next to the widget. Multiple `#i` lines join with spaces. | +| `#d ` | Datatype | Widget type — see table below. If omitted, inferred from name + value. | +| `#e ` | Enum options | Comma-separated valid values for `enum` / `multi_enum`. | +| `#r ` | Range | Numeric `min,max` or `min,max,step` for `int` / `float`. | +| `#g ` | Group | Override tab assignment (otherwise inherited from section header). | +| `#x ` | Condition | `VAR=value` — only render when that var equals that value. Comma-separated values are OR (`#x pipeline_mode=LLM,AudioLLM`). Multiple `#x` lines = AND. | +| `#v ` | Inactive var | `#v VARNAME=value` — a variable definition that ships off by default but is fully configurable. | + +#### Widget types (`#d`) + +| Type | Renders as | +|---|---| +| `string` | `st.text_input` | +| `secret` | `st.text_input(type="password")` | +| `bool` | `st.checkbox` | +| `int` | `st.number_input` (integers, range from `#r`) | +| `float` | `st.number_input` (floats, range from `#r`) | +| `enum` | `st.selectbox` (options from `#e`) | +| `multi_enum` | `st.multiselect` (options from `#e`) | +| `csv_list` | `st.text_input` split/joined on comma | +| `path` | `st.text_input` with existence hint | +| `json_object` | Key/value table + raw JSON expander | +| `json_deployment_list` | Special-cased deployment-card editor for `EVA_MODEL_LIST` | + +#### Widget inference (when `#d` is omitted) + +- Name contains `KEY`, `SECRET`, `TOKEN`, or `PASSWORD` → `secret` +- Name contains `CREDENTIALS` or ends with `_PATH` / `_DIR` → `path` +- Value is `true` / `false` → `bool` +- Value parses as an integer → `int`, as a float → `float` +- Value looks like a JSON array containing `model_name` → `json_deployment_list` +- Value looks like a JSON array or object → `json_object` +- Otherwise → `string` + +#### Section headers + +Top-level groups are declared by a 3-line header block. Variables that follow inherit the group name until the next header. + +```bash +# ============================================== +# Voice Pipeline +# ============================================== +``` + +The section title must match one of the tab name constants in [`config_schema.py`](config_schema.py) (`API Configs`, `Voice Pipeline`, `LiteLLM Deployments`, `Framework & Runtime`, `Turn Detection & VAD`, `User Config`, `Debug & Logging`). + +#### Variable states + +```bash +# Just a note — ignored entirely. + +#i Maximum parallel conversations. +#d int +#r 1,100,1 +EVA_MAX_CONCURRENT_CONVERSATIONS=5 # active — written to .env + +#i Domain for dataset/agent paths. +#d enum +#e airline,itsm,medical_hr +#v EVA_DOMAIN=airline # inactive — user can enable in UI + +#i French accent agent ID. +#d secret +#x perturbation_mode=Accent +#x EVA_PERTURBATION__ACCENT=french +#v EVA_FRENCH_ACCENT_USER_F= # only renders when both conditions hold +``` + +#### Conditions and modes + +`#x` conditions can reference either: +- Another env variable's value (e.g. `#x EVA_PERTURBATION__ACCENT=french`) +- A UI-only state key managed by a mutex radio button (e.g. `#x pipeline_mode=LLM`) + +Mutex radio buttons are declared in [`config_schema.py`](config_schema.py) via `MUTEX_RADIOS`. Each radio writes to a session-state key (`pipeline_mode`, `perturbation_mode`) that `#x` conditions can match against. + +#### Serialization rules + +When the user saves `.env`: + +| In `.env.example` | User sets a value | Disabled by mutex / `#x` | Output | +|---|---|---|---| +| Active (`VAR=…`) | yes | no | `VAR=value` | +| Active (`VAR=…`) | no | no | original line verbatim | +| Active (`VAR=…`) | any | yes | `#v VAR=value` (or example value) | +| Inactive (`#v VAR=…`) | yes | no | `VAR=value` (activated) | +| Inactive (`#v VAR=…`) | no | any | `#v VAR=…` verbatim | +| Not in template, in user's loaded `.env` | — | — | appended in matching tab section (KEY/URL → API Configs, `EVA_*` → Framework & Runtime, otherwise Misc) | + +Round-tripping is lossless: `serialize_env({}, parse_env_example(...))` reproduces the original file byte-for-byte. + +#### Implementation + +- [`config_io.py`](config_io.py) — `parse_env_example`, `load_env`, `serialize_env`, `compute_disabled`. Pure functions, no Streamlit dependency. +- [`config_schema.py`](config_schema.py) — group constants, tab ordering, mutex radio definitions. Everything else lives in `.env.example`. +- [`config_editor.py`](config_editor.py) — Streamlit UI that dispatches on `AnnotatedVar.widget`. --- diff --git a/apps/analysis.py b/apps/analysis.py index 101fdcdc..e4ff8d5d 100644 --- a/apps/analysis.py +++ b/apps/analysis.py @@ -2143,8 +2143,8 @@ def render_conversation_trace_tab(metrics: RecordMetrics | None, record_dir: Pat expected_db = context.get("expected_scenario_db") final_db = context.get("final_scenario_db") if expected_db and final_db: - expected_str = json.dumps(expected_db, indent=2, sort_keys=True, default=str) - actual_str = json.dumps(final_db, indent=2, sort_keys=True, default=str) + expected_str = json.dumps(expected_db, indent=2, sort_keys=True, default=str, ensure_ascii=False) + actual_str = json.dumps(final_db, indent=2, sort_keys=True, default=str, ensure_ascii=False) diff_viewer(expected_str, actual_str, lang="json", key="task_completion_diff") elif details_to_show: st.json(details_to_show) @@ -2387,13 +2387,15 @@ def render_conversation_trace_tab(metrics: RecordMetrics | None, record_dir: Pat col_left = st.container() with col_left: if entry_type == "tool_call": - params_str = json.dumps(entry.get("parameters", {}), indent=2) + params_str = json.dumps(entry.get("parameters", {}), indent=2, ensure_ascii=False) with st.expander(f"Tool Call — `{tool_name}`", expanded=False): st.code(params_str, language="json") elif entry_type == "tool_response": tool_response = entry.get("tool_response", "") response_str = ( - json.dumps(tool_response, indent=2) if isinstance(tool_response, dict) else str(tool_response) + json.dumps(tool_response, indent=2, ensure_ascii=False) + if isinstance(tool_response, dict) + else str(tool_response) ) with st.expander(f"Tool Response — `{tool_name}`", expanded=False): st.code(response_str, language="json") diff --git a/apps/config_editor.py b/apps/config_editor.py index f59995b4..1a62767a 100644 --- a/apps/config_editor.py +++ b/apps/config_editor.py @@ -12,6 +12,7 @@ from __future__ import annotations +import hashlib import html as html_module import json import sys @@ -133,12 +134,17 @@ def _init_state() -> None: def _is_visible_av(var: AnnotatedVar) -> bool: - """Return True when all #x conditions for this var are satisfied.""" + """Return True when all #x conditions for this var are satisfied. + + Comma-separated values in a single condition are treated as OR + (e.g. `#x pipeline_mode=LLM,AudioLLM`). + """ for cond_key, cond_val in var.conditions: actual = st.session_state.get(cond_key) if actual is None: actual = st.session_state.get("field_values", {}).get(cond_key) - if actual != cond_val: + allowed = {v.strip() for v in cond_val.split(",") if v.strip()} + if actual not in allowed: return False return True @@ -215,9 +221,10 @@ def _enum_options_for(var: AnnotatedVar) -> list[str]: def _render_json_object(name: str, info: str, current: dict) -> None: st.markdown(f"**{name}**" + (f" — {info}" if info else "")) - raw_key = f"raw_{name}" - if raw_key not in st.session_state: - st.session_state[raw_key] = json.dumps(current, indent=2) if current else "" + + # Both widgets are keyed by a hash of the current value so they always + # re-initialize from field_values after any write + rerun. + val_hash = hashlib.md5(json.dumps(current, sort_keys=True, ensure_ascii=False).encode()).hexdigest()[:8] rows = [{"key": k, "value": _scalar_to_str(v)} for k, v in current.items()] or [{"key": "", "value": ""}] edited = st.data_editor( @@ -228,30 +235,43 @@ def _render_json_object(name: str, info: str, current: dict) -> None: "key": st.column_config.TextColumn("key", required=False), "value": st.column_config.TextColumn("value", required=False), }, - key=f"de_{name}", + key=f"_de_{name}_{val_hash}", ) - parsed_kv: dict[str, Any] = {} - for row in edited: - k = (row.get("key") or "").strip() - if k: - parsed_kv[k] = _str_to_scalar(row.get("value")) + parsed_from_table: dict[str, Any] = { + (r.get("key") or "").strip(): _str_to_scalar(r.get("value")) for r in edited if (r.get("key") or "").strip() + } + + if json.dumps(parsed_from_table, sort_keys=True, ensure_ascii=False) != json.dumps( + current, sort_keys=True, ensure_ascii=False + ): + st.session_state.field_values[name] = parsed_from_table + st.rerun() with st.expander("Raw JSON", expanded=False): text = st.text_area( - "Edit as JSON", value=json.dumps(parsed_kv, indent=2) if parsed_kv else "", key=raw_key, height=140 + "Edit as JSON", + value=json.dumps(current, indent=2, ensure_ascii=False) if current else "", + key=f"_rawtxt_{name}_{val_hash}", + height=140, ) - if text.strip(): - try: - parsed_kv = json.loads(text) - except json.JSONDecodeError as e: - st.warning(f"Invalid JSON: {e}") - st.session_state.field_values[name] = parsed_kv + if text.strip(): + try: + parsed_kv = json.loads(text) + if json.dumps(parsed_kv, sort_keys=True, ensure_ascii=False) != json.dumps( + current, sort_keys=True, ensure_ascii=False + ): + st.session_state.field_values[name] = parsed_kv + st.rerun() + except json.JSONDecodeError as e: + st.warning(f"Invalid JSON: {e}") + + st.session_state.field_values[name] = current def _scalar_to_str(v: Any) -> str: if isinstance(v, (dict, list)): - return json.dumps(v) + return json.dumps(v, ensure_ascii=False) if isinstance(v, bool): return "true" if v else "false" if v is None: @@ -397,7 +417,7 @@ def _render_unmapped_var(name: str) -> None: values = st.session_state.field_values v = values.get(name, "") if not isinstance(v, str): - v = json.dumps(v) if v else "" + v = json.dumps(v, ensure_ascii=False) if v else "" widget_type = "password" if "KEY" in name else "default" values[name] = st.text_input(name, value=v, key=f"w_{name}", type=widget_type) @@ -623,7 +643,7 @@ def main() -> None: mime="text/plain", width="stretch", ) - data_attr = html_module.escape(json.dumps(text), quote=True) + data_attr = html_module.escape(json.dumps(text, ensure_ascii=False), quote=True) st_components.html( f"""