From 706846c2606d44f9a4e475770e8c353aae5e89e6 Mon Sep 17 00:00:00 2001 From: octo-patch Date: Tue, 21 Apr 2026 10:32:25 +0800 Subject: [PATCH] fix: mark evaluation run as failed when interrupted to prevent stuck Running state (fixes #436) Previously, if an evaluation run was interrupted (e.g. KeyboardInterrupt) or raised an unexpected exception, write_evaluation_run_end was never called, leaving the run permanently stuck in Running state in the store. Add a BaseException handler that sets success=False, records the error message, and calls write_evaluation_run_end before re-raising, so the run is always finalized regardless of how it exits. Also guard EvaluationResults.from_rowar_results against an empty rowar_results list, which would otherwise crash with an IndexError when the run is interrupted before any results are collected. --- src/ell/evaluation/evaluation.py | 12 ++++++++++-- src/ell/evaluation/results.py | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/ell/evaluation/evaluation.py b/src/ell/evaluation/evaluation.py index 9ecee050a..3a2443608 100644 --- a/src/ell/evaluation/evaluation.py +++ b/src/ell/evaluation/evaluation.py @@ -182,7 +182,7 @@ def run( def written_result(o): write_evaluation_run_intermediate(self, evaluation_run, (res := o())) return res - + metric_futures.extend([executor.submit(written_result, o) for o in get_outputs]) for result_future in ( @@ -206,7 +206,15 @@ def written_result(o): write_evaluation_run_end(self, evaluation_run) return evaluation_run - # TODO: add error handling and unsccessful runs. + except BaseException as e: + # Mark the run as failed so it doesn't remain stuck in "Running" state + # when the process is interrupted (e.g. KeyboardInterrupt) or an error occurs. + evaluation_run.end_time = datetime.now(timezone.utc) + evaluation_run.success = False + evaluation_run.error = str(e) + evaluation_run.results = EvaluationResults.from_rowar_results(rowar_results) + write_evaluation_run_end(self, evaluation_run) + raise finally: config.verbose = original_verbose diff --git a/src/ell/evaluation/results.py b/src/ell/evaluation/results.py index 71da991e7..015a492fb 100644 --- a/src/ell/evaluation/results.py +++ b/src/ell/evaluation/results.py @@ -60,7 +60,7 @@ def from_rowar_results( rowar_results: List[_ResultDatapoint], ) -> "EvaluationResults": def extract_labels(is_invocation: bool): - if not rowar_results[0].labels: + if not rowar_results or not rowar_results[0].labels: return [] # Group labels by name and type