From 706846c2606d44f9a4e475770e8c353aae5e89e6 Mon Sep 17 00:00:00 2001
From: octo-patch <octo-patch@github.com>
Date: Tue, 21 Apr 2026 10:32:25 +0800
Subject: [PATCH] fix: mark evaluation run as failed when interrupted to
 prevent stuck Running state (fixes #436)

Previously, if an evaluation run was interrupted (e.g. KeyboardInterrupt)
or raised an unexpected exception, write_evaluation_run_end was never
called, leaving the run permanently stuck in Running state in the store.

Add a BaseException handler that sets success=False, records the error
message, and calls write_evaluation_run_end before re-raising, so the
run is always finalized regardless of how it exits.

Also guard EvaluationResults.from_rowar_results against an empty
rowar_results list, which would otherwise crash with an IndexError
when the run is interrupted before any results are collected.
---
 src/ell/evaluation/evaluation.py | 12 ++++++++++--
 src/ell/evaluation/results.py    |  2 +-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/ell/evaluation/evaluation.py b/src/ell/evaluation/evaluation.py
index 9ecee050a..3a2443608 100644
--- a/src/ell/evaluation/evaluation.py
+++ b/src/ell/evaluation/evaluation.py
@@ -182,7 +182,7 @@ def run(
                     def written_result(o):
                         write_evaluation_run_intermediate(self, evaluation_run, (res := o()))
                         return res
-                
+
                     metric_futures.extend([executor.submit(written_result, o) for o in get_outputs])
 
                 for result_future in (
@@ -206,7 +206,15 @@ def written_result(o):
             write_evaluation_run_end(self, evaluation_run)
 
             return evaluation_run
-            # TODO: add error handling and unsccessful runs.
+        except BaseException as e:
+            # Mark the run as failed so it doesn't remain stuck in "Running" state
+            # when the process is interrupted (e.g. KeyboardInterrupt) or an error occurs.
+            evaluation_run.end_time = datetime.now(timezone.utc)
+            evaluation_run.success = False
+            evaluation_run.error = str(e)
+            evaluation_run.results = EvaluationResults.from_rowar_results(rowar_results)
+            write_evaluation_run_end(self, evaluation_run)
+            raise
         finally:
             config.verbose = original_verbose
 
diff --git a/src/ell/evaluation/results.py b/src/ell/evaluation/results.py
index 71da991e7..015a492fb 100644
--- a/src/ell/evaluation/results.py
+++ b/src/ell/evaluation/results.py
@@ -60,7 +60,7 @@ def from_rowar_results(
         rowar_results: List[_ResultDatapoint],
     ) -> "EvaluationResults":
         def extract_labels(is_invocation: bool):
-            if not rowar_results[0].labels:
+            if not rowar_results or not rowar_results[0].labels:
                 return []
             
             # Group labels by name and type