diff --git a/src/ell/evaluation/evaluation.py b/src/ell/evaluation/evaluation.py index 9ecee050a..3a2443608 100644 --- a/src/ell/evaluation/evaluation.py +++ b/src/ell/evaluation/evaluation.py @@ -182,7 +182,7 @@ def run( def written_result(o): write_evaluation_run_intermediate(self, evaluation_run, (res := o())) return res - + metric_futures.extend([executor.submit(written_result, o) for o in get_outputs]) for result_future in ( @@ -206,7 +206,15 @@ def written_result(o): write_evaluation_run_end(self, evaluation_run) return evaluation_run - # TODO: add error handling and unsccessful runs. + except BaseException as e: + # Mark the run as failed so it doesn't remain stuck in "Running" state + # when the process is interrupted (e.g. KeyboardInterrupt) or an error occurs. + evaluation_run.end_time = datetime.now(timezone.utc) + evaluation_run.success = False + evaluation_run.error = str(e) + evaluation_run.results = EvaluationResults.from_rowar_results(rowar_results) + write_evaluation_run_end(self, evaluation_run) + raise finally: config.verbose = original_verbose diff --git a/src/ell/evaluation/results.py b/src/ell/evaluation/results.py index 71da991e7..015a492fb 100644 --- a/src/ell/evaluation/results.py +++ b/src/ell/evaluation/results.py @@ -60,7 +60,7 @@ def from_rowar_results( rowar_results: List[_ResultDatapoint], ) -> "EvaluationResults": def extract_labels(is_invocation: bool): - if not rowar_results[0].labels: + if not rowar_results or not rowar_results[0].labels: return [] # Group labels by name and type