-
Notifications
You must be signed in to change notification settings - Fork 59
Resolve database/GitHub task status disconnect #2961
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
25f9e0c
a221734
8c831af
78511dc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,6 +7,7 @@ | |
| from typing import Optional | ||
|
|
||
| from celery import Task, signature | ||
| from ogr.exceptions import GithubAPIException, GitlabAPIException, PagureAPIException | ||
| from ogr.services.github import GithubProject | ||
| from ogr.services.gitlab import GitlabProject | ||
| from packit.config import ( | ||
|
|
@@ -160,6 +161,11 @@ def get_checkers() -> tuple[type[Checker], ...]: | |
| BuildNotAlreadyStarted, | ||
| ) | ||
|
|
||
| def set_status_reporter_reraise_transient_errors(self, reraise: bool) -> None: | ||
| """Set whether to re-raise transient GitHub errors or fall back to comments.""" | ||
| # CoprBuildStartHandler doesn't use status reporting with transient error handling, | ||
| # but needs this method for babysit compatibility | ||
|
|
||
| def set_start_time(self): | ||
| start_time = ( | ||
| datetime.utcfromtimestamp(self.copr_event.timestamp) | ||
|
|
@@ -240,6 +246,23 @@ class CoprBuildEndHandler(AbstractCoprBuildReportHandler): | |
| topic = "org.fedoraproject.prod.copr.build.end" | ||
| task_name = TaskName.copr_build_end | ||
|
|
||
| def __init__( | ||
| self, | ||
| package_config: PackageConfig, | ||
| job_config: JobConfig, | ||
| event: dict, | ||
| ): | ||
| super().__init__( | ||
| package_config=package_config, | ||
| job_config=job_config, | ||
| event=event, | ||
| ) | ||
| self._status_reporter_reraise_transient_errors = True | ||
|
|
||
| def set_status_reporter_reraise_transient_errors(self, reraise: bool) -> None: | ||
| """Set whether to re-raise transient GitHub errors or fall back to comments.""" | ||
| self._status_reporter_reraise_transient_errors = reraise | ||
|
|
||
| def set_srpm_url(self) -> None: | ||
| # TODO how to do better | ||
| srpm_build = ( | ||
|
|
@@ -306,6 +329,9 @@ def _run(self) -> TaskResults: | |
| f"chroot={self.copr_event.chroot} " | ||
| f"at {run_start_time.isoformat()}" | ||
| ) | ||
| self.copr_build_helper.status_reporter.reraise_transient_errors = ( | ||
| self._status_reporter_reraise_transient_errors | ||
| ) | ||
| if not self.build: | ||
| # TODO: how could this happen? | ||
| model = "SRPMBuildDB" if self.copr_event.chroot == COPR_SRPM_CHROOT else "CoprBuildDB" | ||
|
|
@@ -330,28 +356,32 @@ def _run(self) -> TaskResults: | |
| if self.copr_event.chroot == COPR_SRPM_CHROOT: | ||
| return self.handle_srpm_end() | ||
|
|
||
| self.pushgateway.copr_builds_finished.inc() | ||
|
|
||
| # if the build is needed only for test, it doesn't have the task_accepted_time | ||
| if self.build.task_accepted_time: | ||
| copr_build_time = elapsed_seconds( | ||
| begin=self.build.task_accepted_time, | ||
| end=datetime.now(timezone.utc), | ||
| ) | ||
| self.pushgateway.copr_build_finished_time.observe(copr_build_time) | ||
|
|
||
| # https://pagure.io/copr/copr/blob/master/f/common/copr_common/enums.py#_42 | ||
| if self.copr_event.status != COPR_API_SUCC_STATE: | ||
| failed_msg = "RPMs failed to be built." | ||
| packit_dashboard_url = get_copr_build_info_url(self.build.id) | ||
| # if SRPM build failed it has been reported already so skip reporting | ||
| if self.build.get_srpm_build().status != BuildStatus.failure: | ||
| self.copr_build_helper.report_status_to_all_for_chroot( | ||
| state=BaseCommitStatus.failure, | ||
| description=failed_msg, | ||
| url=packit_dashboard_url, | ||
| chroot=self.copr_event.chroot, | ||
| ) | ||
| try: | ||
| self.copr_build_helper.report_status_to_all_for_chroot( | ||
| state=BaseCommitStatus.failure, | ||
| description=failed_msg, | ||
| url=packit_dashboard_url, | ||
| chroot=self.copr_event.chroot, | ||
| ) | ||
| except (GithubAPIException, GitlabAPIException, PagureAPIException): | ||
| # Transient error - return early before setting the state | ||
| return TaskResults(success=False, details={"msg": "Status reporting failed"}) | ||
|
|
||
| # Only execute the following if GitHub reporting succeeded | ||
| self.pushgateway.copr_builds_finished.inc() | ||
| if self.build.task_accepted_time: | ||
| copr_build_time = elapsed_seconds( | ||
| begin=self.build.task_accepted_time, | ||
| end=datetime.now(timezone.utc), | ||
| ) | ||
| self.pushgateway.copr_build_finished_time.observe(copr_build_time) | ||
|
|
||
| self.measure_time_after_reporting() | ||
| self.copr_build_helper.notify_about_failure_if_configured( | ||
| packit_dashboard_url=packit_dashboard_url, | ||
|
|
@@ -362,9 +392,22 @@ def _run(self) -> TaskResults: | |
| report_long_runtime("Copr build failed end", 120, run_start_time) | ||
| return TaskResults(success=False, details={"msg": failed_msg}) | ||
|
|
||
| self.report_successful_build() | ||
| self.measure_time_after_reporting() | ||
| try: | ||
| self.report_successful_build() | ||
| except (GithubAPIException, GitlabAPIException, PagureAPIException): | ||
| # Transient error - return early before setting the state | ||
| return TaskResults(success=False, details={"msg": "Status reporting failed"}) | ||
|
|
||
| # Only execute the following if GitHub reporting succeeded | ||
| self.pushgateway.copr_builds_finished.inc() | ||
| if self.build.task_accepted_time: | ||
| copr_build_time = elapsed_seconds( | ||
| begin=self.build.task_accepted_time, | ||
| end=datetime.now(timezone.utc), | ||
| ) | ||
| self.pushgateway.copr_build_finished_time.observe(copr_build_time) | ||
|
Comment on lines
+402
to
+408
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
|
||
| self.measure_time_after_reporting() | ||
| self.set_built_packages() | ||
| self.build.set_status(BuildStatus.success) | ||
| self.handle_testing_farm() | ||
|
|
@@ -432,11 +475,16 @@ def handle_srpm_end(self): | |
|
|
||
| if self.copr_event.status != COPR_API_SUCC_STATE: | ||
| failed_msg = "SRPM build failed, check the logs for details." | ||
| self.copr_build_helper.report_status_to_all( | ||
| state=BaseCommitStatus.failure, | ||
| description=failed_msg, | ||
| url=url, | ||
| ) | ||
| try: | ||
| self.copr_build_helper.report_status_to_all( | ||
| state=BaseCommitStatus.failure, | ||
| description=failed_msg, | ||
| url=url, | ||
| ) | ||
| except (GithubAPIException, GitlabAPIException, PagureAPIException): | ||
| # Transient error - return early before setting the state | ||
| return TaskResults(success=False, details={"msg": "Status reporting failed"}) | ||
|
|
||
| self.copr_build_helper.notify_about_failure_if_configured( | ||
| packit_dashboard_url=url, | ||
| external_dashboard_url=self.build.copr_web_url, | ||
|
|
@@ -449,23 +497,29 @@ def handle_srpm_end(self): | |
| ) | ||
| return TaskResults(success=False, details={"msg": failed_msg}) | ||
|
|
||
| report_status = ( | ||
| self.copr_build_helper.report_status_to_all | ||
| if self.job_config.sync_test_job_statuses_with_builds | ||
| else self.copr_build_helper.report_status_to_build | ||
| ) | ||
| try: | ||
| report_status( | ||
| state=BaseCommitStatus.running, | ||
| description="SRPM build succeeded. Waiting for RPM build to start...", | ||
| url=url, | ||
| ) | ||
| except (GithubAPIException, GitlabAPIException, PagureAPIException): | ||
| # Transient error - return early before setting the state | ||
| return TaskResults(success=False, details={"msg": "Status reporting failed"}) | ||
|
|
||
| # Set DB status after successful GitHub reporting | ||
| for build in CoprBuildTargetModel.get_all_by_build_id( | ||
| str(self.copr_event.build_id), | ||
| ): | ||
| # from waiting_for_srpm to pending | ||
| build.set_status(BuildStatus.pending) | ||
|
|
||
| self.build.set_status(BuildStatus.success) | ||
| report_status = ( | ||
| self.copr_build_helper.report_status_to_all | ||
| if self.job_config.sync_test_job_statuses_with_builds | ||
| else self.copr_build_helper.report_status_to_build | ||
| ) | ||
| report_status( | ||
| state=BaseCommitStatus.running, | ||
| description="SRPM build succeeded. Waiting for RPM build to start...", | ||
| url=url, | ||
| ) | ||
| msg = "SRPM build in Copr has finished." | ||
| logger.debug(msg) | ||
| return TaskResults(success=True, details={"msg": msg}) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -197,6 +197,8 @@ def update_testing_farm_run(event: testing_farm.Result, run: TFTTestRunTargetMod | |
| job_config=job_config, | ||
| event=event_dict, | ||
| ) | ||
| # TODO: Consider time-based heuristic instead of always False | ||
| upstream_handler.set_status_reporter_reraise_transient_errors(False) | ||
| # Check if handler should process this test | ||
| if upstream_handler.pre_check(package_config, job_config, event_dict): | ||
| signatures.append(upstream_handler.get_signature(event=event, job=job_config)) | ||
|
|
@@ -487,6 +489,8 @@ def update_copr_build_state( | |
| job_config=job_config, | ||
| event=event_dict, | ||
| ) | ||
| # TODO: Consider time-based heuristic instead of always False | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what about doing something like here, and use that as condition for setting the reporter?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there such a thing as a "last try" for a babysit task? My impression is that babysit runs indefinitely, until the task status in db changes from "pending".
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ah, good point, I haven't realised we do this only for the babysit of individual copr build, see here. For the global build/test babysit based on DB, there is the timeout of 7 days. So this becomes trickier. But I fear if we will be setting this always to False for babysitting, we might be still bumping often into the issue of the database/forge disconnect.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree that the babysit copr build task does much of the work.
Reason 1 and 4 are the most common, and you can see in this graph how often we retry the babysit Copr build task. Personally, I would like to be able to spot problems, like the transient errors, in the above graph, but we can't because of points 1 and 4, for those, shouldn't we instead schedule a fresh run for later?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @majamassarini this applies to only the inidividual babysit task, right? I agree, it could be fixed, but maybe outside of the scope of this PR? Looking into the babysitting, I am also thinking we could set reraise_transient_errors=True for individual babysit and False for the periodic ones. If the individual babysit fails due to a GitHub API error, the periodic babysit runs regularly and will eventually catch any builds still stuck as pending and retry the update. WDYT @m-blaha ? Would you like to keep this PR and test on stg and implement this as followup?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes sure, outside of the scope of this PR sounds good to me. And yes this is something only related with babysit task.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually it seems that |
||
| handler.set_status_reporter_reraise_transient_errors(False) | ||
| if handler.pre_check(package_config, job_config, event_dict): | ||
| signatures.append(handler.get_signature(event=event, job=job_config)) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -58,6 +58,12 @@ def set_status( | |
| trim=True, | ||
| ) | ||
| except GithubAPIException as e: | ||
| if self.is_transient_error(e) and self.reraise_transient_errors: | ||
| logger.debug( | ||
| f"Re-raising transient GitHub API error when setting " | ||
| f"status for '{check_name}': {e}." | ||
| ) | ||
| raise | ||
| self._comment_as_set_status_fallback(e, state, description, check_name, url) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did we get the comments though? 👀 |
||
|
|
||
|
|
||
|
|
@@ -137,6 +143,12 @@ def set_status( | |
| output=create_github_check_run_output(description, summary), | ||
| ) | ||
| except GithubAPIException as e: | ||
| if self.is_transient_error(e) and self.reraise_transient_errors: | ||
| logger.debug( | ||
| f"Re-raising transient GitHub API error when setting " | ||
| f"status for '{check_name}': {e}." | ||
| ) | ||
| raise | ||
| logger.debug( | ||
| f"Failed to set status check, setting status as a fallback: {e!s}", | ||
| ) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Moving the metrics recording (
pushgateway.copr_builds_finished.inc()andpushgateway.copr_build_finished_time.observe()) inside this block, after thereport_status_to_all_for_chrootcall, is a good improvement. It ensures that these metrics are only updated if the GitHub status reporting was successful, aligning with the PR's goal of maintaining consistency between internal state and external reports.