Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 87 additions & 33 deletions packit_service/worker/handlers/copr.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing import Optional

from celery import Task, signature
from ogr.exceptions import GithubAPIException, GitlabAPIException, PagureAPIException
from ogr.services.github import GithubProject
from ogr.services.gitlab import GitlabProject
from packit.config import (
Expand Down Expand Up @@ -160,6 +161,11 @@ def get_checkers() -> tuple[type[Checker], ...]:
BuildNotAlreadyStarted,
)

def set_status_reporter_reraise_transient_errors(self, reraise: bool) -> None:
"""Set whether to re-raise transient GitHub errors or fall back to comments."""
# CoprBuildStartHandler doesn't use status reporting with transient error handling,
# but needs this method for babysit compatibility

def set_start_time(self):
start_time = (
datetime.utcfromtimestamp(self.copr_event.timestamp)
Expand Down Expand Up @@ -240,6 +246,23 @@ class CoprBuildEndHandler(AbstractCoprBuildReportHandler):
topic = "org.fedoraproject.prod.copr.build.end"
task_name = TaskName.copr_build_end

def __init__(
self,
package_config: PackageConfig,
job_config: JobConfig,
event: dict,
):
super().__init__(
package_config=package_config,
job_config=job_config,
event=event,
)
self._status_reporter_reraise_transient_errors = True

def set_status_reporter_reraise_transient_errors(self, reraise: bool) -> None:
"""Set whether to re-raise transient GitHub errors or fall back to comments."""
self._status_reporter_reraise_transient_errors = reraise

def set_srpm_url(self) -> None:
# TODO how to do better
srpm_build = (
Expand Down Expand Up @@ -306,6 +329,9 @@ def _run(self) -> TaskResults:
f"chroot={self.copr_event.chroot} "
f"at {run_start_time.isoformat()}"
)
self.copr_build_helper.status_reporter.reraise_transient_errors = (
self._status_reporter_reraise_transient_errors
)
if not self.build:
# TODO: how could this happen?
model = "SRPMBuildDB" if self.copr_event.chroot == COPR_SRPM_CHROOT else "CoprBuildDB"
Expand All @@ -330,28 +356,32 @@ def _run(self) -> TaskResults:
if self.copr_event.chroot == COPR_SRPM_CHROOT:
return self.handle_srpm_end()

self.pushgateway.copr_builds_finished.inc()

# if the build is needed only for test, it doesn't have the task_accepted_time
if self.build.task_accepted_time:
copr_build_time = elapsed_seconds(
begin=self.build.task_accepted_time,
end=datetime.now(timezone.utc),
)
self.pushgateway.copr_build_finished_time.observe(copr_build_time)

# https://pagure.io/copr/copr/blob/master/f/common/copr_common/enums.py#_42
if self.copr_event.status != COPR_API_SUCC_STATE:
failed_msg = "RPMs failed to be built."
packit_dashboard_url = get_copr_build_info_url(self.build.id)
# if SRPM build failed it has been reported already so skip reporting
if self.build.get_srpm_build().status != BuildStatus.failure:
self.copr_build_helper.report_status_to_all_for_chroot(
state=BaseCommitStatus.failure,
description=failed_msg,
url=packit_dashboard_url,
chroot=self.copr_event.chroot,
)
try:
self.copr_build_helper.report_status_to_all_for_chroot(
state=BaseCommitStatus.failure,
description=failed_msg,
url=packit_dashboard_url,
chroot=self.copr_event.chroot,
)
except (GithubAPIException, GitlabAPIException, PagureAPIException):
# Transient error - return early before setting the state
return TaskResults(success=False, details={"msg": "Status reporting failed"})

# Only execute the following if GitHub reporting succeeded
self.pushgateway.copr_builds_finished.inc()
if self.build.task_accepted_time:
copr_build_time = elapsed_seconds(
begin=self.build.task_accepted_time,
end=datetime.now(timezone.utc),
)
self.pushgateway.copr_build_finished_time.observe(copr_build_time)
Comment on lines +376 to +383
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Moving the metrics recording (pushgateway.copr_builds_finished.inc() and pushgateway.copr_build_finished_time.observe()) inside this block, after the report_status_to_all_for_chroot call, is a good improvement. It ensures that these metrics are only updated if the GitHub status reporting was successful, aligning with the PR's goal of maintaining consistency between internal state and external reports.


self.measure_time_after_reporting()
self.copr_build_helper.notify_about_failure_if_configured(
packit_dashboard_url=packit_dashboard_url,
Expand All @@ -362,9 +392,22 @@ def _run(self) -> TaskResults:
report_long_runtime("Copr build failed end", 120, run_start_time)
return TaskResults(success=False, details={"msg": failed_msg})

self.report_successful_build()
self.measure_time_after_reporting()
try:
self.report_successful_build()
except (GithubAPIException, GitlabAPIException, PagureAPIException):
# Transient error - return early before setting the state
return TaskResults(success=False, details={"msg": "Status reporting failed"})

# Only execute the following if GitHub reporting succeeded
self.pushgateway.copr_builds_finished.inc()
if self.build.task_accepted_time:
copr_build_time = elapsed_seconds(
begin=self.build.task_accepted_time,
end=datetime.now(timezone.utc),
)
self.pushgateway.copr_build_finished_time.observe(copr_build_time)
Comment on lines +402 to +408
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Placing the metrics recording after report_successful_build() and under the comment # Only execute the following if GitHub reporting succeeded is a correct and important change. This ensures that the metrics accurately reflect successful operations that have been communicated externally.


self.measure_time_after_reporting()
self.set_built_packages()
self.build.set_status(BuildStatus.success)
self.handle_testing_farm()
Expand Down Expand Up @@ -432,11 +475,16 @@ def handle_srpm_end(self):

if self.copr_event.status != COPR_API_SUCC_STATE:
failed_msg = "SRPM build failed, check the logs for details."
self.copr_build_helper.report_status_to_all(
state=BaseCommitStatus.failure,
description=failed_msg,
url=url,
)
try:
self.copr_build_helper.report_status_to_all(
state=BaseCommitStatus.failure,
description=failed_msg,
url=url,
)
except (GithubAPIException, GitlabAPIException, PagureAPIException):
# Transient error - return early before setting the state
return TaskResults(success=False, details={"msg": "Status reporting failed"})

self.copr_build_helper.notify_about_failure_if_configured(
packit_dashboard_url=url,
external_dashboard_url=self.build.copr_web_url,
Expand All @@ -449,23 +497,29 @@ def handle_srpm_end(self):
)
return TaskResults(success=False, details={"msg": failed_msg})

report_status = (
self.copr_build_helper.report_status_to_all
if self.job_config.sync_test_job_statuses_with_builds
else self.copr_build_helper.report_status_to_build
)
try:
report_status(
state=BaseCommitStatus.running,
description="SRPM build succeeded. Waiting for RPM build to start...",
url=url,
)
except (GithubAPIException, GitlabAPIException, PagureAPIException):
# Transient error - return early before setting the state
return TaskResults(success=False, details={"msg": "Status reporting failed"})

# Set DB status after successful GitHub reporting
for build in CoprBuildTargetModel.get_all_by_build_id(
str(self.copr_event.build_id),
):
# from waiting_for_srpm to pending
build.set_status(BuildStatus.pending)

self.build.set_status(BuildStatus.success)
report_status = (
self.copr_build_helper.report_status_to_all
if self.job_config.sync_test_job_statuses_with_builds
else self.copr_build_helper.report_status_to_build
)
report_status(
state=BaseCommitStatus.running,
description="SRPM build succeeded. Waiting for RPM build to start...",
url=url,
)
msg = "SRPM build in Copr has finished."
logger.debug(msg)
return TaskResults(success=True, details={"msg": msg})
Expand Down
31 changes: 23 additions & 8 deletions packit_service/worker/handlers/testing_farm.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from celery import Task
from ogr.abstract import GitProject
from ogr.exceptions import GithubAPIException, GitlabAPIException, PagureAPIException
from packit.config import JobConfig, JobType, aliases
from packit.config.package_config import PackageConfig

Expand Down Expand Up @@ -533,6 +534,11 @@ def __init__(
self.log_url = event.get("log_url")
self.summary = event.get("summary")
self.created = event.get("created")
self._status_reporter_reraise_transient_errors = True

def set_status_reporter_reraise_transient_errors(self, reraise: bool) -> None:
"""Set whether to re-raise transient GitHub errors or fall back to comments."""
self._status_reporter_reraise_transient_errors = reraise

@staticmethod
def get_checkers() -> tuple[type[Checker], ...]:
Expand All @@ -550,6 +556,9 @@ def db_project_event(self) -> Optional[ProjectEventModel]:

def _run(self) -> TaskResults:
logger.debug(f"Testing farm {self.pipeline_id} result:\n{self.result}")
self.testing_farm_job_helper.status_reporter.reraise_transient_errors = (
self._status_reporter_reraise_transient_errors
)

test_run_model = TFTTestRunTargetModel.get_by_pipeline_id(
pipeline_id=self.pipeline_id,
Expand Down Expand Up @@ -587,6 +596,20 @@ def _run(self) -> TaskResults:
status = BaseCommitStatus.error
summary = self.summary or "Error ..."

url = get_testing_farm_info_url(test_run_model.id) if test_run_model else None
try:
self.testing_farm_job_helper.report_status_to_tests_for_test_target(
state=status,
description=summary,
target=test_run_model.target,
url=url if url else self.log_url,
links_to_external_services={"Testing Farm": self.log_url},
)
except (GithubAPIException, GitlabAPIException, PagureAPIException):
# Transient error - return early before setting the state
return TaskResults(success=False, details={"msg": "Status reporting failed"})

# Record metrics - only after successful GitHub reporting to avoid double-counting on retry
if self.result == TestingFarmResult.running:
self.pushgateway.test_runs_started.inc()
else:
Expand All @@ -598,14 +621,6 @@ def _run(self) -> TaskResults:
self.pushgateway.test_run_finished_time.observe(test_run_time)

test_run_model.set_web_url(self.log_url)
url = get_testing_farm_info_url(test_run_model.id) if test_run_model else None
self.testing_farm_job_helper.report_status_to_tests_for_test_target(
state=status,
description=summary,
target=test_run_model.target,
url=url if url else self.log_url,
links_to_external_services={"Testing Farm": self.log_url},
)
if failure:
self.testing_farm_job_helper.notify_about_failure_if_configured(
packit_dashboard_url=url,
Expand Down
4 changes: 4 additions & 0 deletions packit_service/worker/helpers/build/babysit.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,8 @@ def update_testing_farm_run(event: testing_farm.Result, run: TFTTestRunTargetMod
job_config=job_config,
event=event_dict,
)
# TODO: Consider time-based heuristic instead of always False
upstream_handler.set_status_reporter_reraise_transient_errors(False)
# Check if handler should process this test
if upstream_handler.pre_check(package_config, job_config, event_dict):
signatures.append(upstream_handler.get_signature(event=event, job=job_config))
Expand Down Expand Up @@ -487,6 +489,8 @@ def update_copr_build_state(
job_config=job_config,
event=event_dict,
)
# TODO: Consider time-based heuristic instead of always False
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what about doing something like here, and use that as condition for setting the reporter?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there such a thing as a "last try" for a babysit task? My impression is that babysit runs indefinitely, until the task status in db changes from "pending".

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah, good point, I haven't realised we do this only for the babysit of individual copr build, see here. For the global build/test babysit based on DB, there is the timeout of 7 days. So this becomes trickier. But I fear if we will be setting this always to False for babysitting, we might be still bumping often into the issue of the database/forge disconnect.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that the babysit copr build task does much of the work.
However, I don't think it's easy to solve the problem.
The babysit task is retried for the following reasons:

  1. build hasn't started yet
  2. exception during SRPM update
  3. exception during build update
  4. build hasn't ended

Reason 1 and 4 are the most common, and you can see in this graph how often we retry the babysit Copr build task.

Personally, I would like to be able to spot problems, like the transient errors, in the above graph, but we can't because of points 1 and 4, for those, shouldn't we instead schedule a fresh run for later?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@majamassarini this applies to only the inidividual babysit task, right? I agree, it could be fixed, but maybe outside of the scope of this PR?

Looking into the babysitting, I am also thinking we could set reraise_transient_errors=True for individual babysit and False for the periodic ones. If the individual babysit fails due to a GitHub API error, the periodic babysit runs regularly and will eventually catch any builds still stuck as pending and retry the update. WDYT @m-blaha ? Would you like to keep this PR and test on stg and implement this as followup?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes sure, outside of the scope of this PR sounds good to me. And yes this is something only related with babysit task.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually it seems that handler.set_status_reporter_reraise_transient_errors(False) here doesn't have any effect. The handler instance is only transient and destroyed after it went out of scope. celery_run_async is called with signatures and we would need to somehow pass the reraise_transient_errors to get_signatures(), and then back when celery re-creates handler from the signature.

handler.set_status_reporter_reraise_transient_errors(False)
if handler.pre_check(package_config, job_config, event_dict):
signatures.append(handler.get_signature(event=event, job=job_config))

Expand Down
1 change: 1 addition & 0 deletions packit_service/worker/helpers/job_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ def status_reporter(self) -> StatusReporter:
packit_user=self.service_config.get_github_account_name(),
project_event_id=(self.db_project_event.id if self.db_project_event else None),
pr_id=self.metadata.pr_id,
reraise_transient_errors=False,
)
return self._status_reporter

Expand Down
34 changes: 33 additions & 1 deletion packit_service/worker/reporting/reporters/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import Callable, Optional, Union

from ogr.abstract import GitProject, PullRequest
from ogr.exceptions import GithubAPIException, GitlabAPIException, PagureAPIException
from ogr.services.github import GithubProject
from ogr.services.gitlab import GitlabProject
from ogr.services.pagure import PagureProject
Expand All @@ -29,6 +30,7 @@ def __init__(
packit_user: str,
project_event_id: Optional[int] = None,
pr_id: Optional[int] = None,
reraise_transient_errors: bool = False,
):
logger.debug(
f"Status reporter will report for {project}, commit={commit_sha}, pr={pr_id}",
Expand All @@ -41,6 +43,7 @@ def __init__(
self.project_event_id: int = project_event_id
self.pr_id: Optional[int] = pr_id
self._pull_request_object: Optional[PullRequest] = None
self.reraise_transient_errors: bool = reraise_transient_errors

@classmethod
def get_instance(
Expand All @@ -50,6 +53,7 @@ def get_instance(
packit_user: str,
project_event_id: Optional[int] = None,
pr_id: Optional[int] = None,
reraise_transient_errors: bool = False,
) -> "StatusReporter":
"""
Get the StatusReporter instance.
Expand All @@ -67,7 +71,9 @@ def get_instance(
reporter = StatusReporterGitlab
elif isinstance(project, PagureProject):
reporter = StatusReporterPagure
return reporter(project, commit_sha, packit_user, project_event_id, pr_id)
return reporter(
project, commit_sha, packit_user, project_event_id, pr_id, reraise_transient_errors
)

@property
def project_with_commit(self) -> GitProject:
Expand Down Expand Up @@ -97,6 +103,32 @@ def get_commit_status(state: BaseCommitStatus):
def get_check_run(state: BaseCommitStatus):
return MAP_TO_CHECK_RUN[state]

@staticmethod
def is_transient_error(
exception: Union[GithubAPIException, GitlabAPIException, PagureAPIException],
) -> bool:
"""
Check if an API exception represents a transient error that should be retried.

Transient errors include:
- Network errors (no response_code attribute)
- Rate limiting (HTTP 429)
- Server errors (HTTP 5xx)

Args:
exception: An API exception from ogr

Returns:
True if the error is transient and should be retried, False otherwise
"""
response_code = getattr(exception, "response_code", None)

if response_code is None:
# Network errors (no response code) are transient
return True

return response_code == 429 or (500 <= response_code < 600)

def set_status(
self,
state: BaseCommitStatus,
Expand Down
12 changes: 12 additions & 0 deletions packit_service/worker/reporting/reporters/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,12 @@ def set_status(
trim=True,
)
except GithubAPIException as e:
if self.is_transient_error(e) and self.reraise_transient_errors:
logger.debug(
f"Re-raising transient GitHub API error when setting "
f"status for '{check_name}': {e}."
)
raise
self._comment_as_set_status_fallback(e, state, description, check_name, url)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did we get the comments though? 👀



Expand Down Expand Up @@ -137,6 +143,12 @@ def set_status(
output=create_github_check_run_output(description, summary),
)
except GithubAPIException as e:
if self.is_transient_error(e) and self.reraise_transient_errors:
logger.debug(
f"Re-raising transient GitHub API error when setting "
f"status for '{check_name}': {e}."
)
raise
logger.debug(
f"Failed to set status check, setting status as a fallback: {e!s}",
)
Expand Down
Loading
Loading