From 2d3e26fc223c6c4370822d16743535abf3ca18a2 Mon Sep 17 00:00:00 2001 From: Manas Srivastava Date: Wed, 29 Apr 2026 17:13:05 +0530 Subject: [PATCH 1/3] fix: print "job timed out"/"job cancelled" instead of "context cancelled" When a job is cancelled or times out, the agent surfaced bare Go context errors ("context canceled", "context deadline exceeded") in the job log. These come from a couple of places, primarily the kubernetes runner returning ctx.Err() directly and the executor emitting %v on a context-wrapped command error. This adds an internal/job.FormatJobError helper that rescues context.DeadlineExceeded as "job timed out" and context.Canceled as "job cancelled", and routes the three leakiest log sites through it: - agent/run_job.go "Error running job: ..." - internal/job/executor.go "user command error: ..." - internal/job/executor.go CommandPhase default branch The agent cannot reliably distinguish a server-driven cancel (UI button) from a server-driven job timeout -- both transition the job to the same canceling state and arrive as a context.Canceled. The helper deliberately uses the umbrella term "job cancelled" rather than claiming a timeout it can't confirm. Only context.DeadlineExceeded (set explicitly within the agent, e.g. WithGracePeriod) maps to "job timed out". Fixes #3384. --- agent/run_job.go | 2 +- internal/job/errors.go | 38 ++++++++++++++++++++++++ internal/job/errors_test.go | 58 +++++++++++++++++++++++++++++++++++++ internal/job/executor.go | 4 +-- 4 files changed, 99 insertions(+), 3 deletions(-) create mode 100644 internal/job/errors.go create mode 100644 internal/job/errors_test.go diff --git a/agent/run_job.go b/agent/run_job.go index f47a7bd046..bfd2e37d62 100644 --- a/agent/run_job.go +++ b/agent/run_job.go @@ -330,7 +330,7 @@ func (r *JobRunner) runJob(ctx context.Context) core.ProcessExit { // Run the process. This will block until it finishes. if err := r.process.Run(ctx); err != nil { // Send the error to job logs - _, _ = fmt.Fprintf(r.jobLogs, "Error running job: %s\n", err) + _, _ = fmt.Fprintf(r.jobLogs, "Error running job: %s\n", job.FormatJobError(err)) // The process did not run at all, so make sure it fails return core.ProcessExit{ diff --git a/internal/job/errors.go b/internal/job/errors.go new file mode 100644 index 0000000000..bdc28b71d4 --- /dev/null +++ b/internal/job/errors.go @@ -0,0 +1,38 @@ +package job + +import ( + "context" + "errors" +) + +// FormatJobError returns a human-friendly description of err for inclusion in +// the job log or agent stderr. +// +// Bare context errors (`context canceled`, `context deadline exceeded`) leak +// out of various code paths -- the kubernetes runner returns ctx.Err() +// directly, and shell errors that wrap a cancelled context can otherwise +// surface a Go default string. This helper normalises those cases so the user +// sees something they can act on. +// +// If err is nil, the returned string is empty. +func FormatJobError(err error) string { + switch { + case err == nil: + return "" + case errors.Is(err, context.DeadlineExceeded): + // context.DeadlineExceeded only fires when something explicitly set a + // deadline -- e.g. the in-process signal grace period in + // WithGracePeriod. From the user's perspective, that means a timeout + // was reached. + return "job timed out" + case errors.Is(err, context.Canceled): + // We can't tell from the agent side whether this came from a + // server-issued cancel (UI button or job-level timeout, both of which + // transition the job to the same `canceling` state) or from an + // operator interrupting the agent. "cancelled" is the honest umbrella + // term that doesn't lie about the cause. + return "job cancelled" + default: + return err.Error() + } +} diff --git a/internal/job/errors_test.go b/internal/job/errors_test.go new file mode 100644 index 0000000000..6dfcdb94b4 --- /dev/null +++ b/internal/job/errors_test.go @@ -0,0 +1,58 @@ +package job + +import ( + "context" + "errors" + "fmt" + "testing" +) + +func TestFormatJobError(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + err error + want string + }{ + { + name: "nil error", + err: nil, + want: "", + }, + { + name: "context.Canceled", + err: context.Canceled, + want: "job cancelled", + }, + { + name: "context.DeadlineExceeded", + err: context.DeadlineExceeded, + want: "job timed out", + }, + { + name: "wrapped context.Canceled", + err: fmt.Errorf("running command: %w", context.Canceled), + want: "job cancelled", + }, + { + name: "wrapped context.DeadlineExceeded", + err: fmt.Errorf("running command: %w", context.DeadlineExceeded), + want: "job timed out", + }, + { + name: "non-context error passes through", + err: errors.New("kaboom"), + want: "kaboom", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + if got := FormatJobError(tt.err); got != tt.want { + t.Errorf("FormatJobError(%v) = %q, want %q", tt.err, got, tt.want) + } + }) + } +} diff --git a/internal/job/executor.go b/internal/job/executor.go index 85795c7c51..f19eea73d9 100644 --- a/internal/job/executor.go +++ b/internal/job/executor.go @@ -271,7 +271,7 @@ func (e *Executor) Run(ctx context.Context) (exitCode int) { // an error from the hook/job logic. These are both good to report but // shouldn't override each other in reporting. if commandErr != nil { - e.shell.Printf("user command error: %v", commandErr) + e.shell.Printf("user command error: %s", FormatJobError(commandErr)) span.RecordError(commandErr) } @@ -1140,7 +1140,7 @@ func (e *Executor) CommandPhase(ctx context.Context) (hookErr, commandErr error) return nil, commandErr default: - e.shell.Errorf("%s", commandErr) + e.shell.Errorf("%s", FormatJobError(commandErr)) // error is not an exit error, we don't want to return it return nil, nil From 6dbd178797f17b85aca524f83b55f3cf206c88ae Mon Sep 17 00:00:00 2001 From: Manas Srivastava <40285830+mastermanas805@users.noreply.github.com> Date: Wed, 29 Apr 2026 17:24:08 +0530 Subject: [PATCH 2/3] Update errors.go cleaning --- internal/job/errors.go | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/internal/job/errors.go b/internal/job/errors.go index bdc28b71d4..f306a31387 100644 --- a/internal/job/errors.go +++ b/internal/job/errors.go @@ -5,32 +5,13 @@ import ( "errors" ) -// FormatJobError returns a human-friendly description of err for inclusion in -// the job log or agent stderr. -// -// Bare context errors (`context canceled`, `context deadline exceeded`) leak -// out of various code paths -- the kubernetes runner returns ctx.Err() -// directly, and shell errors that wrap a cancelled context can otherwise -// surface a Go default string. This helper normalises those cases so the user -// sees something they can act on. -// -// If err is nil, the returned string is empty. func FormatJobError(err error) string { switch { case err == nil: return "" case errors.Is(err, context.DeadlineExceeded): - // context.DeadlineExceeded only fires when something explicitly set a - // deadline -- e.g. the in-process signal grace period in - // WithGracePeriod. From the user's perspective, that means a timeout - // was reached. return "job timed out" case errors.Is(err, context.Canceled): - // We can't tell from the agent side whether this came from a - // server-issued cancel (UI button or job-level timeout, both of which - // transition the job to the same `canceling` state) or from an - // operator interrupting the agent. "cancelled" is the honest umbrella - // term that doesn't lie about the cause. return "job cancelled" default: return err.Error() From 552bd2a8705ac61bc087489ecdcac51ea2035391 Mon Sep 17 00:00:00 2001 From: Manas Srivastava Date: Tue, 5 May 2026 11:17:04 +0530 Subject: [PATCH 3/3] test: add regression test reproducing the bare context-canceled leak Drives a real kubernetes.Runner, cancels its context, and verifies both that runner.Run returns context.Canceled bare and that job.FormatJobError translates it to "job cancelled" at the agent/run_job.go call site. Locks in the user-visible message so that #3384 cannot regress, and documents the leak path that pre-dates this PR for any reviewer who can't reproduce against the non-k8s flow. --- kubernetes/repro_3384_test.go | 91 +++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 kubernetes/repro_3384_test.go diff --git a/kubernetes/repro_3384_test.go b/kubernetes/repro_3384_test.go new file mode 100644 index 0000000000..6cf3548076 --- /dev/null +++ b/kubernetes/repro_3384_test.go @@ -0,0 +1,91 @@ +//go:build !windows + +package kubernetes + +import ( + "context" + "errors" + "fmt" + "os" + "path/filepath" + "testing" + "time" + + "github.com/buildkite/agent/v3/internal/job" + "github.com/buildkite/agent/v3/logger" +) + +// TestRepro3384_K8sRunnerLeaksBareContextError reproduces the user-visible bug +// in #3384: when the agent context is cancelled (UI cancel or job timeout), +// kubernetes.Runner.Run returns ctx.Err() bare. The pre-fix line at +// agent/run_job.go was: +// +// fmt.Fprintf(r.jobLogs, "Error running job: %s\n", err) +// +// which produced `Error running job: context canceled` in the job log. This +// test drives a real Runner, cancels its context, and shows both the pre-fix +// and post-fix log lines. +func TestRepro3384_K8sRunnerLeaksBareContextError(t *testing.T) { + // Short dir name: macOS unix sockets cap at 104 bytes. + tempDir, err := os.MkdirTemp("", "r3384") + if err != nil { + t.Fatalf("MkdirTemp: %v", err) + } + t.Cleanup(func() { _ = os.RemoveAll(tempDir) }) + + runner := NewRunner(logger.Discard, RunnerConfig{ + SocketPath: filepath.Join(tempDir, "bk.sock"), + ClientCount: 0, // startupCheck disabled with ClientStartTimeout==0 + }) + + ctx, cancel := context.WithCancel(context.Background()) + runErrCh := make(chan error, 1) + go func() { runErrCh <- runner.Run(ctx) }() + + // Wait for the socket to appear so we know Run is in its select loop. + deadline := time.After(10 * time.Second) + tick := time.Tick(time.Millisecond) +loop: + for { + select { + case <-tick: + if _, err := os.Lstat(runner.conf.SocketPath); err == nil { + break loop + } + case <-deadline: + t.Fatal("runner socket never appeared") + } + } + + // Simulate the agent receiving a cancel (UI cancel / job timeout). + cancel() + + var runErr error + select { + case runErr = <-runErrCh: + case <-time.After(5 * time.Second): + t.Fatal("runner.Run did not return after ctx cancel") + } + + // Confirm the bare context error. + if !errors.Is(runErr, context.Canceled) { + t.Fatalf("runner.Run returned %v, want context.Canceled", runErr) + } + if runErr.Error() != "context canceled" { + t.Fatalf("runErr.Error() = %q, want %q (Go's bare sentinel)", runErr.Error(), "context canceled") + } + + // Pre-fix: agent/run_job.go printed err directly. + preFix := fmt.Sprintf("Error running job: %s", runErr) + t.Logf("PRE-FIX job log line: %q", preFix) + if preFix != "Error running job: context canceled" { + t.Fatalf("pre-fix line = %q, want %q", preFix, "Error running job: context canceled") + } + + // Post-fix: routed through job.FormatJobError. + postFix := fmt.Sprintf("Error running job: %s", job.FormatJobError(runErr)) + t.Logf("POST-FIX job log line: %q", postFix) + if postFix != "Error running job: job cancelled" { + t.Fatalf("post-fix line = %q, want %q", postFix, "Error running job: job cancelled") + } +}