From 9073427b50e0a2a0d67ccb6fd2ccf813aebe4cb6 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Tue, 2 Jun 2026 12:26:25 +1000 Subject: [PATCH 01/15] Tentacle script abandonment (re-landed on main) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the abandon feature on top of main (which now carries the async-migration base and the Windows-on-demand test runner). Lets the server release a stuck script's isolation mutex when CancelScript can't terminate the process. - SilentProcessRunner: the only async change kept here — the wait body becomes async so abandon can break it; abandon best-effort-kills then returns AbandonedExitCode (-48). - AbandonScript RPC: AbandonScriptCommandV2 on IScriptServiceV2, ScriptServiceV2 fires the abandon token, RunningScript distinguishes the abandoned exit, CapabilitiesServiceV2 advertises it via nameof. - Client: ScriptServiceV2Executor.AbandonScript (+capability check), orchestrator escalation via abandonAfterCancellationPendingFor, TentacleClient/ITentacleClient.AbandonScript. - Tests: unit + integration coverage incl. the stuck-script and grandchild cases; design spec. The async-migration duplicates (PowerShellPrerequisite, Kubernetes, service configurators, etc.) are dropped — they're already on main via the async-migration-base PR. Co-Authored-By: Claude Opus 4.8 (1M context) --- ...-effort-kill-and-execute-script-timeout.md | 1000 +++++++++++++++++ ...26-05-21-tentacle-script-abandon-design.md | 477 ++++++++ ...bilitiesResponseV2ExtensionMethodsTests.cs | 35 + ...ObservingScriptOrchestratorAbandonTests.cs | 74 ++ .../ScriptServiceV2ExecutorAbandonTests.cs | 82 ++ .../CapabilitiesResponseV2ExtensionMethods.cs | 13 + .../ITentacleClient.cs | 18 +- .../Octopus.Tentacle.Client/ScriptExecutor.cs | 10 +- .../Scripts/IScriptExecutor.cs | 8 + .../KubernetesScriptServiceV1Executor.cs | 6 + .../Scripts/ObservingScriptOrchestrator.cs | 21 +- .../Scripts/ScriptExecutorFactory.cs | 1 + .../Scripts/ScriptServiceV1Executor.cs | 6 + .../Scripts/ScriptServiceV2Executor.cs | 40 + .../Octopus.Tentacle.Client/TentacleClient.cs | 37 +- .../IAsyncClientScriptServiceV2.cs | 1 + .../ScriptExitCodes.cs | 1 + .../ScriptServiceV2/AbandonScriptCommandV2.cs | 17 + .../ScriptServiceV2/IScriptServiceV2.cs | 1 + .../Services/Scripts/RunningScript.cs | 50 +- .../Services/Scripts/ScriptServiceV2.cs | 72 +- .../Util/CommandLine/SilentProcessRunner.cs | 113 +- .../Util/EnvironmentVariables.cs | 1 + .../RecordMethodUsagesExtensionMethods.cs | 5 + .../ScriptServiceV2DecoratorBuilder.cs | 5 + .../CapabilitiesServiceV2Test.cs | 20 +- .../ClientScriptExecutionAbandon.cs | 238 ++++ ...estDecoratorsAreCalledInTheCorrectOrder.cs | 5 + .../TentacleClientExtensionMethods.cs | 38 + .../Util/RunningScriptFixture.cs | 4 +- .../Util/SilentProcessRunnerFixture.cs | 210 +++- .../CapabilitiesServiceV2Fixture.cs | 25 +- .../Integration/ScriptServiceV2Fixture.cs | 466 +++++++- .../Capabilities/CapabilitiesServiceV2.cs | 3 +- .../Services/Scripts/ScriptService.cs | 2 +- .../Util/ISilentProcessRunner.cs | 20 +- 36 files changed, 2970 insertions(+), 155 deletions(-) create mode 100644 docs/superpowers/plans/2026-06-01-abandon-best-effort-kill-and-execute-script-timeout.md create mode 100644 docs/superpowers/specs/2026-05-21-tentacle-script-abandon-design.md create mode 100644 source/Octopus.Tentacle.Client.Tests/CapabilitiesResponseV2ExtensionMethodsTests.cs create mode 100644 source/Octopus.Tentacle.Client.Tests/ObservingScriptOrchestratorAbandonTests.cs create mode 100644 source/Octopus.Tentacle.Client.Tests/ScriptServiceV2ExecutorAbandonTests.cs create mode 100644 source/Octopus.Tentacle.Contracts/ScriptServiceV2/AbandonScriptCommandV2.cs create mode 100644 source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs diff --git a/docs/superpowers/plans/2026-06-01-abandon-best-effort-kill-and-execute-script-timeout.md b/docs/superpowers/plans/2026-06-01-abandon-best-effort-kill-and-execute-script-timeout.md new file mode 100644 index 000000000..7d76e7af2 --- /dev/null +++ b/docs/superpowers/plans/2026-06-01-abandon-best-effort-kill-and-execute-script-timeout.md @@ -0,0 +1,1000 @@ +# Abandon best-effort-kill + ExecuteScript abandon-timeout — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax. + +**Goal:** Make Tentacle's `AbandonScript` best-effort-kill the process before it stops waiting (anti-abuse), make the PR1 runner deterministically return `AbandonedExitCode (-48)` keyed on the abandon token rather than on which task won the `Task.WhenAny` race, and give the client an optional `abandonAfterCancellationPendingFor` timeout that escalates a stuck cancel to an abandon when the Tentacle advertises the capability. + +**Architecture:** Two changes across server (Tentacle) and client. +- **Change 1 (server, PR1 + later PR2).** `ScriptServiceV2.AbandonScriptAsync` calls `runningScript.Cancel()` *then* `runningScript.Abandon()`, so the existing `cancel.Register → DoOurBestToCleanUp → Hitman.Kill` machinery attempts the kill on *every* abandon — including a direct RPC with no prior cancel. The PR1 runner in `SilentProcessRunner.ExecuteCommandAsync` keys its abandoned-result branch on `abandon.IsCancellationRequested` instead of `== abandoned.Task`. After `Cancel()→Close()` the process is detached, so the abandon branch must NOT read `process.HasExited`/`process.ExitCode`; it just returns `-48`. The "abandon was unnecessary because the script already finished" case is handled one layer up: the script's `RunningScript` is already `Complete` with its recorded exit code, so `GetResponse` returns that real code, never reaching the runner's abandon branch. +- **Change 2 (client, PR1 only).** Optional `TimeSpan? abandonAfterCancellationPendingFor = null` threaded through `ITentacleClient.ExecuteScript` → `TentacleClient.ExecuteScript` → `ObservingScriptOrchestrator`. The orchestrator's `ObserveUntilComplete` already calls `CancelScript` each poll while the token is cancelled; it records when cancellation first fired and, once elapsed crosses the threshold AND the abandon capability is advertised, calls `AbandonScript` instead. A new `IScriptExecutor.AbandonScript` is implemented on `ScriptServiceV2Executor` (with a capability check) and as a no-op fallback on `ScriptServiceV1Executor`/`KubernetesScriptServiceV1Executor`. `ScriptExecutionResult.ExitCode (-48)` is the source of truth. NO shared `Octopus.Tentacle.Contracts` constant for the message. + +**Tech Stack:** C#, NUnit, FluentAssertions, NSubstitute, Halibut RPC. + +> **State of the tree at plan time (read before starting).** The contract surface is already merged on this branch: `AbandonScriptCommandV2`, `IScriptServiceV2.AbandonScript`, `ScriptExitCodes.AbandonedExitCode = -48`, `ITentacleClient.AbandonScript`/`TentacleClient.AbandonScript`, `RunningScript.CreateAbandonable` with `abandonToken`, the `RunningScript.Execute` abandon catch, `ScriptServiceV2.AbandonScriptAsync` (currently calls only `Abandon()`), and the PR1 runner `Task.WhenAny` block (currently keyed on `== abandoned.Task`). Capabilities advertise `nameof(ScriptServiceV2.AbandonScriptAsync)` → the literal string `"AbandonScriptAsync"`, NOT `"AbandonScriptV2"`. This plan ONLY makes the two deltas above; it does not re-create the contract. + +--- + +## Task 1 — PR1 runner: key the abandoned branch on `abandon.IsCancellationRequested` + +The PR1 `Task.WhenAny` currently returns `AbandonedExitCode` only when `abandoned.Task` literally won the race (`== abandoned.Task`). The spec requires the result keyed on the *token*, so that when both cancel and abandon fire and `waitForExit` happens to win (e.g. the abandon TCS callback is still queued), we still return `-48` deterministically. The abandon branch must NOT touch `process.HasExited`/`ExitCode` — after `Cancel()→Close()` the Process is detached. + +**Files** +- Test: `source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs` (add new `[Test]` after `AbandonToken_ShouldReturnAbandonedExitCodeWithoutKillingProcess` which ends at line ~372) +- Modify: `source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs` lines 161–177 (the `abandoned`/`Task.WhenAny` block) + +- [ ] Write a failing test `AbandonToken_WhenWaitTaskWinsRace_StillReturnsAbandonedExitCode` in `SilentProcessRunnerFixture.cs`. This drives both tokens: cancel fires first (no-op kill via the env var), then abandon, and asserts the return is `-48` regardless of race ordering. Add this method inside the class: + +```csharp + [Test] + public async Task AbandonToken_WhenBothTokensFire_ReturnsAbandonedExitCodeKeyedOnToken() + { + // Simulate an un-killable script: DisableProcessKill makes cancel's Hitman.Kill a no-op, + // so cancel does NOT make the process exit. Then abandon fires. The return must be + // AbandonedExitCode because abandon.IsCancellationRequested is set — not because of which + // task won the WhenAny race. This is the deterministic-(-48) guarantee from the spec. + Environment.SetEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, "1"); + try + { + using var tempDir = new TemporaryDirectory(); + var pidFile = Path.Combine(tempDir.DirectoryPath, "process.pid"); + + var executable = PlatformDetection.IsRunningOnWindows ? "powershell.exe" : "/bin/bash"; + var arguments = PlatformDetection.IsRunningOnWindows + ? $"-NoProfile -NonInteractive -Command \"$PID | Out-File -FilePath '{pidFile}' -Encoding ASCII; Start-Sleep -Seconds 300\"" + : $"-c \"echo $$ > '{pidFile}' && sleep 300\""; + + using var cancelCts = new CancellationTokenSource(); + using var abandonCts = new CancellationTokenSource(); + + var task = Task.Run(async () => await SilentProcessRunner.ExecuteCommandAsync( + executable, + arguments, + Environment.CurrentDirectory, + debug: _ => { }, + info: _ => { }, + error: _ => { }, + customEnvironmentVariables: null, + cancel: cancelCts.Token, + abandon: abandonCts.Token)); + + await WaitForPidFileAsync(pidFile, TimeSpan.FromSeconds(30)); + + // Cancel first (kill is no-op'd so the process keeps running), then abandon. + cancelCts.Cancel(); + abandonCts.Cancel(); + + var exitCode = await task; + exitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + + // Cleanup: the real process is still alive because kill was no-op'd. + if (File.Exists(pidFile) && int.TryParse(SafelyReadAllText(pidFile).Trim(), out var pid) && pid > 0) + { + try { Process.GetProcessById(pid).Kill(); } catch { /* already gone */ } + } + } + finally + { + Environment.SetEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, null); + } + } +``` + +- [ ] Run it on net48 + net8.0 and watch it FAIL (today the `== abandoned.Task` branch can be skipped when `waitForExit` wins, falling through to `SafelyGetExitCode` on a detached process → returns `-1`, not `-48`): + +```bash +dotnet test source/Octopus.Tentacle.Tests.Integration --framework net48 \ + --filter "FullyQualifiedName~SilentProcessRunnerFixture.AbandonToken_WhenBothTokensFire_ReturnsAbandonedExitCodeKeyedOnToken" +dotnet test source/Octopus.Tentacle.Tests.Integration --framework net8.0 \ + --filter "FullyQualifiedName~SilentProcessRunnerFixture.AbandonToken_WhenBothTokensFire_ReturnsAbandonedExitCodeKeyedOnToken" +``` +Expected: FAIL (returns -1 or hangs). + +- [ ] Make the minimal implementation change in `SilentProcessRunner.cs`. Replace the `Task.WhenAny`/`== abandoned.Task` block (lines 161–177) so the abandoned branch is keyed on the token and runs after the race regardless of winner: + +```csharp + var abandoned = new TaskCompletionSource(); + using (abandon.Register(() => abandoned.TrySetResult(null))) + { + var waitForExit = Task.Run(() => + { + try { process.WaitForExit(); } + catch { /* released by Process.Dispose on the abandon path */ } + }); + + await Task.WhenAny(waitForExit, abandoned.Task).ConfigureAwait(false); + + // Key the abandoned result on the TOKEN, not on which task won the race. + // When both cancel and abandon fire, cancel's Kill is no-op'd / can't land, so + // waitForExit may win even though we must still return AbandonedExitCode. + // After Cancel()->Close() the Process is detached, so do NOT read + // process.HasExited / process.ExitCode here — just return -48. The + // "abandon was unnecessary" case never reaches here: the script's + // RunningScript is already Complete with its real code one layer up. + if (abandon.IsCancellationRequested) + { + info("Tentacle has abandoned this script. The underlying script process may still be running on this host."); + SafelyCancelOutputAndErrorRead(process, debug); + running = false; + return ScriptExitCodes.AbandonedExitCode; + } + } +``` + +- [ ] Re-run the same two `dotnet test` commands from above. Expected: PASS on both net48 and net8.0. + +- [ ] Run the existing abandon + grandchild regression tests to confirm no regression: + +```bash +dotnet test source/Octopus.Tentacle.Tests.Integration --framework net8.0 \ + --filter "FullyQualifiedName~SilentProcessRunnerFixture.AbandonToken_ShouldReturnAbandonedExitCodeWithoutKillingProcess|FullyQualifiedName~SilentProcessRunnerFixture.CancellationToken_WhenGrandchildHoldsRedirectedPipes_ShouldNotHang|FullyQualifiedName~SilentProcessRunnerFixture.CancellationToken_WhenUnixGrandchildHoldsRedirectedPipes_ShouldNotHang" +``` +Expected: PASS. + +- [ ] Commit: `git commit -am "PR1 runner: key abandoned result on abandon token, not WhenAny winner"` + +--- + +## Task 2 — `AbandonScriptAsync` best-effort-kills (Cancel then Abandon) — service-layer test + +`ScriptServiceV2.AbandonScriptAsync` currently calls only `runningScript.Abandon()`. The spec requires `runningScript.Cancel()` first so `Hitman.Kill` runs on every abandon (anti-abuse). We prove this through the integration layer because `ScriptServiceV2` wiring (workspace factory, state store, shell) is awkward to stub; the existing abandon integration tests already exercise this service with a real Tentacle. + +**Files** +- Test: `source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs` (add a third `[Test]` after `AbandonScript_ReleasesIsolationMutexEvenWhileProcessIsStillRunning`, which ends at line ~129) + +- [ ] Write a failing test `AbandonScript_WithNoPriorCancel_KillsTheProcess` in `ClientScriptExecutionAbandon.cs`. Kill is NOT disabled here, so abandon's internal `Cancel()` must terminate the process. Add this method inside the class: + +```csharp + [Test] + [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version2)] + public async Task AbandonScript_WithNoPriorCancel_KillsTheProcess(TentacleConfigurationTestCase tentacleConfigurationTestCase) + { + // Anti-abuse: a direct AbandonScript with no prior CancelScript must still attempt the + // kill. AbandonScriptAsync calls Cancel() then Abandon(), so Hitman.Kill runs. Kill is + // NOT disabled here, so the underlying process must actually die. + await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() + .Build(CancellationToken); + + var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); + var releaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "release"); + + var command = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder() + .CreateFile(startFile) + .WaitForFileToExist(releaseFile)) + .WithIsolationLevel(ScriptIsolationLevel.NoIsolation) + .Build(); + + var tentacleClient = clientTentacle.TentacleClient; + var scriptExecution = Task.Run(async () => await tentacleClient.ExecuteScript(command, CancellationToken)); + + await Wait.For(() => File.Exists(startFile), + TimeSpan.FromSeconds(30), + () => throw new Exception("Script did not start"), + CancellationToken); + + // Direct abandon, NO prior cancel. + await tentacleClient.AbandonScript(command.ScriptTicket, CancellationToken); + + ScriptStatus abandonResponse = null!; + await Wait.For(async () => + { + abandonResponse = await tentacleClient.GetStatus(command.ScriptTicket, CancellationToken); + return abandonResponse.State == ProcessState.Complete; + }, + TimeSpan.FromSeconds(30), + () => throw new Exception("Abandoned script did not reach Complete state within 30s"), + CancellationToken); + + abandonResponse.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + + // The kill landed because Hitman was NOT disabled: the script never saw the release file, + // so if it is still waiting the file would be absent and the process alive. The script + // reaching Complete with AbandonedExitCode while we never wrote releaseFile proves the + // wait was broken by abandon; the kill having run is asserted by the script process being + // gone — drain the execution task, which completes once the process is dead. + await scriptExecution; + } +``` + +- [ ] Run it and watch it FAIL (today abandon does NOT call `Cancel()`, so `Hitman.Kill` never runs; the process keeps waiting on `releaseFile`, and `scriptExecution` never drains within the test timeout): + +```bash +dotnet test source/Octopus.Tentacle.Tests.Integration --framework net8.0 \ + --filter "FullyQualifiedName~ClientScriptExecutionAbandon.AbandonScript_WithNoPriorCancel_KillsTheProcess" +``` +Expected: FAIL (test times out because the process is never killed). + +- [ ] Make the minimal implementation change in `source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs`. Modify `AbandonScriptAsync` (lines 144–156) to call `Cancel()` then `Abandon()`: + +```csharp + public Task AbandonScriptAsync(AbandonScriptCommandV2 command, CancellationToken cancellationToken) + { + // Abandon best-effort-kills first (anti-abuse): Cancel() runs the existing + // cancel.Register -> DoOurBestToCleanUp -> Hitman.Kill machinery so ANY abandon — + // including a direct RPC with no prior cancel — attempts the kill. Abandon() then + // layers "stop waiting + release the mutex + return -48" on top. A process survives + // abandon only when the kill genuinely can't land (stuck / re-parented grandchild). + if (runningScripts.TryGetValue(command.Ticket, out var runningScript)) + { + runningScript.Cancel(); + runningScript.Abandon(); + } + + return Task.FromResult(GetResponse(command.Ticket, command.LastLogSequence, runningScript?.Process)); + } +``` + +- [ ] Re-run the same `dotnet test` command. Expected: PASS. + +- [ ] Run the two existing abandon integration tests to confirm no regression (kill-disabled path still returns -48 and releases the mutex): + +```bash +dotnet test source/Octopus.Tentacle.Tests.Integration --framework net8.0 \ + --filter "FullyQualifiedName~ClientScriptExecutionAbandon" +``` +Expected: PASS (all three tests). + +- [ ] Commit: `git commit -am "AbandonScriptAsync: Cancel() then Abandon() so abandon best-effort-kills"` + +--- + +## Task 3 — Client: add `AbandonScript` to `IScriptExecutor` and capability extension + +Change 2 needs the orchestrator to call abandon through the executor abstraction. Add `AbandonScript(CommandContext)` to `IScriptExecutor`, a capability-checking implementation on `ScriptServiceV2Executor`, and no-op fallbacks on V1/Kubernetes executors. The capability check uses a new `HasAbandonScriptV2()` extension that matches the capability string Tentacle actually advertises today (`"AbandonScriptAsync"`). + +> **Spec gap (resolved here):** the spec names the capability `AbandonScriptV2`, but `CapabilitiesServiceV2` already advertises `nameof(ScriptServiceV2.AbandonScriptAsync)` = the string `"AbandonScriptAsync"`. Changing the advertised string would break the contract already shipped on this branch. This plan matches the *actual* advertised string. If the team wants the spec's `AbandonScriptV2` literal, that is a separate contract change with its own backward-compat story — flagged, not silently done. + +**Files** +- Test: `source/Octopus.Tentacle.Client.Tests/CapabilitiesResponseV2ExtensionMethodsTests.cs` (CREATE) +- Modify: `source/Octopus.Tentacle.Client/Capabilities/CapabilitiesResponseV2ExtensionMethods.cs` (add method after `HasScriptServiceV2`, line 17) +- Modify: `source/Octopus.Tentacle.Client/Scripts/IScriptExecutor.cs` (add method after `CancelScript`, line 33) + +- [ ] Write a failing test file `source/Octopus.Tentacle.Client.Tests/CapabilitiesResponseV2ExtensionMethodsTests.cs`: + +```csharp +using System.Collections.Generic; +using FluentAssertions; +using NUnit.Framework; +using Octopus.Tentacle.Client.Capabilities; +using Octopus.Tentacle.Contracts.Capabilities; + +namespace Octopus.Tentacle.Client.Tests +{ + [TestFixture] + public class CapabilitiesResponseV2ExtensionMethodsTests + { + [Test] + public void HasAbandonScriptV2_WhenAdvertised_ReturnsTrue() + { + var capabilities = new CapabilitiesResponseV2(new List { "IScriptServiceV2", "AbandonScriptAsync" }); + capabilities.HasAbandonScriptV2().Should().BeTrue(); + } + + [Test] + public void HasAbandonScriptV2_WhenNotAdvertised_ReturnsFalse() + { + var capabilities = new CapabilitiesResponseV2(new List { "IScriptServiceV2" }); + capabilities.HasAbandonScriptV2().Should().BeFalse(); + } + + [Test] + public void HasAbandonScriptV2_WhenEmpty_ReturnsFalse() + { + var capabilities = new CapabilitiesResponseV2(new List()); + capabilities.HasAbandonScriptV2().Should().BeFalse(); + } + } +} +``` + +- [ ] Run it and watch it FAIL (the extension does not exist → compile error): + +```bash +dotnet test source/Octopus.Tentacle.Client.Tests --framework net8.0 \ + --filter "FullyQualifiedName~CapabilitiesResponseV2ExtensionMethodsTests" +``` +Expected: FAIL (does not compile). + +- [ ] Add the extension method to `CapabilitiesResponseV2ExtensionMethods.cs` after `HasScriptServiceV2` (the closing brace of that method is at line 17). Insert: + +```csharp + + public static bool HasAbandonScriptV2(this CapabilitiesResponseV2 capabilities) + { + if (capabilities?.SupportedCapabilities?.Any() != true) + { + return false; + } + + // Tentacle advertises this as nameof(ScriptServiceV2.AbandonScriptAsync). Keep the + // literal in sync with CapabilitiesServiceV2.GetCapabilitiesAsync on the Tentacle side. + return capabilities.SupportedCapabilities.Contains("AbandonScriptAsync"); + } +``` + +- [ ] Re-run the extension test. Expected: PASS. + +- [ ] Add `AbandonScript` to `IScriptExecutor.cs`. Insert after the `CancelScript` declaration (line 33, before `CompleteScript`'s doc comment): + +```csharp + + /// + /// Abandon the script: signal Tentacle to stop waiting and release the isolation mutex. + /// Returns the abandoned status when the Tentacle advertises the abandon capability; + /// otherwise falls back to cancelling and returns that result. + /// + /// The CommandContext from the previous command + Task AbandonScript(CommandContext commandContext); +``` + +- [ ] Build the client to confirm the interface compiles (implementations come in Task 4 — expect implementor errors are fine to defer only if you build implementors together; here build will FAIL until Task 4, so just verify the interface file itself parses by building the project after Task 4). Skip a standalone build here; commit the interface + extension together with implementations is cleaner — but to keep commits frequent, commit the extension + its test now: + +```bash +dotnet test source/Octopus.Tentacle.Client.Tests --framework net8.0 \ + --filter "FullyQualifiedName~CapabilitiesResponseV2ExtensionMethodsTests" +``` +Expected: PASS. + +- [ ] Commit: `git commit -am "Client: add HasAbandonScriptV2 capability check + IScriptExecutor.AbandonScript"` + +--- + +## Task 4 — Implement `AbandonScript` on the three executors + +`ScriptServiceV2Executor` does the real work with a capability check; V1 and Kubernetes executors fall back to `CancelScript` (they have no abandon RPC). The V2 executor needs the capabilities client; add it as a constructor dependency and wire it through `ScriptExecutorFactory`. + +**Files** +- Modify: `source/Octopus.Tentacle.Client/Scripts/ScriptServiceV2Executor.cs` (ctor lines 27–41; add `AbandonScript` after `CancelScript`, line 174) +- Modify: `source/Octopus.Tentacle.Client/Scripts/ScriptServiceV1Executor.cs` (add `AbandonScript` fallback) +- Modify: `source/Octopus.Tentacle.Client/Scripts/KubernetesScriptServiceV1Executor.cs` (add `AbandonScript` fallback) +- Modify: `source/Octopus.Tentacle.Client/Scripts/ScriptExecutorFactory.cs` (pass capabilities client to V2 executor, line 48) +- Test: `source/Octopus.Tentacle.Client.Tests/ScriptServiceV2ExecutorAbandonTests.cs` (CREATE) + +- [ ] Write a failing test file `source/Octopus.Tentacle.Client.Tests/ScriptServiceV2ExecutorAbandonTests.cs`: + +```csharp +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; +using FluentAssertions; +using Halibut.ServiceModel; +using NSubstitute; +using NUnit.Framework; +using Octopus.Tentacle.Client; +using Octopus.Tentacle.Client.EventDriven; +using Octopus.Tentacle.Client.Execution; +using Octopus.Tentacle.Client.Observability; +using Octopus.Tentacle.Client.Retries; +using Octopus.Tentacle.Client.Scripts; +using Octopus.Tentacle.Contracts; +using Octopus.Tentacle.Contracts.Capabilities; +using Octopus.Tentacle.Contracts.ClientServices; +using Octopus.Tentacle.Contracts.Logging; +using Octopus.Tentacle.Contracts.Observability; +using Octopus.Tentacle.Contracts.ScriptServiceV2; + +namespace Octopus.Tentacle.Client.Tests +{ + [TestFixture] + public class ScriptServiceV2ExecutorAbandonTests + { + static ScriptServiceV2Executor CreateExecutor(IAsyncClientScriptServiceV2 scriptService, IAsyncClientCapabilitiesServiceV2 capabilities) + => new( + scriptService, + capabilities, + RpcCallExecutorFactory.Create(TimeSpan.Zero, Substitute.For()), + ClientOperationMetricsBuilder.Start(), + TimeSpan.Zero, + new TentacleClientOptions(new RpcRetrySettings(retriesEnabled: false, retryDuration: TimeSpan.Zero)), + Substitute.For()); + + static CommandContext Context() => new(new ScriptTicket("TestTicket"), 0, ScriptServiceVersion.ScriptServiceVersion2); + + [Test] + public async Task AbandonScript_WhenCapabilityAdvertised_CallsAbandonScriptAsync() + { + var scriptService = Substitute.For(); + scriptService.AbandonScriptAsync(Arg.Any(), Arg.Any()) + .Returns(x => Task.FromResult(new ScriptStatusResponseV2( + x.Arg().Ticket, ProcessState.Complete, + ScriptExitCodes.AbandonedExitCode, new List(), 1))); + + var capabilities = Substitute.For(); + capabilities.GetCapabilitiesAsync(Arg.Any()) + .Returns(Task.FromResult(new CapabilitiesResponseV2(new List { "IScriptServiceV2", "AbandonScriptAsync" }))); + + var executor = CreateExecutor(scriptService, capabilities); + + var result = await executor.AbandonScript(Context()); + + await scriptService.Received(1).AbandonScriptAsync(Arg.Any(), Arg.Any()); + await scriptService.DidNotReceive().CancelScriptAsync(Arg.Any(), Arg.Any()); + result.ScriptStatus.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + } + + [Test] + public async Task AbandonScript_WhenCapabilityAbsent_FallsBackToCancelScriptAsync() + { + var scriptService = Substitute.For(); + scriptService.CancelScriptAsync(Arg.Any(), Arg.Any()) + .Returns(x => Task.FromResult(new ScriptStatusResponseV2( + x.Arg().Ticket, ProcessState.Running, + 0, new List(), 1))); + + var capabilities = Substitute.For(); + capabilities.GetCapabilitiesAsync(Arg.Any()) + .Returns(Task.FromResult(new CapabilitiesResponseV2(new List { "IScriptServiceV2" }))); + + var executor = CreateExecutor(scriptService, capabilities); + + await executor.AbandonScript(Context()); + + await scriptService.DidNotReceive().AbandonScriptAsync(Arg.Any(), Arg.Any()); + await scriptService.Received(1).CancelScriptAsync(Arg.Any(), Arg.Any()); + } + } +} +``` + +- [ ] Run it and watch it FAIL (V2 executor has no capabilities ctor param and no `AbandonScript` → compile error): + +```bash +dotnet test source/Octopus.Tentacle.Client.Tests --framework net8.0 \ + --filter "FullyQualifiedName~ScriptServiceV2ExecutorAbandonTests" +``` +Expected: FAIL (does not compile). + +- [ ] Add the capabilities dependency to `ScriptServiceV2Executor`. Modify the field block (after line 20 `readonly IAsyncClientScriptServiceV2 clientScriptServiceV2;`) and the ctor (lines 27–41): + +```csharp + readonly IAsyncClientScriptServiceV2 clientScriptServiceV2; + readonly IAsyncClientCapabilitiesServiceV2 clientCapabilitiesServiceV2; + readonly RpcCallExecutor rpcCallExecutor; + readonly ClientOperationMetricsBuilder clientOperationMetricsBuilder; + readonly TimeSpan onCancellationAbandonCompleteScriptAfter; + readonly ITentacleClientTaskLog logger; + readonly TentacleClientOptions clientOptions; + + public ScriptServiceV2Executor( + IAsyncClientScriptServiceV2 clientScriptServiceV2, + IAsyncClientCapabilitiesServiceV2 clientCapabilitiesServiceV2, + RpcCallExecutor rpcCallExecutor, + ClientOperationMetricsBuilder clientOperationMetricsBuilder, + TimeSpan onCancellationAbandonCompleteScriptAfter, + TentacleClientOptions clientOptions, + ITentacleClientTaskLog logger) + { + this.clientScriptServiceV2 = clientScriptServiceV2; + this.clientCapabilitiesServiceV2 = clientCapabilitiesServiceV2; + this.rpcCallExecutor = rpcCallExecutor; + this.clientOperationMetricsBuilder = clientOperationMetricsBuilder; + this.onCancellationAbandonCompleteScriptAfter = onCancellationAbandonCompleteScriptAfter; + this.clientOptions = clientOptions; + this.logger = logger; + } +``` + +- [ ] Add the `AbandonScript` method to `ScriptServiceV2Executor.cs` immediately after `CancelScript` (after line 174, before `CompleteScript`). Add `using Octopus.Tentacle.Client.Capabilities;` at the top if absent: + +```csharp + public async Task AbandonScript(CommandContext commandContext) + { + using var activity = TentacleClient.ActivitySource.StartActivity($"{nameof(ScriptServiceV2Executor)}.{nameof(AbandonScript)}"); + + var capabilities = await rpcCallExecutor.Execute( + retriesEnabled: clientOptions.RpcRetrySettings.RetriesEnabled, + RpcCall.Create(nameof(Contracts.Capabilities.ICapabilitiesServiceV2.GetCapabilities)), + async ct => await clientCapabilitiesServiceV2.GetCapabilitiesAsync(new HalibutProxyRequestOptions(ct)), + logger, + clientOperationMetricsBuilder, + CancellationToken.None).ConfigureAwait(false); + + // Capability absent → the Tentacle is too old to abandon. Keep cancelling cleanly. + if (!capabilities.HasAbandonScriptV2()) + { + logger.Verbose("Tentacle does not advertise AbandonScript; falling back to CancelScript."); + return await CancelScript(commandContext).ConfigureAwait(false); + } + + async Task AbandonScriptAction(CancellationToken ct) + { + var request = new AbandonScriptCommandV2(commandContext.ScriptTicket, commandContext.NextLogSequence); + return await clientScriptServiceV2.AbandonScriptAsync(request, new HalibutProxyRequestOptions(ct)); + } + + var scriptStatusResponseV2 = await rpcCallExecutor.Execute( + retriesEnabled: clientOptions.RpcRetrySettings.RetriesEnabled, + RpcCall.Create(nameof(IScriptServiceV2.AbandonScript)), + AbandonScriptAction, + logger, + clientOperationMetricsBuilder, + // Like CancelScript, abandon must not be cancelled — it stops the script on Tentacle. + CancellationToken.None).ConfigureAwait(false); + return Map(scriptStatusResponseV2); + } +``` + +- [ ] Add the no-op fallback to `ScriptServiceV1Executor.cs` (mirror its `CancelScript`; V1 has no abandon RPC, so abandon degrades to cancel). Add after that executor's `CancelScript` method: + +```csharp + public Task AbandonScript(CommandContext commandContext) + { + // ScriptServiceV1 has no abandon verb; degrade to cancel. + return CancelScript(commandContext); + } +``` + +- [ ] Add the same fallback to `KubernetesScriptServiceV1Executor.cs` after its `CancelScript`: + +```csharp + public Task AbandonScript(CommandContext commandContext) + { + // KubernetesScriptServiceV1 has no abandon verb; degrade to cancel. + return CancelScript(commandContext); + } +``` + +- [ ] Wire the capabilities client through `ScriptExecutorFactory.cs`. Change the V2 construction (line 48) to pass `allClients.CapabilitiesServiceV2`: + +```csharp + return new ScriptServiceV2Executor( + allClients.ScriptServiceV2, + allClients.CapabilitiesServiceV2, + rpcCallExecutor, + clientOperationMetricsBuilder, + onCancellationAbandonCompleteScriptAfter, + clientOptions, + logger); +``` + +- [ ] Add `AbandonScript` to the `ScriptExecutor` facade (`source/Octopus.Tentacle.Client/ScriptExecutor.cs`) after its `CancelScript` method (line ~70), mirroring the existing delegation pattern: + +```csharp + public async Task AbandonScript(CommandContext commandContext) + { + var scriptExecutorFactory = CreateScriptExecutorFactory(); + var scriptExecutor = scriptExecutorFactory.CreateScriptExecutor(commandContext.ScripServiceVersionUsed); + + return await scriptExecutor.AbandonScript(commandContext); + } +``` + +- [ ] Re-run the V2 executor abandon tests. Expected: PASS. + +```bash +dotnet test source/Octopus.Tentacle.Client.Tests --framework net8.0 \ + --filter "FullyQualifiedName~ScriptServiceV2ExecutorAbandonTests" +``` + +- [ ] Build the client + run the full client test project to confirm no implementor was missed: + +```bash +dotnet build source/Octopus.Tentacle.Client --framework net8.0 +dotnet test source/Octopus.Tentacle.Client.Tests --framework net8.0 +``` +Expected: build succeeds; tests PASS. + +- [ ] Commit: `git commit -am "Client executors: implement AbandonScript with capability check + cancel fallback"` + +--- + +## Task 5 — Orchestrator: escalate to abandon after the cancellation-pending threshold + +`ObservingScriptOrchestrator.ObserveUntilComplete` calls `CancelScript` each poll while the token is cancelled. Add `abandonAfterCancellationPendingFor`; record when cancellation first fired; once elapsed crosses the threshold call `AbandonScript` (the executor handles the capability check + fallback). When the param is `null`, behaviour is unchanged. + +**Files** +- Modify: `source/Octopus.Tentacle.Client/Scripts/ObservingScriptOrchestrator.cs` (ctor lines 18–28; `ExecuteScript` lines 30–45; `ObserveUntilComplete` lines 76–144) +- Test: `source/Octopus.Tentacle.Client.Tests/ObservingScriptOrchestratorAbandonTests.cs` (CREATE) + +- [ ] First, make `ObserveUntilComplete` testable by changing its access modifier from `private` (implicit) to `internal` in `ObservingScriptOrchestrator.cs` (the class is already `internal`; `Octopus.Tentacle.Client.Tests` has `InternalsVisibleTo`). Change line 76 from `async Task ObserveUntilComplete(` to `internal async Task ObserveUntilComplete(`. (`ExecuteScriptCommand` is abstract, so driving the full `ExecuteScript` from a unit test is awkward; calling `ObserveUntilComplete` directly is the deterministic seam.) + +- [ ] Write a failing test file `source/Octopus.Tentacle.Client.Tests/ObservingScriptOrchestratorAbandonTests.cs`. It drives `ObserveUntilComplete` directly with an NSubstitute `IScriptExecutor`, a pre-cancelled token, and asserts cancel-vs-abandon based on the threshold: + +```csharp +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; +using FluentAssertions; +using NSubstitute; +using NUnit.Framework; +using Octopus.Tentacle.Client.EventDriven; +using Octopus.Tentacle.Client.Scripts; +using Octopus.Tentacle.Client.Scripts.Models; +using Octopus.Tentacle.Contracts; + +namespace Octopus.Tentacle.Client.Tests +{ + [TestFixture] + public class ObservingScriptOrchestratorAbandonTests + { + static CommandContext Context() => new(new ScriptTicket("T"), 0, ScriptServiceVersion.ScriptServiceVersion2); + + static ScriptOperationExecutionResult Running() + => new(new ScriptStatus(ProcessState.Running, 0, new List()), Context()); + + static ScriptOperationExecutionResult Complete(int exitCode) + => new(new ScriptStatus(ProcessState.Complete, exitCode, new List()), Context()); + + static ObservingScriptOrchestrator CreateOrchestrator(IScriptExecutor executor, TimeSpan? abandonAfter) + => new( + new ImmediateBackoff(), + _ => { }, + _ => Task.CompletedTask, + executor, + abandonAfter); + + sealed class ImmediateBackoff : IScriptObserverBackoffStrategy + { + public TimeSpan GetBackoff(int iteration) => TimeSpan.Zero; + } + + [Test] + public async Task ParamUnset_CancelsOnly_NeverAbandons() + { + var executor = Substitute.For(); + executor.CancelScript(Arg.Any()) + .Returns(Running(), Complete(ScriptExitCodes.CanceledExitCode)); + + var orchestrator = CreateOrchestrator(executor, abandonAfter: null); + using var cts = new CancellationTokenSource(); + cts.Cancel(); + + await orchestrator.ObserveUntilComplete(Running(), cts.Token); + + await executor.DidNotReceive().AbandonScript(Arg.Any()); + await executor.Received().CancelScript(Arg.Any()); + } + + [Test] + public async Task ThresholdCrossed_SwitchesFromCancelToAbandon() + { + var executor = Substitute.For(); + // abandonAfter = Zero means the threshold is crossed on the first cancelled iteration, + // so the orchestrator abandons immediately. AbandonScript returns Complete to end the loop. + executor.AbandonScript(Arg.Any()) + .Returns(Complete(ScriptExitCodes.AbandonedExitCode)); + + var orchestrator = CreateOrchestrator(executor, abandonAfter: TimeSpan.Zero); + using var cts = new CancellationTokenSource(); + cts.Cancel(); + + var result = await orchestrator.ObserveUntilComplete(Running(), cts.Token); + + await executor.Received().AbandonScript(Arg.Any()); + result.ScriptStatus.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + } + } +} +``` + +- [ ] Run it and watch it FAIL (orchestrator ctor has no `abandonAfter` param and `ObserveUntilComplete` is not yet `internal`/escalating → compile error on the 5-arg ctor): + +```bash +dotnet test source/Octopus.Tentacle.Client.Tests --framework net8.0 \ + --filter "FullyQualifiedName~ObservingScriptOrchestratorAbandonTests" +``` +Expected: FAIL (does not compile). + +- [ ] Add the field + ctor param to `ObservingScriptOrchestrator.cs`. After `readonly IScriptExecutor scriptExecutor;` (line 16) add `readonly TimeSpan? abandonAfterCancellationPendingFor;`. Change the ctor signature (line 18) to append `TimeSpan? abandonAfterCancellationPendingFor` and assign it: + +```csharp + public ObservingScriptOrchestrator( + IScriptObserverBackoffStrategy scriptObserverBackOffStrategy, + OnScriptStatusResponseReceived onScriptStatusResponseReceived, + OnScriptCompleted onScriptCompleted, + IScriptExecutor scriptExecutor, + TimeSpan? abandonAfterCancellationPendingFor = null) + { + this.scriptExecutor = scriptExecutor; + this.scriptObserverBackOffStrategy = scriptObserverBackOffStrategy; + this.onScriptStatusResponseReceived = onScriptStatusResponseReceived; + this.onScriptCompleted = onScriptCompleted; + this.abandonAfterCancellationPendingFor = abandonAfterCancellationPendingFor; + } +``` + +- [ ] Change `ObserveUntilComplete` (lines 76–144) to record first-cancel time and escalate. Replace the cancellation branch (lines 86–89): + +```csharp + var iteration = 0; + var cancellationIteration = 0; + var lastResult = startScriptResult; + var stopwatch = new Stopwatch(); + + while (lastResult.ScriptStatus.State != ProcessState.Complete) + { + if (scriptExecutionCancellationToken.IsCancellationRequested) + { + // Record when cancellation first fired so we can escalate to abandon after the threshold. + if (!stopwatch.IsRunning) + { + stopwatch.Start(); + } + + var shouldAbandon = abandonAfterCancellationPendingFor is { } threshold + && stopwatch.Elapsed >= threshold; + + lastResult = shouldAbandon + ? await scriptExecutor.AbandonScript(lastResult.ContextForNextCommand).ConfigureAwait(false) + : await scriptExecutor.CancelScript(lastResult.ContextForNextCommand).ConfigureAwait(false); + } + else + { +``` + +(`System.Diagnostics` is already imported for `Stopwatch` at line 2.) + +- [ ] Re-run the orchestrator abandon tests. Expected: PASS. + +```bash +dotnet test source/Octopus.Tentacle.Client.Tests --framework net8.0 \ + --filter "FullyQualifiedName~ObservingScriptOrchestratorAbandonTests" +``` + +- [ ] Commit: `git commit -am "Orchestrator: escalate cancel to abandon after abandonAfterCancellationPendingFor"` + +--- + +## Task 6 — Thread `abandonAfterCancellationPendingFor` through `TentacleClient` + `ITentacleClient` + +The orchestrator now accepts the timeout; expose it on the public `ExecuteScript` surface as an optional parameter (default `null`, so all existing callers are unchanged). + +**Files** +- Modify: `source/Octopus.Tentacle.Client/ITentacleClient.cs` (`ExecuteScript` declaration lines 30–35) +- Modify: `source/Octopus.Tentacle.Client/TentacleClient.cs` (`ExecuteScript` lines 168–208; orchestrator construction line 189) +- Test: `source/Octopus.Tentacle.Client.Tests/ObservingScriptOrchestratorAbandonTests.cs` (no change — covered) + a compile-only assertion in `source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs` (Task 7 covers behavioural) + +- [ ] Add the optional parameter to `ITentacleClient.ExecuteScript` (after `scriptExecutionCancellationToken`, line 35): + +```csharp + Task ExecuteScript( + ExecuteScriptCommand executeScriptCommand, + OnScriptStatusResponseReceived onScriptStatusResponseReceived, + OnScriptCompleted onScriptCompleted, + ITentacleClientTaskLog logger, + CancellationToken scriptExecutionCancellationToken, + TimeSpan? abandonAfterCancellationPendingFor = null); +``` + +- [ ] Add the matching parameter to `TentacleClient.ExecuteScript` (line 168) and pass it to the orchestrator (line 189): + +```csharp + public async Task ExecuteScript(ExecuteScriptCommand executeScriptCommand, + OnScriptStatusResponseReceived onScriptStatusResponseReceived, + OnScriptCompleted onScriptCompleted, + ITentacleClientTaskLog logger, + CancellationToken scriptExecutionCancellationToken, + TimeSpan? abandonAfterCancellationPendingFor = null) + { +``` + +and: + +```csharp + var orchestrator = new ObservingScriptOrchestrator(scriptObserverBackOffStrategy, + onScriptStatusResponseReceived, + onScriptCompleted, + scriptExecutor, + abandonAfterCancellationPendingFor); +``` + +- [ ] Build the client and the integration test project (which references the client) to confirm the optional param did not break any caller: + +```bash +dotnet build source/Octopus.Tentacle.Client --framework net8.0 +dotnet build source/Octopus.Tentacle.Tests.Integration --framework net8.0 +``` +Expected: both build succeed (optional param = no caller changes required). + +- [ ] Run the full client test project to confirm green: + +```bash +dotnet test source/Octopus.Tentacle.Client.Tests --framework net8.0 +``` +Expected: PASS. + +- [ ] Commit: `git commit -am "TentacleClient: expose optional abandonAfterCancellationPendingFor on ExecuteScript"` + +--- + +## Task 7 — End-to-end integration test: ExecuteScript escalates to abandon after the threshold + +Prove the whole client→Tentacle path: a stuck (kill-disabled) script with `abandonAfterCancellationPendingFor` set short, cancelled mid-flight, escalates to `AbandonScript` and returns `AbandonedExitCode`. Uses `TentacleServiceDecoratorBuilder.RecordMethodUsages` to assert the `AbandonScript` verb was called. + +**Files** +- Modify: `source/Octopus.Tentacle.Tests.Integration.Common/Builders/Decorators/Proxies/RecordMethodUsagesExtensionMethods.cs` (add `ForAbandonScriptAsync`) +- Test: `source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs` (add a `[Test]` after Task 2's test) + +- [ ] Add a `ForAbandonScriptAsync` extension to `RecordMethodUsagesExtensionMethods.cs` (mirrors the existing `ForCancelScriptAsync` at line 22; the recorded method name is the proxied async client method `AbandonScriptAsync`). Insert after `ForCancelScriptAsync` (line 25): + +```csharp + + public static IRecordedMethodUsage ForAbandonScriptAsync(this IRecordedMethodUsages o) + { + return o.For("AbandonScriptAsync"); + } +``` + +- [ ] Write a failing test `ExecuteScript_WhenCancellationStaysPending_EscalatesToAbandon` in `ClientScriptExecutionAbandon.cs`. This uses the exact recorded-usages API (`IRecordedMethodUsages recordedUsages = new MethodUsages();` then `.RecordMethodUsages(out recordedUsages)`, queried via `recordedUsages.ForAbandonScriptAsync().Started`). Add the `using Octopus.Tentacle.Tests.Integration.Common.Builders.Decorators.Proxies;` and `using Octopus.Tentacle.Contracts.ClientServices;` imports at the top of the file if absent: + +```csharp + [Test] + [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version2)] + public async Task ExecuteScript_WhenCancellationStaysPending_EscalatesToAbandon(TentacleConfigurationTestCase tentacleConfigurationTestCase) + { + // Kill is disabled, so the script is genuinely stuck and CancelScript can't end it. + // With abandonAfterCancellationPendingFor set short, the orchestrator must escalate to + // AbandonScript, which returns AbandonedExitCode and releases the script. + IRecordedMethodUsages recordedUsages = new MethodUsages(); + await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() + .WithTentacle(x => x.WithRunTentacleEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, "1")) + .WithTentacleServiceDecorator(new TentacleServiceDecoratorBuilder() + .RecordMethodUsages(out recordedUsages) + .Build()) + .Build(CancellationToken); + + var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); + var releaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "release"); + + var command = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder() + .CreateFile(startFile) + .WaitForFileToExist(releaseFile)) + .WithIsolationLevel(ScriptIsolationLevel.NoIsolation) + .Build(); + + var tentacleClient = clientTentacle.TentacleClient; + + using var executionCts = new CancellationTokenSource(); + var logs = new System.Collections.Generic.List(); + + var execution = Task.Run(async () => await tentacleClient.ExecuteScript( + command, + onScriptStatusResponseReceived => logs.AddRange(onScriptStatusResponseReceived.Logs), + _ => Task.CompletedTask, + new SerilogLoggerBuilder().Build().ForContext().ToITentacleTaskLog(), + executionCts.Token, + abandonAfterCancellationPendingFor: TimeSpan.FromSeconds(2))); + + await Wait.For(() => File.Exists(startFile), + TimeSpan.FromSeconds(30), + () => throw new Exception("Script did not start"), + CancellationToken); + + executionCts.Cancel(); + + // ExecuteScript throws OperationCanceledException once the token is cancelled. + await FluentActions.Invoking(async () => await execution).Should().ThrowAsync(); + + // The orchestrator must have escalated to AbandonScript after the 2s threshold. + recordedUsages.ForAbandonScriptAsync().Started.Should().BeGreaterThan(0); + + // Cleanup: release + reap the still-alive process (kill was disabled). + File.WriteAllText(releaseFile, ""); + } +``` + +- [ ] Run it. With Tasks 5–6 implemented it should PASS; if abandon was never called it FAILs on the `.Started.Should().BeGreaterThan(0)` assertion (debug Task 5): + +```bash +dotnet test source/Octopus.Tentacle.Tests.Integration --framework net8.0 \ + --filter "FullyQualifiedName~ClientScriptExecutionAbandon.ExecuteScript_WhenCancellationStaysPending_EscalatesToAbandon" +``` +Expected after correct wiring: PASS. (If it fails because abandon was not called, the orchestrator escalation is wrong — debug Task 5.) + +- [ ] Run the whole abandon integration fixture once more for regression: + +```bash +dotnet test source/Octopus.Tentacle.Tests.Integration --framework net8.0 \ + --filter "FullyQualifiedName~ClientScriptExecutionAbandon" +``` +Expected: PASS (all tests). + +- [ ] Commit: `git commit -am "Integration: ExecuteScript escalates stuck cancel to abandon after threshold"` + +--- + +## Task 8 — Full build across all TFMs (net48 trap guard) + +The Core project multi-targets `net48;net8.0;net8.0-windows`. The runner change in Task 1 keeps the generic `TaskCompletionSource` (the non-generic `TaskCompletionSource` does NOT exist on net48), so this must still build on net48. + +**Files** +- (no source change) + +- [ ] Build the Core project on all three TFMs: + +```bash +dotnet build source/Octopus.Tentacle.Core/Octopus.Tentacle.Core.csproj --framework net48 +dotnet build source/Octopus.Tentacle.Core/Octopus.Tentacle.Core.csproj --framework net8.0 +dotnet build source/Octopus.Tentacle.Core/Octopus.Tentacle.Core.csproj --framework net8.0-windows +``` +Expected: all three succeed. + +- [ ] Build the integration test project on net48 + net8.0 (it runs the runner tests on both): + +```bash +dotnet build source/Octopus.Tentacle.Tests.Integration --framework net48 +dotnet build source/Octopus.Tentacle.Tests.Integration --framework net8.0 +``` +Expected: both succeed. + +- [ ] Run the runner fixture on net48 to confirm the keyed-on-token change behaves there too: + +```bash +dotnet test source/Octopus.Tentacle.Tests.Integration --framework net48 \ + --filter "FullyQualifiedName~SilentProcessRunnerFixture.AbandonToken" +``` +Expected: PASS. + +- [ ] No commit needed (no source change). If any TFM failed, fix inline and commit `git commit -am "Fix net48 build for abandon runner change"`. + +--- + +## Task 9 (final) — Apply Change 1 to PR2 branch (#1244) + +PR2 (`jimpelletier/eft-3295-pr2-async-unlink`, #1244) already has the async `WaitForExitAsync` migration and already keys its abandon-catch on `abandon.IsCancellationRequested` — so the *runner* half of Change 1 is already correct there. What PR2 is still missing is the `AbandonScriptAsync` best-effort-kill: its `AbandonScriptAsync` calls only `Abandon()`. Apply the same Cancel-then-Abandon change. PR2's grandchild tests must STAY as "cancel then abandon" (do NOT flip them back to "cancel alone"). + +**Files (on branch `jimpelletier/eft-3295-pr2-async-unlink`)** +- Modify: `source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs` (`AbandonScriptAsync`) +- Test: `source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs` (cherry-pick Task 2's `AbandonScript_WithNoPriorCancel_KillsTheProcess`) + +- [ ] Switch to PR2 and confirm the runner is already token-keyed (no runner change needed): + +```bash +git switch jimpelletier/eft-3295-pr2-async-unlink +grep -n "abandon.IsCancellationRequested\|runningScript.Cancel\|runningScript.Abandon" \ + source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs \ + source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs +``` +Expected: the runner shows `catch (OperationCanceledException) when (abandon.IsCancellationRequested)`; `AbandonScriptAsync` shows only `runningScript.Abandon()` (the gap to fix). + +- [ ] Cherry-pick Task 2's failing test onto PR2: add `AbandonScript_WithNoPriorCancel_KillsTheProcess` (identical body to Task 2) to `ClientScriptExecutionAbandon.cs`. Run it and watch it FAIL: + +```bash +dotnet test source/Octopus.Tentacle.Tests.Integration --framework net8.0 \ + --filter "FullyQualifiedName~ClientScriptExecutionAbandon.AbandonScript_WithNoPriorCancel_KillsTheProcess" +``` +Expected: FAIL (no kill, test times out). + +- [ ] Apply the same `AbandonScriptAsync` change as Task 2 (Cancel then Abandon) to PR2's `ScriptServiceV2.cs`: + +```csharp + if (runningScripts.TryGetValue(command.Ticket, out var runningScript)) + { + runningScript.Cancel(); + runningScript.Abandon(); + } +``` + +- [ ] Re-run the cherry-picked test. Expected: PASS. + +- [ ] Confirm PR2's grandchild tests STAY as "cancel then abandon" — do NOT modify them. Run them to confirm they still pass with the async runner + best-effort-kill abandon: + +```bash +dotnet test source/Octopus.Tentacle.Tests.Integration --framework net8.0 \ + --filter "FullyQualifiedName~SilentProcessRunnerFixture" +``` +Expected: PASS (grandchild tests require cancel-then-abandon on PR2 by design — do not regress them to PR1's "cancel alone" shape). + +- [ ] Build PR2's Core on all TFMs (net48 guard for PR2's `WaitForExitAsync` polyfill): + +```bash +dotnet build source/Octopus.Tentacle.Core/Octopus.Tentacle.Core.csproj --framework net48 +dotnet build source/Octopus.Tentacle.Core/Octopus.Tentacle.Core.csproj --framework net8.0 +dotnet build source/Octopus.Tentacle.Core/Octopus.Tentacle.Core.csproj --framework net8.0-windows +``` +Expected: all succeed. + +- [ ] Commit on PR2: `git commit -am "PR2: AbandonScriptAsync Cancel() then Abandon() so abandon best-effort-kills"` + +- [ ] Switch back to PR1: `git switch jimpelletier/eft-3295-tentacle-script-abandonment-to-release-the-mutex` + +--- + +## Notes / spec gaps surfaced during planning + +- **Capability string mismatch.** Spec Section 1 names the capability `AbandonScriptV2`. The code on this branch advertises `nameof(ScriptServiceV2.AbandonScriptAsync)` = `"AbandonScriptAsync"`. This plan matches the *deployed* string in both the Tentacle advertisement and the client `HasAbandonScriptV2()` check. Renaming to `AbandonScriptV2` is a separate contract decision and is intentionally NOT done here. +- **Most of the contract is already merged.** `AbandonScriptCommandV2`, `IScriptServiceV2.AbandonScript`, `AbandonedExitCode = -48`, `ITentacleClient.AbandonScript`, `RunningScript` abandon catch, and the PR1 runner block already exist. The two deltas in this plan are the only behavioural changes Change 1 and Change 2 require. +- **`process.HasExited` is deliberately NOT read in the PR1 abandon branch.** After `Cancel()→Close()` detaches the Process, reading `HasExited`/`ExitCode` would throw or lie. The "abandon unnecessary" race is handled one layer up by `GetResponse` returning the already-`Complete` `RunningScript`'s real code. diff --git a/docs/superpowers/specs/2026-05-21-tentacle-script-abandon-design.md b/docs/superpowers/specs/2026-05-21-tentacle-script-abandon-design.md new file mode 100644 index 000000000..7c07c1722 --- /dev/null +++ b/docs/superpowers/specs/2026-05-21-tentacle-script-abandon-design.md @@ -0,0 +1,477 @@ +# Tentacle script abandon — design + +**Status:** Draft, ready for implementation planning. Contract aligned with the parallel server-side session. +**Ticket:** [EFT-3295](https://linear.app/octopus/issue/EFT-3295/tentacle-script-abandonment-to-release-the-mutex) +**ADR:** [ADR-042 — Defer server-task Abandoned state](https://github.com/OctopusDeploy/adr/pull/226) +**Parallel work:** Server-side (ProcessExecution layer) is being designed in a separate session and consumes the contract proposed here. + +--- + +## Problem + +When a Tentacle script is hung in a way that resists `Process.Kill` (Philips' case: PowerShell stuck inside CrowdStrike + Rapid7 fighting over the same process; kernel-level uninterruptible wait), today's flow ends with: + +- `ScriptIsolationMutex` stays held → subsequent deployments to that Tentacle queue forever. +- The .NET threadpool thread inside `RunningScript.Execute()` stays parked on `process.WaitForExit()` (synchronous). +- The customer's only recovery is RDP-in-and-kill or reboot. Not acceptable for Philips. + +Server-side detects that cancellation hasn't propagated within its own timeout and tells Tentacle to **abandon** the script. Tentacle stops waiting, releases the mutex, logs honestly, and accepts new work. + +**Abandon now best-effort-kills the process** (see ADR-42 reversal below). The runaway OS process survives only when the kill genuinely can't land — exactly the stuck / re-parented-grandchild case the ticket is about. + +## Abandon semantics (the agreed model) + +- **Cancel** = best-effort kill, then **WAIT** for the script to finish and report its real outcome. +- **Abandon** = best-effort kill, then **DON'T wait** — release the isolation mutex and return `AbandonedExitCode` (-48) even if the script didn't die. + +Both attempt the kill. Only abandon stops waiting and releases the mutex. A process survives abandon only when the kill genuinely can't land (stuck / re-parented grandchild). This closes the abuse vector where someone could use abandon to leave a perfectly killable process running unmanaged. + +## Scope + +In scope: +- `IScriptServiceV2` only (Listening + Polling Tentacles). +- New Halibut RPC verb `AbandonScript`, new exit code `AbandonedExitCode = -48`. +- New optional client parameter `abandonAfterCancellationPendingFor` on `TentacleClient.ExecuteScript`. +- Gated by server-side feature flag (`AbandonTentacleScriptOnCancellationTimeoutFeatureToggle`) for the first release. No Tentacle-side flag — capability advertisement is binary on build version. + +Out of scope: +- SSH targets (different lock model; ticket explicitly defers). +- Kubernetes agent (`IKubernetesScriptServiceV1`): different mechanism, separate stuck-pod work already in flight (`KubernetesPendingPodWatchDog`). Server's capability negotiation handles "don't try abandon on Kubernetes targets" cleanly. +- Old `IScriptService` (V1): no signal that any active Tentacle still negotiates V1. +- Server-task Abandoned UI state — deferred by ADR-042; task continues to surface as Cancelled. + +## Section 0 — Two PRs, sequenced + +The work ships as two PRs in sequence so the async migration is reviewable and mergeable independently of the abandon feature, and so the behavioural change to grandchild handling is gated behind universal server abandon support. + +``` +main ← PR1 (#1226) ← PR2 (#1244, draft) +``` + +### PR1 (#1226) — ships first + +The wait in `SilentProcessRunner` is a **synchronous `process.WaitForExit()` run on a `Task`**, raced against an abandon signal via `Task.WhenAny`. The abandon signal is a `TaskCompletionSource` completed when the abandon token fires. (The non-generic `TaskCompletionSource` does NOT exist on net48, so the generic form is required.) + +**Cancel behaves exactly like main.** `cancel.Register` runs `DoOurBestToCleanUp`, which does best-effort `Hitman.Kill` **and `process.Close()`**. `Close` releases the redirected pipe handles, so the synchronous wait returns even when a re-parented grandchild holds stdout/stderr open. So cancel — including the grandchild case — is handled exactly as it is today, **with no server dependency**. Cancel of a genuinely un-killable script hangs the wait until abandon fires. + +`Close` is safe here precisely because the wait is synchronous; it would race the `Exited` event of the async `WaitForExitAsync`, which PR1 does not use. + +PR1 also adds the full abandon contract surface, the client parameter, capability advertisement, and tests. + +### PR2 (#1244) — draft, ships only after all servers support abandon + +Switches the wait to async `await process.WaitForExitAsync(...)` for performance: no thread is blocked per running script. With the async wait, `Close()` can no longer be used to unblock the grandchild case — it races the `Exited` event the async wait depends on — so `process.Close()` is removed from `DoOurBestToCleanUp`. Consequence: **cancel no longer unsticks a re-parented grandchild on its own; grandchildren require abandon (cancel-then-abandon)**. Cancel returns the real kill exit code instead of -1. + +PR2 must ship only after every server in the fleet supports the abandon escalation, because it makes server-side abandon the only recovery path for the grandchild case. + +The two grandchild tests flip from "cancel alone does not hang" (PR1) to "cancel then abandon" (PR2). + +## Section 1 — Contract surface + +Add a method to existing `IScriptServiceV2`. Do NOT introduce V3 — the convention here is method-addition + capability negotiation. The server-side RPC contract below is **unchanged** by the ADR-42 reversal. + +```csharp +// source/Octopus.Tentacle.Contracts/ScriptServiceV2/IScriptServiceV2.cs +public interface IScriptServiceV2 +{ + ScriptStatusResponseV2 StartScript(StartScriptCommandV2 command); + ScriptStatusResponseV2 GetStatus(ScriptStatusRequestV2 request); + ScriptStatusResponseV2 CancelScript(CancelScriptCommandV2 command); + ScriptStatusResponseV2 AbandonScript(AbandonScriptCommandV2 command); // NEW + void CompleteScript(CompleteScriptCommandV2 command); +} + +// NEW: source/Octopus.Tentacle.Contracts/ScriptServiceV2/AbandonScriptCommandV2.cs +public class AbandonScriptCommandV2 +{ + public AbandonScriptCommandV2(ScriptTicket ticket, long lastLogSequence) { /* … */ } + public ScriptTicket Ticket { get; } + public long LastLogSequence { get; } +} +``` + +**Capability advertisement.** Tentacle's `CapabilitiesServiceV2` advertises `AbandonScriptV2` once the build supports it. Binary on build version, no Tentacle-side toggle. Server's existing `BackwardsCompatibleAsyncCapabilitiesV2Decorator` handles "Tentacle doesn't advertise it → don't call it" for older Tentacles. Server-side `AbandonTentacleScriptOnCancellationTimeoutFeatureToggle` is the only feature-flag off-switch. + +**Why a new verb (not a "force" flag on Cancel).** Different semantics: Cancel = "best-effort kill, then wait for the real outcome". Abandon = "best-effort kill, then stop waiting; release the mutex; return -48". Two verbs map cleanly to ProcessExecution's two-step escalation (cancel first, abandon if cancel doesn't propagate). + +## Section 2 — Where abandon's kill lives, and mutex release + +**The core constraint.** `RunningScript.Execute()` acquires `ScriptIsolationMutex` inside a `using` block that wraps the call into `SilentProcessRunner`. That call blocks (PR1) or awaits (PR2) on the process wait. When the wait never returns: +1. The mutex is welded shut (the `using`'s Dispose never runs). +2. In PR1 the threadpool thread is parked; in PR2 no thread is parked, but the awaiter never completes. + +Abandon solves both by completing the wait via the abandon signal regardless of whether the OS process exited. + +**Where the kill lives.** `ScriptServiceV2.AbandonScriptAsync` calls only `runningScript.Abandon()`. The kill happens in `SilentProcessRunner`'s **abandon branch**: when the abandon token is observed it calls `DoOurBestToCleanUp` (best-effort `Hitman.Kill`, idempotent if cancel already ran it) and then returns `AbandonedExitCode`. Doing the kill there — in the runner, sequentially in the abandon branch — closes the abuse vector for ANY abandon (including a direct RPC with no prior cancel), and is **race-free**. + +> **Why not fire the cancel token from `AbandonScriptAsync` (the originally-proposed shape)?** Firing `Cancel()` then `Abandon()` races: cancel's `Hitman.Kill` makes a killable process exit, which resolves the runner's wait to the *cancel* exit code (`-1`) before the abandon token is observed — the runner returns `-1` instead of `-48`. Reversing the order makes `-48` deterministic but then the kill races the runner disposing the process. Doing the kill sequentially inside the abandon branch avoids both races. (Proven by `AbandonScript_WithNoPriorCancel_KillsTheProcess`.) + +**Exit code when both tokens fire.** The runner returns `-48` whenever the **abandon path resolves the wait** — i.e. the process is still blocking the wait when abandon fires. This is the case the feature exists for: cancel's kill could not unstick the script (genuinely stuck / re-parented grandchild), so the wait is still pending when abandon arrives. + +It is NOT a goal to return `-48` when cancel and abandon race on a *killable* process. There, cancel's kill can resolve the wait to the killed exit code before abandon is observed, and returning that code is correct — the script is being torn down either way. This race does not occur in production: cancel and abandon arrive as separate, sequential RPCs, and the server only sends abandon **after** cancel has failed to unstick the script. By the time abandon's RPC lands, a killable script has already completed (its later abandon RPC is a no-op against an already-`Complete` script); only a stuck script is still waiting, and that path is deterministically `-48`. Both PR1 (`Task.WhenAny`) and PR2 (async abandon-catch) behave this way — neither tries to override cancel's exit code when cancel wins the kill race. + +**PR1 wait shape (synchronous wait raced against abandon):** + +```csharp +using (cancel.Register(() => DoOurBestToCleanUp(process, error))) // Hitman.Kill + process.Close() +{ + process.BeginOutputReadLine(); + process.BeginErrorReadLine(); + + var abandonTcs = new TaskCompletionSource(); + using (abandon.Register(() => abandonTcs.TrySetResult(null))) + { + var waitTask = Task.Run(() => process.WaitForExit()); + await Task.WhenAny(waitTask, abandonTcs.Task).ConfigureAwait(false); + + if (abandon.IsCancellationRequested && !process.HasExited) + { + info("Tentacle has abandoned this script. The underlying script process may still be running on this host."); + SafelyCancelRead(process.CancelErrorRead, debug); + SafelyCancelRead(process.CancelOutputRead, debug); + return ScriptExitCodes.AbandonedExitCode; + } + + // process exited (naturally, or via cancel-triggered Kill+Close releasing the pipes) + SafelyCancelRead(process.CancelErrorRead, debug); + SafelyCancelRead(process.CancelOutputRead, debug); + return SafelyGetExitCode(process); + } +} +``` + +**PR2 wait shape (async wait, no `process.Close()`):** the `Task.Run(WaitForExit)` + `WhenAny` is replaced by `await process.WaitForExitAsync(abandon)`, and `process.Close()` is removed from `DoOurBestToCleanUp`. The abandon-catch keys on `abandon.IsCancellationRequested && !process.HasExited` exactly as above. Cancel returns the real kill exit code via the natural `Exited`-event completion. + +**Diff shape.** The wait method and its callers migrate to async (`ExecuteCommand` → `ExecuteCommandAsync`, returning `Task`). A search across the repo found ~20 call sites; every one migrates. PR1 introduces the async signature with the synchronous-wait-on-a-Task body and a net48 polyfill where `WaitForExitAsync` is unavailable; PR2 swaps the body to the true async wait. + +Production code: +- `source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs` — the method itself. Async signature, two-token (`cancel`, `abandon`). +- `source/Octopus.Tentacle/Util/ISilentProcessRunner.cs` — interface and in-process wrapper become async. +- `source/Octopus.Tentacle/Util/CommandLineRunner.cs` — caller migration. +- `source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs` — `RunScript` → `RunScriptAsync`; ctor takes `abandonToken` alongside `runningScriptToken`; `Execute()` awaits the new path. +- `source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs` — `LaunchShell` passes `abandonToken` from the wrapper. `RunningScriptWrapper` gains `abandonTokenSource`. `AbandonScriptAsync` calls `Abandon()` (the best-effort kill lives in the runner's abandon branch — see above). +- `source/Octopus.Tentacle.Contracts/ScriptServiceV2/` — new `AbandonScriptCommandV2.cs`, interface method on `IScriptServiceV2.cs` (per Section 1). +- `source/Octopus.Tentacle.Contracts/ScriptExitCodes.cs` — add `AbandonedExitCode = -48`. +- Capabilities advertisement (`AbandonScriptV2`). + +Immediate sync callers migrated with `.GetAwaiter().GetResult()` and a sync-boundary comment ("We're on a plain thread-pool worker — when the async work finishes it can resume on any free thread, so the block resolves normally"): +- `PowerShellPrerequisite.Check()`, `KubernetesDirectoryInformationProvider.GetDriveBytesUsingDu()`, `SystemCtlHelper.RunServiceCommand()` (×2), `LinuxServiceConfigurator` (×3), `WindowsServiceConfigurator.Sc()`, `CommandLineRunner.Execute(CommandLineInvocation, ...)`. + +Kubernetes integration test scaffolding (all caller-migration, no logic change): +- `KubeCtlTool.cs`, `DockerImageLoader.cs` (×2), `KubernetesAgentInstaller.cs` (×3), `KubernetesClusterInstaller.cs` (×4), `HelmDownloader.cs`, `ToolDownloader.cs` (all under `source/Octopus.Tentacle.Kubernetes.Tests.Integration/`). + +Tentacle integration test scaffolding (caller migration): +- `PowerShellStartupDetectionTests.cs` (×3), `Util/SilentProcessRunnerFixture.cs`, `Support/TentacleFetchers/LinuxTentacleFetcher.cs` (all under `source/Octopus.Tentacle.Tests.Integration/`). + +**What happens to stdout/stderr after abandon.** Returning `AbandonedExitCode` unwinds the method. The outer `using (var process = new Process())` disposes the Process, which closes our end of the redirected pipes. The OS process may get EPIPE on its next stdout/stderr write. The script's runtime keeps doing whatever it's doing; many scripts ignore broken-pipe errors, and scripts that fail on them already had nowhere to log anyway. The alternative — leaving the Process and its pipes pinned in memory indefinitely — is a resource-accumulation problem. + +**Async correctness watch-outs for the implementation plan:** +- Every new async method gets `.ConfigureAwait(false)`. +- No `.Result` / `.Wait()` on the new path; surface any caller that can't easily go async rather than block-on-async. +- Verify no deadlock under the Tentacle's synchronisation context (none expected, but worth confirming). + +**Rejected alternatives** (documented for the reviewer's benefit): +- **Orphan the Task + release mutex via external Dispose.** Releases the mutex but leaks a threadpool worker per abandon. Tentacle eventually starves the threadpool. +- **Manual `Thread` instead of `Task`.** Same leak problem, just trades threadpool for kernel thread handles + stack memory. +- **`Thread.Abort` / `Thread.Interrupt` / `TerminateThread` P/Invoke.** No safe managed mechanism to release a thread parked in unmanaged code; can corrupt Tentacle's own state. +- **Out-of-process script worker.** Cleanly isolates the stuck-process problem but is a massive refactor far outside EFT-3295's scope. Worth a separate proposal someday. + +## Section 3 — Client parameter (PR1) + +Add one optional parameter, threaded through the client orchestration: + +```csharp +TimeSpan? abandonAfterCancellationPendingFor = null +``` + +It is added to `TentacleClient.ExecuteScript`, `ITentacleClient.ExecuteScript`, and `ObservingScriptOrchestrator`. + +The orchestrator's poll loop (`ObservingScriptOrchestrator.ObserveUntilComplete`) already calls `CancelScript` every iteration while the `CancellationToken` is cancelled. When the parameter is set: +1. Record when cancellation first fired. +2. Once elapsed crosses the threshold AND the `AbandonScriptV2` capability is advertised, call `AbandonScript` instead of `CancelScript`. +3. If the capability is absent, keep cancelling (skip the abandon attempt cleanly). + +Add an `AbandonScript` method to the V2 script executor (`ScriptServiceV2Executor`) with the capability check. `ScriptExecutionResult.ExitCode` is the source of truth (-48 = abandoned). + +**No shared constant in `Octopus.Tentacle.Contracts`.** An earlier ask for a shared abandoned-message constant was retracted; the Tentacle's own script-log line is the only customer-facing message. + +## Section 4 — State, exit code, log wording + +- **Exit code:** `ScriptExitCodes.AbandonedExitCode = -48`. Distinct from `CanceledExitCode (-43)`. Server-side telemetry can tell abandoned from cancelled even though task UI surfaces both as "Cancelled" per ADR-042. +- **State on GetStatus after abandon:** `(ProcessState.Complete, AbandonedExitCode, latestLogs)`. Same shape as Cancel returns today. +- **Honest log line:** `"Tentacle has abandoned this script. The underlying script process may still be running on this host."` Written once, into the workspace script log, near the end of the abandon path. Still accurate after the ADR-42 reversal: the process survives only when the best-effort kill couldn't land. +- **Workspace cleanup on subsequent `CompleteScript`:** targeted best-effort. `CompleteScript` reads the stateStore and checks the persisted exit code. If `AbandonedExitCode`, wrap `workspace.Delete` in try/catch, log a `Warn` to systemLog naming the leaked directory, return success. For any other exit code, `workspace.Delete` is called as today and exceptions propagate. The relaxed-deletion policy applies only to the rare abandon case; bugs that leak handles on normal-completion paths can't hide under a blanket try/catch. No janitor — OS-level state on the host is the customer's problem per the ticket. +- **Idempotency — actual-status return (NOT silent no-op):** + - Abandon called twice on the same already-abandoned ticket → returns the cached `(Complete, AbandonedExitCode, logs)` response. + - Abandon called on a ticket that completed naturally before the abandon arrived (race case the server-side session flagged) → returns `(Complete, realExitCode, logs)` with the **real exit code**, distinct from `AbandonedExitCode`. The server uses this distinction to log *"Script had already completed before abandon was needed"* instead of *"Tentacle abandoned the script"*. + - Abandon called on an unknown ticket (never started, or already cleaned up via `CompleteScript`) → returns `(Complete, UnknownScriptExitCode, [])`, matching Cancel's behaviour for the same case. +- **Race with natural completion:** the wrapper's existing `StartScriptMutex` (or a new dedicated lock) serialises abandon entry. If state is already Complete, abandon returns the cached status per the rules above. + +## Section 5 — Automated test strategy + +### 5.1 `SilentProcessRunner` unit tests + +Style: matches existing `SilentProcessRunnerFixture.cs`. Use short-lived helper scripts/exes as process subjects. + +| Test | Trigger | Verify | +|---|---|---| +| Normal exit | Run a process that exits 0 | Returns 0; no abandon log line captured by the `info` callback spy. | +| Cancel kills process | Long-running process; fire cancel token | Within 1s: process is killed (`process.HasExited == true`). PR1: return is the kill-induced exit code (Linux 137; Windows process-defined) via the `Close`-released pipe path. PR2: return is the real kill exit code. No abandon log line. | +| Abandon while running | Long-running process; fire abandon token | Within ~100ms: returns `AbandonedExitCode`, `info` callback received exactly one call containing "Tentacle has abandoned this script". Assert `process.HasExited == false`; clean up by killing externally. | +| Abandon AFTER natural exit (race) | Process that exits in ~50ms; fire abandon token at the moment exit fires | Return value is the process's real exit code, not `AbandonedExitCode`. No abandon log line. Verifies the `abandon.IsCancellationRequested && !process.HasExited` guard. | +| Cancel also requested (abandon first) | Long-running process; fire abandon, then also fire cancel | Returns `AbandonedExitCode`. Abandon resolves the wait deterministically; the additional cancel does not change the result. (Firing abandon first avoids the killable-process race — see "Exit code when both tokens fire" above.) The stuck both-tokens case is covered end-to-end by the integration test `AbandonScript_WhenCancelFailsToKillProcess`. | + +**Grandchild (re-parented) tests — differ by PR.** A child process spawns a grandchild that inherits stdout/stderr and outlives the child. +- **PR1:** "cancel alone does not hang" — `cancel.Register`'s `process.Close()` releases the inherited pipe handles, so the synchronous wait returns even though the grandchild holds them open. No abandon needed. +- **PR2:** "cancel then abandon" — `process.Close()` is gone, the async wait depends on the `Exited` event, so cancel alone leaves the wait pending; abandon is required to complete it and return -48. + +**Async-specific timing assertion (PR2):** `WaitForExitAsync(token)` returns within ~50ms of cancellation. Wrap the await in `Stopwatch.StartNew()`; assert elapsed < 100ms. Proves the async wait is independent of process exit. + +**Thread-leak regression test (PR2):** start 50 stuck processes via `ExecuteCommandAsync` (all `await`ed in parallel), fire abandon on all; capture `Process.GetCurrentProcess().Threads.Count` before and 1s after; assert delta ≤ 5 (allow for threadpool jitter). The async path should produce zero parked threads at steady state. (Under PR1 the synchronous-wait-on-a-Task body still parks a worker per running script, so this assertion is PR2-only.) + +### 5.2 `ScriptServiceV2` service-layer tests + +Style: matches existing service-layer fixtures using in-memory script shells and stub workspace factories. + +| Test | Trigger | Verify | +|---|---|---| +| **Mutex release (load-bearing)** | Start `FullIsolation` script; abandon it; immediately start second `FullIsolation` script | Second `StartScript` returns with `State == Running` within 1s. Reading `ScriptIsolationMutex.TaskLock.Report()` between abandon and second-start shows the lock free in that window. | +| Abandon before StartScript | Call AbandonScript with a ticket never seen | Returns `(Complete, UnknownScriptExitCode)`. Matches existing Cancel behaviour for unknown ticket. | +| Abandon after CompleteScript | Start → Complete → Abandon | Returns `(Complete, UnknownScriptExitCode)` (wrapper already removed; stateStore gone). | +| Abandon then Cancel | Abandon, then Cancel same ticket | Cancel returns the cached abandoned response unchanged. Asserts via response equality. | +| **Cancel then Abandon (real flow)** | Long-running script; cancel; `cancel.Register` no-op'd to simulate un-killable; abandon | Final GetStatus returns `(Complete, AbandonedExitCode, logs)`. Log content includes the honest line. Subsequent same-ticket StartScript returns the cached state. | +| **Abandon best-effort-kills (anti-abuse)** | Long-running but killable script; call AbandonScript directly with no prior cancel | The process is killed (the runner's abandon branch ran `DoOurBestToCleanUp → Hitman.Kill`). Final state is `(Complete, AbandonedExitCode, logs)`. Confirms a direct abandon does not leave a killable process running unmanaged. | +| Abandon during StartScript launch | Concurrent: StartScript holding `StartScriptMutex`, AbandonScript called | Abandon serialises behind StartScript via the existing wrapper mutex. Final state is consistent (no half-abandoned wrapper). | +| Capability advertisement | Tentacle build with the abandon feature; query `CapabilitiesServiceV2.GetCapabilities()` | Response includes `AbandonScriptV2`. Builds without the feature do not advertise it. | + +### 5.3 Client orchestrator tests + +| Test | Trigger | Verify | +|---|---|---| +| Param unset → cancel only | `abandonAfterCancellationPendingFor = null`; cancel the token mid-execution | Orchestrator calls `CancelScript` each poll; never calls `AbandonScript`. | +| Threshold crossed + capability present | Param set to a short span; capability advertised; cancellation pending past the span | Orchestrator switches from `CancelScript` to `AbandonScript` after the threshold. `RecordMethodUsages` shows the `AbandonScript` call. | +| Threshold crossed + capability absent | Param set; `AbandonScriptV2` NOT advertised; cancellation pending past the span | Orchestrator keeps calling `CancelScript`; never calls `AbandonScript`; no error. | + +### 5.4 Integration tests (real shells, real processes) + +Style: matches `Octopus.Tentacle.Tests.Integration/ClientScriptExecutionIsolationMutex.cs` (real Tentacle, real script, mutex semantics under test). + +**Timing flakiness: use the existing builders, not raw shell + `Thread.Sleep`.** +- `ScriptBuilder` (`Octopus.Tentacle.CommonTestUtils/Builders/ScriptBuilder.cs`): `.CreateFile(path)` signals "script reached this line"; `.WaitForFileToExist(path)` blocks the script on an event, not a sleep race. +- `TestExecuteShellScriptCommandBuilder` (`Octopus.Tentacle.Tests.Integration/Util/Builders/`): `.SetScriptBody(ScriptBuilder)`, `.WithIsolationLevel(...)`, `.WithIsolationMutexName(...)`, `.Build()`. +- `TentacleConfigurationTestCase.CreateBuilder()` and `ClientAndTentacleBuilder` set up real Tentacle + Halibut. +- `TentacleServiceDecoratorBuilder.RecordMethodUsages(...)` decorates the script service so the test can assert call counts for the new `AbandonScript` verb and capability negotiation. +- `Wait.For(condition, timeout, onFail, ct)` is the event-driven polling helper. Always preferred over `Task.Delay`. + +**Pattern to follow:** mirror `ClientScriptExecutionIsolationMutex.cs`. Stuck-script tests use `ScriptBuilder.WaitForFileToExist(...)` as the "kernel-blocked" simulant rather than `sleep 600`. For the un-killable variant, combine the file-wait with the `Tentacle.Debug.DisableProcessKill` flag so `Hitman` becomes a no-op for the test's duration. + +| Test | Trigger | Verify | +|---|---|---| +| PowerShell + cancel (kill works) | Real PowerShell, `Start-Sleep -Seconds 600`, fire Cancel, normal kill path | Final response is `(Complete, CanceledExitCode)` via the existing path. **Negative check:** abandon log line NOT present. Confirms Cancel isn't regressed into the abandon path. | +| PowerShell + abandon (kill mocked off) | Real PowerShell, sleep; `Hitman` mocked to no-op; fire Cancel; wait; fire AbandonScript | Within 2s of abandon: response is `(Complete, AbandonedExitCode, [...honest log line...])`; mutex is free (start a second `FullIsolation` script that acquires within 1s); the real PowerShell process is still alive (verified via `Process.GetProcessById`). Test cleanup: kill the leftover PowerShell. | +| **Multi-level-deep hang (ticket-mandated)** | bootstrap → Calamari-shim → user script, with `Hitman` no-op flag set | All verifications from the previous row pass through the multi-level launch chain. Confirms abandon works when the stuck process is not the immediate child of Tentacle. | +| **Re-parented grandchild — PR1** | Child spawns a grandchild that inherits stdout/stderr and outlives it; fire Cancel | Cancel alone returns (does NOT hang): `process.Close()` releases the inherited pipes. No abandon needed. | +| **Re-parented grandchild — PR2** | Same setup; fire Cancel, then AbandonScript | Cancel alone leaves the wait pending; abandon completes it and returns `(Complete, AbandonedExitCode)`. The two grandchild tests flip from PR1's "cancel alone does not hang" to PR2's "cancel then abandon". | +| Windows workspace cleanup with open handles | Run the abandon path; leave the simulated zombie holding the workspace log file open; call CompleteScript | CompleteScript returns without exception. Tentacle systemLog contains a `Warn` naming the leaked workspace directory. Workspace dir on disk still exists (assert via `Directory.Exists`). No exception bubbles up. | +| Polling Tentacle variant | Configure test fixture as Polling | All verifications from the kill-mocked-off row pass against a Polling Tentacle. | + +**End-to-end async thread audit (PR2).** Capture `Process.GetCurrentProcess().Threads.Count` 5s into a stuck-script scenario; assert no thread parked attributable to the script pipeline. Most reliable proxy: total thread count not higher than baseline + epsilon. + +**Normal-path timing regression check.** Run a 100-iteration benchmark of normal short-script execution (`Write-Host "x"`); compare median wall-clock time vs. a baseline build. **Verify:** median delta within margin of error. + +## Section 6 — Manual testing plan + +Manual scenarios on a real test Tentacle. All scenarios assume the parallel server-side build is deployed. + +### Setup + +- Test Octopus Server with EFT-3295 server-side build. +- Windows Tentacle (primary) + Linux Tentacle (smoke). +- Debug Tentacle build with `Tentacle.Debug.DisableProcessKill=true` making `Hitman.TryKillProcessAndChildrenRecursively` a no-op — simulant for "kill doesn't work" without engineering real kernel-level waits. +- Server-side feature flag `AbandonTentacleScriptOnCancellationTimeoutFeatureToggle` (default ON, configured on the test Octopus Server). + +### Where to find things (reference for verification steps below) + +- **Tentacle systemLog (Windows):** `C:\Octopus\Logs\OctopusTentacle.txt` (confirm via `Tentacle show-configuration`). +- **Tentacle systemLog (Linux):** `/etc/octopus//Logs/OctopusTentacle.txt`. +- **Tentacle workspace root:** `/Work/`. Each script gets a subdirectory named after its `ScriptTicket`. Inside: `bootstrapRunner.log`, `Output.log`, `script.ps1`/`Bootstrap.sh`, the state store file. +- **Script log in UI:** Octopus Server → the task → expand the deployment step. This is what the customer sees and what gets the honest abandon line. +- **Thread count (Windows):** PowerShell `(Get-Process Tentacle).Threads.Count`, or Process Explorer's Threads tab. +- **Thread count (Linux):** `ps -o nlwp= -p $(pgrep -f Tentacle)`. +- **Capability advertisement:** Tentacle systemLog at startup contains `Negotiated capabilities: [...]`. Or enable Halibut verbose tracing server-side and inspect the `CapabilitiesResponseV2` payload. +- **Mutex state in Tentacle log:** grep for `acquiring isolation mutex` / `Lock acquired` / `Releasing lock` with the relevant task ID. + +### M1 — Regression smoke (flag ON, normal script) + +Deploy `Write-Host "hello"; Start-Sleep 5; Write-Host "done"`. + +**Verify (all must pass):** +1. Octopus UI task status → **Success** (green tick). +2. Script log in UI shows `hello` and `done`; no abandon line. +3. Tentacle systemLog: `grep "abandon" OctopusTentacle.txt` → zero matches for this task ID. +4. Tentacle systemLog shows the normal acquire/release pair for this task ID. +5. Thread count (sampled 5s after task completes) → within ±2 of pre-test baseline. + +### M2 — Cancel still works (flag ON, killable script) + +`DisableProcessKill=false`. Deploy `Start-Sleep -Seconds 300`. Wait ~10s. Click **Cancel** in Octopus UI. + +**Verify:** +1. UI task status transitions to **Cancelled** within 30s. +2. Tentacle systemLog shows the kill attempt followed by mutex release for this task ID. +3. PowerShell process is gone (match by PID captured from Tentacle log at script start). +4. `grep "abandon" OctopusTentacle.txt` → zero matches for this task ID. Cancel path was used, not abandon. +5. Deploy a second project to the same Tentacle → starts immediately (mutex released by the normal Cancel path). + +### M3 — The Philips scenario (flag ON, unkillable script) + +`Tentacle.Debug.DisableProcessKill=true`. Restart Tentacle. Capture thread-count baseline. Deploy `Start-Sleep -Seconds 600`. Note the PowerShell PID from the Tentacle log. Click **Cancel** after ~10s. Wait for the server-side abandon timeout (2 min for first release). + +**Verify (all must pass; this is the load-bearing scenario):** + +1. **Server side called Abandon.** Server log shows an `AbandonScript` call for this task's ticket, timestamped after the Cancel attempt + the abandon timeout. +2. **Honest log line in the customer-visible task log.** Confirm `Tentacle has abandoned this script. The underlying script process may still be running on this host.` is present in the script log section. +3. **Tentacle systemLog records the abandon path.** Shows: AbandonScript invocation received, best-effort kill attempted (no-op'd here), abandon signal fired, mutex released, wrapper removed. +4. **Mutex released — load-bearing check.** Immediately deploy a second trivial project. **Pass:** starts within 5s. **Fail:** queues with "Waiting for the script in task...". +5. **Task UI status = Cancelled** (no separate "Abandoned" state — per ADR-042). +6. **Thread count returned to baseline** (PR2). Sample 10s after the abandon. **Pass:** within ±2 of baseline. (Under PR1 a worker is parked per running script in the normal case, so use the PR2 build for this check.) +7. **The PowerShell process is still alive on the host.** `Get-Process -Id ` returns the process — because the best-effort kill was no-op'd. Kill it manually at end of test for cleanup. (With `DisableProcessKill=false` abandon would have killed it; this scenario specifically simulates the kill not landing.) +8. **Exit code in the task log = -48 (AbandonedExitCode)**, distinct from `-43` (CanceledExitCode). + +### M4 — Repeated abandon (thread/handle-leak check under repetition) + +Capture baseline thread count and Tentacle working-set memory. Run M3 ten times back-to-back (script the loop). + +**Verify:** +1. Sample thread count after each iteration (PR2 build). **Pass:** within ±5 of baseline across all ten runs. **Fail:** monotonic growth. +2. Sample working-set memory after each iteration. **Pass:** within ~50MB of baseline. **Fail:** grows by more than ~10MB per iteration. +3. After all ten runs, deploy a normal project. **Pass:** runs normally. +4. Kill leftover `powershell.exe` / `sleep` processes manually at end of test. + +### M5 — Server-side flag off (Tentacle behaves as today) + +Set `AbandonTentacleScriptOnCancellationTimeoutFeatureToggle` to OFF in the test Octopus Server. Restart Server. Leave Tentacle untouched. + +**Verify:** +1. **Server doesn't dispatch Abandon.** Repeat the M3 setup. Wait past the 2-minute point. Server log: zero `AbandonScript` matches for this task ID. +2. **Tentacle still advertises the capability.** `CapabilitiesResponseV2` still contains `AbandonScriptV2`. The flag lives on the Server, not on Tentacle. +3. **Tentacle stays wedged.** Subsequent deployment queues with "Waiting for the script in task...". Confirms today's behaviour is preserved when Server has the feature off. +4. Recovery: restart Tentacle (the existing workaround). + +### M6 — Workspace cleanup with open handles (Windows-specific) + +Run M3 to completion. Note the `ScriptTicket` from the Tentacle log. + +**Verify:** +1. **Workspace dir still exists.** Listing shows log files present; open handles prevent deletion. +2. **systemLog records the failure.** A `Warn`-level entry names the directory that could not be deleted, with the underlying I/O exception message. +3. **No propagated exception to Server.** `CompleteScript` returns normally; Server log shows successful completion. +4. **Tentacle continues to function.** Deploy a third project (not to the wedged workspace). **Pass:** runs normally. +5. **Manual cleanup works after the zombie process is killed.** Confirms the leak is bounded. + +### M7 — Polling Tentacle variant + +Register a Polling Tentacle. Repeat M3 setup and execution. + +**Verify:** +1. All M3 verification points pass with no Polling-specific differences. The Halibut RPC path is the same — only connection initiation direction differs. +2. **Polling-specific check:** during the abandon, Tentacle's polling loop continues. **Pass:** polling not blocked by the abandon flow. +3. After abandon, the Polling Tentacle picks up the next deployment. **Pass:** new deployment dispatched and runs (mutex released). + +### M8 — Linux smoke + +On a Linux Tentacle, deploy a Bash script: `sleep 600`. Repeat M2 (kill works) and M3 (kill mocked off). + +**Verify:** +1. **M2 on Linux:** `ps -p ` shows the bash/sleep process gone after Cancel. Same outcomes as Windows M2. +2. **M3 on Linux:** all M3 verification points pass. Thread count via `ps -o nlwp= -p $(pgrep -f Tentacle)`. Workspace location: `/etc/octopus//Work//`. +3. **Linux file-handle behaviour differs:** Linux generally allows deletion of files held open by other processes. For M6's analogue on Linux, workspace deletion is more likely to succeed even with the zombie running. Note in test result. +4. Confirms the implementation isn't accidentally Windows-only. + +### M9 — Server escalation ordering + +Server escalation is hardcoded at **2 minutes** post-Cancel for the first release. Ask the server-side session for a debug-build override constant to run this faster. + +**Verify the killable case (no escalation expected):** +1. Run M2 (killable + cancel). Wait at least 3 minutes. +2. Server log: zero `AbandonScript` matches for this task ID. Cancel succeeded inside the 2-minute window. +3. Tentacle log: zero abandon entries for this task ID. + +**Verify the unkillable case (escalation expected):** +4. Run M3 (kill mocked off + cancel). Wait through the 2-minute timeout. +5. Server log: exactly one `AbandonScript` match, timestamped ~2 minutes after the Cancel. +6. Tentacle log: one abandon entry for this task ID. + +**Verify the actual-status race case** (server-side session's idempotency concern): +7. Set up M3, but let the script complete naturally just before the 2-minute timer fires (~110-second script). +8. Server fires AbandonScript anyway because the completion event hasn't reached it yet. +9. Tentacle returns `(Complete, realExitCode, logs)` — NOT `AbandonedExitCode`. +10. Server task log entry: *Script had already completed before abandon was needed.* + +**Bug indicators to flag back to the server session:** +- Server calls AbandonScript on every Cancel (even killable cases) → server's escalation predicate is wrong. +- Server retries AbandonScript multiple times for the same ticket → server idempotency broken. +- Server calls AbandonScript before the 2-minute window → timer is wrong. +- Server calls AbandonScript with the Tentacle capability missing → capability gating broken. + +### Sign-off criteria + +To turn the feature flag on by default in a future release: M1–M5 pass on Windows; M3 + M4 pass on Linux; M7 passes on Polling; M9 confirms server escalation policy; M6 confirms workspace leak is bounded and logged. + +## ADR-42 / "not killed" reversal + +The original ticket and ADR-42 said the runaway process is **not** killed. This design **reverses** that: abandon now best-effort-kills the process (anti-abuse — see Abandon semantics). The Tentacle script-log line remains accurate because the process survives only when the kill genuinely couldn't land. + +Follow-ups required outside this Tentacle work: +- **ADR-42** — update to reflect best-effort-kill-on-abandon. +- **EFT-3295 ticket scope** — update the "do not kill" statement. +- **Docs PR #3175** — update the customer-facing wording to match best-effort-kill-on-abandon. + +**Unchanged:** the server-side RPC contract — `AbandonScript(AbandonScriptCommandV2) → ScriptStatusResponseV2`, exit code -48 — is identical to what was already agreed. + +## Risks and rollout + +- **Feature flag off by default** for the first release. Customer-by-customer opt-in. +- **PR2 sequencing:** PR2 (#1244) must ship only after every server in the fleet supports the abandon escalation, because PR2 removes `process.Close()` and makes server abandon the only recovery for the re-parented-grandchild case. +- **Sequence:** after EFT V1 cleanup closes (target end May 2026), before Task Cap 320, targeting Philips' July self-host release. +- **Telemetry:** count of AbandonScript calls per Tentacle per day. Spike = signal that either Cancel is broken or this feature is masking a different bug. +- **Soak test pre-release:** 1000 normal scripts with the server-side flag ON, verify no resource leak vs. flag OFF baseline. + +## Open questions for external reviewer + +(None remaining. Workspace cleanup policy resolved 2026-05-21 — targeted best-effort gated on `AbandonedExitCode` in the stateStore. No janitor; OS-level state on the host is the customer's responsibility per the ticket.) + +## Coordination — locked with the server-side session (2026-05-21) + +Aligned via Linear thread on EFT-3295 (commenter Jim, both sessions). Items below are locked unless explicitly noted. + +**Contract (final shape):** + +- `ScriptStatusResponseV2 AbandonScript(AbandonScriptCommandV2 command)` on `IScriptServiceV2`. +- `AbandonScriptCommandV2 { ScriptTicket Ticket; long LastLogSequence; }` — same shape as `CancelScriptCommandV2`. Server-side dropped its initial `ServerTaskId` and "cancellation correlation id" proposal; `ScriptTicket` is sufficient. +- Capability name: `AbandonScriptV2`. + +**Idempotency (final):** Tentacle returns actual current status. Already-completed script returns `(Complete, realExitCode, logs)` — distinct from `AbandonedExitCode`, so the server's task log entry can record that the abandon was unnecessary. Unknown/already-cleaned-up ticket returns `(Complete, UnknownScriptExitCode, [])`, matching Cancel's existing shape. + +**Capability check is the primary gate.** Server uses `BackwardsCompatibleAsyncCapabilitiesV2Decorator` to query `AbandonScriptV2` once per session. Capability absent → server does not schedule the abandon dispatch at all. The RPC-fail-then-log path stays as a defensive fallback for capability-cache staleness. + +**One off-switch, server-side:** `AbandonTentacleScriptOnCancellationTimeoutFeatureToggle` (default ON). Governs whether server escalates to AbandonScript at all. No Tentacle-side flag — Tentacle's capability advertisement is binary on build version. + +**Escalation timing (locked for first release):** 2 minutes. Both V1 and V2 execution pipelines escalate to AbandonScript on their next status-poll once cancellation has been pending that long. Hardcoded on the server toggle class, not configurable. Server-side trigger is a polling-loop check, not a delayed NSB message; no new server-side timers. The Tentacle-side contract is unchanged either way. + +**Execution-pipeline scope (server-side, 2026-05-21):** V1 *and* V2 server-side execution pipelines call AbandonScript via the same contract. Philips is V1 self-host so V1 is the urgent path. Doesn't change anything Tentacle is building. + +**Post-abandon flow:** + +1. Server calls `AbandonScript` → gets `ScriptStatusResponseV2`. +2. Server publishes `TentacleScriptAbandonedEvent`. +3. Existing post-cancel path proceeds (eventually calls `CompleteScript` downstream). + +**Task log wording:** + +- Tentacle script log (this doc's Section 4): *Tentacle has abandoned this script. The underlying script process may still be running on this host.* +- Server task log (server session's surface). Server session's working proposal: + - On dispatch: *Cancellation hasn't taken effect on Tentacle after 2 minutes. Abandoning the script to release the script-isolation mutex.* + - On Tentacle returning `AbandonedExitCode`: *Tentacle abandoned the script.* + - On Tentacle returning a real exit code (abandon unnecessary): *Script had already completed before abandon was needed.* +- I pushed back on the dispatch wording — "script-isolation mutex" exposes internal terminology to the customer. Suggested rewrite: *Cancellation hasn't taken effect on Tentacle after 2 minutes. Abandoning the script so this target can accept new deployments.* Server session's call which to ship with. diff --git a/source/Octopus.Tentacle.Client.Tests/CapabilitiesResponseV2ExtensionMethodsTests.cs b/source/Octopus.Tentacle.Client.Tests/CapabilitiesResponseV2ExtensionMethodsTests.cs new file mode 100644 index 000000000..2eb0b7f67 --- /dev/null +++ b/source/Octopus.Tentacle.Client.Tests/CapabilitiesResponseV2ExtensionMethodsTests.cs @@ -0,0 +1,35 @@ +using System.Collections.Generic; +using FluentAssertions; +using NUnit.Framework; +using Octopus.Tentacle.Client.Capabilities; +using Octopus.Tentacle.Contracts.Capabilities; +using Octopus.Tentacle.Contracts.ClientServices; +using Octopus.Tentacle.Contracts.ScriptServiceV2; + +namespace Octopus.Tentacle.Client.Tests +{ + [TestFixture] + public class CapabilitiesResponseV2ExtensionMethodsTests + { + [Test] + public void HasAbandonScriptV2_WhenAdvertised_ReturnsTrue() + { + var capabilities = new CapabilitiesResponseV2(new List { nameof(IScriptServiceV2), nameof(IAsyncClientScriptServiceV2.AbandonScriptAsync) }); + capabilities.HasAbandonScriptV2().Should().BeTrue(); + } + + [Test] + public void HasAbandonScriptV2_WhenNotAdvertised_ReturnsFalse() + { + var capabilities = new CapabilitiesResponseV2(new List { nameof(IScriptServiceV2) }); + capabilities.HasAbandonScriptV2().Should().BeFalse(); + } + + [Test] + public void HasAbandonScriptV2_WhenEmpty_ReturnsFalse() + { + var capabilities = new CapabilitiesResponseV2(new List()); + capabilities.HasAbandonScriptV2().Should().BeFalse(); + } + } +} diff --git a/source/Octopus.Tentacle.Client.Tests/ObservingScriptOrchestratorAbandonTests.cs b/source/Octopus.Tentacle.Client.Tests/ObservingScriptOrchestratorAbandonTests.cs new file mode 100644 index 000000000..8348bdd78 --- /dev/null +++ b/source/Octopus.Tentacle.Client.Tests/ObservingScriptOrchestratorAbandonTests.cs @@ -0,0 +1,74 @@ +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; +using FluentAssertions; +using NSubstitute; +using NUnit.Framework; +using Octopus.Tentacle.Client.EventDriven; +using Octopus.Tentacle.Client.Scripts; +using Octopus.Tentacle.Contracts; + +namespace Octopus.Tentacle.Client.Tests +{ + [TestFixture] + public class ObservingScriptOrchestratorAbandonTests + { + static CommandContext Context() => new(new ScriptTicket("T"), 0, ScriptServiceVersion.ScriptServiceVersion2); + + static ScriptOperationExecutionResult Running() + => new(new ScriptStatus(ProcessState.Running, 0, new List()), Context()); + + static ScriptOperationExecutionResult Complete(int exitCode) + => new(new ScriptStatus(ProcessState.Complete, exitCode, new List()), Context()); + + static ObservingScriptOrchestrator CreateOrchestrator(IScriptExecutor executor, TimeSpan? abandonAfter) + => new( + new ImmediateBackoff(), + _ => { }, + _ => Task.CompletedTask, + executor, + abandonAfter); + + sealed class ImmediateBackoff : IScriptObserverBackoffStrategy + { + public TimeSpan GetBackoff(int iteration) => TimeSpan.Zero; + } + + [Test] + public async Task ParamUnset_CancelsOnly_NeverAbandons() + { + var executor = Substitute.For(); + executor.CancelScript(Arg.Any()) + .Returns(Running(), Complete(ScriptExitCodes.CanceledExitCode)); + + var orchestrator = CreateOrchestrator(executor, abandonAfter: null); + using var cts = new CancellationTokenSource(); + cts.Cancel(); + + await orchestrator.ObserveUntilComplete(Running(), cts.Token); + + await executor.DidNotReceive().AbandonScript(Arg.Any()); + await executor.Received().CancelScript(Arg.Any()); + } + + [Test] + public async Task ThresholdCrossed_SwitchesFromCancelToAbandon() + { + var executor = Substitute.For(); + // abandonAfter = Zero means the threshold is crossed on the first cancelled iteration, + // so the orchestrator abandons immediately. AbandonScript returns Complete to end the loop. + executor.AbandonScript(Arg.Any()) + .Returns(Complete(ScriptExitCodes.AbandonedExitCode)); + + var orchestrator = CreateOrchestrator(executor, abandonAfter: TimeSpan.Zero); + using var cts = new CancellationTokenSource(); + cts.Cancel(); + + var result = await orchestrator.ObserveUntilComplete(Running(), cts.Token); + + await executor.Received().AbandonScript(Arg.Any()); + result.ScriptStatus.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + } + } +} diff --git a/source/Octopus.Tentacle.Client.Tests/ScriptServiceV2ExecutorAbandonTests.cs b/source/Octopus.Tentacle.Client.Tests/ScriptServiceV2ExecutorAbandonTests.cs new file mode 100644 index 000000000..4d3dca224 --- /dev/null +++ b/source/Octopus.Tentacle.Client.Tests/ScriptServiceV2ExecutorAbandonTests.cs @@ -0,0 +1,82 @@ +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; +using FluentAssertions; +using Halibut.ServiceModel; +using NSubstitute; +using NUnit.Framework; +using Octopus.Tentacle.Client; +using Octopus.Tentacle.Client.EventDriven; +using Octopus.Tentacle.Client.Execution; +using Octopus.Tentacle.Client.Observability; +using Octopus.Tentacle.Client.Retries; +using Octopus.Tentacle.Client.Scripts; +using Octopus.Tentacle.Contracts; +using Octopus.Tentacle.Contracts.Capabilities; +using Octopus.Tentacle.Contracts.ClientServices; +using Octopus.Tentacle.Contracts.Logging; +using Octopus.Tentacle.Contracts.Observability; +using Octopus.Tentacle.Contracts.ScriptServiceV2; + +namespace Octopus.Tentacle.Client.Tests +{ + [TestFixture] + public class ScriptServiceV2ExecutorAbandonTests + { + static ScriptServiceV2Executor CreateExecutor(IAsyncClientScriptServiceV2 scriptService, IAsyncClientCapabilitiesServiceV2 capabilities) + => new( + scriptService, + capabilities, + RpcCallExecutorFactory.Create(TimeSpan.Zero, Substitute.For()), + ClientOperationMetricsBuilder.Start(), + TimeSpan.Zero, + new TentacleClientOptions(new RpcRetrySettings(RetriesEnabled: false, RetryDuration: TimeSpan.Zero)), + Substitute.For()); + + static CommandContext Context() => new(new ScriptTicket("TestTicket"), 0, ScriptServiceVersion.ScriptServiceVersion2); + + [Test] + public async Task AbandonScript_WhenCapabilityAdvertised_CallsAbandonScriptAsync() + { + var scriptService = Substitute.For(); + scriptService.AbandonScriptAsync(Arg.Any(), Arg.Any()) + .Returns(x => Task.FromResult(new ScriptStatusResponseV2( + x.Arg().Ticket, ProcessState.Complete, + ScriptExitCodes.AbandonedExitCode, new List(), 1))); + + var capabilities = Substitute.For(); + capabilities.GetCapabilitiesAsync(Arg.Any()) + .Returns(Task.FromResult(new CapabilitiesResponseV2(new List { "IScriptServiceV2", "AbandonScriptAsync" }))); + + var executor = CreateExecutor(scriptService, capabilities); + + var result = await executor.AbandonScript(Context()); + + await scriptService.Received(1).AbandonScriptAsync(Arg.Any(), Arg.Any()); + await scriptService.DidNotReceive().CancelScriptAsync(Arg.Any(), Arg.Any()); + result.ScriptStatus.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + } + + [Test] + public async Task AbandonScript_WhenCapabilityAbsent_FallsBackToCancelScriptAsync() + { + var scriptService = Substitute.For(); + scriptService.CancelScriptAsync(Arg.Any(), Arg.Any()) + .Returns(x => Task.FromResult(new ScriptStatusResponseV2( + x.Arg().Ticket, ProcessState.Running, + 0, new List(), 1))); + + var capabilities = Substitute.For(); + capabilities.GetCapabilitiesAsync(Arg.Any()) + .Returns(Task.FromResult(new CapabilitiesResponseV2(new List { "IScriptServiceV2" }))); + + var executor = CreateExecutor(scriptService, capabilities); + + await executor.AbandonScript(Context()); + + await scriptService.DidNotReceive().AbandonScriptAsync(Arg.Any(), Arg.Any()); + await scriptService.Received(1).CancelScriptAsync(Arg.Any(), Arg.Any()); + } + } +} diff --git a/source/Octopus.Tentacle.Client/Capabilities/CapabilitiesResponseV2ExtensionMethods.cs b/source/Octopus.Tentacle.Client/Capabilities/CapabilitiesResponseV2ExtensionMethods.cs index bc49a1e95..2b17d288c 100644 --- a/source/Octopus.Tentacle.Client/Capabilities/CapabilitiesResponseV2ExtensionMethods.cs +++ b/source/Octopus.Tentacle.Client/Capabilities/CapabilitiesResponseV2ExtensionMethods.cs @@ -1,5 +1,6 @@ using System.Linq; using Octopus.Tentacle.Contracts.Capabilities; +using Octopus.Tentacle.Contracts.ClientServices; using Octopus.Tentacle.Contracts.ScriptServiceV2; namespace Octopus.Tentacle.Client.Capabilities @@ -15,5 +16,17 @@ public static bool HasScriptServiceV2(this CapabilitiesResponseV2 capabilities) return capabilities.SupportedCapabilities.Contains(nameof(IScriptServiceV2)); } + + public static bool HasAbandonScriptV2(this CapabilitiesResponseV2 capabilities) + { + if (capabilities?.SupportedCapabilities?.Any() != true) + { + return false; + } + + // Both sides nameof their own AbandonScriptAsync. The strings match, and a rename on + // either side can't silently drift the capability check. + return capabilities.SupportedCapabilities.Contains(nameof(IAsyncClientScriptServiceV2.AbandonScriptAsync)); + } } } \ No newline at end of file diff --git a/source/Octopus.Tentacle.Client/ITentacleClient.cs b/source/Octopus.Tentacle.Client/ITentacleClient.cs index c79a285fc..a3c7b9ebe 100644 --- a/source/Octopus.Tentacle.Client/ITentacleClient.cs +++ b/source/Octopus.Tentacle.Client/ITentacleClient.cs @@ -8,6 +8,7 @@ using Octopus.Tentacle.Client.Scripts.Models; using Octopus.Tentacle.Contracts; using Octopus.Tentacle.Contracts.Logging; +using Octopus.Tentacle.Contracts.ScriptServiceV2; namespace Octopus.Tentacle.Client { @@ -31,7 +32,8 @@ Task ExecuteScript( OnScriptStatusResponseReceived onScriptStatusResponseReceived, OnScriptCompleted onScriptCompleted, ITentacleClientTaskLog logger, - CancellationToken scriptExecutionCancellationToken); + CancellationToken scriptExecutionCancellationToken, + TimeSpan? abandonAfterCancellationPendingFor = null); /// /// Start the script. @@ -59,6 +61,20 @@ Task StartScript(ExecuteScriptCommand command, /// The result, which includes the CommandContext for the next command Task CancelScript(CommandContext commandContext, ITentacleClientTaskLog logger); + /// + /// Abandon a running script: signal Tentacle to best-effort-kill the process, stop waiting, + /// and release the script's isolation mutex, returning straight away without waiting for the + /// process to exit. This does not delete the workspace. The later CompleteScript call does + /// that, and tolerates cleanup failures for an abandoned script whose process is still alive. + /// Used as an escape hatch when CancelScript can't terminate a stuck process; a prior + /// CancelScript is not required because abandon does its own best-effort kill. + /// + /// The ticket of the script to abandon + /// Used to output user orientated log messages + /// Cancels the RPC call + /// The current status snapshot of the script at the time abandon was processed + Task AbandonScript(ScriptTicket scriptTicket, ITentacleClientTaskLog logger, CancellationToken cancellationToken); + /// /// Complete the script. /// diff --git a/source/Octopus.Tentacle.Client/ScriptExecutor.cs b/source/Octopus.Tentacle.Client/ScriptExecutor.cs index 572db403a..1ce5e1f2c 100644 --- a/source/Octopus.Tentacle.Client/ScriptExecutor.cs +++ b/source/Octopus.Tentacle.Client/ScriptExecutor.cs @@ -69,7 +69,15 @@ public async Task CancelScript(CommandContext co return await scriptExecutor.CancelScript(commandContext); } - + + public async Task AbandonScript(CommandContext commandContext) + { + var scriptExecutorFactory = CreateScriptExecutorFactory(); + var scriptExecutor = scriptExecutorFactory.CreateScriptExecutor(commandContext.ScripServiceVersionUsed); + + return await scriptExecutor.AbandonScript(commandContext); + } + public async Task CompleteScript(CommandContext commandContext, CancellationToken cancellationToken) { var scriptExecutorFactory = CreateScriptExecutorFactory(); diff --git a/source/Octopus.Tentacle.Client/Scripts/IScriptExecutor.cs b/source/Octopus.Tentacle.Client/Scripts/IScriptExecutor.cs index 0544ef9ed..48fb6536c 100644 --- a/source/Octopus.Tentacle.Client/Scripts/IScriptExecutor.cs +++ b/source/Octopus.Tentacle.Client/Scripts/IScriptExecutor.cs @@ -32,6 +32,14 @@ Task StartScript(ExecuteScriptCommand command, /// The result, which includes the CommandContext for the next command Task CancelScript(CommandContext commandContext); + /// + /// Abandon the script: signal Tentacle to stop waiting and release the isolation mutex. + /// Returns the abandoned status when the Tentacle advertises the abandon capability; + /// otherwise falls back to cancelling and returns that result. + /// + /// The CommandContext from the previous command + Task AbandonScript(CommandContext commandContext); + /// /// Complete the script. /// diff --git a/source/Octopus.Tentacle.Client/Scripts/KubernetesScriptServiceV1Executor.cs b/source/Octopus.Tentacle.Client/Scripts/KubernetesScriptServiceV1Executor.cs index b5b1b1e44..16dfc559b 100644 --- a/source/Octopus.Tentacle.Client/Scripts/KubernetesScriptServiceV1Executor.cs +++ b/source/Octopus.Tentacle.Client/Scripts/KubernetesScriptServiceV1Executor.cs @@ -184,6 +184,12 @@ async Task CancelScriptAction(CancellationToke return Map(kubernetesScriptStatusResponseV1); } + public Task AbandonScript(CommandContext commandContext) + { + // KubernetesScriptServiceV1 has no abandon verb; degrade to cancel. + return CancelScript(commandContext); + } + public async Task CompleteScript(CommandContext lastStatusResponse, CancellationToken scriptExecutionCancellationToken) { using var activity = TentacleClient.ActivitySource.StartActivity($"{nameof(KubernetesScriptServiceV1Executor)}.{nameof(CompleteScript)}"); diff --git a/source/Octopus.Tentacle.Client/Scripts/ObservingScriptOrchestrator.cs b/source/Octopus.Tentacle.Client/Scripts/ObservingScriptOrchestrator.cs index 209941900..f71d77084 100644 --- a/source/Octopus.Tentacle.Client/Scripts/ObservingScriptOrchestrator.cs +++ b/source/Octopus.Tentacle.Client/Scripts/ObservingScriptOrchestrator.cs @@ -14,17 +14,20 @@ sealed class ObservingScriptOrchestrator readonly OnScriptStatusResponseReceived onScriptStatusResponseReceived; readonly OnScriptCompleted onScriptCompleted; readonly IScriptExecutor scriptExecutor; + readonly TimeSpan? abandonAfterCancellationPendingFor; public ObservingScriptOrchestrator( IScriptObserverBackoffStrategy scriptObserverBackOffStrategy, OnScriptStatusResponseReceived onScriptStatusResponseReceived, OnScriptCompleted onScriptCompleted, - IScriptExecutor scriptExecutor) + IScriptExecutor scriptExecutor, + TimeSpan? abandonAfterCancellationPendingFor = null) { this.scriptExecutor = scriptExecutor; this.scriptObserverBackOffStrategy = scriptObserverBackOffStrategy; this.onScriptStatusResponseReceived = onScriptStatusResponseReceived; this.onScriptCompleted = onScriptCompleted; + this.abandonAfterCancellationPendingFor = abandonAfterCancellationPendingFor; } public async Task ExecuteScript(ExecuteScriptCommand command, CancellationToken scriptExecutionCancellationToken) @@ -73,19 +76,31 @@ async Task ObserveUntilCompleteThenFinish( return observingUntilCompleteResult.ScriptStatus; } - async Task ObserveUntilComplete( + internal async Task ObserveUntilComplete( ScriptOperationExecutionResult startScriptResult, CancellationToken scriptExecutionCancellationToken) { var iteration = 0; var cancellationIteration = 0; var lastResult = startScriptResult; + var stopwatch = new Stopwatch(); while (lastResult.ScriptStatus.State != ProcessState.Complete) { if (scriptExecutionCancellationToken.IsCancellationRequested) { - lastResult = await scriptExecutor.CancelScript(lastResult.ContextForNextCommand).ConfigureAwait(false); + // Record when cancellation first fired so we can escalate to abandon after the threshold. + if (!stopwatch.IsRunning) + { + stopwatch.Start(); + } + + var shouldAbandon = abandonAfterCancellationPendingFor.HasValue + && stopwatch.Elapsed >= abandonAfterCancellationPendingFor.Value; + + lastResult = shouldAbandon + ? await scriptExecutor.AbandonScript(lastResult.ContextForNextCommand).ConfigureAwait(false) + : await scriptExecutor.CancelScript(lastResult.ContextForNextCommand).ConfigureAwait(false); } else { diff --git a/source/Octopus.Tentacle.Client/Scripts/ScriptExecutorFactory.cs b/source/Octopus.Tentacle.Client/Scripts/ScriptExecutorFactory.cs index a67824d6f..d5b0af94a 100644 --- a/source/Octopus.Tentacle.Client/Scripts/ScriptExecutorFactory.cs +++ b/source/Octopus.Tentacle.Client/Scripts/ScriptExecutorFactory.cs @@ -47,6 +47,7 @@ public IScriptExecutor CreateScriptExecutor(ScriptServiceVersion scriptServiceTo { return new ScriptServiceV2Executor( allClients.ScriptServiceV2, + allClients.CapabilitiesServiceV2, rpcCallExecutor, clientOperationMetricsBuilder, onCancellationAbandonCompleteScriptAfter, diff --git a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV1Executor.cs b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV1Executor.cs index b777b8e1d..8d0d32c81 100644 --- a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV1Executor.cs +++ b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV1Executor.cs @@ -164,6 +164,12 @@ public async Task CancelScript(CommandContext co return Map(response); } + public Task AbandonScript(CommandContext commandContext) + { + // ScriptServiceV1 has no abandon verb; degrade to cancel. + return CancelScript(commandContext); + } + public async Task CompleteScript(CommandContext lastStatusResponse, CancellationToken scriptExecutionCancellationToken) { try diff --git a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV2Executor.cs b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV2Executor.cs index c3fc68898..abc3270a3 100644 --- a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV2Executor.cs +++ b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV2Executor.cs @@ -3,6 +3,7 @@ using System.Threading.Tasks; using Halibut; using Halibut.ServiceModel; +using Octopus.Tentacle.Client.Capabilities; using Octopus.Tentacle.Client.EventDriven; using Octopus.Tentacle.Client.Execution; using Octopus.Tentacle.Client.Observability; @@ -18,6 +19,7 @@ namespace Octopus.Tentacle.Client.Scripts class ScriptServiceV2Executor : IScriptExecutor { readonly IAsyncClientScriptServiceV2 clientScriptServiceV2; + readonly IAsyncClientCapabilitiesServiceV2 clientCapabilitiesServiceV2; readonly RpcCallExecutor rpcCallExecutor; readonly ClientOperationMetricsBuilder clientOperationMetricsBuilder; readonly TimeSpan onCancellationAbandonCompleteScriptAfter; @@ -26,6 +28,7 @@ class ScriptServiceV2Executor : IScriptExecutor public ScriptServiceV2Executor( IAsyncClientScriptServiceV2 clientScriptServiceV2, + IAsyncClientCapabilitiesServiceV2 clientCapabilitiesServiceV2, RpcCallExecutor rpcCallExecutor, ClientOperationMetricsBuilder clientOperationMetricsBuilder, TimeSpan onCancellationAbandonCompleteScriptAfter, @@ -33,6 +36,7 @@ public ScriptServiceV2Executor( ITentacleClientTaskLog logger) { this.clientScriptServiceV2 = clientScriptServiceV2; + this.clientCapabilitiesServiceV2 = clientCapabilitiesServiceV2; this.rpcCallExecutor = rpcCallExecutor; this.clientOperationMetricsBuilder = clientOperationMetricsBuilder; this.onCancellationAbandonCompleteScriptAfter = onCancellationAbandonCompleteScriptAfter; @@ -173,6 +177,42 @@ async Task CancelScriptAction(CancellationToken ct) return Map(scriptStatusResponseV2); } + public async Task AbandonScript(CommandContext commandContext) + { + using var activity = TentacleClient.ActivitySource.StartActivity($"{nameof(ScriptServiceV2Executor)}.{nameof(AbandonScript)}"); + + var capabilities = await rpcCallExecutor.Execute( + retriesEnabled: clientOptions.RpcRetrySettings.RetriesEnabled, + RpcCall.Create(nameof(Contracts.Capabilities.ICapabilitiesServiceV2.GetCapabilities)), + async ct => await clientCapabilitiesServiceV2.GetCapabilitiesAsync(new HalibutProxyRequestOptions(ct)), + logger, + clientOperationMetricsBuilder, + CancellationToken.None).ConfigureAwait(false); + + // Capability absent → the Tentacle is too old to abandon. Keep cancelling cleanly. + if (!capabilities.HasAbandonScriptV2()) + { + logger.Verbose("Tentacle does not advertise AbandonScript; falling back to CancelScript."); + return await CancelScript(commandContext).ConfigureAwait(false); + } + + async Task AbandonScriptAction(CancellationToken ct) + { + var request = new AbandonScriptCommandV2(commandContext.ScriptTicket, commandContext.NextLogSequence); + return await clientScriptServiceV2.AbandonScriptAsync(request, new HalibutProxyRequestOptions(ct)); + } + + var scriptStatusResponseV2 = await rpcCallExecutor.Execute( + retriesEnabled: clientOptions.RpcRetrySettings.RetriesEnabled, + RpcCall.Create(nameof(IScriptServiceV2.AbandonScript)), + AbandonScriptAction, + logger, + clientOperationMetricsBuilder, + // Like CancelScript, abandon must not be cancelled — it stops the script on Tentacle. + CancellationToken.None).ConfigureAwait(false); + return Map(scriptStatusResponseV2); + } + public async Task CompleteScript(CommandContext lastStatusResponse, CancellationToken scriptExecutionCancellationToken) { using var activity = TentacleClient.ActivitySource.StartActivity($"{nameof(ScriptServiceV2Executor)}.{nameof(CompleteScript)}"); diff --git a/source/Octopus.Tentacle.Client/TentacleClient.cs b/source/Octopus.Tentacle.Client/TentacleClient.cs index 8176cccc0..ca8003d95 100644 --- a/source/Octopus.Tentacle.Client/TentacleClient.cs +++ b/source/Octopus.Tentacle.Client/TentacleClient.cs @@ -16,6 +16,7 @@ using Octopus.Tentacle.Contracts.Capabilities; using Octopus.Tentacle.Contracts.Logging; using Octopus.Tentacle.Contracts.Observability; +using Octopus.Tentacle.Contracts.ScriptServiceV2; using ITentacleClientObserver = Octopus.Tentacle.Contracts.Observability.ITentacleClientObserver; namespace Octopus.Tentacle.Client @@ -168,7 +169,8 @@ public async Task ExecuteScript(ExecuteScriptCommand exec OnScriptStatusResponseReceived onScriptStatusResponseReceived, OnScriptCompleted onScriptCompleted, ITentacleClientTaskLog logger, - CancellationToken scriptExecutionCancellationToken) + CancellationToken scriptExecutionCancellationToken, + TimeSpan? abandonAfterCancellationPendingFor = null) { using var activity = ActivitySource.StartActivity($"{nameof(TentacleClient)}.{nameof(ExecuteScript)}"); activity?.AddTag("octopus.tentacle.script.files", string.Join(",", executeScriptCommand.Files.Select(f => f.Name))); @@ -188,7 +190,8 @@ public async Task ExecuteScript(ExecuteScriptCommand exec var orchestrator = new ObservingScriptOrchestrator(scriptObserverBackOffStrategy, onScriptStatusResponseReceived, onScriptCompleted, - scriptExecutor); + scriptExecutor, + abandonAfterCancellationPendingFor); var result = await orchestrator.ExecuteScript(executeScriptCommand, scriptExecutionCancellationToken); @@ -260,6 +263,36 @@ public async Task CancelScript(CommandContext co return await scriptExecutor.CancelScript(commandContext); } + public async Task AbandonScript(ScriptTicket scriptTicket, ITentacleClientTaskLog logger, CancellationToken cancellationToken) + { + using var activity = ActivitySource.StartActivity($"{nameof(TentacleClient)}.{nameof(AbandonScript)}"); + activity?.AddTag("octopus.tentacle.script.ticket", scriptTicket.TaskId); + + var operationMetricsBuilder = ClientOperationMetricsBuilder.Start(); + + async Task AbandonScriptAction(CancellationToken ct) + { + var request = new AbandonScriptCommandV2(scriptTicket, lastLogSequence: 0); + return await allClients.ScriptServiceV2.AbandonScriptAsync(request, new HalibutProxyRequestOptions(ct)); + } + + try + { + return await rpcCallExecutor.Execute( + retriesEnabled: clientOptions.RpcRetrySettings.RetriesEnabled, + RpcCall.Create(nameof(IScriptServiceV2.AbandonScript)), + AbandonScriptAction, + logger, + operationMetricsBuilder, + cancellationToken).ConfigureAwait(false); + } + catch (Exception e) + { + operationMetricsBuilder.Failure(e, cancellationToken); + throw; + } + } + public async Task CompleteScript(CommandContext commandContext, ITentacleClientTaskLog logger, CancellationToken scriptExecutionCancellationToken) { using var activity = ActivitySource.StartActivity($"{nameof(TentacleClient)}.{nameof(CompleteScript)}"); diff --git a/source/Octopus.Tentacle.Contracts/ClientServices/IAsyncClientScriptServiceV2.cs b/source/Octopus.Tentacle.Contracts/ClientServices/IAsyncClientScriptServiceV2.cs index 996f915c1..0c89048e4 100644 --- a/source/Octopus.Tentacle.Contracts/ClientServices/IAsyncClientScriptServiceV2.cs +++ b/source/Octopus.Tentacle.Contracts/ClientServices/IAsyncClientScriptServiceV2.cs @@ -10,6 +10,7 @@ public interface IAsyncClientScriptServiceV2 Task StartScriptAsync(StartScriptCommandV2 command, HalibutProxyRequestOptions proxyRequestOptions); Task GetStatusAsync(ScriptStatusRequestV2 request, HalibutProxyRequestOptions proxyRequestOptions); Task CancelScriptAsync(CancelScriptCommandV2 command, HalibutProxyRequestOptions proxyRequestOptions); + Task AbandonScriptAsync(AbandonScriptCommandV2 command, HalibutProxyRequestOptions proxyRequestOptions); Task CompleteScriptAsync(CompleteScriptCommandV2 command, HalibutProxyRequestOptions proxyRequestOptions); } } \ No newline at end of file diff --git a/source/Octopus.Tentacle.Contracts/ScriptExitCodes.cs b/source/Octopus.Tentacle.Contracts/ScriptExitCodes.cs index 2e0ce1b15..2319410c4 100644 --- a/source/Octopus.Tentacle.Contracts/ScriptExitCodes.cs +++ b/source/Octopus.Tentacle.Contracts/ScriptExitCodes.cs @@ -12,6 +12,7 @@ public static class ScriptExitCodes public const int UnknownScriptExitCode = -45; public const int UnknownResultExitCode = -46; public const int PowerShellNeverStartedExitCode = -47; + public const int AbandonedExitCode = -48; //Kubernetes Agent public const int KubernetesScriptPodNotFound = -81; diff --git a/source/Octopus.Tentacle.Contracts/ScriptServiceV2/AbandonScriptCommandV2.cs b/source/Octopus.Tentacle.Contracts/ScriptServiceV2/AbandonScriptCommandV2.cs new file mode 100644 index 000000000..66efba446 --- /dev/null +++ b/source/Octopus.Tentacle.Contracts/ScriptServiceV2/AbandonScriptCommandV2.cs @@ -0,0 +1,17 @@ +using System; + +namespace Octopus.Tentacle.Contracts.ScriptServiceV2 +{ + public class AbandonScriptCommandV2 + { + public AbandonScriptCommandV2(ScriptTicket ticket, long lastLogSequence) + { + Ticket = ticket; + LastLogSequence = lastLogSequence; + } + + public ScriptTicket Ticket { get; } + + public long LastLogSequence { get; } + } +} diff --git a/source/Octopus.Tentacle.Contracts/ScriptServiceV2/IScriptServiceV2.cs b/source/Octopus.Tentacle.Contracts/ScriptServiceV2/IScriptServiceV2.cs index 3effc17b8..9858111fa 100644 --- a/source/Octopus.Tentacle.Contracts/ScriptServiceV2/IScriptServiceV2.cs +++ b/source/Octopus.Tentacle.Contracts/ScriptServiceV2/IScriptServiceV2.cs @@ -7,6 +7,7 @@ public interface IScriptServiceV2 ScriptStatusResponseV2 StartScript(StartScriptCommandV2 command); ScriptStatusResponseV2 GetStatus(ScriptStatusRequestV2 request); ScriptStatusResponseV2 CancelScript(CancelScriptCommandV2 command); + ScriptStatusResponseV2 AbandonScript(AbandonScriptCommandV2 command); void CompleteScript(CompleteScriptCommandV2 command); } } \ No newline at end of file diff --git a/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs b/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs index 412a415e3..7109d0ff1 100644 --- a/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs +++ b/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs @@ -22,18 +22,20 @@ public class RunningScript: IRunningScript readonly IShell shell; readonly string taskId; readonly CancellationToken runningScriptToken; + readonly CancellationToken abandonToken; readonly IReadOnlyDictionary environmentVariables; readonly ILog log; readonly ScriptIsolationMutex scriptIsolationMutex; readonly TimeSpan powerShellStartupTimeout; - public RunningScript(IShell shell, + RunningScript(IShell shell, IScriptWorkspace workspace, IScriptStateStore? stateStore, IScriptLog scriptLog, string taskId, ScriptIsolationMutex scriptIsolationMutex, CancellationToken runningScriptToken, + CancellationToken abandonToken, IReadOnlyDictionary environmentVariables, TimeSpan powerShellStartupTimeout, ILog log @@ -44,6 +46,7 @@ ILog log this.stateStore = stateStore; this.taskId = taskId; this.runningScriptToken = runningScriptToken; + this.abandonToken = abandonToken; this.environmentVariables = environmentVariables; this.log = log; this.scriptIsolationMutex = scriptIsolationMutex; @@ -52,7 +55,7 @@ ILog log this.powerShellStartupTimeout = powerShellStartupTimeout; } - public RunningScript(IShell shell, + RunningScript(IShell shell, IScriptWorkspace workspace, IScriptLog scriptLog, string taskId, @@ -60,10 +63,34 @@ public RunningScript(IShell shell, CancellationToken runningScriptToken, IReadOnlyDictionary environmentVariables, TimeSpan powerShellStartupTimeout, - ILog log) : this(shell, workspace, null, scriptLog, taskId, scriptIsolationMutex, runningScriptToken, environmentVariables, powerShellStartupTimeout, log) + ILog log) : this(shell, workspace, null, scriptLog, taskId, scriptIsolationMutex, runningScriptToken, CancellationToken.None, environmentVariables, powerShellStartupTimeout, log) { } + public static RunningScript Create(IShell shell, + IScriptWorkspace workspace, + IScriptLog scriptLog, + string taskId, + ScriptIsolationMutex scriptIsolationMutex, + CancellationToken runningScriptToken, + IReadOnlyDictionary environmentVariables, + TimeSpan powerShellStartupTimeout, + ILog log) + => new RunningScript(shell, workspace, null, scriptLog, taskId, scriptIsolationMutex, runningScriptToken, CancellationToken.None, environmentVariables, powerShellStartupTimeout, log); + + public static RunningScript CreateAbandonable(IShell shell, + IScriptWorkspace workspace, + IScriptStateStore? stateStore, + IScriptLog scriptLog, + string taskId, + ScriptIsolationMutex scriptIsolationMutex, + CancellationToken runningScriptToken, + CancellationToken abandonToken, + IReadOnlyDictionary environmentVariables, + TimeSpan powerShellStartupTimeout, + ILog log) + => new RunningScript(shell, workspace, stateStore, scriptLog, taskId, scriptIsolationMutex, runningScriptToken, abandonToken, environmentVariables, powerShellStartupTimeout, log); + public ProcessState State { get; private set; } public int ExitCode { get; private set; } @@ -96,14 +123,22 @@ public async Task Execute() exitCode = workspace.ShouldMonitorPowerShellStartup() ? await RunPowershellScriptWithMonitoring(shellPath, writer, runningScriptToken) - : await RunScriptAsync(shellPath, writer, runningScriptToken); + : await RunScriptAsync(shellPath, writer, runningScriptToken, abandonToken); } } + // Fires when the caller abandoned the script: leave the OS process running and signal the distinct AbandonedExitCode so the server can tell it apart from a cancel. + catch (OperationCanceledException) when (abandonToken.IsCancellationRequested) + { + writer.WriteOutput(ProcessOutputSource.StdOut, "Script execution abandoned."); + exitCode = ScriptExitCodes.AbandonedExitCode; + } + // Fires when the caller cancelled the script and the underlying process honored the cancellation token. catch (OperationCanceledException) { writer.WriteOutput(ProcessOutputSource.StdOut, "Script execution canceled."); exitCode = ScriptExitCodes.CanceledExitCode; } + // Fires when acquiring the isolation mutex timed out before the script could start. catch (TimeoutException) { writer.WriteOutput(ProcessOutputSource.StdOut, "Script execution timed out."); @@ -147,7 +182,7 @@ async Task RunPowershellScriptWithMonitoring(string shellPath, IScriptLogWr var monitor = new PowerShellStartupMonitor(workspace.WorkingDirectory, powerShellStartupTimeout, log, taskId); var monitoringTask = monitor.WaitForStartup(monitoringTaskCts.Token); - var scriptTask = Task.Run(async () => await RunScriptAsync(shellPath, writer, scriptTaskCts.Token), scriptTaskCts.Token); + var scriptTask = Task.Run(async () => await RunScriptAsync(shellPath, writer, scriptTaskCts.Token, abandonToken), scriptTaskCts.Token); var completedTask = await Task.WhenAny(monitoringTask, scriptTask); @@ -222,7 +257,7 @@ void RecordScriptHasCompleted(int exitCode) } } - async Task RunScriptAsync(string shellPath, IScriptLogWriter writer, CancellationToken cancellationToken) + async Task RunScriptAsync(string shellPath, IScriptLogWriter writer, CancellationToken cancellationToken, CancellationToken abandon) { try { @@ -234,7 +269,8 @@ async Task RunScriptAsync(string shellPath, IScriptLogWriter writer, Cancel LogScriptOutputTo(writer, ProcessOutputSource.StdOut), LogScriptOutputTo(writer, ProcessOutputSource.StdErr), environmentVariables, - cancel: cancellationToken); + cancellationToken, + abandon); return exitCode; } diff --git a/source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs b/source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs index 0a200fc19..152d4b13c 100644 --- a/source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs +++ b/source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs @@ -72,7 +72,7 @@ public async Task StartScriptAsync(StartScriptCommandV2 { IScriptWorkspace workspace; - // If the state already exists then this runningScript is already running/has already run and we should not run it again + // StartScript may be called multiple times for the same ticket (e.g. if server retries the tentacle command), so we must guard against actually starting the script twice. if (runningScript.ScriptStateStore.Exists()) { var state = runningScript.ScriptStateStore.Load(); @@ -103,7 +103,8 @@ public async Task StartScriptAsync(StartScriptCommandV2 command.TaskId, workspace, runningScript.ScriptStateStore, - runningScript.CancellationToken); + runningScript.CancellationToken, + runningScript.AbandonToken); runningScript.Process = process; @@ -140,22 +141,66 @@ public async Task CancelScriptAsync(CancelScriptCommandV return GetResponse(command.Ticket, command.LastLogSequence, runningScript?.Process); } - public async Task CompleteScriptAsync(CompleteScriptCommandV2 command, CancellationToken cancellationToken) + public Task AbandonScriptAsync(AbandonScriptCommandV2 command, CancellationToken cancellationToken) { - await Task.CompletedTask; + // Fire the abandon token. The runner's abandon branch best-effort-kills the process, + // stops waiting, releases the mutex, and returns -48. We don't kill from here: firing + // cancel would race the runner's wait. The process only survives if the kill can't land + // (genuinely stuck, or a re-parented grandchild). + if (runningScripts.TryGetValue(command.Ticket, out var runningScript)) + { + runningScript.Abandon(); + } + + return Task.FromResult(GetResponse(command.Ticket, command.LastLogSequence, runningScript?.Process)); + } + public async Task CompleteScriptAsync(CompleteScriptCommandV2 command, CancellationToken cancellationToken) + { if (runningScripts.TryRemove(command.Ticket, out var runningScript)) { runningScript.Dispose(); } var workspace = workspaceFactory.GetWorkspace(command.Ticket, WorkspaceReadinessCheck.Skip); - await workspace.Delete(cancellationToken); + + // For abandoned scripts (see AbandonScriptCommandV2 and + // https://octopus.com/docs/infrastructure/deployment-targets/tentacle/tentacle-script-abandonment) + // the underlying OS process is, by design, still alive + // and unable to be killed by Tentacle. It may still hold open file handles inside + // the workspace (logs being written to, working files, etc.). workspace.Delete() + // will fail in that case on Windows due to sharing violations and may partially + // delete on Linux. We need to tolerate the failure, which will leave the workspace + // on disk to hopefully be cleaned up by another mechanism (manual cleanup, + // instance restart) etc. This is the best we can do. For all other completion paths + // the process has exited and Delete should succeed; surface any failure there. + if (WasAbandoned(workspace)) + { + try + { + await workspace.Delete(cancellationToken); + } + catch (Exception ex) + { + log.Warn(ex, $"Could not delete abandoned workspace at {workspace.WorkingDirectory}. Leaving on disk; the underlying script process may still hold open file handles."); + } + } + else + { + await workspace.Delete(cancellationToken); + } } - RunningScript LaunchShell(ScriptTicket ticket, string serverTaskId, IScriptWorkspace workspace, IScriptStateStore stateStore, CancellationToken cancellationToken) + bool WasAbandoned(IScriptWorkspace workspace) { - var runningScript = new RunningScript(shell, workspace, stateStore, workspace.CreateLog(), serverTaskId, scriptIsolationMutex, cancellationToken, environmentVariables, powerShellStartupTimeout, log); + var stateStore = scriptStateStoreFactory.Create(workspace); + return stateStore.Exists() + && stateStore.Load().ExitCode == ScriptExitCodes.AbandonedExitCode; + } + + RunningScript LaunchShell(ScriptTicket ticket, string serverTaskId, IScriptWorkspace workspace, IScriptStateStore stateStore, CancellationToken cancellationToken, CancellationToken abandonToken) + { + var runningScript = RunningScript.CreateAbandonable(shell, workspace, stateStore, workspace.CreateLog(), serverTaskId, scriptIsolationMutex, cancellationToken, abandonToken, environmentVariables, powerShellStartupTimeout, log); _ = Task.Run(async () => await runningScript.Execute()); return runningScript; } @@ -204,13 +249,14 @@ public bool IsRunningScript(ScriptTicket ticket) class RunningScriptWrapper : IDisposable { - readonly CancellationTokenSource cancellationTokenSource = new (); + readonly CancellationTokenSource cancellationTokenSource = new(); + readonly CancellationTokenSource abandonTokenSource = new(); public RunningScriptWrapper(ScriptStateStore scriptStateStore) { ScriptStateStore = scriptStateStore; - CancellationToken = cancellationTokenSource.Token; + AbandonToken = abandonTokenSource.Token; } public RunningScript? Process { get; set; } @@ -218,15 +264,15 @@ public RunningScriptWrapper(ScriptStateStore scriptStateStore) public SemaphoreSlim StartScriptMutex { get; } = new(1, 1); public CancellationToken CancellationToken { get; } + public CancellationToken AbandonToken { get; } - public void Cancel() - { - cancellationTokenSource.Cancel(); - } + public void Cancel() => cancellationTokenSource.Cancel(); + public void Abandon() => abandonTokenSource.Cancel(); public void Dispose() { cancellationTokenSource.Dispose(); + abandonTokenSource.Dispose(); } } } diff --git a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs index 0e57221c6..8964ad12a 100644 --- a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs +++ b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs @@ -9,7 +9,9 @@ using System.Text; using System.Threading; using System.Threading.Tasks; +using Octopus.Tentacle.Contracts; using Octopus.Tentacle.Core.Diagnostics; +using Octopus.Tentacle.Core.Util; namespace Octopus.Tentacle.Util { @@ -22,12 +24,13 @@ public static Task ExecuteCommandAsync( Action debug, Action info, Action error, - CancellationToken cancel) + CancellationToken cancel, + CancellationToken abandon) { - return ExecuteCommandAsync(executable, arguments, workingDirectory, debug, info, error, customEnvironmentVariables: null, cancel: cancel); + return ExecuteCommandAsync(executable, arguments, workingDirectory, debug, info, error, customEnvironmentVariables: null, cancel: cancel, abandon: abandon); } - public static Task ExecuteCommandAsync( + public static async Task ExecuteCommandAsync( string executable, string arguments, string workingDirectory, @@ -35,7 +38,8 @@ public static Task ExecuteCommandAsync( Action info, Action error, IReadOnlyDictionary? customEnvironmentVariables = null, - CancellationToken cancel = default) + CancellationToken cancel = default, + CancellationToken abandon = default) { if (executable == null) throw new ArgumentNullException(nameof(executable)); @@ -136,15 +140,60 @@ void WriteData(Action action, ManualResetEventSlim resetEvent, DataRecei })) { if (cancel.IsCancellationRequested) - DoOurBestToCleanUp(process, error); + DoOurBestToCleanUp(process, error); process.BeginOutputReadLine(); process.BeginErrorReadLine(); - process.WaitForExit(); + // Wait synchronously for the process to exit, but let `abandon` break out. + // + // process.WaitForExit() (no-timeout) waits for the process to exit AND for the + // redirected streams to reach EOF. Cancel kills + Close()s the process via + // cancel.Register; Close releases the pipe handles, which unblocks this wait even + // when a re-parented grandchild holds the streams open. So cancel — including the + // grandchild case — behaves exactly as it does today. + // + // Abandon is the new behaviour: we race the blocking wait against a task that + // completes when the abandon token fires. If abandon wins we return + // AbandonedExitCode and leave the OS process running; the still-blocked + // WaitForExit thread is released when the using (process) disposes on return. + // TaskCompletionSource (not the non-generic overload, which doesn't exist on net48). + var abandoned = new TaskCompletionSource(); + using (abandon.Register(() => abandoned.TrySetResult(null))) + { + var waitForExit = Task.Run(() => + { + try { process.WaitForExit(); } + catch { /* released by Process.Dispose on the abandon path */ } + }); + + await Task.WhenAny(waitForExit, abandoned.Task).ConfigureAwait(false); + + // Key the abandoned result on the TOKEN, not on which task won the race. + // When both cancel and abandon fire, cancel's Kill may be no-op'd / can't land, + // so waitForExit can win even though we must still return AbandonedExitCode. + // After Cancel()->Close() the Process is detached, so do NOT read + // process.HasExited / process.ExitCode here — just return -48. The + // "abandon was unnecessary" case never reaches here: the script's RunningScript + // is already Complete with its real exit code one layer up. + if (abandon.IsCancellationRequested) + { + // Abandon best-effort-kills (anti-abuse): kill it if we can, then stop + // waiting and release. Doing the kill here — sequentially, in the abandon + // branch — is race-free, unlike firing the cancel token from the RPC handler + // (the kill would race the WhenAny resolution / process disposal). The kill + // is idempotent if cancel already ran it. The process survives only if the + // kill genuinely can't land (stuck / re-parented grandchild). Do NOT read + // HasExited/ExitCode here — Close() may have detached the Process. + DoOurBestToCleanUp(process, error); + info("Tentacle has abandoned this script. The underlying script process may still be running on this host."); + SafelyCancelOutputAndErrorRead(process, debug); + running = false; + return ScriptExitCodes.AbandonedExitCode; + } + } - SafelyCancelRead(process.CancelErrorRead, debug); - SafelyCancelRead(process.CancelOutputRead, debug); + SafelyCancelOutputAndErrorRead(process, debug); SafelyWaitForAllOutput(outputResetEvent, cancel, debug); SafelyWaitForAllOutput(errorResetEvent, cancel, debug); @@ -153,7 +202,7 @@ void WriteData(Action action, ManualResetEventSlim resetEvent, DataRecei debug($"Process {exeFileNameOrFullPath} in {workingDirectory} exited with code {exitCode}"); running = false; - return Task.FromResult(exitCode); + return exitCode; } } } @@ -181,9 +230,20 @@ static void SafelyWaitForAllOutput(ManualResetEventSlim outputResetEvent, CancellationToken cancel, Action debug) { + // outputResetEvent.Wait is waiting for the OutputDataReceived/ErrorDataReceived + // handlers to signal EOF on the stream (when it receives a null + // DataReceivedEventArgs.Data, .NET's EOF marker). This does NOT close the pipe, + // it just gives the OS up to 5 seconds to deliver the EOF. + // + // If a re-parented grandchild is holding the pipe open, EOF never arrives, the wait + // times out, and we proceed without the final flush of buffered output. The pipe is + // released later by Process.Dispose() at end of ExecuteCommandAsync via the + // `using (var process = new Process())` block. + // + // 5 seconds is somewhat arbitrary — the process has already exited by the time we + // reach here, so under normal circumstances EOF arrives within milliseconds. try { - //5 seconds is a bit arbitrary, but the process should have already exited by now, so unwise to wait too long outputResetEvent.Wait(TimeSpan.FromSeconds(5), cancel); } catch (OperationCanceledException ex) @@ -192,6 +252,17 @@ static void SafelyWaitForAllOutput(ManualResetEventSlim outputResetEvent, } } + static void SafelyCancelOutputAndErrorRead(Process process, Action debug) + { + // Cancel the output/error readers so a late OutputDataReceived/ErrorDataReceived + // callback doesn't try to write to a workspace log that's already been disposed by + // the using-block above; that write would throw ObjectDisposedException. Called in + // both the normal completion path and the abandon path; extracted here so the two + // callers stay consistent. + SafelyCancelRead(process.CancelErrorRead, debug); + SafelyCancelRead(process.CancelOutputRead, debug); + } + static void SafelyCancelRead(Action action, Action debug) { try @@ -226,15 +297,16 @@ static void DoOurBestToCleanUp(Process process, Action error) { try { - // When cancelling, close the file handles. - // If the child finishes, but the grandchild holds stdout/stderr and never completes, - // THEN we won't issue a kill to the grandchild but will wait for the grandchild to - // close the handles. + // Close the handles after killing, on this same thread (after Kill). This releases + // the redirected pipes so the synchronous process.WaitForExit() above can return + // even when a re-parented grandchild holds stdout/stderr open. (Close is safe with + // the synchronous wait — main has relied on exactly this; it only races the Exited + // event of the *async* WaitForExitAsync, which this code does not use.) process.Close(); } - catch (Exception ex) + catch (Exception closeException) { - error($"Failed to close process resources: {ex.Message}"); + error($"Failed to close process resources: {closeException.Message}"); } } } @@ -252,11 +324,18 @@ class Hitman { public static void TryKillProcessAndChildrenRecursively(Process process) { + if (!string.IsNullOrEmpty(Environment.GetEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION))) + { + // Test-only no-op: simulate "kill was attempted but didn't terminate the process". + // Only activated when the test harness sets this env var on the Tentacle process. + return; + } + #if NETFRAMEWORK TryKillWindowsProcessAndChildrenRecursively(process.Id); #endif #if !NETFRAMEWORK - // Since .NET Core 3.0 there is support for killing a process and it's children + // Since .NET Core 3.0 there is support for killing a process and it's children process.Kill(true); #endif } diff --git a/source/Octopus.Tentacle.Core/Util/EnvironmentVariables.cs b/source/Octopus.Tentacle.Core/Util/EnvironmentVariables.cs index 2b5f83a59..faf01eb00 100644 --- a/source/Octopus.Tentacle.Core/Util/EnvironmentVariables.cs +++ b/source/Octopus.Tentacle.Core/Util/EnvironmentVariables.cs @@ -29,6 +29,7 @@ public static class EnvironmentVariables public const string TentacleMachineConfigurationHomeDirectory = "TentacleMachineConfigurationHomeDirectory"; public const string TentaclePollingConnectionCount = "TentaclePollingConnectionCount"; public const string TentaclePowerShellStartupTimeout = "TentaclePowerShellStartupTimeout"; + public const string TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION = "TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION"; public const string NfsWatchdogDirectory = "watchdog_directory"; public static string TentacleUseTcpNoDelay = "TentacleUseTcpNoDelay"; public static string TentacleUseAsyncListener = "TentacleUseAsyncListener"; diff --git a/source/Octopus.Tentacle.Tests.Integration.Common/Builders/Decorators/Proxies/RecordMethodUsagesExtensionMethods.cs b/source/Octopus.Tentacle.Tests.Integration.Common/Builders/Decorators/Proxies/RecordMethodUsagesExtensionMethods.cs index 643999ffb..b2a4a943b 100644 --- a/source/Octopus.Tentacle.Tests.Integration.Common/Builders/Decorators/Proxies/RecordMethodUsagesExtensionMethods.cs +++ b/source/Octopus.Tentacle.Tests.Integration.Common/Builders/Decorators/Proxies/RecordMethodUsagesExtensionMethods.cs @@ -24,6 +24,11 @@ public static IRecordedMethodUsage ForCancelScriptAsync(this IRecordedMethodUsag return o.For("CancelScriptAsync"); } + public static IRecordedMethodUsage ForAbandonScriptAsync(this IRecordedMethodUsages o) + { + return o.For("AbandonScriptAsync"); + } + public static IRecordedMethodUsage ForCompleteScriptAsync(this IRecordedMethodUsages o) { return o.For("CompleteScriptAsync"); diff --git a/source/Octopus.Tentacle.Tests.Integration.Common/Builders/Decorators/ScriptServiceV2DecoratorBuilder.cs b/source/Octopus.Tentacle.Tests.Integration.Common/Builders/Decorators/ScriptServiceV2DecoratorBuilder.cs index 17b499a06..5cd747cf0 100644 --- a/source/Octopus.Tentacle.Tests.Integration.Common/Builders/Decorators/ScriptServiceV2DecoratorBuilder.cs +++ b/source/Octopus.Tentacle.Tests.Integration.Common/Builders/Decorators/ScriptServiceV2DecoratorBuilder.cs @@ -188,6 +188,11 @@ public async Task CancelScriptAsync(CancelScriptCommandV return await cancelScriptFunc(inner, command, options); } + public async Task AbandonScriptAsync(AbandonScriptCommandV2 command, HalibutProxyRequestOptions options) + { + return await inner.AbandonScriptAsync(command, options); + } + public async Task CompleteScriptAsync(CompleteScriptCommandV2 command, HalibutProxyRequestOptions options) { await completeScriptAction(inner, command, options); diff --git a/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs b/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs index 6f9b7300a..f52ef1b59 100644 --- a/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs +++ b/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs @@ -9,6 +9,7 @@ using Octopus.Tentacle.Contracts.Capabilities; using Octopus.Tentacle.Contracts.KubernetesScriptServiceV1; using Octopus.Tentacle.Contracts.ScriptServiceV2; +using Octopus.Tentacle.Core.Services.Scripts; using Octopus.Tentacle.Tests.Integration.Common.Builders.Decorators; using Octopus.Tentacle.Tests.Integration.Support; using Octopus.Tentacle.Tests.Integration.Util.Builders; @@ -31,15 +32,10 @@ public async Task CapabilitiesFromAnOlderTentacleWhichHasNoCapabilitiesService_W capabilities.Should().Contain(nameof(IScriptService)); capabilities.Should().Contain(nameof(IFileTransferService)); - //all versions have ScriptServiceV1 & IFileTransferService - var expectedCapabilitiesCount = 2; if (version.HasScriptServiceV2()) { capabilities.Should().Contain(nameof(IScriptServiceV2)); - expectedCapabilitiesCount++; } - - capabilities.Count.Should().Be(expectedCapabilitiesCount); } [Test] @@ -55,17 +51,23 @@ public async Task CapabilitiesServiceDoesNotReturnKubernetesScriptServiceForNonK capabilities.Should().Contain(nameof(IScriptService)); capabilities.Should().Contain(nameof(IFileTransferService)); - //all versions have ScriptServiceV1 & IFileTransferService - var expectedCapabilitiesCount = 2; if (version.HasScriptServiceV2()) { capabilities.Should().Contain(nameof(IScriptServiceV2)); - expectedCapabilitiesCount++; } capabilities.Should().NotContain(nameof(IKubernetesScriptServiceV1)); + } + + [Test] + [TentacleConfigurations] + public async Task LatestTentacle_AdvertisesAbandonScriptCapability(TentacleConfigurationTestCase tentacleConfigurationTestCase) + { + await using var clientAndTentacle = await tentacleConfigurationTestCase.CreateLegacyBuilder().Build(CancellationToken); + + var capabilities = (await clientAndTentacle.TentacleClient.CapabilitiesServiceV2.GetCapabilitiesAsync(new(CancellationToken))).SupportedCapabilities; - capabilities.Count.Should().Be(expectedCapabilitiesCount); + capabilities.Should().Contain(nameof(ScriptServiceV2.AbandonScriptAsync)); } [Test] diff --git a/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs b/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs new file mode 100644 index 000000000..ac946b292 --- /dev/null +++ b/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs @@ -0,0 +1,238 @@ +using System; +using System.IO; +using System.Linq; +using System.Threading.Tasks; +using FluentAssertions; +using NUnit.Framework; +using Octopus.Tentacle.Client; +using Octopus.Tentacle.Contracts; +using Octopus.Tentacle.Contracts.ClientServices; +using Octopus.Tentacle.Core.Util; +using Octopus.Tentacle.Tests.Integration.Common.Builders.Decorators; +using Octopus.Tentacle.Tests.Integration.Common.Builders.Decorators.Proxies; +using Octopus.Tentacle.Tests.Integration.Support; +using Octopus.Tentacle.Tests.Integration.Util; +using Octopus.Tentacle.Tests.Integration.Util.Builders; + +namespace Octopus.Tentacle.Tests.Integration +{ + [IntegrationTestTimeout] + public class ClientScriptExecutionAbandon : IntegrationTest + { + [Test] + [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version2)] + public async Task AbandonScript_WhenCancelFailsToKillProcess_ReturnsAbandonedExitCode(TentacleConfigurationTestCase tentacleConfigurationTestCase) + { + // TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION=1 makes Hitman a no-op, so + // CancelScript cannot actually terminate the underlying script process. The script + // becomes genuinely "stuck" from Tentacle's perspective. AbandonScript should then + // return promptly with AbandonedExitCode without waiting for the process to exit. + await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() + .WithTentacle(x => x.WithRunTentacleEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, "1")) + .Build(CancellationToken); + + var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); + var releaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "release"); + + var firstCommand = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder() + .CreateFile(startFile) + .WaitForFileToExist(releaseFile)) + .WithIsolationLevel(ScriptIsolationLevel.NoIsolation) + .Build(); + + var tentacleClient = clientTentacle.TentacleClient; + + var scriptExecution = Task.Run(async () => await tentacleClient.ExecuteScript(firstCommand, CancellationToken)); + + await Wait.For(() => File.Exists(startFile), + TimeSpan.FromSeconds(30), + () => throw new Exception("Script did not start"), + CancellationToken); + + // Cancel: Hitman is a no-op so the process keeps running. + await tentacleClient.CancelScript(firstCommand.ScriptTicket); + await Task.Delay(TimeSpan.FromSeconds(1)); + + // Abandon: fires the abandon token. The RPC returns the current status snapshot + // immediately, so we poll GetStatus until the script reaches Complete state. + await tentacleClient.AbandonScript(firstCommand.ScriptTicket, CancellationToken); + + ScriptStatus abandonResponse = null!; + await Wait.For(async () => + { + abandonResponse = await tentacleClient.GetStatus(firstCommand.ScriptTicket, CancellationToken); + return abandonResponse.State == ProcessState.Complete; + }, + TimeSpan.FromSeconds(30), + () => throw new Exception("Abandoned script did not reach Complete state within 30s"), + CancellationToken); + abandonResponse.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + + // Release the script process so it exits cleanly and stops leaking. + File.WriteAllText(releaseFile, ""); + await scriptExecution; + } + + [Test] + [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version2)] + public async Task AbandonScript_ReleasesIsolationMutexEvenWhileProcessIsStillRunning(TentacleConfigurationTestCase tentacleConfigurationTestCase) + { + // The whole reason Tentacle needs an abandon RPC is to release the isolation mutex + // when CancelScript can't unstick the script. This test proves that contract: a + // FullIsolation script gets stuck (because TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION + // makes cancel a no-op), abandon is called, and a second FullIsolation script with + // the same mutex name must then be able to acquire the mutex and run. + await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() + .WithTentacle(x => x.WithRunTentacleEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, "1")) + .Build(CancellationToken); + + var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); + var releaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "release"); + + const string sharedMutex = "abandon-test-mutex"; + + var firstCommand = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder() + .CreateFile(startFile) + .WaitForFileToExist(releaseFile)) + .WithIsolationLevel(ScriptIsolationLevel.FullIsolation) + .WithIsolationMutexName(sharedMutex) + .Build(); + + var tentacleClient = clientTentacle.TentacleClient; + + var firstScriptExecution = Task.Run(async () => await tentacleClient.ExecuteScript(firstCommand, CancellationToken)); + + await Wait.For(() => File.Exists(startFile), + TimeSpan.FromSeconds(30), + () => throw new Exception("First script did not start"), + CancellationToken); + + await tentacleClient.CancelScript(firstCommand.ScriptTicket); + await Task.Delay(TimeSpan.FromSeconds(1)); + + await tentacleClient.AbandonScript(firstCommand.ScriptTicket, CancellationToken); + + // Second FullIsolation script with the SAME mutex name. If the abandon released + // the mutex, this script can acquire it and run to completion. Otherwise it would + // block waiting for the (still-alive) first script's mutex hold. + var secondStartFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "second-start"); + var secondCommand = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder().CreateFile(secondStartFile)) + .WithIsolationLevel(ScriptIsolationLevel.FullIsolation) + .WithIsolationMutexName(sharedMutex) + .Build(); + + var (secondResult, _) = await tentacleClient.ExecuteScript(secondCommand, CancellationToken); + secondResult.ExitCode.Should().Be(0); + File.Exists(secondStartFile).Should().BeTrue("second script should have run after the mutex was released"); + + File.WriteAllText(releaseFile, ""); + await firstScriptExecution; + } + + [Test] + [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version2)] + public async Task AbandonScript_WithNoPriorCancel_KillsTheProcess(TentacleConfigurationTestCase tentacleConfigurationTestCase) + { + // Anti-abuse: a direct AbandonScript with no prior CancelScript must still attempt the + // kill. The runner's abandon branch best-effort-kills (DoOurBestToCleanUp) before returning + // -48. Kill is NOT disabled here, so the underlying process must actually die (the execution drains). + await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() + .Build(CancellationToken); + + var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); + var releaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "release"); + + var command = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder() + .CreateFile(startFile) + .WaitForFileToExist(releaseFile)) + .WithIsolationLevel(ScriptIsolationLevel.NoIsolation) + .Build(); + + var tentacleClient = clientTentacle.TentacleClient; + var scriptExecution = Task.Run(async () => await tentacleClient.ExecuteScript(command, CancellationToken)); + + await Wait.For(() => File.Exists(startFile), + TimeSpan.FromSeconds(30), + () => throw new Exception("Script did not start"), + CancellationToken); + + // Direct abandon, NO prior cancel. + await tentacleClient.AbandonScript(command.ScriptTicket, CancellationToken); + + ScriptStatus abandonResponse = null!; + await Wait.For(async () => + { + abandonResponse = await tentacleClient.GetStatus(command.ScriptTicket, CancellationToken); + return abandonResponse.State == ProcessState.Complete; + }, + TimeSpan.FromSeconds(30), + () => throw new Exception("Abandoned script did not reach Complete state within 30s"), + CancellationToken); + + abandonResponse.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + + // We never wrote releaseFile, so the script only completes because the abandon branch + // killed it. Draining the execution confirms the process is gone. + await scriptExecution; + } + + [Test] + [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version2)] + public async Task ExecuteScript_WhenCancellationStaysPending_EscalatesToAbandon(TentacleConfigurationTestCase tentacleConfigurationTestCase) + { + // Kill is disabled, so the script is genuinely stuck and CancelScript can't end it. + // With abandonAfterCancellationPendingFor set short, the orchestrator must escalate to + // AbandonScript, which returns AbandonedExitCode and releases the script. + IRecordedMethodUsages recordedUsages = new MethodUsages(); + await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() + .WithTentacle(x => x.WithRunTentacleEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, "1")) + .WithTentacleServiceDecorator(new TentacleServiceDecoratorBuilder() + .RecordMethodUsages(out recordedUsages) + .Build()) + .Build(CancellationToken); + + var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); + var releaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "release"); + + var command = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder() + .CreateFile(startFile) + .WaitForFileToExist(releaseFile)) + .WithIsolationLevel(ScriptIsolationLevel.NoIsolation) + .Build(); + + var tentacleClient = clientTentacle.TentacleClient; + + using var executionCts = new System.Threading.CancellationTokenSource(); + var logs = new System.Collections.Generic.List(); + + var execution = Task.Run(async () => await tentacleClient.ExecuteScript( + command, + onScriptStatusResponseReceived => logs.AddRange(onScriptStatusResponseReceived.Logs), + _ => Task.CompletedTask, + new SerilogLoggerBuilder().Build().ForContext().ToITentacleTaskLog(), + executionCts.Token, + abandonAfterCancellationPendingFor: TimeSpan.FromSeconds(2))); + + await Wait.For(() => File.Exists(startFile), + TimeSpan.FromSeconds(30), + () => throw new Exception("Script did not start"), + CancellationToken); + + executionCts.Cancel(); + + // ExecuteScript throws OperationCanceledException once the token is cancelled. + await FluentActions.Invoking(async () => await execution).Should().ThrowAsync(); + + // The orchestrator must have escalated to AbandonScript after the 2s threshold. + recordedUsages.ForAbandonScriptAsync().Started.Should().BeGreaterThan(0); + + // Cleanup: release + reap the still-alive process (kill was disabled). + File.WriteAllText(releaseFile, ""); + } + } +} diff --git a/source/Octopus.Tentacle.Tests.Integration/Support/TestDecoratorsAreCalledInTheCorrectOrder.cs b/source/Octopus.Tentacle.Tests.Integration/Support/TestDecoratorsAreCalledInTheCorrectOrder.cs index 09cbc58fb..fc3fd090b 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Support/TestDecoratorsAreCalledInTheCorrectOrder.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Support/TestDecoratorsAreCalledInTheCorrectOrder.cs @@ -105,6 +105,11 @@ public Task CancelScriptAsync(CancelScriptCommandV2 comm throw new NotImplementedException(); } + public Task AbandonScriptAsync(AbandonScriptCommandV2 command, HalibutProxyRequestOptions proxyRequestOptions) + { + throw new NotImplementedException(); + } + public async Task CompleteScriptAsync(CompleteScriptCommandV2 command, HalibutProxyRequestOptions proxyRequestOptions) { await Task.CompletedTask; diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/Builders/TentacleClientExtensionMethods.cs b/source/Octopus.Tentacle.Tests.Integration/Util/Builders/TentacleClientExtensionMethods.cs index c0975c788..bf6ba7343 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/Builders/TentacleClientExtensionMethods.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/Builders/TentacleClientExtensionMethods.cs @@ -5,10 +5,12 @@ using System.Threading.Tasks; using Halibut; using Octopus.Tentacle.Client; +using Octopus.Tentacle.Client.EventDriven; using Octopus.Tentacle.Client.Scripts; using Octopus.Tentacle.Client.Scripts.Models; using Octopus.Tentacle.Contracts; using Octopus.Tentacle.Contracts.Logging; +using Octopus.Tentacle.Contracts.ScriptServiceV2; using Octopus.Tentacle.Tests.Integration.Support; using Octopus.Tentacle.Tests.Integration.Support.ExtensionMethods; @@ -56,6 +58,42 @@ public static async Task UploadFile( return result; } + public static async Task AbandonScript( + this TentacleClient tentacleClient, + ScriptTicket scriptTicket, + CancellationToken token, + ITentacleClientTaskLog? log = null) + { + return await tentacleClient.AbandonScript(scriptTicket, + new SerilogLoggerBuilder().Build().ForContext().ToITentacleTaskLog().Chain(log), + token).ConfigureAwait(false); + } + + public static async Task CancelScript( + this TentacleClient tentacleClient, + ScriptTicket scriptTicket, + ITentacleClientTaskLog? log = null) + { + var commandContext = new CommandContext(scriptTicket, 0, ScriptServiceVersion.ScriptServiceVersion2); + var result = await tentacleClient.CancelScript(commandContext, + new SerilogLoggerBuilder().Build().ForContext().ToITentacleTaskLog().Chain(log)) + .ConfigureAwait(false); + return result.ScriptStatus; + } + + public static async Task GetStatus( + this TentacleClient tentacleClient, + ScriptTicket scriptTicket, + CancellationToken token, + ITentacleClientTaskLog? log = null) + { + var commandContext = new CommandContext(scriptTicket, 0, ScriptServiceVersion.ScriptServiceVersion2); + var result = await tentacleClient.GetStatus(commandContext, + new SerilogLoggerBuilder().Build().ForContext().ToITentacleTaskLog().Chain(log), + token).ConfigureAwait(false); + return result.ScriptStatus; + } + public static async Task DownloadFile( this TentacleClient tentacleClient, string remotePath, diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/RunningScriptFixture.cs b/source/Octopus.Tentacle.Tests.Integration/Util/RunningScriptFixture.cs index c0e3b05ad..2c6b9ceec 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/RunningScriptFixture.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/RunningScriptFixture.cs @@ -67,7 +67,7 @@ public void SetUpLocal() scriptLog = new TestScriptLog(); cancellationTokenSource = new CancellationTokenSource(TimeSpan.FromSeconds(10)); scriptIsolationMutex = new ScriptIsolationMutex(); - runningScript = new RunningScript(shell, + runningScript = RunningScript.Create(shell, workspace, scriptLog, taskId, @@ -169,7 +169,7 @@ public async Task CancellationToken_ShouldKillTheProcess() ? (new PowerShell(), "Start-Sleep -seconds") : (new Bash() as IShell, "sleep"); - var script = new RunningScript(shell, + var script = RunningScript.Create(shell, workspace, scriptLog, taskId, diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs index 44125a578..c048b70ed 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs @@ -7,6 +7,7 @@ using FluentAssertions; using NUnit.Framework; using Octopus.Tentacle.CommonTestUtils; +using Octopus.Tentacle.Contracts; using Octopus.Tentacle.Tests.Integration.Support; using Octopus.Tentacle.Tests.Integration.Support.TestAttributes; using Octopus.Tentacle.Util; @@ -130,29 +131,37 @@ public async Task CancellationToken_WhenGrandchildHoldsRedirectedPipes_ShouldNot using var tempDir = new TemporaryDirectory(); var grandchildPidFile = Path.Combine(tempDir.DirectoryPath, "grandchild.pid"); - // This test reproduces the cancel-time hang. - // The issue was SilentProcessRunner will wait for stdout/stderr pipes to be closed. - // The pipes can be inherited by grandchildren, and remain open even after the - // child process has died. + // This test guards the cancel path of ExecuteCommandAsync against a regression + // involving re-parented grandchildren that inherit our redirected pipes. // - // Normally Process.Kill(entireProcessTree:true) would kill the entire process tree. - // However if the child process dies THEN we issue the kill command, we do NOT see any - // process under the child and so the Kill() command completes. This leaves the grandchild - // running, holding our pipes we are waiting on. + // The scenario: + // * We launch a process that spawns a child, which spawns a long-running grandchild + // and then immediately exits. + // * Because the child exited BEFORE our cancel fires, the grandchild has been + // re-parented (PPID broken). Process.Kill(entireProcessTree:true) follows PPID + // links, so it does NOT find the grandchild — Kill returns having killed nothing + // beyond the (already-dead) child. + // * The grandchild inherited our redirected stdout/stderr pipes and holds them + // open. The stream readers therefore never see EOF. // - // The test stacks three processes: PowerShell (the child) launches cmd.exe (a - // throwaway middle layer) which does `start /b ping` to background ping (the - // grandchild) and then exits. cmd exiting before we cancel is what breaks the - // PPID chain — without that, ping would still be a direct child of PowerShell - // and Kill(true) would find it. + // How cancel handles this: cancel.Register kills the (already-dead) child and then + // calls process.Close(), which releases our pipe handles. That lets the synchronous + // process.WaitForExit() return even though the orphaned grandchild still holds the + // streams open. (Close is safe here because we wait synchronously; it would race the + // Exited event of the async WaitForExitAsync, which this code deliberately does not use.) // - // Two non-obvious bits below, both load-bearing: + // This test asserts cancel alone returns in well under the test timeout in the + // grandchild scenario. If it hangs, process.Close() has been removed from + // DoOurBestToCleanUp (see that method) or the cancel-kill path has otherwise broken. + // + // Two non-obvious bits in the PowerShell script below, both load-bearing: // * $psi.RedirectStandardInput = $true — we don't use stdin, but redirecting // any stream is what flips bInheritHandles=true in .NET's Process.Start. That // is what makes cmd (and by extension ping) inherit our pipe write-ends. // Without this the grandchild doesn't hold our pipes and there is no bug to // reproduce. - // * The WMI lookup — we need the grandchild's PID so the test can clean it up. + // * The WMI lookup — we need the grandchild's PID so the test can clean it up + // afterwards (otherwise we'd leak a long-running ping on the CI host). var psScript = @" $pidFile = 'PIDFILE_PLACEHOLDER' $pingPath = Join-Path $env:WINDIR 'System32\PING.EXE' @@ -196,18 +205,21 @@ public async Task CancellationToken_WhenGrandchildHoldsRedirectedPipes_ShouldNot cts.Token)); // Wait for the grandchild to actually be spawned before cancelling - await WaitForGrandchildSpawnAsync(grandchildPidFile, TimeSpan.FromSeconds(60)); + await WaitForPidFileAsync(grandchildPidFile, TimeSpan.FromSeconds(60)); var sw = Stopwatch.StartNew(); cts.Cancel(); - var completed = task.Wait(TimeSpan.FromSeconds(30)); + var completed = task.Wait(TimeSpan.FromSeconds(60)); sw.Stop(); completed.Should().BeTrue( - $"ExecuteCommand should return shortly after cancellation even when a grandchild " + - $"holds the redirected pipes. Without proactively closing the redirected streams " + - $"after Kill, Process.WaitForExit() blocks indefinitely. Elapsed since cancel: {sw.Elapsed.TotalSeconds:F1}s"); + $"ExecuteCommandAsync should return promptly after cancellation even when a " + + $"grandchild holds the redirected pipes. Worst case is ~10s (5s timeout × 2 streams " + + $"in SafelyWaitForAllOutput). If we hit the 60s test timeout, either someone " + + $"re-introduced process.Close() in DoOurBestToCleanUp (which races with the Exited " + + $"event WaitForExitAsync depends on) or SafelyWaitForAllOutput's per-stream timeout " + + $"has been removed. Elapsed since cancel: {sw.Elapsed.TotalSeconds:F1}s"); } } finally @@ -222,19 +234,20 @@ public async Task CancellationToken_WhenUnixGrandchildHoldsRedirectedPipes_Shoul if (PlatformDetection.IsRunningOnWindows) Assert.Ignore("Unix-only repro (Mac/Linux). The Windows equivalent is covered by the [WindowsTest] above."); - // This test reproduces the cancel-time hang. - // The issue is SilentProcessRunner will wait for stdout/stderr pipes to be closed. - // The pipes can be inherited by grandchildren, and remain open even after the - // child process has died. + // Unix equivalent of CancellationToken_WhenGrandchildHoldsRedirectedPipes_ShouldNotHang + // above. See that test's leading comment for the full rationale — the short version is: // - // Normally Process.Kill(entireProcessTree:true) would kill the entire process tree. - // However if the child process dies THEN we issue the kill command, we do NOT see any - // process under the child and so the Kill() command completes. This leaves the grandchild - // running, holding our pipes we are waiting on. + // * We start sh, which backgrounds a long sleep (the grandchild) and exits + // immediately. The grandchild gets re-parented to init/launchd and inherits our + // redirected stdout/stderr pipes, holding them open. + // * Process.Kill(entireProcessTree:true) follows PPID links, so by the time we + // cancel, the now-orphan grandchild is invisible to Kill — it keeps running. + // * On cancel, cancel.Register kills the (already-dead) child and calls + // process.Close(), which releases our pipe handles so the synchronous + // process.WaitForExit() returns despite the orphaned grandchild holding the + // streams open. (Close is safe with the synchronous wait — see DoOurBestToCleanUp.) // - // The test is simple we run "sh", the child process, and tell it to yeet - // sleep, the grandchild, into the background. The grandchild now keeps - // running holding on to our pipes we will wait for an EOF on. + // This test asserts cancel alone returns in well under the test timeout in the grandchild scenario. using var tempDir = new TemporaryDirectory(); var grandchildPidFile = Path.Combine(tempDir.DirectoryPath, "grandchild.pid"); @@ -253,20 +266,27 @@ public async Task CancellationToken_WhenUnixGrandchildHoldsRedirectedPipes_Shoul out _, cts.Token)); - await WaitForGrandchildSpawnAsync(grandchildPidFile, TimeSpan.FromSeconds(30)); + await WaitForPidFileAsync(grandchildPidFile, TimeSpan.FromSeconds(60)); var sw = Stopwatch.StartNew(); cts.Cancel(); - // Cancel should be super quick, if it takes a long time, then we have an issue where we are waiting for the grandchild. - var completed = task.Wait(TimeSpan.FromSeconds(30)); + // Cancel should return within ~10s worst case (5s SafelyWaitForAllOutput timeout + // per stream). The 60s test bound is generous for slow Linux CI agents while + // still flagging a real regression — if process.Close() got re-added to + // DoOurBestToCleanUp (see that method for why) or SafelyWaitForAllOutput's + // per-stream timeout was removed, the wait hangs indefinitely and we trip the + // bound by miles. + var completed = task.Wait(TimeSpan.FromSeconds(60)); sw.Stop(); completed.Should().BeTrue( - $"ExecuteCommand should return shortly after cancellation even when a Unix " + - $"grandchild (reparented to init/launchd) holds the redirected pipes. " + - $"Without proactively closing the redirected streams after Kill, " + - $"Process.WaitForExit() blocks indefinitely. Elapsed since cancel: {sw.Elapsed.TotalSeconds:F1}s"); + $"ExecuteCommandAsync should return promptly after cancellation even when a Unix " + + $"grandchild (reparented to init/launchd) holds the redirected pipes. Worst case " + + $"is ~10s. If we hit the 60s test timeout, either process.Close() was re-introduced " + + $"in DoOurBestToCleanUp (which races with the Exited event WaitForExitAsync depends " + + $"on) or SafelyWaitForAllOutput's per-stream timeout has been removed. Elapsed " + + $"since cancel: {sw.Elapsed.TotalSeconds:F1}s"); } } finally @@ -275,7 +295,115 @@ public async Task CancellationToken_WhenUnixGrandchildHoldsRedirectedPipes_Shoul } } - static async Task WaitForGrandchildSpawnAsync(string pidFile, TimeSpan timeout) + [Test] + public async Task AbandonToken_ReturnsAbandonedExitCode() + { + // Abandon stops waiting and returns the distinct AbandonedExitCode. Abandon also + // best-effort-kills the process — the kill itself is asserted by + // ClientScriptExecutionAbandon.AbandonScript_WithNoPriorCancel_KillsTheProcess, and the + // un-killable (survives) case by AbandonScript_WhenCancelFailsToKillProcess. (We do NOT + // disable kill here: setting that env var is process-wide and leaks into the Tentacle + // subprocesses other integration tests spawn.) + using var tempDir = new TemporaryDirectory(); + var pidFile = Path.Combine(tempDir.DirectoryPath, "process.pid"); + + var abandonCommand = PlatformDetection.IsRunningOnWindows ? "powershell.exe" : "/bin/bash"; + var arguments = PlatformDetection.IsRunningOnWindows + ? $"-NoProfile -NonInteractive -Command \"$PID | Out-File -FilePath '{pidFile}' -Encoding ASCII; Start-Sleep -Seconds 300\"" + : $"-c \"echo $$ > '{pidFile}' && sleep 300\""; + + using var cancelCts = new CancellationTokenSource(); + using var abandonCts = new CancellationTokenSource(); + + var infoMessages = new StringBuilder(); + + var task = Task.Run(async () => await SilentProcessRunner.ExecuteCommandAsync( + abandonCommand, + arguments, + Environment.CurrentDirectory, + debug: _ => { }, + info: msg => { lock (infoMessages) infoMessages.AppendLine(msg); }, + error: _ => { }, + customEnvironmentVariables: null, + cancel: cancelCts.Token, + abandon: abandonCts.Token)); + + // Wait deterministically for the process to write its PID before we abandon + await WaitForPidFileAsync(pidFile, TimeSpan.FromSeconds(30)); + abandonCts.Cancel(); + + try + { + var exitCode = await task; + exitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + infoMessages.ToString().Should().Contain("Tentacle has abandoned this script"); + } + finally + { + // Force-kill the script process if it's somehow still around, to avoid leaking on CI + if (File.Exists(pidFile) && int.TryParse(SafelyReadAllText(pidFile).Trim(), out var pid) && pid > 0) + { + try { Process.GetProcessById(pid).Kill(); } catch { /* already gone */ } + } + } + } + + [Test] + public async Task AbandonToken_WhenCancelAlsoRequested_StillReturnsAbandonedExitCode() + { + // Once abandon is requested, the runner returns AbandonedExitCode even though cancel is + // also requested — the abandon branch resolves the wait and returns -48. + // + // We fire abandon FIRST so this is deterministic. The exit code is NOT deterministic if + // cancel and abandon race on a *killable* process: cancel's kill can resolve the wait to + // the killed exit code before abandon is observed. That race does not happen in production + // — cancel and abandon arrive as separate, sequential RPCs, and the server only sends + // abandon AFTER cancel has failed to unstick the script. The deterministic both-tokens + // case (cancel fired, kill genuinely fails, abandon -> -48) is covered end-to-end by + // ClientScriptExecutionAbandon.AbandonScript_WhenCancelFailsToKillProcess. + using var tempDir = new TemporaryDirectory(); + var pidFile = Path.Combine(tempDir.DirectoryPath, "process.pid"); + + var executable = PlatformDetection.IsRunningOnWindows ? "powershell.exe" : "/bin/bash"; + var arguments = PlatformDetection.IsRunningOnWindows + ? $"-NoProfile -NonInteractive -Command \"$PID | Out-File -FilePath '{pidFile}' -Encoding ASCII; Start-Sleep -Seconds 300\"" + : $"-c \"echo $$ > '{pidFile}' && sleep 300\""; + + using var cancelCts = new CancellationTokenSource(); + using var abandonCts = new CancellationTokenSource(); + + var task = Task.Run(async () => await SilentProcessRunner.ExecuteCommandAsync( + executable, + arguments, + Environment.CurrentDirectory, + debug: _ => { }, + info: _ => { }, + error: _ => { }, + customEnvironmentVariables: null, + cancel: cancelCts.Token, + abandon: abandonCts.Token)); + + await WaitForPidFileAsync(pidFile, TimeSpan.FromSeconds(30)); + + // Abandon first (resolves the wait to -48), then cancel is also requested. + abandonCts.Cancel(); + cancelCts.Cancel(); + + try + { + var exitCode = await task; + exitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + } + finally + { + if (File.Exists(pidFile) && int.TryParse(SafelyReadAllText(pidFile).Trim(), out var pid) && pid > 0) + { + try { Process.GetProcessById(pid).Kill(); } catch { /* already gone */ } + } + } + } + + static async Task WaitForPidFileAsync(string pidFile, TimeSpan timeout) { var deadline = DateTime.UtcNow + timeout; while (DateTime.UtcNow < deadline) @@ -285,8 +413,8 @@ static async Task WaitForGrandchildSpawnAsync(string pidFile, TimeSpan timeout) await Task.Delay(50); } throw new TimeoutException( - $"Test setup failed: the grandchild PID was never written to '{pidFile}'. " + - $"The grandchild-pipe scenario is not being exercised."); + $"Test setup failed: a valid PID was never written to '{pidFile}'. " + + $"The scenario under test is not being exercised."); } static string SafelyReadAllText(string path) diff --git a/source/Octopus.Tentacle.Tests/Capabilities/CapabilitiesServiceV2Fixture.cs b/source/Octopus.Tentacle.Tests/Capabilities/CapabilitiesServiceV2Fixture.cs index 0c6326a05..e219353cd 100644 --- a/source/Octopus.Tentacle.Tests/Capabilities/CapabilitiesServiceV2Fixture.cs +++ b/source/Octopus.Tentacle.Tests/Capabilities/CapabilitiesServiceV2Fixture.cs @@ -6,6 +6,7 @@ using Octopus.Tentacle.Contracts; using Octopus.Tentacle.Contracts.KubernetesScriptServiceV1; using Octopus.Tentacle.Contracts.ScriptServiceV2; +using Octopus.Tentacle.Core.Services.Scripts; using Octopus.Tentacle.Kubernetes; using Octopus.Tentacle.Services.Capabilities; @@ -20,8 +21,7 @@ public async Task CapabilitiesAreReturned() .GetCapabilitiesAsync(CancellationToken.None)) .SupportedCapabilities; - capabilities.Should().BeEquivalentTo(nameof(IScriptService), nameof(IFileTransferService), nameof(IScriptServiceV2)); - capabilities.Count.Should().Be(3); + capabilities.Should().BeEquivalentTo(nameof(IScriptService), nameof(IFileTransferService), nameof(IScriptServiceV2), nameof(ScriptServiceV2.AbandonScriptAsync)); capabilities.Should().NotContainMatch("IKubernetesScriptService*"); } @@ -36,11 +36,30 @@ public async Task OnlyKubernetesScriptServicesAreReturnedWhenRunningAsKubernetes .SupportedCapabilities; capabilities.Should().BeEquivalentTo(nameof(IFileTransferService), nameof(IKubernetesScriptServiceV1)); - capabilities.Count.Should().Be(2); capabilities.Should().NotContainMatch("IScriptService*"); Environment.SetEnvironmentVariable(KubernetesConfig.NamespaceVariableName, null); } + + [Test] + public async Task GetCapabilities_OnNonKubernetesTentacle_AdvertisesAbandonScriptV2() + { + var service = new CapabilitiesServiceV2(); + var response = await service.GetCapabilitiesAsync(CancellationToken.None); + response.SupportedCapabilities.Should().Contain(nameof(ScriptServiceV2.AbandonScriptAsync)); + } + + [Test] + public async Task GetCapabilities_OnKubernetesTentacle_DoesNotAdvertiseAbandonScriptV2() + { + Environment.SetEnvironmentVariable(KubernetesConfig.NamespaceVariableName, "ABC"); + + var service = new CapabilitiesServiceV2(); + var response = await service.GetCapabilitiesAsync(CancellationToken.None); + response.SupportedCapabilities.Should().NotContain(nameof(ScriptServiceV2.AbandonScriptAsync)); + + Environment.SetEnvironmentVariable(KubernetesConfig.NamespaceVariableName, null); + } } } \ No newline at end of file diff --git a/source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs b/source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs index d5c771eef..9317a874e 100644 --- a/source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs +++ b/source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs @@ -1,4 +1,4 @@ -using System; +using System; using System.Collections.Concurrent; using System.Collections.Generic; using System.Diagnostics; @@ -16,6 +16,7 @@ using Octopus.Tentacle.Core.Diagnostics; using Octopus.Tentacle.Core.Services.Scripts; using Octopus.Tentacle.Core.Services.Scripts.Locking; +using Octopus.Tentacle.Core.Services.Scripts.Logging; using Octopus.Tentacle.Core.Services.Scripts.Security.Masking; using Octopus.Tentacle.Core.Services.Scripts.Shell; using Octopus.Tentacle.Core.Services.Scripts.StateStore; @@ -29,30 +30,12 @@ namespace Octopus.Tentacle.Tests.Integration [TestFixture] public class ScriptServiceV2Fixture { - ScriptServiceV2 service = null!; - ScriptWorkspaceFactory workspaceFactory = null!; - ScriptStateStoreFactory stateStoreFactory = null!; - - [SetUp] - public void SetUp() - { - var homeConfiguration = Substitute.For(); - homeConfiguration.HomeDirectory.Returns(Environment.CurrentDirectory); - - var octopusPhysicalFileSystem = new OctopusPhysicalFileSystem(Substitute.For()); - workspaceFactory = new ScriptWorkspaceFactory(octopusPhysicalFileSystem, homeConfiguration, new SensitiveValueMasker()); - stateStoreFactory = new ScriptStateStoreFactory(octopusPhysicalFileSystem); - service = new ScriptServiceV2( - PlatformDetection.IsRunningOnWindows ? (IShell)new PowerShell() : new Bash(), - workspaceFactory, - stateStoreFactory, - new ScriptIsolationMutex(), - Substitute.For()); - } - [Test] public async Task ShouldExecuteAScriptSuccessfully() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var windowsScript = "& ping.exe localhost -n 1"; var bashScript = "ping localhost -c 1"; @@ -63,7 +46,7 @@ public async Task ShouldExecuteAScriptSuccessfully() .Build(); var startScriptResponse = await service.StartScriptAsync(startScriptCommand, CancellationToken.None); - var (logs, finalResponse) = await RunUntilScriptCompletes(startScriptCommand, startScriptResponse); + var (logs, finalResponse) = await RunUntilScriptCompletes(service, startScriptCommand, startScriptResponse); finalResponse.State.Should().Be(ProcessState.Complete); finalResponse.ExitCode.Should().Be(0); @@ -73,6 +56,9 @@ public async Task ShouldExecuteAScriptSuccessfully() [Test] public async Task ShouldReturnANonZeroExitCodeForAFailingScript() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var windowsScript = "& ping.exe nope -n 1"; var bashScript = "ping nope -c 1"; @@ -83,7 +69,7 @@ public async Task ShouldReturnANonZeroExitCodeForAFailingScript() .Build(); var startScriptResponse = await service.StartScriptAsync(startScriptCommand, CancellationToken.None); - var (logs, finalResponse) = await RunUntilScriptCompletes(startScriptCommand, startScriptResponse); + var (logs, finalResponse) = await RunUntilScriptCompletes(service, startScriptCommand, startScriptResponse); finalResponse.State.Should().Be(ProcessState.Complete); finalResponse.ExitCode.Should().NotBe(0); @@ -93,6 +79,9 @@ public async Task ShouldReturnANonZeroExitCodeForAFailingScript() [Test] public async Task ShouldExecuteMultipleScriptsConcurrently() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var bashScript = "sleep 10"; var windowsScript = "Start-Sleep -Seconds 10"; @@ -114,7 +103,7 @@ public async Task ShouldExecuteMultipleScriptsConcurrently() }); await Task.WhenAll(tasks); - + var startDuration = started.Elapsed; startDuration.Should().BeLessThan(TimeSpan.FromSeconds(5)); @@ -122,7 +111,7 @@ public async Task ShouldExecuteMultipleScriptsConcurrently() foreach (var script in scripts) { - var (logs, finalResponse) = await RunUntilScriptCompletes(script.Command, script.Response); + var (logs, finalResponse) = await RunUntilScriptCompletes(service, script.Command, script.Response); finalResponse.State.Should().Be(ProcessState.Complete); finalResponse.ExitCode.Should().Be(0); @@ -136,6 +125,9 @@ public async Task ShouldExecuteMultipleScriptsConcurrently() [Test] public async Task ShouldStartExecuteAScriptQuickly() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var script = "echo \"finished\""; var startScriptCommand = new StartScriptCommandV2Builder() @@ -170,6 +162,9 @@ public async Task ShouldStartExecuteAScriptQuickly() [Test] public async Task ShouldExecuteALongRunningScriptSuccessfully() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var bashScript = "sleep 10"; var windowsScript = "Start-Sleep -Seconds 10"; @@ -180,7 +175,7 @@ public async Task ShouldExecuteALongRunningScriptSuccessfully() .Build(); var startScriptResponse = await service.StartScriptAsync(startScriptCommand, CancellationToken.None); - var (_, finalResponse) = await RunUntilScriptCompletes(startScriptCommand, startScriptResponse); + var (_, finalResponse) = await RunUntilScriptCompletes(service, startScriptCommand, startScriptResponse); finalResponse.State.Should().Be(ProcessState.Complete); finalResponse.ExitCode.Should().Be(0); @@ -189,6 +184,9 @@ public async Task ShouldExecuteALongRunningScriptSuccessfully() [Test] public async Task StartScriptShouldWaitForAShortScriptToFinish() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var startScriptCommand = new StartScriptCommandV2Builder() .WithScriptBody("echo \"finished\"") .WithDurationStartScriptCanWaitForScriptToFinish(TimeSpan.FromSeconds(5)) @@ -207,6 +205,9 @@ public async Task StartScriptShouldWaitForAShortScriptToFinish() [Test] public async Task StartScriptShouldNotWaitForALongScriptToFinish() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var bashScript = "sleep 10"; var windowsScript = "Start-Sleep -Seconds 10"; @@ -217,7 +218,7 @@ public async Task StartScriptShouldNotWaitForALongScriptToFinish() .Build(); var startScriptResponse = await service.StartScriptAsync(startScriptCommand, CancellationToken.None); - await RunUntilScriptCompletes(startScriptCommand, startScriptResponse); + await RunUntilScriptCompletes(service, startScriptCommand, startScriptResponse); startScriptResponse.State.Should().Be(ProcessState.Running); startScriptResponse.ExitCode.Should().Be(0); @@ -227,6 +228,9 @@ public async Task StartScriptShouldNotWaitForALongScriptToFinish() [Test] public async Task StartScriptShouldNotStartTheScriptForTheSameScriptTicketMoreThanOnce_SequentialRequests() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + // Arrange var scriptTicket = new ScriptTicket(Guid.NewGuid().ToString()); var script1 = GetStartScriptCommandForScriptThatCreatesAFile(scriptTicket); @@ -236,7 +240,7 @@ public async Task StartScriptShouldNotStartTheScriptForTheSameScriptTicketMoreTh await service.StartScriptAsync(script1.StartScriptCommand, CancellationToken.None); var startScriptResponse = await service.StartScriptAsync(script2.StartScriptCommand, CancellationToken.None); - var (_, finalResponse) = await RunUntilScriptCompletes(script2.StartScriptCommand, startScriptResponse); + var (_, finalResponse) = await RunUntilScriptCompletes(service, script2.StartScriptCommand, startScriptResponse); // Assert finalResponse.State.Should().Be(ProcessState.Complete); @@ -249,6 +253,9 @@ public async Task StartScriptShouldNotStartTheScriptForTheSameScriptTicketMoreTh [Test] public async Task StartScriptShouldNotStartTheScriptForTheSameScriptTicketMoreThanOnce_ConcurrentRequests() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + // Arrange var scriptTicket = new ScriptTicket(Guid.NewGuid().ToString()); @@ -271,6 +278,7 @@ public async Task StartScriptShouldNotStartTheScriptForTheSameScriptTicketMoreTh await Task.WhenAll(tasks); var (_, finalResponse) = await RunUntilScriptCompletes( + service, scripts[0].StartScriptCommand, new ScriptStatusResponseV2(scripts[0].StartScriptCommand.ScriptTicket, ProcessState.Pending, 0, new List(), 0)); @@ -292,6 +300,9 @@ public async Task StartScriptShouldNotStartTheScriptForTheSameScriptTicketMoreTh [Test] public async Task CancelScriptShouldCancelAnExecutingScript() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var bashScript = "sleep 60"; var windowsScript = "Start-Sleep -Seconds 60"; @@ -337,6 +348,10 @@ public async Task CancelScriptShouldCancelAnExecutingScript() [Test] public async Task CompleteScriptShouldCleanupTheWorkspace() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var workspaceFactory = (ScriptWorkspaceFactory)builder.WorkspaceFactory; + var script = "echo \"finished\""; var startScriptCommand = new StartScriptCommandV2Builder() @@ -351,13 +366,14 @@ public async Task CompleteScriptShouldCleanupTheWorkspace() var startScriptResponse = await service.StartScriptAsync(startScriptCommand, CancellationToken.None); Directory.Exists(workspaceDirectory).Should().BeTrue(); - await RunUntilScriptCompletes(startScriptCommand, startScriptResponse); + await RunUntilScriptCompletes(service, startScriptCommand, startScriptResponse); Directory.Exists(workspaceDirectory).Should().BeFalse(); } [Test] public async Task GetStatusShouldReturnAnExitCodeOf45ForAnUnknownScriptTicket() { + var service = new ScriptServiceV2Builder().Build(); var response = await service.GetStatusAsync(new ScriptStatusRequestV2(new ScriptTicket("nope"), 0), CancellationToken.None); response.ExitCode.Should().Be(-45); @@ -366,6 +382,7 @@ public async Task GetStatusShouldReturnAnExitCodeOf45ForAnUnknownScriptTicket() [Test] public async Task CancelScriptShouldReturnAnExitCodeOf45ForAnUnknownScriptTicket() { + var service = new ScriptServiceV2Builder().Build(); var response = await service.CancelScriptAsync(new CancelScriptCommandV2(new ScriptTicket("nope"), 0), CancellationToken.None); response.ExitCode.Should().Be(-45); @@ -374,43 +391,53 @@ public async Task CancelScriptShouldReturnAnExitCodeOf45ForAnUnknownScriptTicket [Test] public async Task CompleteScriptShouldNotErrorForAnUnknownScriptTicket() { + var service = new ScriptServiceV2Builder().Build(); await service.CompleteScriptAsync(new CompleteScriptCommandV2(new ScriptTicket("nope")), CancellationToken.None); } [Test] public async Task GetStatusShouldReturnAnExitCodeOf46ForAScriptWithAnUnknownResult() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var request = new ScriptStatusRequestV2(new ScriptTicket($"did-not-finish-{Guid.NewGuid()}"), 0); var ticket = request.Ticket; - SetupScriptState(ticket); + SetupScriptState(builder.WorkspaceFactory, builder.StateStoreFactory, ticket); var response = await service.GetStatusAsync(request, CancellationToken.None); response.ExitCode.Should().Be(-46); - await CleanupWorkspace(ticket, CancellationToken.None); + await CleanupWorkspace(builder.WorkspaceFactory, ticket, CancellationToken.None); } [Test] public async Task CancelScriptShouldReturnAnExitCodeOf46ForAScriptWithAnUnknownResult() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var request = new CancelScriptCommandV2(new ScriptTicket($"did-not-finish-{Guid.NewGuid()}"), 0); var ticket = request.Ticket; - SetupScriptState(ticket); + SetupScriptState(builder.WorkspaceFactory, builder.StateStoreFactory, ticket); var response = await service.CancelScriptAsync(request, CancellationToken.None); response.ExitCode.Should().Be(-46); - await CleanupWorkspace(ticket, CancellationToken.None); + await CleanupWorkspace(builder.WorkspaceFactory, ticket, CancellationToken.None); } [Test] public async Task CompleteScriptShouldNotErrorForAScriptWithAnUnknownResult() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var request = new CompleteScriptCommandV2(new ScriptTicket($"did-not-finish-{Guid.NewGuid()}")); var ticket = request.Ticket; - SetupScriptState(ticket); + SetupScriptState(builder.WorkspaceFactory, builder.StateStoreFactory, ticket); await service.CompleteScriptAsync(request, CancellationToken.None); } @@ -418,6 +445,9 @@ public async Task CompleteScriptShouldNotErrorForAScriptWithAnUnknownResult() [Test] public async Task ShouldStoreTheStateOfTheScriptInTheScriptStateStore() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var testStarted = DateTimeOffset.UtcNow; var bashScript = "sleep 10"; @@ -429,13 +459,13 @@ public async Task ShouldStoreTheStateOfTheScriptInTheScriptStateStore() .WithDurationStartScriptCanWaitForScriptToFinish(null) .Build(); - var scriptStateStore = SetupScriptStateStore(startScriptCommand.ScriptTicket); + var scriptStateStore = SetupScriptStateStore(builder.WorkspaceFactory, builder.StateStoreFactory, startScriptCommand.ScriptTicket); var startScriptResponse = await service.StartScriptAsync(startScriptCommand, CancellationToken.None); var runningScriptState = scriptStateStore.Load(); - var (logs, finalResponse) = await RunUntilScriptFinishes(startScriptCommand, startScriptResponse); + var (logs, finalResponse) = await RunUntilScriptFinishes(service, startScriptCommand, startScriptResponse); var testFinished = DateTimeOffset.UtcNow; var finishedScriptState = scriptStateStore.Load(); @@ -461,6 +491,8 @@ public async Task ShouldStoreTheStateOfTheScriptInTheScriptStateStore() [Test] public async Task ScriptTicketCasingShouldNotAffectCommands() { + var service = new ScriptServiceV2Builder().Build(); + // Arrange var startScriptCommand = new StartScriptCommandV2Builder() .WithScriptBody("echo \"finished\"") @@ -483,22 +515,364 @@ public async Task ScriptTicketCasingShouldNotAffectCommands() lowerCaseResponse.ExitCode.Should().Be(0); } + [Test] + public async Task AbandonScript_OnUnknownTicket_ReturnsCompleteWithUnknownScriptExitCode() + { + var service = new ScriptServiceV2Builder().Build(); + + var ticket = new ScriptTicket("unknown-ticket-" + Guid.NewGuid().ToString("N")); + var response = await service.AbandonScriptAsync(new AbandonScriptCommandV2(ticket, 0), CancellationToken.None); + + response.State.Should().Be(ProcessState.Complete); + response.ExitCode.Should().Be(ScriptExitCodes.UnknownScriptExitCode); + } + + [Test] + public async Task AbandonScript_OnRunningScript_FiresAbandonToken_ReturnsAbandonedExitCode() + { + var service = new ScriptServiceV2Builder().Build(); + + var startCommand = new StartScriptCommandV2Builder() + .WithScriptBodyForCurrentOs("Start-Sleep -Seconds 60", "sleep 60") + .WithIsolation(ScriptIsolationLevel.NoIsolation) + .WithDurationStartScriptCanWaitForScriptToFinish(null) + .Build(); + + await service.StartScriptAsync(startCommand, CancellationToken.None); + + // Wait for the script to reach Running state + ScriptStatusResponseV2 status; + var deadline = DateTime.UtcNow.AddSeconds(30); + do + { + status = await service.GetStatusAsync(new ScriptStatusRequestV2(startCommand.ScriptTicket, 0), CancellationToken.None); + if (status.State == ProcessState.Running) break; + await Task.Delay(50); + } while (DateTime.UtcNow < deadline); + status.State.Should().Be(ProcessState.Running, "script should have reached Running state within 30 seconds"); + + // Fire abandon + await service.AbandonScriptAsync(new AbandonScriptCommandV2(startCommand.ScriptTicket, 0), CancellationToken.None); + + // Poll until the script completes (the abandon token causes the process runner to return AbandonedExitCode) + ScriptStatusResponseV2 finalResponse; + var completionDeadline = DateTime.UtcNow.AddSeconds(30); + do + { + finalResponse = await service.GetStatusAsync(new ScriptStatusRequestV2(startCommand.ScriptTicket, 0), CancellationToken.None); + if (finalResponse.State == ProcessState.Complete) break; + await Task.Delay(100); + } while (DateTime.UtcNow < completionDeadline); + + finalResponse.State.Should().Be(ProcessState.Complete); + finalResponse.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + } + + [Test] + public async Task AbandonScript_OnAlreadyCompletedScript_ReturnsRealExitCode() + { + var service = new ScriptServiceV2Builder().Build(); + + var startCommand = new StartScriptCommandV2Builder() + .WithScriptBody("echo \"finished\"") + .WithIsolation(ScriptIsolationLevel.NoIsolation) + .WithDurationStartScriptCanWaitForScriptToFinish(null) + .Build(); + + await service.StartScriptAsync(startCommand, CancellationToken.None); + + // Wait for the script to complete + ScriptStatusResponseV2 status; + var deadline = DateTime.UtcNow.AddSeconds(30); + do + { + status = await service.GetStatusAsync(new ScriptStatusRequestV2(startCommand.ScriptTicket, 0), CancellationToken.None); + if (status.State == ProcessState.Complete) break; + await Task.Delay(50); + } while (DateTime.UtcNow < deadline); + status.State.Should().Be(ProcessState.Complete); + + var abandonResponse = await service.AbandonScriptAsync(new AbandonScriptCommandV2(startCommand.ScriptTicket, 0), CancellationToken.None); + abandonResponse.ExitCode.Should().Be(0, "real exit code should be returned, not AbandonedExitCode"); + } + + [Test] + public async Task CompleteScript_AfterAbandon_WhenWorkspaceDeleteFails_LogsWarnAndReturnsNormally() + { + var deleteException = new IOException("file in use"); + var builder = new ScriptServiceV2Builder(); + var (throwingFactory, mockLog) = BuildFactoryWithThrowingDelete(builder.WorkspaceFactory, deleteException); + var serviceUnderTest = builder + .WithWorkspaceFactory(throwingFactory) + .WithLog(mockLog) + .Build(); + + var startCommand = new StartScriptCommandV2Builder() + .WithScriptBodyForCurrentOs("Start-Sleep -Seconds 60", "sleep 60") + .WithIsolation(ScriptIsolationLevel.NoIsolation) + .WithDurationStartScriptCanWaitForScriptToFinish(null) + .Build(); + + await serviceUnderTest.StartScriptAsync(startCommand, CancellationToken.None); + + // Wait for Running + ScriptStatusResponseV2 status; + var runningDeadline = DateTime.UtcNow.AddSeconds(30); + do + { + status = await serviceUnderTest.GetStatusAsync(new ScriptStatusRequestV2(startCommand.ScriptTicket, 0), CancellationToken.None); + if (status.State == ProcessState.Running) break; + await Task.Delay(50); + } while (DateTime.UtcNow < runningDeadline); + status.State.Should().Be(ProcessState.Running, "script should have reached Running state within 30 seconds"); + + await serviceUnderTest.AbandonScriptAsync(new AbandonScriptCommandV2(startCommand.ScriptTicket, 0), CancellationToken.None); + + // Poll until Complete + var completeDeadline = DateTime.UtcNow.AddSeconds(30); + do + { + status = await serviceUnderTest.GetStatusAsync(new ScriptStatusRequestV2(startCommand.ScriptTicket, 0), CancellationToken.None); + if (status.State == ProcessState.Complete) break; + await Task.Delay(50); + } while (DateTime.UtcNow < completeDeadline); + status.State.Should().Be(ProcessState.Complete); + status.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + + Func complete = async () => await serviceUnderTest.CompleteScriptAsync(new CompleteScriptCommandV2(startCommand.ScriptTicket), CancellationToken.None); + + await complete.Should().NotThrowAsync(); + mockLog.Received().Warn(deleteException, Arg.Is(m => m.Contains("Could not delete") && m.Contains(startCommand.ScriptTicket.TaskId))); + } + + [Test] + public async Task CompleteScript_AfterNormalCompletion_WhenWorkspaceDeleteFails_PropagatesException() + { + var deleteException = new IOException("file in use"); + var builder = new ScriptServiceV2Builder(); + var (throwingFactory, mockLog) = BuildFactoryWithThrowingDelete(builder.WorkspaceFactory, deleteException); + var serviceUnderTest = builder + .WithWorkspaceFactory(throwingFactory) + .WithLog(mockLog) + .Build(); + + var startCommand = new StartScriptCommandV2Builder() + .WithScriptBody("echo \"finished\"") + .WithIsolation(ScriptIsolationLevel.NoIsolation) + .WithDurationStartScriptCanWaitForScriptToFinish(null) + .Build(); + + await serviceUnderTest.StartScriptAsync(startCommand, CancellationToken.None); + + // Poll until natural completion + ScriptStatusResponseV2 status; + var deadline = DateTime.UtcNow.AddSeconds(30); + do + { + status = await serviceUnderTest.GetStatusAsync(new ScriptStatusRequestV2(startCommand.ScriptTicket, 0), CancellationToken.None); + if (status.State == ProcessState.Complete) break; + await Task.Delay(50); + } while (DateTime.UtcNow < deadline); + status.State.Should().Be(ProcessState.Complete); + status.ExitCode.Should().Be(0, "the script exited cleanly, not via abandon"); + + Func complete = async () => await serviceUnderTest.CompleteScriptAsync(new CompleteScriptCommandV2(startCommand.ScriptTicket), CancellationToken.None); + + await complete.Should().ThrowAsync(); + } + + /// + /// Builds an IScriptWorkspaceFactory decorator over the supplied workspaceFactory whose returned + /// workspaces forward every member except Delete(CancellationToken), which throws the supplied + /// exception. Also returns a mock ISystemLog for assertion. + /// + static (IScriptWorkspaceFactory factory, ISystemLog log) BuildFactoryWithThrowingDelete(IScriptWorkspaceFactory workspaceFactory, Exception deleteException) + { + var throwingFactory = new DeleteThrowingScriptWorkspaceFactory(workspaceFactory, deleteException); + var fakeLog = Substitute.For(); + return (throwingFactory, fakeLog); + } + + /// + /// Builder for ScriptServiceV2 SUT construction. Defaults match what the previous [SetUp] + /// produced; tests opt into mock overrides via the chainable With* methods. The + /// WorkspaceFactory and StateStoreFactory properties materialize on first access so tests + /// can grab them before Build() (e.g. to wrap the default factory in a decorator). + /// + class ScriptServiceV2Builder + { + IScriptWorkspaceFactory? workspaceFactory; + ScriptStateStoreFactory? stateStoreFactory; + ISystemLog? log; + IShell? shell; + ScriptIsolationMutex? mutex; + OctopusPhysicalFileSystem? cachedFileSystem; + + public IScriptWorkspaceFactory WorkspaceFactory => workspaceFactory ??= BuildDefaultWorkspaceFactory(); + public ScriptStateStoreFactory StateStoreFactory => stateStoreFactory ??= new ScriptStateStoreFactory(FileSystem); + + OctopusPhysicalFileSystem FileSystem => cachedFileSystem ??= new OctopusPhysicalFileSystem(Substitute.For()); + + ScriptWorkspaceFactory BuildDefaultWorkspaceFactory() + { + var homeConfiguration = Substitute.For(); + homeConfiguration.HomeDirectory.Returns(Environment.CurrentDirectory); + return new ScriptWorkspaceFactory(FileSystem, homeConfiguration, new SensitiveValueMasker()); + } + + public ScriptServiceV2Builder WithWorkspaceFactory(IScriptWorkspaceFactory factory) + { + workspaceFactory = factory; + return this; + } + + public ScriptServiceV2Builder WithStateStoreFactory(ScriptStateStoreFactory factory) + { + stateStoreFactory = factory; + return this; + } + + public ScriptServiceV2Builder WithLog(ISystemLog log) + { + this.log = log; + return this; + } + + public ScriptServiceV2Builder WithShell(IShell shell) + { + this.shell = shell; + return this; + } + + public ScriptServiceV2Builder WithMutex(ScriptIsolationMutex mutex) + { + this.mutex = mutex; + return this; + } + + public ScriptServiceV2 Build() + { + return new ScriptServiceV2( + shell ?? (PlatformDetection.IsRunningOnWindows ? (IShell)new PowerShell() : new Bash()), + WorkspaceFactory, + StateStoreFactory, + mutex ?? new ScriptIsolationMutex(), + log ?? Substitute.For()); + } + } + + /// + /// IScriptWorkspaceFactory decorator that wraps every workspace it returns in a + /// DeleteThrowingScriptWorkspace so that Delete throws the configured exception while all + /// other members forward to the real workspace. + /// + class DeleteThrowingScriptWorkspaceFactory : IScriptWorkspaceFactory + { + readonly IScriptWorkspaceFactory inner; + readonly Exception deleteException; + + public DeleteThrowingScriptWorkspaceFactory(IScriptWorkspaceFactory inner, Exception deleteException) + { + this.inner = inner; + this.deleteException = deleteException; + } + + public IScriptWorkspace GetWorkspace(ScriptTicket ticket, WorkspaceReadinessCheck readinessCheck) + => new DeleteThrowingScriptWorkspace(inner.GetWorkspace(ticket, readinessCheck), deleteException); + + public async Task PrepareWorkspace( + ScriptTicket ticket, + string scriptBody, + Dictionary scripts, + ScriptIsolationLevel isolationLevel, + TimeSpan scriptMutexAcquireTimeout, + string? scriptMutexName, + string[]? scriptArguments, + List files, + CancellationToken cancellationToken) + { + var workspace = await inner.PrepareWorkspace(ticket, scriptBody, scripts, isolationLevel, scriptMutexAcquireTimeout, scriptMutexName, scriptArguments, files, cancellationToken); + return new DeleteThrowingScriptWorkspace(workspace, deleteException); + } + + public List GetUncompletedWorkspaces() + => inner.GetUncompletedWorkspaces().Select(w => (IScriptWorkspace)new DeleteThrowingScriptWorkspace(w, deleteException)).ToList(); + } + + /// + /// IScriptWorkspace decorator that forwards every member to an inner real workspace, except + /// Delete(CancellationToken), which throws the configured exception. Used to exercise the + /// CompleteScript abandon-aware tolerance of Delete failures without disturbing anything else + /// StartScript / RunningScript may touch on the workspace. + /// + class DeleteThrowingScriptWorkspace : IScriptWorkspace + { + readonly IScriptWorkspace inner; + readonly Exception deleteException; + + public DeleteThrowingScriptWorkspace(IScriptWorkspace inner, Exception deleteException) + { + this.inner = inner; + this.deleteException = deleteException; + } + + public ScriptTicket ScriptTicket => inner.ScriptTicket; + public string WorkingDirectory => inner.WorkingDirectory; + public string BootstrapScriptFilePath => inner.BootstrapScriptFilePath; + public string LogFilePath => inner.LogFilePath; + + public string[]? ScriptArguments + { + get => inner.ScriptArguments; + set => inner.ScriptArguments = value; + } + + public ScriptIsolationLevel IsolationLevel + { + get => inner.IsolationLevel; + set => inner.IsolationLevel = value; + } + + public TimeSpan ScriptMutexAcquireTimeout + { + get => inner.ScriptMutexAcquireTimeout; + set => inner.ScriptMutexAcquireTimeout = value; + } + + public string? ScriptMutexName + { + get => inner.ScriptMutexName; + set => inner.ScriptMutexName = value; + } + + public bool ShouldMonitorPowerShellStartup() => inner.ShouldMonitorPowerShellStartup(); + public void BootstrapScript(string scriptBody) => inner.BootstrapScript(scriptBody); + public string ResolvePath(string fileName) => inner.ResolvePath(fileName); + public IScriptLog CreateLog() => inner.CreateLog(); + public void WriteFile(string filename, string contents) => inner.WriteFile(filename, contents); + public void CopyFile(string sourceFilePath, string destFileName, bool overwrite) => inner.CopyFile(sourceFilePath, destFileName, overwrite); + public void CheckReadiness() => inner.CheckReadiness(); + public string? TryReadFile(string filename) => inner.TryReadFile(filename); + + public Task Delete(CancellationToken cancellationToken) => throw deleteException; + } + // TODO - Test the stateStore is updated. - private void SetupScriptState(ScriptTicket ticket) + static void SetupScriptState(IScriptWorkspaceFactory workspaceFactory, ScriptStateStoreFactory stateStoreFactory, ScriptTicket ticket) { - var stateWorkspace = SetupScriptStateStore(ticket); + var stateWorkspace = SetupScriptStateStore(workspaceFactory, stateStoreFactory, ticket); stateWorkspace.Create(); } - private ScriptStateStore SetupScriptStateStore(ScriptTicket ticket) + static ScriptStateStore SetupScriptStateStore(IScriptWorkspaceFactory workspaceFactory, ScriptStateStoreFactory stateStoreFactory, ScriptTicket ticket) { var workspace = workspaceFactory.GetWorkspace(ticket, WorkspaceReadinessCheck.Perform); var stateWorkspace = stateStoreFactory.Create(workspace); return stateWorkspace; } - private async Task CleanupWorkspace(ScriptTicket ticket, CancellationToken cancellationToken) + static async Task CleanupWorkspace(IScriptWorkspaceFactory workspaceFactory, ScriptTicket ticket, CancellationToken cancellationToken) { var workspace = workspaceFactory.GetWorkspace(ticket, WorkspaceReadinessCheck.Skip); await workspace.Delete(cancellationToken); @@ -527,9 +901,9 @@ private async Task CleanupWorkspace(ScriptTicket ticket, CancellationToken cance return (startScriptCommand, new FileInfo(filePath)); } - async Task<(List, ScriptStatusResponseV2)> RunUntilScriptCompletes(StartScriptCommandV2 startScriptCommand, ScriptStatusResponseV2 response) + static async Task<(List, ScriptStatusResponseV2)> RunUntilScriptCompletes(ScriptServiceV2 service, StartScriptCommandV2 startScriptCommand, ScriptStatusResponseV2 response) { - var (logs, lastResponse) = await RunUntilScriptFinishes(startScriptCommand, response); + var (logs, lastResponse) = await RunUntilScriptFinishes(service, startScriptCommand, response); await service.CompleteScriptAsync(new CompleteScriptCommandV2(startScriptCommand.ScriptTicket), CancellationToken.None); @@ -538,7 +912,7 @@ private async Task CleanupWorkspace(ScriptTicket ticket, CancellationToken cance return (logs, lastResponse); } - async Task<(List logs, ScriptStatusResponseV2 response)> RunUntilScriptFinishes(StartScriptCommandV2 startScriptCommand, ScriptStatusResponseV2 response) + static async Task<(List logs, ScriptStatusResponseV2 response)> RunUntilScriptFinishes(ScriptServiceV2 service, StartScriptCommandV2 startScriptCommand, ScriptStatusResponseV2 response) { var logs = new List(response.Logs); @@ -557,7 +931,7 @@ private async Task CleanupWorkspace(ScriptTicket ticket, CancellationToken cance return (logs, response); } - void WriteLogsToConsole(List logs) + static void WriteLogsToConsole(List logs) { foreach (var log in logs) { @@ -576,4 +950,4 @@ public StartScriptCommandAndResponse(StartScriptCommandV2 command) public ScriptStatusResponseV2? Response { get; set; } } } -} \ No newline at end of file +} diff --git a/source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs b/source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs index 7dda62791..a4537f26a 100644 --- a/source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs +++ b/source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs @@ -6,6 +6,7 @@ using Octopus.Tentacle.Contracts.KubernetesScriptServiceV1; using Octopus.Tentacle.Contracts.ScriptServiceV2; using Octopus.Tentacle.Core.Services; +using Octopus.Tentacle.Core.Services.Scripts; using Octopus.Tentacle.Util; namespace Octopus.Tentacle.Services.Capabilities @@ -24,7 +25,7 @@ public async Task GetCapabilitiesAsync(CancellationToken } //non-kubernetes agent tentacles only support the standard script services - return new CapabilitiesResponseV2(new List { nameof(IScriptService), nameof(IFileTransferService), nameof(IScriptServiceV2) }); + return new CapabilitiesResponseV2(new List { nameof(IScriptService), nameof(IFileTransferService), nameof(IScriptServiceV2), nameof(ScriptServiceV2.AbandonScriptAsync) }); } } } \ No newline at end of file diff --git a/source/Octopus.Tentacle/Services/Scripts/ScriptService.cs b/source/Octopus.Tentacle/Services/Scripts/ScriptService.cs index e1b952202..82b482d6f 100644 --- a/source/Octopus.Tentacle/Services/Scripts/ScriptService.cs +++ b/source/Octopus.Tentacle/Services/Scripts/ScriptService.cs @@ -93,7 +93,7 @@ public async Task CompleteScriptAsync(CompleteScriptComman RunningScript LaunchShell(ScriptTicket ticket, string serverTaskId, IScriptWorkspace workspace, CancellationTokenSource cancel) { - var runningScript = new RunningScript(shell, workspace, workspace.CreateLog(), serverTaskId, scriptIsolationMutex, cancel.Token, new Dictionary(), PowerShellStartupDetection.PowerShellStartupTimeout, log); + var runningScript = RunningScript.Create(shell, workspace, workspace.CreateLog(), serverTaskId, scriptIsolationMutex, cancel.Token, new Dictionary(), PowerShellStartupDetection.PowerShellStartupTimeout, log); _ = Task.Run(async () => await runningScript.Execute()); return runningScript; } diff --git a/source/Octopus.Tentacle/Util/ISilentProcessRunner.cs b/source/Octopus.Tentacle/Util/ISilentProcessRunner.cs index 49c54fd6b..fa6d21f45 100644 --- a/source/Octopus.Tentacle/Util/ISilentProcessRunner.cs +++ b/source/Octopus.Tentacle/Util/ISilentProcessRunner.cs @@ -14,7 +14,8 @@ Task ExecuteCommandAsync( string workingDirectory, Action info, Action error, - CancellationToken cancel = default); + CancellationToken cancel = default, + CancellationToken abandon = default); Task ExecuteCommandAsync( string executable, @@ -23,19 +24,20 @@ Task ExecuteCommandAsync( Action debug, Action info, Action error, - CancellationToken cancel = default); + CancellationToken cancel = default, + CancellationToken abandon = default); } public class SilentProcessRunnerWrapper : ISilentProcessRunner { - public Task ExecuteCommandAsync(string executable, string arguments, string workingDirectory, Action info, Action error, CancellationToken cancel = default) + public Task ExecuteCommandAsync(string executable, string arguments, string workingDirectory, Action info, Action error, CancellationToken cancel = default, CancellationToken abandon = default) { - return SilentProcessRunnerExtended.ExecuteCommandAsync(executable, arguments, workingDirectory, info, error, cancel); + return SilentProcessRunnerExtended.ExecuteCommandAsync(executable, arguments, workingDirectory, info, error, cancel, abandon); } - public Task ExecuteCommandAsync(string executable, string arguments, string workingDirectory, Action debug, Action info, Action error, CancellationToken cancel = default) + public Task ExecuteCommandAsync(string executable, string arguments, string workingDirectory, Action debug, Action info, Action error, CancellationToken cancel = default, CancellationToken abandon = default) { - return SilentProcessRunner.ExecuteCommandAsync(executable, arguments, workingDirectory, debug, info, error, cancel: cancel); + return SilentProcessRunner.ExecuteCommandAsync(executable, arguments, workingDirectory, debug, info, error, cancel: cancel, abandon: abandon); } } @@ -70,7 +72,8 @@ public static Task ExecuteCommandAsync( string workingDirectory, Action info, Action error, - CancellationToken cancel = default) + CancellationToken cancel = default, + CancellationToken abandon = default) => SilentProcessRunner.ExecuteCommandAsync(executable, arguments, workingDirectory, @@ -78,6 +81,7 @@ public static Task ExecuteCommandAsync( info, error, customEnvironmentVariables: null, - cancel: cancel); + cancel: cancel, + abandon: abandon); } } From 4f3fc4d6c4ed41c16cfd9805fdd03d2e46af906b Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Tue, 2 Jun 2026 12:40:31 +1000 Subject: [PATCH 02/15] Address review: remove abandon capability check, drop abandon metrics, tighten runner - ScriptServiceV2Executor.AbandonScript: drop the GetCapabilities check + CancelScript fallback (every tentacle from this build forward advertises abandon); remove the now-dead clientCapabilitiesServiceV2 dependency, the dead HasAbandonScriptV2 extension + its tests, and the obsolete fallback unit test. IScriptExecutor doc no longer mentions a fallback. - TentacleClient.AbandonScript: drop the per-call operation metrics builder + try/catch to match CancelScript (abandon is a one-shot RPC). - SilentProcessRunner: extract WhenCancelled(token) for the abandon-signal task; tighten the wait/cleanup comments and correct the both-tokens note (the -48 race is test-only; production sends abandon only after cancel fails). Co-Authored-By: Claude Opus 4.8 (1M context) --- ...bilitiesResponseV2ExtensionMethodsTests.cs | 35 ------ .../ScriptServiceV2ExecutorAbandonTests.cs | 34 +----- .../CapabilitiesResponseV2ExtensionMethods.cs | 13 --- .../Scripts/IScriptExecutor.cs | 5 +- .../Scripts/ScriptExecutorFactory.cs | 1 - .../Scripts/ScriptServiceV2Executor.cs | 18 --- .../Octopus.Tentacle.Client/TentacleClient.cs | 25 ++-- .../Util/CommandLine/SilentProcessRunner.cs | 108 +++++++----------- 8 files changed, 54 insertions(+), 185 deletions(-) delete mode 100644 source/Octopus.Tentacle.Client.Tests/CapabilitiesResponseV2ExtensionMethodsTests.cs diff --git a/source/Octopus.Tentacle.Client.Tests/CapabilitiesResponseV2ExtensionMethodsTests.cs b/source/Octopus.Tentacle.Client.Tests/CapabilitiesResponseV2ExtensionMethodsTests.cs deleted file mode 100644 index 2eb0b7f67..000000000 --- a/source/Octopus.Tentacle.Client.Tests/CapabilitiesResponseV2ExtensionMethodsTests.cs +++ /dev/null @@ -1,35 +0,0 @@ -using System.Collections.Generic; -using FluentAssertions; -using NUnit.Framework; -using Octopus.Tentacle.Client.Capabilities; -using Octopus.Tentacle.Contracts.Capabilities; -using Octopus.Tentacle.Contracts.ClientServices; -using Octopus.Tentacle.Contracts.ScriptServiceV2; - -namespace Octopus.Tentacle.Client.Tests -{ - [TestFixture] - public class CapabilitiesResponseV2ExtensionMethodsTests - { - [Test] - public void HasAbandonScriptV2_WhenAdvertised_ReturnsTrue() - { - var capabilities = new CapabilitiesResponseV2(new List { nameof(IScriptServiceV2), nameof(IAsyncClientScriptServiceV2.AbandonScriptAsync) }); - capabilities.HasAbandonScriptV2().Should().BeTrue(); - } - - [Test] - public void HasAbandonScriptV2_WhenNotAdvertised_ReturnsFalse() - { - var capabilities = new CapabilitiesResponseV2(new List { nameof(IScriptServiceV2) }); - capabilities.HasAbandonScriptV2().Should().BeFalse(); - } - - [Test] - public void HasAbandonScriptV2_WhenEmpty_ReturnsFalse() - { - var capabilities = new CapabilitiesResponseV2(new List()); - capabilities.HasAbandonScriptV2().Should().BeFalse(); - } - } -} diff --git a/source/Octopus.Tentacle.Client.Tests/ScriptServiceV2ExecutorAbandonTests.cs b/source/Octopus.Tentacle.Client.Tests/ScriptServiceV2ExecutorAbandonTests.cs index 4d3dca224..9e3a2f888 100644 --- a/source/Octopus.Tentacle.Client.Tests/ScriptServiceV2ExecutorAbandonTests.cs +++ b/source/Octopus.Tentacle.Client.Tests/ScriptServiceV2ExecutorAbandonTests.cs @@ -1,6 +1,5 @@ using System; using System.Collections.Generic; -using System.Threading; using System.Threading.Tasks; using FluentAssertions; using Halibut.ServiceModel; @@ -13,7 +12,6 @@ using Octopus.Tentacle.Client.Retries; using Octopus.Tentacle.Client.Scripts; using Octopus.Tentacle.Contracts; -using Octopus.Tentacle.Contracts.Capabilities; using Octopus.Tentacle.Contracts.ClientServices; using Octopus.Tentacle.Contracts.Logging; using Octopus.Tentacle.Contracts.Observability; @@ -24,10 +22,9 @@ namespace Octopus.Tentacle.Client.Tests [TestFixture] public class ScriptServiceV2ExecutorAbandonTests { - static ScriptServiceV2Executor CreateExecutor(IAsyncClientScriptServiceV2 scriptService, IAsyncClientCapabilitiesServiceV2 capabilities) + static ScriptServiceV2Executor CreateExecutor(IAsyncClientScriptServiceV2 scriptService) => new( scriptService, - capabilities, RpcCallExecutorFactory.Create(TimeSpan.Zero, Substitute.For()), ClientOperationMetricsBuilder.Start(), TimeSpan.Zero, @@ -37,7 +34,7 @@ static ScriptServiceV2Executor CreateExecutor(IAsyncClientScriptServiceV2 script static CommandContext Context() => new(new ScriptTicket("TestTicket"), 0, ScriptServiceVersion.ScriptServiceVersion2); [Test] - public async Task AbandonScript_WhenCapabilityAdvertised_CallsAbandonScriptAsync() + public async Task AbandonScript_CallsAbandonScriptAsync() { var scriptService = Substitute.For(); scriptService.AbandonScriptAsync(Arg.Any(), Arg.Any()) @@ -45,11 +42,7 @@ public async Task AbandonScript_WhenCapabilityAdvertised_CallsAbandonScriptAsync x.Arg().Ticket, ProcessState.Complete, ScriptExitCodes.AbandonedExitCode, new List(), 1))); - var capabilities = Substitute.For(); - capabilities.GetCapabilitiesAsync(Arg.Any()) - .Returns(Task.FromResult(new CapabilitiesResponseV2(new List { "IScriptServiceV2", "AbandonScriptAsync" }))); - - var executor = CreateExecutor(scriptService, capabilities); + var executor = CreateExecutor(scriptService); var result = await executor.AbandonScript(Context()); @@ -57,26 +50,5 @@ public async Task AbandonScript_WhenCapabilityAdvertised_CallsAbandonScriptAsync await scriptService.DidNotReceive().CancelScriptAsync(Arg.Any(), Arg.Any()); result.ScriptStatus.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); } - - [Test] - public async Task AbandonScript_WhenCapabilityAbsent_FallsBackToCancelScriptAsync() - { - var scriptService = Substitute.For(); - scriptService.CancelScriptAsync(Arg.Any(), Arg.Any()) - .Returns(x => Task.FromResult(new ScriptStatusResponseV2( - x.Arg().Ticket, ProcessState.Running, - 0, new List(), 1))); - - var capabilities = Substitute.For(); - capabilities.GetCapabilitiesAsync(Arg.Any()) - .Returns(Task.FromResult(new CapabilitiesResponseV2(new List { "IScriptServiceV2" }))); - - var executor = CreateExecutor(scriptService, capabilities); - - await executor.AbandonScript(Context()); - - await scriptService.DidNotReceive().AbandonScriptAsync(Arg.Any(), Arg.Any()); - await scriptService.Received(1).CancelScriptAsync(Arg.Any(), Arg.Any()); - } } } diff --git a/source/Octopus.Tentacle.Client/Capabilities/CapabilitiesResponseV2ExtensionMethods.cs b/source/Octopus.Tentacle.Client/Capabilities/CapabilitiesResponseV2ExtensionMethods.cs index 2b17d288c..bc49a1e95 100644 --- a/source/Octopus.Tentacle.Client/Capabilities/CapabilitiesResponseV2ExtensionMethods.cs +++ b/source/Octopus.Tentacle.Client/Capabilities/CapabilitiesResponseV2ExtensionMethods.cs @@ -1,6 +1,5 @@ using System.Linq; using Octopus.Tentacle.Contracts.Capabilities; -using Octopus.Tentacle.Contracts.ClientServices; using Octopus.Tentacle.Contracts.ScriptServiceV2; namespace Octopus.Tentacle.Client.Capabilities @@ -16,17 +15,5 @@ public static bool HasScriptServiceV2(this CapabilitiesResponseV2 capabilities) return capabilities.SupportedCapabilities.Contains(nameof(IScriptServiceV2)); } - - public static bool HasAbandonScriptV2(this CapabilitiesResponseV2 capabilities) - { - if (capabilities?.SupportedCapabilities?.Any() != true) - { - return false; - } - - // Both sides nameof their own AbandonScriptAsync. The strings match, and a rename on - // either side can't silently drift the capability check. - return capabilities.SupportedCapabilities.Contains(nameof(IAsyncClientScriptServiceV2.AbandonScriptAsync)); - } } } \ No newline at end of file diff --git a/source/Octopus.Tentacle.Client/Scripts/IScriptExecutor.cs b/source/Octopus.Tentacle.Client/Scripts/IScriptExecutor.cs index 48fb6536c..3d9acc709 100644 --- a/source/Octopus.Tentacle.Client/Scripts/IScriptExecutor.cs +++ b/source/Octopus.Tentacle.Client/Scripts/IScriptExecutor.cs @@ -33,9 +33,8 @@ Task StartScript(ExecuteScriptCommand command, Task CancelScript(CommandContext commandContext); /// - /// Abandon the script: signal Tentacle to stop waiting and release the isolation mutex. - /// Returns the abandoned status when the Tentacle advertises the abandon capability; - /// otherwise falls back to cancelling and returns that result. + /// Abandon the script: signal Tentacle to stop waiting and release the isolation mutex, + /// returning the abandoned status. /// /// The CommandContext from the previous command Task AbandonScript(CommandContext commandContext); diff --git a/source/Octopus.Tentacle.Client/Scripts/ScriptExecutorFactory.cs b/source/Octopus.Tentacle.Client/Scripts/ScriptExecutorFactory.cs index d5b0af94a..a67824d6f 100644 --- a/source/Octopus.Tentacle.Client/Scripts/ScriptExecutorFactory.cs +++ b/source/Octopus.Tentacle.Client/Scripts/ScriptExecutorFactory.cs @@ -47,7 +47,6 @@ public IScriptExecutor CreateScriptExecutor(ScriptServiceVersion scriptServiceTo { return new ScriptServiceV2Executor( allClients.ScriptServiceV2, - allClients.CapabilitiesServiceV2, rpcCallExecutor, clientOperationMetricsBuilder, onCancellationAbandonCompleteScriptAfter, diff --git a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV2Executor.cs b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV2Executor.cs index abc3270a3..229e917ad 100644 --- a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV2Executor.cs +++ b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV2Executor.cs @@ -19,7 +19,6 @@ namespace Octopus.Tentacle.Client.Scripts class ScriptServiceV2Executor : IScriptExecutor { readonly IAsyncClientScriptServiceV2 clientScriptServiceV2; - readonly IAsyncClientCapabilitiesServiceV2 clientCapabilitiesServiceV2; readonly RpcCallExecutor rpcCallExecutor; readonly ClientOperationMetricsBuilder clientOperationMetricsBuilder; readonly TimeSpan onCancellationAbandonCompleteScriptAfter; @@ -28,7 +27,6 @@ class ScriptServiceV2Executor : IScriptExecutor public ScriptServiceV2Executor( IAsyncClientScriptServiceV2 clientScriptServiceV2, - IAsyncClientCapabilitiesServiceV2 clientCapabilitiesServiceV2, RpcCallExecutor rpcCallExecutor, ClientOperationMetricsBuilder clientOperationMetricsBuilder, TimeSpan onCancellationAbandonCompleteScriptAfter, @@ -36,7 +34,6 @@ public ScriptServiceV2Executor( ITentacleClientTaskLog logger) { this.clientScriptServiceV2 = clientScriptServiceV2; - this.clientCapabilitiesServiceV2 = clientCapabilitiesServiceV2; this.rpcCallExecutor = rpcCallExecutor; this.clientOperationMetricsBuilder = clientOperationMetricsBuilder; this.onCancellationAbandonCompleteScriptAfter = onCancellationAbandonCompleteScriptAfter; @@ -181,21 +178,6 @@ public async Task AbandonScript(CommandContext c { using var activity = TentacleClient.ActivitySource.StartActivity($"{nameof(ScriptServiceV2Executor)}.{nameof(AbandonScript)}"); - var capabilities = await rpcCallExecutor.Execute( - retriesEnabled: clientOptions.RpcRetrySettings.RetriesEnabled, - RpcCall.Create(nameof(Contracts.Capabilities.ICapabilitiesServiceV2.GetCapabilities)), - async ct => await clientCapabilitiesServiceV2.GetCapabilitiesAsync(new HalibutProxyRequestOptions(ct)), - logger, - clientOperationMetricsBuilder, - CancellationToken.None).ConfigureAwait(false); - - // Capability absent → the Tentacle is too old to abandon. Keep cancelling cleanly. - if (!capabilities.HasAbandonScriptV2()) - { - logger.Verbose("Tentacle does not advertise AbandonScript; falling back to CancelScript."); - return await CancelScript(commandContext).ConfigureAwait(false); - } - async Task AbandonScriptAction(CancellationToken ct) { var request = new AbandonScriptCommandV2(commandContext.ScriptTicket, commandContext.NextLogSequence); diff --git a/source/Octopus.Tentacle.Client/TentacleClient.cs b/source/Octopus.Tentacle.Client/TentacleClient.cs index ca8003d95..188165cf2 100644 --- a/source/Octopus.Tentacle.Client/TentacleClient.cs +++ b/source/Octopus.Tentacle.Client/TentacleClient.cs @@ -268,29 +268,20 @@ public async Task AbandonScript(ScriptTicket scriptTicke using var activity = ActivitySource.StartActivity($"{nameof(TentacleClient)}.{nameof(AbandonScript)}"); activity?.AddTag("octopus.tentacle.script.ticket", scriptTicket.TaskId); - var operationMetricsBuilder = ClientOperationMetricsBuilder.Start(); - async Task AbandonScriptAction(CancellationToken ct) { var request = new AbandonScriptCommandV2(scriptTicket, lastLogSequence: 0); return await allClients.ScriptServiceV2.AbandonScriptAsync(request, new HalibutProxyRequestOptions(ct)); } - try - { - return await rpcCallExecutor.Execute( - retriesEnabled: clientOptions.RpcRetrySettings.RetriesEnabled, - RpcCall.Create(nameof(IScriptServiceV2.AbandonScript)), - AbandonScriptAction, - logger, - operationMetricsBuilder, - cancellationToken).ConfigureAwait(false); - } - catch (Exception e) - { - operationMetricsBuilder.Failure(e, cancellationToken); - throw; - } + return await rpcCallExecutor.Execute( + retriesEnabled: clientOptions.RpcRetrySettings.RetriesEnabled, + RpcCall.Create(nameof(IScriptServiceV2.AbandonScript)), + AbandonScriptAction, + logger, + // Abandon is a one-shot RPC; like CancelScript we don't track operation metrics for it. + ClientOperationMetricsBuilder.Start(), + cancellationToken).ConfigureAwait(false); } public async Task CompleteScript(CommandContext commandContext, ITentacleClientTaskLog logger, CancellationToken scriptExecutionCancellationToken) diff --git a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs index 8964ad12a..6e86f4fda 100644 --- a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs +++ b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs @@ -145,52 +145,32 @@ void WriteData(Action action, ManualResetEventSlim resetEvent, DataRecei process.BeginOutputReadLine(); process.BeginErrorReadLine(); - // Wait synchronously for the process to exit, but let `abandon` break out. - // - // process.WaitForExit() (no-timeout) waits for the process to exit AND for the - // redirected streams to reach EOF. Cancel kills + Close()s the process via - // cancel.Register; Close releases the pipe handles, which unblocks this wait even - // when a re-parented grandchild holds the streams open. So cancel — including the - // grandchild case — behaves exactly as it does today. - // - // Abandon is the new behaviour: we race the blocking wait against a task that - // completes when the abandon token fires. If abandon wins we return - // AbandonedExitCode and leave the OS process running; the still-blocked - // WaitForExit thread is released when the using (process) disposes on return. - // TaskCompletionSource (not the non-generic overload, which doesn't exist on net48). - var abandoned = new TaskCompletionSource(); - using (abandon.Register(() => abandoned.TrySetResult(null))) + // Wait for the process to exit, but let abandon break us out. WaitForExit() also + // drains the redirected streams; cancel kills then Close()s the process (via + // cancel.Register), releasing the pipes so this returns even when a re-parented + // grandchild holds them open. Abandon is the new path: race the wait against the + // abandon token and, if abandon wins, return -48 and leave the process running. + var waitForExit = Task.Run(() => { - var waitForExit = Task.Run(() => - { - try { process.WaitForExit(); } - catch { /* released by Process.Dispose on the abandon path */ } - }); - - await Task.WhenAny(waitForExit, abandoned.Task).ConfigureAwait(false); - - // Key the abandoned result on the TOKEN, not on which task won the race. - // When both cancel and abandon fire, cancel's Kill may be no-op'd / can't land, - // so waitForExit can win even though we must still return AbandonedExitCode. - // After Cancel()->Close() the Process is detached, so do NOT read - // process.HasExited / process.ExitCode here — just return -48. The - // "abandon was unnecessary" case never reaches here: the script's RunningScript - // is already Complete with its real exit code one layer up. - if (abandon.IsCancellationRequested) - { - // Abandon best-effort-kills (anti-abuse): kill it if we can, then stop - // waiting and release. Doing the kill here — sequentially, in the abandon - // branch — is race-free, unlike firing the cancel token from the RPC handler - // (the kill would race the WhenAny resolution / process disposal). The kill - // is idempotent if cancel already ran it. The process survives only if the - // kill genuinely can't land (stuck / re-parented grandchild). Do NOT read - // HasExited/ExitCode here — Close() may have detached the Process. - DoOurBestToCleanUp(process, error); - info("Tentacle has abandoned this script. The underlying script process may still be running on this host."); - SafelyCancelOutputAndErrorRead(process, debug); - running = false; - return ScriptExitCodes.AbandonedExitCode; - } + // Swallows the exception raised when the abandon path disposes the Process + // out from under this blocking wait. + try { process.WaitForExit(); } + catch { /* released by Process.Dispose on the abandon path */ } + }); + + await Task.WhenAny(waitForExit, WhenCancelled(abandon)).ConfigureAwait(false); + + // Abandon requested: best-effort-kill (anti-abuse), then stop waiting and return -48. + // Don't read HasExited/ExitCode — Close() may have detached the Process. A non-(-48) + // result when both tokens fire only happens in tests; in production abandon only + // arrives after cancel has failed to unstick the script (see the abandon design doc). + if (abandon.IsCancellationRequested) + { + DoOurBestToCleanUp(process, error); + info("Tentacle has abandoned this script. The underlying script process may still be running on this host."); + SafelyCancelOutputAndErrorRead(process, debug); + running = false; + return ScriptExitCodes.AbandonedExitCode; } SafelyCancelOutputAndErrorRead(process, debug); @@ -226,22 +206,22 @@ static int SafelyGetExitCode(Process process) } } + // Completes when the token is cancelled. Lets us race a cancellation against a blocking wait. + static Task WhenCancelled(CancellationToken token) + { + var tcs = new TaskCompletionSource(); + token.Register(() => tcs.TrySetResult(null)); + return tcs.Task; + } + static void SafelyWaitForAllOutput(ManualResetEventSlim outputResetEvent, CancellationToken cancel, Action debug) { - // outputResetEvent.Wait is waiting for the OutputDataReceived/ErrorDataReceived - // handlers to signal EOF on the stream (when it receives a null - // DataReceivedEventArgs.Data, .NET's EOF marker). This does NOT close the pipe, - // it just gives the OS up to 5 seconds to deliver the EOF. - // - // If a re-parented grandchild is holding the pipe open, EOF never arrives, the wait - // times out, and we proceed without the final flush of buffered output. The pipe is - // released later by Process.Dispose() at end of ExecuteCommandAsync via the - // `using (var process = new Process())` block. - // - // 5 seconds is somewhat arbitrary — the process has already exited by the time we - // reach here, so under normal circumstances EOF arrives within milliseconds. + // Wait up to 5s for stream EOF (a null DataReceived marks it). The process has already + // exited, so EOF is normally immediate. If a re-parented grandchild still holds the pipe + // open, EOF never comes; we time out and skip the final flush. Process.Dispose() releases + // the pipe at the end of the method. try { outputResetEvent.Wait(TimeSpan.FromSeconds(5), cancel); @@ -254,11 +234,8 @@ static void SafelyWaitForAllOutput(ManualResetEventSlim outputResetEvent, static void SafelyCancelOutputAndErrorRead(Process process, Action debug) { - // Cancel the output/error readers so a late OutputDataReceived/ErrorDataReceived - // callback doesn't try to write to a workspace log that's already been disposed by - // the using-block above; that write would throw ObjectDisposedException. Called in - // both the normal completion path and the abandon path; extracted here so the two - // callers stay consistent. + // Cancel the readers so a late callback can't write to the workspace log after it's + // disposed (that would throw). Used by both the normal and abandon paths. SafelyCancelRead(process.CancelErrorRead, debug); SafelyCancelRead(process.CancelOutputRead, debug); } @@ -297,11 +274,8 @@ static void DoOurBestToCleanUp(Process process, Action error) { try { - // Close the handles after killing, on this same thread (after Kill). This releases - // the redirected pipes so the synchronous process.WaitForExit() above can return - // even when a re-parented grandchild holds stdout/stderr open. (Close is safe with - // the synchronous wait — main has relied on exactly this; it only races the Exited - // event of the *async* WaitForExitAsync, which this code does not use.) + // Close the handles after the kill, on this thread. Releases the redirected pipes + // so the WaitForExit above returns even when a re-parented grandchild holds them open. process.Close(); } catch (Exception closeException) From e797a676e6ae1164a5cf2e6ccd21e131c28b18cb Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Tue, 2 Jun 2026 13:26:40 +1000 Subject: [PATCH 03/15] Apply review: concise AbandonScript summary on ITentacleClient Co-Authored-By: Claude Opus 4.8 (1M context) --- source/Octopus.Tentacle.Client/ITentacleClient.cs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/source/Octopus.Tentacle.Client/ITentacleClient.cs b/source/Octopus.Tentacle.Client/ITentacleClient.cs index a3c7b9ebe..4475acd3b 100644 --- a/source/Octopus.Tentacle.Client/ITentacleClient.cs +++ b/source/Octopus.Tentacle.Client/ITentacleClient.cs @@ -62,12 +62,8 @@ Task StartScript(ExecuteScriptCommand command, Task CancelScript(CommandContext commandContext, ITentacleClientTaskLog logger); /// - /// Abandon a running script: signal Tentacle to best-effort-kill the process, stop waiting, - /// and release the script's isolation mutex, returning straight away without waiting for the - /// process to exit. This does not delete the workspace. The later CompleteScript call does - /// that, and tolerates cleanup failures for an abandoned script whose process is still alive. - /// Used as an escape hatch when CancelScript can't terminate a stuck process; a prior - /// CancelScript is not required because abandon does its own best-effort kill. + /// Abandon a running script. This attempts cancellation, but if necessary leaves the script + /// running in the OS, no longer watched or managed by Tentacle. /// /// The ticket of the script to abandon /// Used to output user orientated log messages From 06517d6e196462e4fdc0e1e34900ea90ed57dabf Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Tue, 2 Jun 2026 14:23:22 +1000 Subject: [PATCH 04/15] Address review: V1/K8s abandon throws + orchestrator is abandon-aware; comment cleanups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ScriptServiceVersion gains SupportsAbandon (only V2). ObservingScriptOrchestrator checks it before escalating a stuck cancel to abandon, so it never calls abandon on a version that can't (V1, K8s). - ScriptServiceV1Executor / KubernetesScriptServiceV1Executor AbandonScript now throw NotSupported instead of silently degrading to cancel (per Luke) — a mistaken call is now obvious. - RunningScript threads the abandon token through the PowerShell-monitoring path explicitly, rather than reading the field in one branch only. - SilentProcessRunner: rename WhenCancelled -> WaitForAbandon; move the wait comment over the WhenAny and drop the grandchild detail; trim the abandon/helper comments; apply Jim's SafelyWaitForAllOutput wording. - ScriptServiceV2.AbandonScriptAsync and ITentacleClient.AbandonScript comments shortened per suggestions. Co-Authored-By: Claude Opus 4.8 (1M context) --- ...ObservingScriptOrchestratorAbandonTests.cs | 26 +++++++++++++++++++ .../ITentacleClient.cs | 2 +- .../KubernetesScriptServiceV1Executor.cs | 8 +++--- .../Scripts/ObservingScriptOrchestrator.cs | 5 +++- .../Scripts/ScriptServiceV1Executor.cs | 7 +++-- .../Scripts/ScriptServiceVersion.cs | 6 ++++- .../Services/Scripts/RunningScript.cs | 4 +-- .../Services/Scripts/ScriptServiceV2.cs | 4 +-- .../Util/CommandLine/SilentProcessRunner.cs | 24 +++++------------ 9 files changed, 53 insertions(+), 33 deletions(-) diff --git a/source/Octopus.Tentacle.Client.Tests/ObservingScriptOrchestratorAbandonTests.cs b/source/Octopus.Tentacle.Client.Tests/ObservingScriptOrchestratorAbandonTests.cs index 8348bdd78..f53178cc9 100644 --- a/source/Octopus.Tentacle.Client.Tests/ObservingScriptOrchestratorAbandonTests.cs +++ b/source/Octopus.Tentacle.Client.Tests/ObservingScriptOrchestratorAbandonTests.cs @@ -15,6 +15,7 @@ namespace Octopus.Tentacle.Client.Tests public class ObservingScriptOrchestratorAbandonTests { static CommandContext Context() => new(new ScriptTicket("T"), 0, ScriptServiceVersion.ScriptServiceVersion2); + static CommandContext V1Context() => new(new ScriptTicket("T"), 0, ScriptServiceVersion.ScriptServiceVersion1); static ScriptOperationExecutionResult Running() => new(new ScriptStatus(ProcessState.Running, 0, new List()), Context()); @@ -22,6 +23,12 @@ static ScriptOperationExecutionResult Running() static ScriptOperationExecutionResult Complete(int exitCode) => new(new ScriptStatus(ProcessState.Complete, exitCode, new List()), Context()); + static ScriptOperationExecutionResult RunningV1() + => new(new ScriptStatus(ProcessState.Running, 0, new List()), V1Context()); + + static ScriptOperationExecutionResult CompleteV1(int exitCode) + => new(new ScriptStatus(ProcessState.Complete, exitCode, new List()), V1Context()); + static ObservingScriptOrchestrator CreateOrchestrator(IScriptExecutor executor, TimeSpan? abandonAfter) => new( new ImmediateBackoff(), @@ -70,5 +77,24 @@ public async Task ThresholdCrossed_SwitchesFromCancelToAbandon() await executor.Received().AbandonScript(Arg.Any()); result.ScriptStatus.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); } + + [Test] + public async Task ThresholdCrossed_ButVersionCannotAbandon_KeepsCancelling() + { + // ScriptServiceV1 has no abandon verb, so even with the threshold crossed the orchestrator + // must keep cancelling and never call AbandonScript. + var executor = Substitute.For(); + executor.CancelScript(Arg.Any()) + .Returns(RunningV1(), CompleteV1(ScriptExitCodes.CanceledExitCode)); + + var orchestrator = CreateOrchestrator(executor, abandonAfter: TimeSpan.Zero); + using var cts = new CancellationTokenSource(); + cts.Cancel(); + + await orchestrator.ObserveUntilComplete(RunningV1(), cts.Token); + + await executor.DidNotReceive().AbandonScript(Arg.Any()); + await executor.Received().CancelScript(Arg.Any()); + } } } diff --git a/source/Octopus.Tentacle.Client/ITentacleClient.cs b/source/Octopus.Tentacle.Client/ITentacleClient.cs index 4475acd3b..9f3d28413 100644 --- a/source/Octopus.Tentacle.Client/ITentacleClient.cs +++ b/source/Octopus.Tentacle.Client/ITentacleClient.cs @@ -63,7 +63,7 @@ Task StartScript(ExecuteScriptCommand command, /// /// Abandon a running script. This attempts cancellation, but if necessary leaves the script - /// running in the OS, no longer watched or managed by Tentacle. + /// running in the OS but no longer has Tentacle watching or managing it. /// /// The ticket of the script to abandon /// Used to output user orientated log messages diff --git a/source/Octopus.Tentacle.Client/Scripts/KubernetesScriptServiceV1Executor.cs b/source/Octopus.Tentacle.Client/Scripts/KubernetesScriptServiceV1Executor.cs index 16dfc559b..124de8181 100644 --- a/source/Octopus.Tentacle.Client/Scripts/KubernetesScriptServiceV1Executor.cs +++ b/source/Octopus.Tentacle.Client/Scripts/KubernetesScriptServiceV1Executor.cs @@ -185,10 +185,10 @@ async Task CancelScriptAction(CancellationToke } public Task AbandonScript(CommandContext commandContext) - { - // KubernetesScriptServiceV1 has no abandon verb; degrade to cancel. - return CancelScript(commandContext); - } + // KubernetesScriptServiceV1 has no abandon verb. The orchestrator checks + // ScriptServiceVersion.SupportsAbandon and won't escalate here, so reaching this is a bug — + // throw rather than quietly cancel. Hung pods are recovered by deleting the pod instead. + => throw new NotSupportedException("KubernetesScriptServiceV1 cannot abandon a script; it has no abandon verb. Cancel the script instead."); public async Task CompleteScript(CommandContext lastStatusResponse, CancellationToken scriptExecutionCancellationToken) { diff --git a/source/Octopus.Tentacle.Client/Scripts/ObservingScriptOrchestrator.cs b/source/Octopus.Tentacle.Client/Scripts/ObservingScriptOrchestrator.cs index f71d77084..cd4e8ef0c 100644 --- a/source/Octopus.Tentacle.Client/Scripts/ObservingScriptOrchestrator.cs +++ b/source/Octopus.Tentacle.Client/Scripts/ObservingScriptOrchestrator.cs @@ -95,7 +95,10 @@ internal async Task ObserveUntilComplete( stopwatch.Start(); } - var shouldAbandon = abandonAfterCancellationPendingFor.HasValue + // Only escalate to abandon when the script service can actually abandon. On versions + // that can't (V1, Kubernetes) we keep cancelling rather than calling a verb they don't have. + var shouldAbandon = lastResult.ContextForNextCommand.ScripServiceVersionUsed.SupportsAbandon + && abandonAfterCancellationPendingFor.HasValue && stopwatch.Elapsed >= abandonAfterCancellationPendingFor.Value; lastResult = shouldAbandon diff --git a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV1Executor.cs b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV1Executor.cs index 8d0d32c81..75b3c3aad 100644 --- a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV1Executor.cs +++ b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV1Executor.cs @@ -165,10 +165,9 @@ public async Task CancelScript(CommandContext co } public Task AbandonScript(CommandContext commandContext) - { - // ScriptServiceV1 has no abandon verb; degrade to cancel. - return CancelScript(commandContext); - } + // ScriptServiceV1 has no abandon verb. The orchestrator checks ScriptServiceVersion.SupportsAbandon + // and won't escalate here, so reaching this is a bug — throw rather than quietly cancel. + => throw new NotSupportedException("ScriptServiceV1 cannot abandon a script; it has no abandon verb. Cancel the script instead."); public async Task CompleteScript(CommandContext lastStatusResponse, CancellationToken scriptExecutionCancellationToken) { diff --git a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersion.cs b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersion.cs index 972256b9b..6a1c7cfde 100644 --- a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersion.cs +++ b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersion.cs @@ -1,4 +1,4 @@ -namespace Octopus.Tentacle.Client.Scripts +namespace Octopus.Tentacle.Client.Scripts { public record ScriptServiceVersion(string Value) { @@ -6,6 +6,10 @@ public record ScriptServiceVersion(string Value) public static ScriptServiceVersion ScriptServiceVersion2 = new(nameof(ScriptServiceVersion2)); public static ScriptServiceVersion KubernetesScriptServiceVersion1 = new(nameof(KubernetesScriptServiceVersion1)); + // Only ScriptServiceV2 has the AbandonScript verb. The orchestrator checks this before it + // escalates a stuck cancel to abandon, so we never call abandon where it can't work. + public bool SupportsAbandon => Value == nameof(ScriptServiceVersion2); + public override string ToString() => Value; } } diff --git a/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs b/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs index 7109d0ff1..1a0069050 100644 --- a/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs +++ b/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs @@ -122,7 +122,7 @@ public async Task Execute() RecordScriptHasStarted(writer); exitCode = workspace.ShouldMonitorPowerShellStartup() - ? await RunPowershellScriptWithMonitoring(shellPath, writer, runningScriptToken) + ? await RunPowershellScriptWithMonitoring(shellPath, writer, runningScriptToken, abandonToken) : await RunScriptAsync(shellPath, writer, runningScriptToken, abandonToken); } } @@ -165,7 +165,7 @@ public async Task Execute() } } - async Task RunPowershellScriptWithMonitoring(string shellPath, IScriptLogWriter writer, CancellationToken runningScriptToken) + async Task RunPowershellScriptWithMonitoring(string shellPath, IScriptLogWriter writer, CancellationToken runningScriptToken, CancellationToken abandonToken) { // We want to be able to make some effort to cancel the running script, if the monitor task detects it as hung. // Hence, we make a linked cancellation token with the runningScriptToken diff --git a/source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs b/source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs index 152d4b13c..1909bacfb 100644 --- a/source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs +++ b/source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs @@ -144,9 +144,7 @@ public async Task CancelScriptAsync(CancelScriptCommandV public Task AbandonScriptAsync(AbandonScriptCommandV2 command, CancellationToken cancellationToken) { // Fire the abandon token. The runner's abandon branch best-effort-kills the process, - // stops waiting, releases the mutex, and returns -48. We don't kill from here: firing - // cancel would race the runner's wait. The process only survives if the kill can't land - // (genuinely stuck, or a re-parented grandchild). + // stops waiting, releases the mutex, and returns -48. if (runningScripts.TryGetValue(command.Ticket, out var runningScript)) { runningScript.Abandon(); diff --git a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs index 6e86f4fda..5064b26b4 100644 --- a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs +++ b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs @@ -145,25 +145,17 @@ void WriteData(Action action, ManualResetEventSlim resetEvent, DataRecei process.BeginOutputReadLine(); process.BeginErrorReadLine(); - // Wait for the process to exit, but let abandon break us out. WaitForExit() also - // drains the redirected streams; cancel kills then Close()s the process (via - // cancel.Register), releasing the pipes so this returns even when a re-parented - // grandchild holds them open. Abandon is the new path: race the wait against the - // abandon token and, if abandon wins, return -48 and leave the process running. var waitForExit = Task.Run(() => { - // Swallows the exception raised when the abandon path disposes the Process - // out from under this blocking wait. try { process.WaitForExit(); } catch { /* released by Process.Dispose on the abandon path */ } }); - await Task.WhenAny(waitForExit, WhenCancelled(abandon)).ConfigureAwait(false); + // Wait for the process to exit, but let abandon break us out. Cancel kills then + // Close()s the process (via cancel.Register) so this returns; abandon races the + // wait and, if it wins, we return -48 and leave the process running. + await Task.WhenAny(waitForExit, WaitForAbandon(abandon)).ConfigureAwait(false); - // Abandon requested: best-effort-kill (anti-abuse), then stop waiting and return -48. - // Don't read HasExited/ExitCode — Close() may have detached the Process. A non-(-48) - // result when both tokens fire only happens in tests; in production abandon only - // arrives after cancel has failed to unstick the script (see the abandon design doc). if (abandon.IsCancellationRequested) { DoOurBestToCleanUp(process, error); @@ -206,8 +198,7 @@ static int SafelyGetExitCode(Process process) } } - // Completes when the token is cancelled. Lets us race a cancellation against a blocking wait. - static Task WhenCancelled(CancellationToken token) + static Task WaitForAbandon(CancellationToken token) { var tcs = new TaskCompletionSource(); token.Register(() => tcs.TrySetResult(null)); @@ -219,9 +210,8 @@ static void SafelyWaitForAllOutput(ManualResetEventSlim outputResetEvent, Action debug) { // Wait up to 5s for stream EOF (a null DataReceived marks it). The process has already - // exited, so EOF is normally immediate. If a re-parented grandchild still holds the pipe - // open, EOF never comes; we time out and skip the final flush. Process.Dispose() releases - // the pipe at the end of the method. + // exited, so EOF is normally immediate. If a re-parented grandchild process is still holding + // the pipe open then EOF will never come. This is why we time out and skip the final flush. try { outputResetEvent.Wait(TimeSpan.FromSeconds(5), cancel); From ffa5b82d57c31f2b2ef6211a42bc3d2afd9d5f18 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Tue, 2 Jun 2026 15:23:09 +1000 Subject: [PATCH 05/15] Test at the right boundary: drop duplicate/wrong-level abandon tests - Capabilities: remove the duplicate AdvertisesAbandonScriptV2 / DoesNotAdvertise fixture tests (CapabilitiesAreReturned / OnlyKubernetes... already pin the full list incl. abandon) and the real-tentacle LatestTentacle_AdvertisesAbandonScriptCapability integration test (static list). - Orchestrator: delete ObservingScriptOrchestratorAbandonTests (the only direct orchestrator test; every other orchestration behaviour is tested at the TentacleClient boundary) and re-privatise ObserveUntilComplete (was only internal for that test). - ClientScriptExecutionAbandon: replace the RecordMethodUsages 'was AbandonScript called' escalation test with ExecuteScript_WhenCancellationStaysPending_EscalatesToAbandon_AndReleasesMutex (asserts the mutex-release outcome), and add ExecuteScript_OnV1Tentacle_DoesNotEscalateToAbandon (proves the V1 gating by outcome: cancel path only, never NotSupportedException). Drop the now-dead ForAbandonScriptAsync helper and unused imports. Co-Authored-By: Claude Opus 4.8 (1M context) --- ...ObservingScriptOrchestratorAbandonTests.cs | 100 ------------------ .../Scripts/ObservingScriptOrchestrator.cs | 2 +- .../RecordMethodUsagesExtensionMethods.cs | 5 - .../CapabilitiesServiceV2Test.cs | 11 -- .../ClientScriptExecutionAbandon.cs | 96 +++++++++++++---- .../CapabilitiesServiceV2Fixture.cs | 20 ---- 6 files changed, 76 insertions(+), 158 deletions(-) delete mode 100644 source/Octopus.Tentacle.Client.Tests/ObservingScriptOrchestratorAbandonTests.cs diff --git a/source/Octopus.Tentacle.Client.Tests/ObservingScriptOrchestratorAbandonTests.cs b/source/Octopus.Tentacle.Client.Tests/ObservingScriptOrchestratorAbandonTests.cs deleted file mode 100644 index f53178cc9..000000000 --- a/source/Octopus.Tentacle.Client.Tests/ObservingScriptOrchestratorAbandonTests.cs +++ /dev/null @@ -1,100 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Threading; -using System.Threading.Tasks; -using FluentAssertions; -using NSubstitute; -using NUnit.Framework; -using Octopus.Tentacle.Client.EventDriven; -using Octopus.Tentacle.Client.Scripts; -using Octopus.Tentacle.Contracts; - -namespace Octopus.Tentacle.Client.Tests -{ - [TestFixture] - public class ObservingScriptOrchestratorAbandonTests - { - static CommandContext Context() => new(new ScriptTicket("T"), 0, ScriptServiceVersion.ScriptServiceVersion2); - static CommandContext V1Context() => new(new ScriptTicket("T"), 0, ScriptServiceVersion.ScriptServiceVersion1); - - static ScriptOperationExecutionResult Running() - => new(new ScriptStatus(ProcessState.Running, 0, new List()), Context()); - - static ScriptOperationExecutionResult Complete(int exitCode) - => new(new ScriptStatus(ProcessState.Complete, exitCode, new List()), Context()); - - static ScriptOperationExecutionResult RunningV1() - => new(new ScriptStatus(ProcessState.Running, 0, new List()), V1Context()); - - static ScriptOperationExecutionResult CompleteV1(int exitCode) - => new(new ScriptStatus(ProcessState.Complete, exitCode, new List()), V1Context()); - - static ObservingScriptOrchestrator CreateOrchestrator(IScriptExecutor executor, TimeSpan? abandonAfter) - => new( - new ImmediateBackoff(), - _ => { }, - _ => Task.CompletedTask, - executor, - abandonAfter); - - sealed class ImmediateBackoff : IScriptObserverBackoffStrategy - { - public TimeSpan GetBackoff(int iteration) => TimeSpan.Zero; - } - - [Test] - public async Task ParamUnset_CancelsOnly_NeverAbandons() - { - var executor = Substitute.For(); - executor.CancelScript(Arg.Any()) - .Returns(Running(), Complete(ScriptExitCodes.CanceledExitCode)); - - var orchestrator = CreateOrchestrator(executor, abandonAfter: null); - using var cts = new CancellationTokenSource(); - cts.Cancel(); - - await orchestrator.ObserveUntilComplete(Running(), cts.Token); - - await executor.DidNotReceive().AbandonScript(Arg.Any()); - await executor.Received().CancelScript(Arg.Any()); - } - - [Test] - public async Task ThresholdCrossed_SwitchesFromCancelToAbandon() - { - var executor = Substitute.For(); - // abandonAfter = Zero means the threshold is crossed on the first cancelled iteration, - // so the orchestrator abandons immediately. AbandonScript returns Complete to end the loop. - executor.AbandonScript(Arg.Any()) - .Returns(Complete(ScriptExitCodes.AbandonedExitCode)); - - var orchestrator = CreateOrchestrator(executor, abandonAfter: TimeSpan.Zero); - using var cts = new CancellationTokenSource(); - cts.Cancel(); - - var result = await orchestrator.ObserveUntilComplete(Running(), cts.Token); - - await executor.Received().AbandonScript(Arg.Any()); - result.ScriptStatus.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); - } - - [Test] - public async Task ThresholdCrossed_ButVersionCannotAbandon_KeepsCancelling() - { - // ScriptServiceV1 has no abandon verb, so even with the threshold crossed the orchestrator - // must keep cancelling and never call AbandonScript. - var executor = Substitute.For(); - executor.CancelScript(Arg.Any()) - .Returns(RunningV1(), CompleteV1(ScriptExitCodes.CanceledExitCode)); - - var orchestrator = CreateOrchestrator(executor, abandonAfter: TimeSpan.Zero); - using var cts = new CancellationTokenSource(); - cts.Cancel(); - - await orchestrator.ObserveUntilComplete(RunningV1(), cts.Token); - - await executor.DidNotReceive().AbandonScript(Arg.Any()); - await executor.Received().CancelScript(Arg.Any()); - } - } -} diff --git a/source/Octopus.Tentacle.Client/Scripts/ObservingScriptOrchestrator.cs b/source/Octopus.Tentacle.Client/Scripts/ObservingScriptOrchestrator.cs index cd4e8ef0c..20f094412 100644 --- a/source/Octopus.Tentacle.Client/Scripts/ObservingScriptOrchestrator.cs +++ b/source/Octopus.Tentacle.Client/Scripts/ObservingScriptOrchestrator.cs @@ -76,7 +76,7 @@ async Task ObserveUntilCompleteThenFinish( return observingUntilCompleteResult.ScriptStatus; } - internal async Task ObserveUntilComplete( + async Task ObserveUntilComplete( ScriptOperationExecutionResult startScriptResult, CancellationToken scriptExecutionCancellationToken) { diff --git a/source/Octopus.Tentacle.Tests.Integration.Common/Builders/Decorators/Proxies/RecordMethodUsagesExtensionMethods.cs b/source/Octopus.Tentacle.Tests.Integration.Common/Builders/Decorators/Proxies/RecordMethodUsagesExtensionMethods.cs index b2a4a943b..643999ffb 100644 --- a/source/Octopus.Tentacle.Tests.Integration.Common/Builders/Decorators/Proxies/RecordMethodUsagesExtensionMethods.cs +++ b/source/Octopus.Tentacle.Tests.Integration.Common/Builders/Decorators/Proxies/RecordMethodUsagesExtensionMethods.cs @@ -24,11 +24,6 @@ public static IRecordedMethodUsage ForCancelScriptAsync(this IRecordedMethodUsag return o.For("CancelScriptAsync"); } - public static IRecordedMethodUsage ForAbandonScriptAsync(this IRecordedMethodUsages o) - { - return o.For("AbandonScriptAsync"); - } - public static IRecordedMethodUsage ForCompleteScriptAsync(this IRecordedMethodUsages o) { return o.For("CompleteScriptAsync"); diff --git a/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs b/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs index f52ef1b59..9dfc78761 100644 --- a/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs +++ b/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs @@ -59,17 +59,6 @@ public async Task CapabilitiesServiceDoesNotReturnKubernetesScriptServiceForNonK capabilities.Should().NotContain(nameof(IKubernetesScriptServiceV1)); } - [Test] - [TentacleConfigurations] - public async Task LatestTentacle_AdvertisesAbandonScriptCapability(TentacleConfigurationTestCase tentacleConfigurationTestCase) - { - await using var clientAndTentacle = await tentacleConfigurationTestCase.CreateLegacyBuilder().Build(CancellationToken); - - var capabilities = (await clientAndTentacle.TentacleClient.CapabilitiesServiceV2.GetCapabilitiesAsync(new(CancellationToken))).SupportedCapabilities; - - capabilities.Should().Contain(nameof(ScriptServiceV2.AbandonScriptAsync)); - } - [Test] [TentacleConfigurations(testCapabilitiesServiceVersions: true)] public async Task CapabilitiesResponseShouldBeCached(TentacleConfigurationTestCase tentacleConfigurationTestCase) diff --git a/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs b/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs index ac946b292..15ba89d55 100644 --- a/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs +++ b/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs @@ -6,10 +6,7 @@ using NUnit.Framework; using Octopus.Tentacle.Client; using Octopus.Tentacle.Contracts; -using Octopus.Tentacle.Contracts.ClientServices; using Octopus.Tentacle.Core.Util; -using Octopus.Tentacle.Tests.Integration.Common.Builders.Decorators; -using Octopus.Tentacle.Tests.Integration.Common.Builders.Decorators.Proxies; using Octopus.Tentacle.Tests.Integration.Support; using Octopus.Tentacle.Tests.Integration.Util; using Octopus.Tentacle.Tests.Integration.Util.Builders; @@ -182,17 +179,75 @@ await Wait.For(async () => [Test] [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version2)] - public async Task ExecuteScript_WhenCancellationStaysPending_EscalatesToAbandon(TentacleConfigurationTestCase tentacleConfigurationTestCase) + public async Task ExecuteScript_WhenCancellationStaysPending_EscalatesToAbandon_AndReleasesMutex(TentacleConfigurationTestCase tentacleConfigurationTestCase) { - // Kill is disabled, so the script is genuinely stuck and CancelScript can't end it. - // With abandonAfterCancellationPendingFor set short, the orchestrator must escalate to - // AbandonScript, which returns AbandonedExitCode and releases the script. - IRecordedMethodUsages recordedUsages = new MethodUsages(); + // Kill is disabled, so the script is genuinely stuck and CancelScript can't end it. With + // abandonAfterCancellationPendingFor set short, the orchestrator escalates from cancel to + // abandon, which releases the isolation mutex. We prove that by the outcome: a second + // FullIsolation script with the same mutex name can only run once the mutex is released. + await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() + .WithTentacle(x => x.WithRunTentacleEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, "1")) + .Build(CancellationToken); + + var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); + var releaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "release"); + const string sharedMutex = "escalation-test-mutex"; + + var stuckCommand = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder() + .CreateFile(startFile) + .WaitForFileToExist(releaseFile)) + .WithIsolationLevel(ScriptIsolationLevel.FullIsolation) + .WithIsolationMutexName(sharedMutex) + .Build(); + + var tentacleClient = clientTentacle.TentacleClient; + + using var executionCts = new System.Threading.CancellationTokenSource(); + var stuckExecution = Task.Run(async () => await tentacleClient.ExecuteScript( + stuckCommand, + _ => { }, + _ => Task.CompletedTask, + new SerilogLoggerBuilder().Build().ForContext().ToITentacleTaskLog(), + executionCts.Token, + abandonAfterCancellationPendingFor: TimeSpan.FromSeconds(2))); + + await Wait.For(() => File.Exists(startFile), + TimeSpan.FromSeconds(30), + () => throw new Exception("Script did not start"), + CancellationToken); + + // Cancel stays pending (kill disabled), so after the threshold the orchestrator escalates to abandon. + executionCts.Cancel(); + await FluentActions.Invoking(async () => await stuckExecution).Should().ThrowAsync(); + + // Outcome: a second FullIsolation script with the same mutex now runs — only possible if abandon released it. + var secondStartFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "second-start"); + var secondCommand = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder().CreateFile(secondStartFile)) + .WithIsolationLevel(ScriptIsolationLevel.FullIsolation) + .WithIsolationMutexName(sharedMutex) + .Build(); + + var (secondResult, _) = await tentacleClient.ExecuteScript(secondCommand, CancellationToken); + secondResult.ExitCode.Should().Be(0); + File.Exists(secondStartFile).Should().BeTrue("escalation to abandon should have released the mutex"); + + // Cleanup: release the still-alive first script. + File.WriteAllText(releaseFile, ""); + } + + [Test] + [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version1)] + public async Task ExecuteScript_OnV1Tentacle_DoesNotEscalateToAbandon(TentacleConfigurationTestCase tentacleConfigurationTestCase) + { + // ScriptServiceV1 has no abandon verb, so the orchestrator must keep cancelling and never + // escalate. If it wrongly escalated it would call the V1 executor's AbandonScript, which + // throws NotSupportedException. We prove the gating by the outcome: even with + // abandonAfterCancellationPendingFor set, the run ends in OperationCanceledException (the + // cancel path) — never NotSupportedException. await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() .WithTentacle(x => x.WithRunTentacleEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, "1")) - .WithTentacleServiceDecorator(new TentacleServiceDecoratorBuilder() - .RecordMethodUsages(out recordedUsages) - .Build()) .Build(CancellationToken); var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); @@ -208,15 +263,13 @@ public async Task ExecuteScript_WhenCancellationStaysPending_EscalatesToAbandon( var tentacleClient = clientTentacle.TentacleClient; using var executionCts = new System.Threading.CancellationTokenSource(); - var logs = new System.Collections.Generic.List(); - var execution = Task.Run(async () => await tentacleClient.ExecuteScript( command, - onScriptStatusResponseReceived => logs.AddRange(onScriptStatusResponseReceived.Logs), + _ => { }, _ => Task.CompletedTask, new SerilogLoggerBuilder().Build().ForContext().ToITentacleTaskLog(), executionCts.Token, - abandonAfterCancellationPendingFor: TimeSpan.FromSeconds(2))); + abandonAfterCancellationPendingFor: TimeSpan.FromSeconds(1))); await Wait.For(() => File.Exists(startFile), TimeSpan.FromSeconds(30), @@ -225,14 +278,15 @@ await Wait.For(() => File.Exists(startFile), executionCts.Cancel(); - // ExecuteScript throws OperationCanceledException once the token is cancelled. - await FluentActions.Invoking(async () => await execution).Should().ThrowAsync(); - - // The orchestrator must have escalated to AbandonScript after the 2s threshold. - recordedUsages.ForAbandonScriptAsync().Started.Should().BeGreaterThan(0); + // Well past the 1s threshold: if the V1 gating were broken the orchestrator would have + // escalated to AbandonScript (which throws NotSupportedException) by now. + await Task.Delay(TimeSpan.FromSeconds(3)); - // Cleanup: release + reap the still-alive process (kill was disabled). + // Release so the stuck script can finish and the run can unwind. File.WriteAllText(releaseFile, ""); + + // Cancel path only — never NotSupportedException. + await FluentActions.Invoking(async () => await execution).Should().ThrowAsync(); } } } diff --git a/source/Octopus.Tentacle.Tests/Capabilities/CapabilitiesServiceV2Fixture.cs b/source/Octopus.Tentacle.Tests/Capabilities/CapabilitiesServiceV2Fixture.cs index e219353cd..3f6b03be6 100644 --- a/source/Octopus.Tentacle.Tests/Capabilities/CapabilitiesServiceV2Fixture.cs +++ b/source/Octopus.Tentacle.Tests/Capabilities/CapabilitiesServiceV2Fixture.cs @@ -41,25 +41,5 @@ public async Task OnlyKubernetesScriptServicesAreReturnedWhenRunningAsKubernetes Environment.SetEnvironmentVariable(KubernetesConfig.NamespaceVariableName, null); } - - [Test] - public async Task GetCapabilities_OnNonKubernetesTentacle_AdvertisesAbandonScriptV2() - { - var service = new CapabilitiesServiceV2(); - var response = await service.GetCapabilitiesAsync(CancellationToken.None); - response.SupportedCapabilities.Should().Contain(nameof(ScriptServiceV2.AbandonScriptAsync)); - } - - [Test] - public async Task GetCapabilities_OnKubernetesTentacle_DoesNotAdvertiseAbandonScriptV2() - { - Environment.SetEnvironmentVariable(KubernetesConfig.NamespaceVariableName, "ABC"); - - var service = new CapabilitiesServiceV2(); - var response = await service.GetCapabilitiesAsync(CancellationToken.None); - response.SupportedCapabilities.Should().NotContain(nameof(ScriptServiceV2.AbandonScriptAsync)); - - Environment.SetEnvironmentVariable(KubernetesConfig.NamespaceVariableName, null); - } } } \ No newline at end of file From d03dcfbf96c82734dd72ff7682a8d474b637d08e Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Tue, 2 Jun 2026 17:00:27 +1000 Subject: [PATCH 06/15] Add AbandonScriptAsync to IAsyncScriptServiceV2 (unblocks server tests) + review feedback - IAsyncScriptServiceV2 (server-side async interface) now declares AbandonScriptAsync, matching the sync IScriptServiceV2. ScriptServiceV2 already implemented it; without the interface declaration Halibut's sync/async convention verifier rejected the registration, blocking the server repo's integration tests. (Requested on EFT-3295.) - Delete ScriptServiceV2ExecutorAbandonTests: after the capability-check removal it's a pure passthrough with no logic to test (unlike ScriptServiceV1ExecutorTests which tests real logic); covered by the service + end-to-end tests. - IScriptExecutor.AbandonScript doc: apply review wording. - ScriptServiceVersion.SupportsAbandon: keep version-based (consistent with how the codebase collapses capabilities to a version); add a comment noting it assumes V2 advertises abandon and the server gates older tentacles. - RunningScript abandon catch: comment now explains the two phases (runner returns -48 during the wait; this catch maps an abandon-driven OCE before/around mutex acquire to the same code). - SilentProcessRunnerFixture: revert the grandchild-test commentary + assertion messages to main's (their behaviour didn't change; the rewrite was gratuitous). Kept the requested WaitForPidFileAsync rename. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../ScriptServiceV2ExecutorAbandonTests.cs | 54 ----------- .../Scripts/IScriptExecutor.cs | 4 +- .../Scripts/ScriptServiceVersion.cs | 6 +- .../Services/Scripts/IAsyncScriptServiceV2.cs | 1 + .../Services/Scripts/RunningScript.cs | 6 +- .../Util/SilentProcessRunnerFixture.cs | 93 ++++++++----------- 6 files changed, 49 insertions(+), 115 deletions(-) delete mode 100644 source/Octopus.Tentacle.Client.Tests/ScriptServiceV2ExecutorAbandonTests.cs diff --git a/source/Octopus.Tentacle.Client.Tests/ScriptServiceV2ExecutorAbandonTests.cs b/source/Octopus.Tentacle.Client.Tests/ScriptServiceV2ExecutorAbandonTests.cs deleted file mode 100644 index 9e3a2f888..000000000 --- a/source/Octopus.Tentacle.Client.Tests/ScriptServiceV2ExecutorAbandonTests.cs +++ /dev/null @@ -1,54 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Threading.Tasks; -using FluentAssertions; -using Halibut.ServiceModel; -using NSubstitute; -using NUnit.Framework; -using Octopus.Tentacle.Client; -using Octopus.Tentacle.Client.EventDriven; -using Octopus.Tentacle.Client.Execution; -using Octopus.Tentacle.Client.Observability; -using Octopus.Tentacle.Client.Retries; -using Octopus.Tentacle.Client.Scripts; -using Octopus.Tentacle.Contracts; -using Octopus.Tentacle.Contracts.ClientServices; -using Octopus.Tentacle.Contracts.Logging; -using Octopus.Tentacle.Contracts.Observability; -using Octopus.Tentacle.Contracts.ScriptServiceV2; - -namespace Octopus.Tentacle.Client.Tests -{ - [TestFixture] - public class ScriptServiceV2ExecutorAbandonTests - { - static ScriptServiceV2Executor CreateExecutor(IAsyncClientScriptServiceV2 scriptService) - => new( - scriptService, - RpcCallExecutorFactory.Create(TimeSpan.Zero, Substitute.For()), - ClientOperationMetricsBuilder.Start(), - TimeSpan.Zero, - new TentacleClientOptions(new RpcRetrySettings(RetriesEnabled: false, RetryDuration: TimeSpan.Zero)), - Substitute.For()); - - static CommandContext Context() => new(new ScriptTicket("TestTicket"), 0, ScriptServiceVersion.ScriptServiceVersion2); - - [Test] - public async Task AbandonScript_CallsAbandonScriptAsync() - { - var scriptService = Substitute.For(); - scriptService.AbandonScriptAsync(Arg.Any(), Arg.Any()) - .Returns(x => Task.FromResult(new ScriptStatusResponseV2( - x.Arg().Ticket, ProcessState.Complete, - ScriptExitCodes.AbandonedExitCode, new List(), 1))); - - var executor = CreateExecutor(scriptService); - - var result = await executor.AbandonScript(Context()); - - await scriptService.Received(1).AbandonScriptAsync(Arg.Any(), Arg.Any()); - await scriptService.DidNotReceive().CancelScriptAsync(Arg.Any(), Arg.Any()); - result.ScriptStatus.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); - } - } -} diff --git a/source/Octopus.Tentacle.Client/Scripts/IScriptExecutor.cs b/source/Octopus.Tentacle.Client/Scripts/IScriptExecutor.cs index 3d9acc709..752b5bbbc 100644 --- a/source/Octopus.Tentacle.Client/Scripts/IScriptExecutor.cs +++ b/source/Octopus.Tentacle.Client/Scripts/IScriptExecutor.cs @@ -33,8 +33,8 @@ Task StartScript(ExecuteScriptCommand command, Task CancelScript(CommandContext commandContext); /// - /// Abandon the script: signal Tentacle to stop waiting and release the isolation mutex, - /// returning the abandoned status. + /// Abandon the script. Signals Tentacle to stop waiting for the script to cancel and make the tentacle + /// available to run more scripts with the same isolation mutex. /// /// The CommandContext from the previous command Task AbandonScript(CommandContext commandContext); diff --git a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersion.cs b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersion.cs index 6a1c7cfde..e32723a0b 100644 --- a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersion.cs +++ b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersion.cs @@ -6,8 +6,10 @@ public record ScriptServiceVersion(string Value) public static ScriptServiceVersion ScriptServiceVersion2 = new(nameof(ScriptServiceVersion2)); public static ScriptServiceVersion KubernetesScriptServiceVersion1 = new(nameof(KubernetesScriptServiceVersion1)); - // Only ScriptServiceV2 has the AbandonScript verb. The orchestrator checks this before it - // escalates a stuck cancel to abandon, so we never call abandon where it can't work. + // Only ScriptServiceV2 has the AbandonScript verb, so the orchestrator checks this before it + // escalates a stuck cancel to abandon and never calls abandon where it can't work. This assumes + // a V2 tentacle advertises AbandonScript (true from this build forward); the server doesn't enable + // abandon against older tentacles, so we don't re-check the capability per call here. public bool SupportsAbandon => Value == nameof(ScriptServiceVersion2); public override string ToString() => Value; diff --git a/source/Octopus.Tentacle.Contracts/Services/Scripts/IAsyncScriptServiceV2.cs b/source/Octopus.Tentacle.Contracts/Services/Scripts/IAsyncScriptServiceV2.cs index c1c9d3118..10b24a752 100644 --- a/source/Octopus.Tentacle.Contracts/Services/Scripts/IAsyncScriptServiceV2.cs +++ b/source/Octopus.Tentacle.Contracts/Services/Scripts/IAsyncScriptServiceV2.cs @@ -12,6 +12,7 @@ public interface IAsyncScriptServiceV2 Task StartScriptAsync(StartScriptCommandV2 command, CancellationToken cancellationToken); Task GetStatusAsync(ScriptStatusRequestV2 request, CancellationToken cancellationToken); Task CancelScriptAsync(CancelScriptCommandV2 command, CancellationToken cancellationToken); + Task AbandonScriptAsync(AbandonScriptCommandV2 command, CancellationToken cancellationToken); Task CompleteScriptAsync(CompleteScriptCommandV2 command, CancellationToken cancellationToken); } } \ No newline at end of file diff --git a/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs b/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs index 1a0069050..bd9d51b7b 100644 --- a/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs +++ b/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs @@ -126,7 +126,11 @@ public async Task Execute() : await RunScriptAsync(shellPath, writer, runningScriptToken, abandonToken); } } - // Fires when the caller abandoned the script: leave the OS process running and signal the distinct AbandonedExitCode so the server can tell it apart from a cancel. + // Abandon surfaced as a cancellation before/around the process wait — e.g. while still + // acquiring the isolation mutex. SilentProcessRunner handles abandon during the wait itself + // and returns AbandonedExitCode directly; this catch covers abandon arriving as an + // OperationCanceledException instead, mapping it to the same code so the server can tell it + // apart from a cancel. catch (OperationCanceledException) when (abandonToken.IsCancellationRequested) { writer.WriteOutput(ProcessOutputSource.StdOut, "Script execution abandoned."); diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs index c048b70ed..9083a61c1 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs @@ -131,37 +131,29 @@ public async Task CancellationToken_WhenGrandchildHoldsRedirectedPipes_ShouldNot using var tempDir = new TemporaryDirectory(); var grandchildPidFile = Path.Combine(tempDir.DirectoryPath, "grandchild.pid"); - // This test guards the cancel path of ExecuteCommandAsync against a regression - // involving re-parented grandchildren that inherit our redirected pipes. + // This test reproduces the cancel-time hang. + // The issue was SilentProcessRunner will wait for stdout/stderr pipes to be closed. + // The pipes can be inherited by grandchildren, and remain open even after the + // child process has died. // - // The scenario: - // * We launch a process that spawns a child, which spawns a long-running grandchild - // and then immediately exits. - // * Because the child exited BEFORE our cancel fires, the grandchild has been - // re-parented (PPID broken). Process.Kill(entireProcessTree:true) follows PPID - // links, so it does NOT find the grandchild — Kill returns having killed nothing - // beyond the (already-dead) child. - // * The grandchild inherited our redirected stdout/stderr pipes and holds them - // open. The stream readers therefore never see EOF. + // Normally Process.Kill(entireProcessTree:true) would kill the entire process tree. + // However if the child process dies THEN we issue the kill command, we do NOT see any + // process under the child and so the Kill() command completes. This leaves the grandchild + // running, holding our pipes we are waiting on. // - // How cancel handles this: cancel.Register kills the (already-dead) child and then - // calls process.Close(), which releases our pipe handles. That lets the synchronous - // process.WaitForExit() return even though the orphaned grandchild still holds the - // streams open. (Close is safe here because we wait synchronously; it would race the - // Exited event of the async WaitForExitAsync, which this code deliberately does not use.) + // The test stacks three processes: PowerShell (the child) launches cmd.exe (a + // throwaway middle layer) which does `start /b ping` to background ping (the + // grandchild) and then exits. cmd exiting before we cancel is what breaks the + // PPID chain — without that, ping would still be a direct child of PowerShell + // and Kill(true) would find it. // - // This test asserts cancel alone returns in well under the test timeout in the - // grandchild scenario. If it hangs, process.Close() has been removed from - // DoOurBestToCleanUp (see that method) or the cancel-kill path has otherwise broken. - // - // Two non-obvious bits in the PowerShell script below, both load-bearing: + // Two non-obvious bits below, both load-bearing: // * $psi.RedirectStandardInput = $true — we don't use stdin, but redirecting // any stream is what flips bInheritHandles=true in .NET's Process.Start. That // is what makes cmd (and by extension ping) inherit our pipe write-ends. // Without this the grandchild doesn't hold our pipes and there is no bug to // reproduce. - // * The WMI lookup — we need the grandchild's PID so the test can clean it up - // afterwards (otherwise we'd leak a long-running ping on the CI host). + // * The WMI lookup — we need the grandchild's PID so the test can clean it up. var psScript = @" $pidFile = 'PIDFILE_PLACEHOLDER' $pingPath = Join-Path $env:WINDIR 'System32\PING.EXE' @@ -210,16 +202,13 @@ public async Task CancellationToken_WhenGrandchildHoldsRedirectedPipes_ShouldNot var sw = Stopwatch.StartNew(); cts.Cancel(); - var completed = task.Wait(TimeSpan.FromSeconds(60)); + var completed = task.Wait(TimeSpan.FromSeconds(30)); sw.Stop(); completed.Should().BeTrue( - $"ExecuteCommandAsync should return promptly after cancellation even when a " + - $"grandchild holds the redirected pipes. Worst case is ~10s (5s timeout × 2 streams " + - $"in SafelyWaitForAllOutput). If we hit the 60s test timeout, either someone " + - $"re-introduced process.Close() in DoOurBestToCleanUp (which races with the Exited " + - $"event WaitForExitAsync depends on) or SafelyWaitForAllOutput's per-stream timeout " + - $"has been removed. Elapsed since cancel: {sw.Elapsed.TotalSeconds:F1}s"); + $"ExecuteCommand should return shortly after cancellation even when a grandchild " + + $"holds the redirected pipes. Without proactively closing the redirected streams " + + $"after Kill, Process.WaitForExit() blocks indefinitely. Elapsed since cancel: {sw.Elapsed.TotalSeconds:F1}s"); } } finally @@ -234,20 +223,19 @@ public async Task CancellationToken_WhenUnixGrandchildHoldsRedirectedPipes_Shoul if (PlatformDetection.IsRunningOnWindows) Assert.Ignore("Unix-only repro (Mac/Linux). The Windows equivalent is covered by the [WindowsTest] above."); - // Unix equivalent of CancellationToken_WhenGrandchildHoldsRedirectedPipes_ShouldNotHang - // above. See that test's leading comment for the full rationale — the short version is: + // This test reproduces the cancel-time hang. + // The issue is SilentProcessRunner will wait for stdout/stderr pipes to be closed. + // The pipes can be inherited by grandchildren, and remain open even after the + // child process has died. // - // * We start sh, which backgrounds a long sleep (the grandchild) and exits - // immediately. The grandchild gets re-parented to init/launchd and inherits our - // redirected stdout/stderr pipes, holding them open. - // * Process.Kill(entireProcessTree:true) follows PPID links, so by the time we - // cancel, the now-orphan grandchild is invisible to Kill — it keeps running. - // * On cancel, cancel.Register kills the (already-dead) child and calls - // process.Close(), which releases our pipe handles so the synchronous - // process.WaitForExit() returns despite the orphaned grandchild holding the - // streams open. (Close is safe with the synchronous wait — see DoOurBestToCleanUp.) + // Normally Process.Kill(entireProcessTree:true) would kill the entire process tree. + // However if the child process dies THEN we issue the kill command, we do NOT see any + // process under the child and so the Kill() command completes. This leaves the grandchild + // running, holding our pipes we are waiting on. // - // This test asserts cancel alone returns in well under the test timeout in the grandchild scenario. + // The test is simple we run "sh", the child process, and tell it to yeet + // sleep, the grandchild, into the background. The grandchild now keeps + // running holding on to our pipes we will wait for an EOF on. using var tempDir = new TemporaryDirectory(); var grandchildPidFile = Path.Combine(tempDir.DirectoryPath, "grandchild.pid"); @@ -266,27 +254,20 @@ public async Task CancellationToken_WhenUnixGrandchildHoldsRedirectedPipes_Shoul out _, cts.Token)); - await WaitForPidFileAsync(grandchildPidFile, TimeSpan.FromSeconds(60)); + await WaitForPidFileAsync(grandchildPidFile, TimeSpan.FromSeconds(30)); var sw = Stopwatch.StartNew(); cts.Cancel(); - // Cancel should return within ~10s worst case (5s SafelyWaitForAllOutput timeout - // per stream). The 60s test bound is generous for slow Linux CI agents while - // still flagging a real regression — if process.Close() got re-added to - // DoOurBestToCleanUp (see that method for why) or SafelyWaitForAllOutput's - // per-stream timeout was removed, the wait hangs indefinitely and we trip the - // bound by miles. - var completed = task.Wait(TimeSpan.FromSeconds(60)); + // Cancel should be super quick, if it takes a long time, then we have an issue where we are waiting for the grandchild. + var completed = task.Wait(TimeSpan.FromSeconds(30)); sw.Stop(); completed.Should().BeTrue( - $"ExecuteCommandAsync should return promptly after cancellation even when a Unix " + - $"grandchild (reparented to init/launchd) holds the redirected pipes. Worst case " + - $"is ~10s. If we hit the 60s test timeout, either process.Close() was re-introduced " + - $"in DoOurBestToCleanUp (which races with the Exited event WaitForExitAsync depends " + - $"on) or SafelyWaitForAllOutput's per-stream timeout has been removed. Elapsed " + - $"since cancel: {sw.Elapsed.TotalSeconds:F1}s"); + $"ExecuteCommand should return shortly after cancellation even when a Unix " + + $"grandchild (reparented to init/launchd) holds the redirected pipes. " + + $"Without proactively closing the redirected streams after Kill, " + + $"Process.WaitForExit() blocks indefinitely. Elapsed since cancel: {sw.Elapsed.TotalSeconds:F1}s"); } } finally From 65a6db58500f4872ec7e1548979df40e4a1c3563 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Wed, 3 Jun 2026 10:21:49 +1000 Subject: [PATCH 07/15] Fix race in abandon integration tests: assert ExecuteScript's result, not a GetStatus poll CI surfaced AbandonScript_WhenCancelFailsToKillProcess(Polling) returning -45 (UnknownScriptExitCode) instead of -48. Root cause was a test race, not the product: the test ran ExecuteScript AND separately polled GetStatus for the terminal code. When abandon fires, ExecuteScript's own orchestrator observes Complete(-48) and then calls CompleteScript, which removes the running script and deletes the workspace. On slower transports (Polling) the separate GetStatus poll landed after that cleanup and saw UnknownScriptExitCode. Fixed both direct-abandon tests to assert on ExecuteScript's returned ScriptExecutionResult (the authoritative terminal code, read by the orchestrator before it cleans up), which is race-free. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../ClientScriptExecutionAbandon.cs | 43 ++++++------------- 1 file changed, 13 insertions(+), 30 deletions(-) diff --git a/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs b/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs index 15ba89d55..8c51a318a 100644 --- a/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs +++ b/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs @@ -51,24 +51,17 @@ await Wait.For(() => File.Exists(startFile), await tentacleClient.CancelScript(firstCommand.ScriptTicket); await Task.Delay(TimeSpan.FromSeconds(1)); - // Abandon: fires the abandon token. The RPC returns the current status snapshot - // immediately, so we poll GetStatus until the script reaches Complete state. + // Abandon: fires the abandon token. ExecuteScript observes the abandoned terminal state and + // returns it. We assert on its result rather than a separate GetStatus poll — the poll would + // race the orchestrator's own CompleteScript cleanup (which removes the script and deletes the + // workspace), and on slower transports lands after it, seeing UnknownScriptExitCode. await tentacleClient.AbandonScript(firstCommand.ScriptTicket, CancellationToken); - ScriptStatus abandonResponse = null!; - await Wait.For(async () => - { - abandonResponse = await tentacleClient.GetStatus(firstCommand.ScriptTicket, CancellationToken); - return abandonResponse.State == ProcessState.Complete; - }, - TimeSpan.FromSeconds(30), - () => throw new Exception("Abandoned script did not reach Complete state within 30s"), - CancellationToken); - abandonResponse.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + var (result, _) = await scriptExecution; + result.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); - // Release the script process so it exits cleanly and stops leaking. + // The process is still alive (kill was disabled); release it so it stops leaking. File.WriteAllText(releaseFile, ""); - await scriptExecution; } [Test] @@ -157,24 +150,14 @@ await Wait.For(() => File.Exists(startFile), () => throw new Exception("Script did not start"), CancellationToken); - // Direct abandon, NO prior cancel. + // Direct abandon, NO prior cancel. Assert on ExecuteScript's own result rather than a separate + // GetStatus poll, which would race the orchestrator's CompleteScript cleanup (see + // AbandonScript_WhenCancelFailsToKillProcess for the same reasoning). await tentacleClient.AbandonScript(command.ScriptTicket, CancellationToken); - ScriptStatus abandonResponse = null!; - await Wait.For(async () => - { - abandonResponse = await tentacleClient.GetStatus(command.ScriptTicket, CancellationToken); - return abandonResponse.State == ProcessState.Complete; - }, - TimeSpan.FromSeconds(30), - () => throw new Exception("Abandoned script did not reach Complete state within 30s"), - CancellationToken); - - abandonResponse.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); - - // We never wrote releaseFile, so the script only completes because the abandon branch - // killed it. Draining the execution confirms the process is gone. - await scriptExecution; + // We never wrote releaseFile, so the script only completes because the abandon branch killed it. + var (result, _) = await scriptExecution; + result.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); } [Test] From 96babddcdf493f867b546d5b9fe3a5116d6367e8 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Thu, 4 Jun 2026 13:03:23 +1000 Subject: [PATCH 08/15] Reword abandon wait/cleanup comments in SilentProcessRunner Co-Authored-By: Claude Opus 4.8 (1M context) --- .../Util/CommandLine/SilentProcessRunner.cs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs index 5064b26b4..e093ab4c6 100644 --- a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs +++ b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs @@ -148,12 +148,14 @@ void WriteData(Action action, ManualResetEventSlim resetEvent, DataRecei var waitForExit = Task.Run(() => { try { process.WaitForExit(); } - catch { /* released by Process.Dispose on the abandon path */ } + catch { /* swalloe exceptions thrown when released by Process.Dispose in DoOurBestToCleanUp */ } }); - // Wait for the process to exit, but let abandon break us out. Cancel kills then - // Close()s the process (via cancel.Register) so this returns; abandon races the - // wait and, if it wins, we return -48 and leave the process running. + // Wait for the process to exit, but break if abandon cancallation token fires. + // Cancel kills then Close()s the process (via cancel.Register) so this returns. + // If the script is not actually stuck when abandon fires, abandoning will be in + // a race with the script to complete, if it wins, we return -48 and leave + // the process running. await Task.WhenAny(waitForExit, WaitForAbandon(abandon)).ConfigureAwait(false); if (abandon.IsCancellationRequested) From 23059e4e91505f6535cb583dac5f7732b08eb411 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Thu, 4 Jun 2026 15:44:15 +1000 Subject: [PATCH 09/15] Address PR review feedback on abandon - Delete the dead abandon-filtered catch in RunningScript; a script cancelled while still acquiring the isolation mutex reports CanceledExitCode. Document that at both the script-service (ScriptServiceV2Fixture) and client (ClientScriptExecutionAbandon) levels, with a comment marking the re-add point. - Re-add the capabilities list-length guard as an exact-set assertion, abandon-aware via Version.HasAbandonScript, in CapabilitiesServiceV2Test. - Extract EnsureProcessKilled test helper that asserts the kill so a leaked process surfaces loudly on CI; fold in TryKillGrandchild; drop the redundant AbandonToken_WhenCancelAlsoRequested runner test. - Comment cleanups: name the abandon-token consumption site in ScriptServiceV2, remove the SupportsAbandon comment, revert SafelyWaitForAllOutput to main's. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../Scripts/ScriptServiceVersion.cs | 4 - .../Services/Scripts/RunningScript.cs | 10 --- .../Services/Scripts/ScriptServiceV2.cs | 6 +- .../Util/CommandLine/SilentProcessRunner.cs | 4 +- .../CapabilitiesServiceV2Test.cs | 26 +++--- .../ClientScriptExecutionAbandon.cs | 66 ++++++++++++++ .../Support/TentacleVersions.cs | 6 ++ .../Util/SilentProcessRunnerFixture.cs | 86 ++++--------------- .../Integration/ScriptServiceV2Fixture.cs | 73 ++++++++++++++++ 9 files changed, 182 insertions(+), 99 deletions(-) diff --git a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersion.cs b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersion.cs index e32723a0b..ebfba57aa 100644 --- a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersion.cs +++ b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersion.cs @@ -6,10 +6,6 @@ public record ScriptServiceVersion(string Value) public static ScriptServiceVersion ScriptServiceVersion2 = new(nameof(ScriptServiceVersion2)); public static ScriptServiceVersion KubernetesScriptServiceVersion1 = new(nameof(KubernetesScriptServiceVersion1)); - // Only ScriptServiceV2 has the AbandonScript verb, so the orchestrator checks this before it - // escalates a stuck cancel to abandon and never calls abandon where it can't work. This assumes - // a V2 tentacle advertises AbandonScript (true from this build forward); the server doesn't enable - // abandon against older tentacles, so we don't re-check the capability per call here. public bool SupportsAbandon => Value == nameof(ScriptServiceVersion2); public override string ToString() => Value; diff --git a/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs b/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs index bd9d51b7b..ddfc7544a 100644 --- a/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs +++ b/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs @@ -126,16 +126,6 @@ public async Task Execute() : await RunScriptAsync(shellPath, writer, runningScriptToken, abandonToken); } } - // Abandon surfaced as a cancellation before/around the process wait — e.g. while still - // acquiring the isolation mutex. SilentProcessRunner handles abandon during the wait itself - // and returns AbandonedExitCode directly; this catch covers abandon arriving as an - // OperationCanceledException instead, mapping it to the same code so the server can tell it - // apart from a cancel. - catch (OperationCanceledException) when (abandonToken.IsCancellationRequested) - { - writer.WriteOutput(ProcessOutputSource.StdOut, "Script execution abandoned."); - exitCode = ScriptExitCodes.AbandonedExitCode; - } // Fires when the caller cancelled the script and the underlying process honored the cancellation token. catch (OperationCanceledException) { diff --git a/source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs b/source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs index 1909bacfb..791750c8d 100644 --- a/source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs +++ b/source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs @@ -143,8 +143,10 @@ public async Task CancelScriptAsync(CancelScriptCommandV public Task AbandonScriptAsync(AbandonScriptCommandV2 command, CancellationToken cancellationToken) { - // Fire the abandon token. The runner's abandon branch best-effort-kills the process, - // stops waiting, releases the mutex, and returns -48. + // runningScript.Abandon() cancels AbandonToken. That token is awaited only in + // SilentProcessRunner.ExecuteCommandAsync, via Task.WhenAny(waitForExit, WaitForAbandon(abandon)): + // cancelling it resolves that wait, so the runner best-effort-kills the process, returns -48, and + // RunningScript's isolation-mutex `using` unwinds to release the mutex. if (runningScripts.TryGetValue(command.Ticket, out var runningScript)) { runningScript.Abandon(); diff --git a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs index e093ab4c6..cb75b4add 100644 --- a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs +++ b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs @@ -211,9 +211,7 @@ static void SafelyWaitForAllOutput(ManualResetEventSlim outputResetEvent, CancellationToken cancel, Action debug) { - // Wait up to 5s for stream EOF (a null DataReceived marks it). The process has already - // exited, so EOF is normally immediate. If a re-parented grandchild process is still holding - // the pipe open then EOF will never come. This is why we time out and skip the final flush. + //5 seconds is a bit arbitrary, but the process should have already exited by now, so unwise to wait too long try { outputResetEvent.Wait(TimeSpan.FromSeconds(5), cancel); diff --git a/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs b/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs index 9dfc78761..37743705c 100644 --- a/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs +++ b/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs @@ -29,13 +29,14 @@ public async Task CapabilitiesFromAnOlderTentacleWhichHasNoCapabilitiesService_W var capabilities = (await clientAndTentacle.TentacleClient.CapabilitiesServiceV2.GetCapabilitiesAsync(new(CancellationToken))).SupportedCapabilities; - capabilities.Should().Contain(nameof(IScriptService)); - capabilities.Should().Contain(nameof(IFileTransferService)); - + var expected = new List { nameof(IScriptService), nameof(IFileTransferService) }; if (version.HasScriptServiceV2()) - { - capabilities.Should().Contain(nameof(IScriptServiceV2)); - } + expected.Add(nameof(IScriptServiceV2)); + if (version.HasAbandonScript()) + expected.Add(nameof(ScriptServiceV2.AbandonScriptAsync)); + + // Exact set, not just Contain: this also guards against over-advertising a capability we didn't intend. + capabilities.Should().BeEquivalentTo(expected); } [Test] @@ -48,14 +49,15 @@ public async Task CapabilitiesServiceDoesNotReturnKubernetesScriptServiceForNonK var capabilities = (await clientAndTentacle.TentacleClient.CapabilitiesServiceV2.GetCapabilitiesAsync(new(CancellationToken))).SupportedCapabilities; - capabilities.Should().Contain(nameof(IScriptService)); - capabilities.Should().Contain(nameof(IFileTransferService)); - + var expected = new List { nameof(IScriptService), nameof(IFileTransferService) }; if (version.HasScriptServiceV2()) - { - capabilities.Should().Contain(nameof(IScriptServiceV2)); - } + expected.Add(nameof(IScriptServiceV2)); + if (version.HasAbandonScript()) + expected.Add(nameof(ScriptServiceV2.AbandonScriptAsync)); + // Exact set, not just Contain: this also guards against over-advertising. In particular a + // non-Kubernetes Tentacle must never surface IKubernetesScriptServiceV1. + capabilities.Should().BeEquivalentTo(expected); capabilities.Should().NotContain(nameof(IKubernetesScriptServiceV1)); } diff --git a/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs b/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs index 8c51a318a..a6c4a8fdf 100644 --- a/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs +++ b/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs @@ -271,5 +271,71 @@ await Wait.For(() => File.Exists(startFile), // Cancel path only — never NotSupportedException. await FluentActions.Invoking(async () => await execution).Should().ThrowAsync(); } + + [Test] + [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version2)] + public async Task ExecuteScript_WhenAQueuedScriptIsCancelled_ExitsViaCancelNotAbandon(TentacleConfigurationTestCase tentacleConfigurationTestCase) + { + // End-to-end counterpart to ScriptServiceV2Fixture.AbandonScript_WhileWaitingOnTheIsolationMutex_ExitsCancelled. + // A script still waiting on the isolation mutex never started a process, so when its execution is + // cancelled (with abandon escalation armed) it ends via the cancel path (OperationCanceledException) + // and never runs. Abandon has nothing to act on for a queued script, so it cannot change the outcome. + // If we ever want abandon to win for a queued script, RunningScript.Execute can catch + // (OperationCanceledException) when (abandonToken.IsCancellationRequested) around the mutex Acquire + // and return AbandonedExitCode instead. Deliberately not doing that today. + await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() + .WithTentacle(x => x.WithRunTentacleEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, "1")) + .Build(CancellationToken); + + var holderStartFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "holder-start"); + var holderReleaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "holder-release"); + var waiterRanFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "waiter-ran"); + const string sharedMutex = "queued-cancel-test-mutex"; + + // Holder takes the mutex and stays stuck (kill disabled), so the waiter must queue behind it. + var holderCommand = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder() + .CreateFile(holderStartFile) + .WaitForFileToExist(holderReleaseFile)) + .WithIsolationLevel(ScriptIsolationLevel.FullIsolation) + .WithIsolationMutexName(sharedMutex) + .Build(); + + var tentacleClient = clientTentacle.TentacleClient; + var holderExecution = Task.Run(async () => await tentacleClient.ExecuteScript(holderCommand, CancellationToken)); + + await Wait.For(() => File.Exists(holderStartFile), + TimeSpan.FromSeconds(30), + () => throw new Exception("Holder script did not start"), + CancellationToken); + + // Waiter wants the same mutex, so it blocks acquiring it — its process never starts. + var waiterCommand = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder().CreateFile(waiterRanFile)) + .WithIsolationLevel(ScriptIsolationLevel.FullIsolation) + .WithIsolationMutexName(sharedMutex) + .Build(); + + using var waiterCts = new System.Threading.CancellationTokenSource(); + var waiterExecution = Task.Run(async () => await tentacleClient.ExecuteScript( + waiterCommand, + _ => { }, + _ => Task.CompletedTask, + new SerilogLoggerBuilder().Build().ForContext().ToITentacleTaskLog(), + waiterCts.Token, + abandonAfterCancellationPendingFor: TimeSpan.FromSeconds(2))); + + // Let the waiter reach the mutex queue, then cancel it. With escalation armed the orchestrator may + // also try to abandon — but abandon can't act on a queued script, so it still ends via cancel. + await Task.Delay(TimeSpan.FromSeconds(2)); + waiterCts.Cancel(); + + await FluentActions.Invoking(async () => await waiterExecution).Should().ThrowAsync(); + File.Exists(waiterRanFile).Should().BeFalse("a script blocked on the isolation mutex never starts its process"); + + // Cleanup: release the holder. + File.WriteAllText(holderReleaseFile, ""); + await holderExecution; + } } } diff --git a/source/Octopus.Tentacle.Tests.Integration/Support/TentacleVersions.cs b/source/Octopus.Tentacle.Tests.Integration/Support/TentacleVersions.cs index 3edf7984d..dc7d09723 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Support/TentacleVersions.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Support/TentacleVersions.cs @@ -78,5 +78,11 @@ public static bool HasScriptServiceV2(this Version? version) { return version == TentacleVersions.Current || version >= TentacleVersions.v7_1_189_SyncHalibutAndScriptServiceV2; } + + public static bool HasAbandonScript(this Version? version) + { + // AbandonScriptAsync is advertised by ScriptServiceV2 from this build onward; no released version has it yet. + return version == TentacleVersions.Current; + } } } \ No newline at end of file diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs index 9083a61c1..e3a92fc19 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs @@ -213,7 +213,7 @@ public async Task CancellationToken_WhenGrandchildHoldsRedirectedPipes_ShouldNot } finally { - TryKillGrandchild(grandchildPidFile); + EnsureProcessKilled(grandchildPidFile); } } @@ -272,7 +272,7 @@ public async Task CancellationToken_WhenUnixGrandchildHoldsRedirectedPipes_Shoul } finally { - TryKillGrandchild(grandchildPidFile); + EnsureProcessKilled(grandchildPidFile); } } @@ -321,66 +321,7 @@ public async Task AbandonToken_ReturnsAbandonedExitCode() } finally { - // Force-kill the script process if it's somehow still around, to avoid leaking on CI - if (File.Exists(pidFile) && int.TryParse(SafelyReadAllText(pidFile).Trim(), out var pid) && pid > 0) - { - try { Process.GetProcessById(pid).Kill(); } catch { /* already gone */ } - } - } - } - - [Test] - public async Task AbandonToken_WhenCancelAlsoRequested_StillReturnsAbandonedExitCode() - { - // Once abandon is requested, the runner returns AbandonedExitCode even though cancel is - // also requested — the abandon branch resolves the wait and returns -48. - // - // We fire abandon FIRST so this is deterministic. The exit code is NOT deterministic if - // cancel and abandon race on a *killable* process: cancel's kill can resolve the wait to - // the killed exit code before abandon is observed. That race does not happen in production - // — cancel and abandon arrive as separate, sequential RPCs, and the server only sends - // abandon AFTER cancel has failed to unstick the script. The deterministic both-tokens - // case (cancel fired, kill genuinely fails, abandon -> -48) is covered end-to-end by - // ClientScriptExecutionAbandon.AbandonScript_WhenCancelFailsToKillProcess. - using var tempDir = new TemporaryDirectory(); - var pidFile = Path.Combine(tempDir.DirectoryPath, "process.pid"); - - var executable = PlatformDetection.IsRunningOnWindows ? "powershell.exe" : "/bin/bash"; - var arguments = PlatformDetection.IsRunningOnWindows - ? $"-NoProfile -NonInteractive -Command \"$PID | Out-File -FilePath '{pidFile}' -Encoding ASCII; Start-Sleep -Seconds 300\"" - : $"-c \"echo $$ > '{pidFile}' && sleep 300\""; - - using var cancelCts = new CancellationTokenSource(); - using var abandonCts = new CancellationTokenSource(); - - var task = Task.Run(async () => await SilentProcessRunner.ExecuteCommandAsync( - executable, - arguments, - Environment.CurrentDirectory, - debug: _ => { }, - info: _ => { }, - error: _ => { }, - customEnvironmentVariables: null, - cancel: cancelCts.Token, - abandon: abandonCts.Token)); - - await WaitForPidFileAsync(pidFile, TimeSpan.FromSeconds(30)); - - // Abandon first (resolves the wait to -48), then cancel is also requested. - abandonCts.Cancel(); - cancelCts.Cancel(); - - try - { - var exitCode = await task; - exitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); - } - finally - { - if (File.Exists(pidFile) && int.TryParse(SafelyReadAllText(pidFile).Trim(), out var pid) && pid > 0) - { - try { Process.GetProcessById(pid).Kill(); } catch { /* already gone */ } - } + EnsureProcessKilled(pidFile); } } @@ -403,16 +344,25 @@ static string SafelyReadAllText(string path) try { return File.ReadAllText(path); } catch { return string.Empty; } } - static void TryKillGrandchild(string pidFile) + static void EnsureProcessKilled(string pidFile) { + // Safety net for tests that spawn a long-lived process. It should already be gone (abandon's + // best-effort kill, or the grandchild reaped). If it's somehow still alive, kill it and confirm + // it died — we'd rather flake loudly here than leak the process onto a CI agent. + if (!File.Exists(pidFile) || !int.TryParse(SafelyReadAllText(pidFile).Trim(), out var pid) || pid <= 0) + return; + try { - if (File.Exists(pidFile) && int.TryParse(SafelyReadAllText(pidFile).Trim(), out var pid)) - { - try { Process.GetProcessById(pid).Kill(); } catch { /* already gone */ } - } + using var process = Process.GetProcessById(pid); + if (process.HasExited) + return; + + process.Kill(); + process.WaitForExit(10_000).Should().BeTrue($"cleanup must terminate PID {pid} so it isn't leaked onto the CI agent"); } - catch { /* ignore */ } + catch (ArgumentException) { /* not running — already gone */ } + catch (InvalidOperationException) { /* exited between lookup and kill */ } } [Test] diff --git a/source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs b/source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs index 9317a874e..b8c56e28e 100644 --- a/source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs +++ b/source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs @@ -345,6 +345,66 @@ public async Task CancelScriptShouldCancelAnExecutingScript() cancellationTimer.Elapsed.Should().BeLessOrEqualTo(TimeSpan.FromSeconds(5)); } + [Test] + public async Task AbandonScript_WhileWaitingOnTheIsolationMutex_ExitsCancelled() + { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + + const string mutexName = "abandon-while-acquiring-mutex"; + + // The holder takes the full-isolation mutex and keeps it for the duration of the test. + var holderTicket = new ScriptTicket(Guid.NewGuid().ToString()); + var holderCommand = new StartScriptCommandV2Builder() + .WithScriptBodyForCurrentOs("Start-Sleep -Seconds 60", "sleep 60") + .WithIsolation(ScriptIsolationLevel.FullIsolation) + .WithMutexName(mutexName) + .WithScriptTicket(holderTicket) + .WithDurationStartScriptCanWaitForScriptToFinish(null) + .Build(); + + // The waiter wants the same mutex, so it blocks in Acquire and never starts its process. + var waiterTicket = new ScriptTicket(Guid.NewGuid().ToString()); + var waiterCommand = new StartScriptCommandV2Builder() + .WithScriptBodyForCurrentOs("Start-Sleep -Seconds 1", "sleep 1") + .WithIsolation(ScriptIsolationLevel.FullIsolation) + .WithMutexName(mutexName) + .WithScriptTicket(waiterTicket) + .WithDurationStartScriptCanWaitForScriptToFinish(null) + .Build(); + + try + { + var holderResponse = await service.StartScriptAsync(holderCommand, CancellationToken.None); + holderResponse = await WaitForState(service, holderTicket, holderResponse, ProcessState.Running); + + var waiterResponse = await service.StartScriptAsync(waiterCommand, CancellationToken.None); + + // Give the waiter time to reach (and block in) the mutex Acquire. It cannot complete while the + // holder owns the mutex, so it must still be waiting. + await Task.Delay(TimeSpan.FromSeconds(2)); + waiterResponse = await service.GetStatusAsync(new ScriptStatusRequestV2(waiterTicket, waiterResponse.NextLogSequence), CancellationToken.None); + waiterResponse.State.Should().NotBe(ProcessState.Complete, "the waiter should be blocked acquiring the isolation mutex"); + + // A script still waiting on the isolation mutex never started a process, so we report it as + // cancelled (-43), not abandoned. Abandon is a no-op here (Acquire doesn't observe the abandon + // token); cancel unblocks Acquire and wins. If we ever want abandon to win in this case, + // RunningScript.Execute can catch (OperationCanceledException) when (abandonToken.IsCancellationRequested) + // around the mutex Acquire and return AbandonedExitCode (-48) instead. Deliberately not doing that today. + await service.AbandonScriptAsync(new AbandonScriptCommandV2(waiterTicket, waiterResponse.NextLogSequence), CancellationToken.None); + var afterCancel = await service.CancelScriptAsync(new CancelScriptCommandV2(waiterTicket, waiterResponse.NextLogSequence), CancellationToken.None); + + var (_, waiterFinal) = await RunUntilScriptCompletes(service, waiterCommand, afterCancel); + waiterFinal.ExitCode.Should().Be(ScriptExitCodes.CanceledExitCode); + } + finally + { + // Release the holder so we don't sit on the mutex (and its 60s sleep gets killed). + await service.CancelScriptAsync(new CancelScriptCommandV2(holderTicket, 0), CancellationToken.None); + await service.CompleteScriptAsync(new CompleteScriptCommandV2(holderTicket), CancellationToken.None); + } + } + [Test] public async Task CompleteScriptShouldCleanupTheWorkspace() { @@ -931,6 +991,19 @@ static async Task CleanupWorkspace(IScriptWorkspaceFactory workspaceFactory, Scr return (logs, response); } + static async Task WaitForState(ScriptServiceV2 service, ScriptTicket ticket, ScriptStatusResponseV2 response, ProcessState desiredState) + { + var deadline = DateTime.UtcNow + TimeSpan.FromSeconds(30); + while (response.State != desiredState && DateTime.UtcNow < deadline) + { + await Task.Delay(TimeSpan.FromMilliseconds(100)); + response = await service.GetStatusAsync(new ScriptStatusRequestV2(ticket, response.NextLogSequence), CancellationToken.None); + } + + response.State.Should().Be(desiredState, $"the script should reach {desiredState} within the timeout"); + return response; + } + static void WriteLogsToConsole(List logs) { foreach (var log in logs) From 7ee0fdbec573089dce83e4f09200be6fb59931bb Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Thu, 4 Jun 2026 16:01:23 +1000 Subject: [PATCH 10/15] Keep grandchild cleanup best-effort (can't assert a reparented non-child's exit) The asserting EnsureProcessKilled failed CancellationToken_WhenUnixGrandchildHoldsRedirectedPipes on the Ubuntu CI agent: the grandchild is reparented to init, so it isn't our child and Process.WaitForExit can't reliably observe it exit. Keep the asserting helper for the abandon test (our child), revert the grandchild sites to best-effort TryKillGrandchild. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../Util/SilentProcessRunnerFixture.cs | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs index e3a92fc19..c0752f08d 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs @@ -213,7 +213,7 @@ public async Task CancellationToken_WhenGrandchildHoldsRedirectedPipes_ShouldNot } finally { - EnsureProcessKilled(grandchildPidFile); + TryKillGrandchild(grandchildPidFile); } } @@ -272,7 +272,7 @@ public async Task CancellationToken_WhenUnixGrandchildHoldsRedirectedPipes_Shoul } finally { - EnsureProcessKilled(grandchildPidFile); + TryKillGrandchild(grandchildPidFile); } } @@ -365,6 +365,19 @@ static void EnsureProcessKilled(string pidFile) catch (InvalidOperationException) { /* exited between lookup and kill */ } } + static void TryKillGrandchild(string pidFile) + { + // Best-effort only. The grandchild is reparented to init, so it is NOT our child: Process.WaitForExit + // can't reliably observe a non-child exiting (and a CI container's PID 1 may not reap the zombie), + // so we can't assert the kill the way EnsureProcessKilled does. The test's real assertion is that + // cancellation returns promptly, not that this cleanup succeeded. + if (!File.Exists(pidFile) || !int.TryParse(SafelyReadAllText(pidFile).Trim(), out var pid) || pid <= 0) + return; + + try { using var process = Process.GetProcessById(pid); process.Kill(); } + catch { /* already gone, or can't be reliably killed/observed — best effort */ } + } + [Test] public void EchoHello_ShouldWriteToStdOut() { From 86ef03a552f1aa44cfd396cc063fc0b52dd6a8cd Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Fri, 5 Jun 2026 12:55:20 +1000 Subject: [PATCH 11/15] Gate abandon on the advertised capability, not just "is V2", + test restructure Abandon support was decided by ScriptServiceVersion == V2, but old V2 Tentacles (v7.1, v8.0) are V2 yet predate the abandon verb. Determine it from the advertised AbandonScript capability: - Re-add client CapabilitiesResponseV2.HasAbandonScript(); the selector surfaces it; the orchestrator captures it once on the start result (CommandContext.SupportsAbandon) and gates escalation on it. Removed SupportsAbandon from ScriptServiceVersion (it can't carry it - the record is value-compared in ScriptExecutorFactory). Selector unit test covers all three cases. Test feedback: - CapabilitiesServiceV2Test: focused decorator/K8s tests back to Contain/NotContain; cross-version AbandonScriptIsAdvertisedOnlyByVersionsThatSupportIt proves old versions don't advertise abandon. - SilentProcessRunnerFixture: renamed to AbandonToken_WhenProcessCanBeKilled_..., assert the process is gone, trim the comment. - ClientScriptExecutionAbandon: drive explicit StartScript/CancelScript/AbandonScript; assert the process is still running (stuck) vs gone (killable) via PID; rename for the killable/stuck split; clarify the V1 no-escalation comment. - RunningScript: correct the cancel-catch comment (OCE comes from mutex Acquire, not the process). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../ScriptServiceVersionSelectorTests.cs | 73 +++++ .../CapabilitiesResponseV2ExtensionMethods.cs | 13 + .../EventDriven/CommandContext.cs | 9 +- .../Octopus.Tentacle.Client/ScriptExecutor.cs | 12 +- .../KubernetesScriptServiceV1Executor.cs | 6 +- .../Scripts/ObservingScriptOrchestrator.cs | 8 +- .../Scripts/ScriptServiceV1Executor.cs | 4 +- .../Scripts/ScriptServiceVersion.cs | 2 - .../Scripts/ScriptServiceVersionSelector.cs | 9 +- .../Builders/ScriptBuilder.cs | 11 + .../Services/Scripts/RunningScript.cs | 3 +- .../CapabilitiesServiceV2Test.cs | 37 ++- .../ClientScriptExecutionAbandon.cs | 271 ++++++++++-------- .../Support/TentacleVersions.cs | 4 +- .../Util/SilentProcessRunnerFixture.cs | 35 ++- 15 files changed, 345 insertions(+), 152 deletions(-) create mode 100644 source/Octopus.Tentacle.Client.Tests/ScriptServiceVersionSelectorTests.cs diff --git a/source/Octopus.Tentacle.Client.Tests/ScriptServiceVersionSelectorTests.cs b/source/Octopus.Tentacle.Client.Tests/ScriptServiceVersionSelectorTests.cs new file mode 100644 index 000000000..1eb73556f --- /dev/null +++ b/source/Octopus.Tentacle.Client.Tests/ScriptServiceVersionSelectorTests.cs @@ -0,0 +1,73 @@ +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; +using FluentAssertions; +using Halibut.ServiceModel; +using NSubstitute; +using NUnit.Framework; +using Octopus.Tentacle.Client.Execution; +using Octopus.Tentacle.Client.Observability; +using Octopus.Tentacle.Client.Retries; +using Octopus.Tentacle.Client.Scripts; +using Octopus.Tentacle.Contracts; +using Octopus.Tentacle.Contracts.Capabilities; +using Octopus.Tentacle.Contracts.ClientServices; +using Octopus.Tentacle.Contracts.Logging; +using Octopus.Tentacle.Contracts.Observability; +using Octopus.Tentacle.Contracts.ScriptServiceV2; + +namespace Octopus.Tentacle.Client.Tests +{ + [TestFixture] + public class ScriptServiceVersionSelectorTests + { + [Test] + public async Task WhenTheTentacleAdvertisesAbandon_SupportsAbandonIsTrue() + { + var (version, supportsAbandon) = await SelectFor( + nameof(IScriptService), nameof(IFileTransferService), nameof(IScriptServiceV2), nameof(IAsyncClientScriptServiceV2.AbandonScriptAsync)); + + version.Should().Be(ScriptServiceVersion.ScriptServiceVersion2); + supportsAbandon.Should().BeTrue(); + } + + [Test] + public async Task WhenAnOlderV2TentacleDoesNotAdvertiseAbandon_SupportsAbandonIsFalse() + { + // Old V2 Tentacles are V2 but predate the abandon verb. Gating on "is V2" would wrongly let the + // orchestrator call AbandonScript on them; gating on the advertised capability does not. + var (version, supportsAbandon) = await SelectFor( + nameof(IScriptService), nameof(IFileTransferService), nameof(IScriptServiceV2)); + + version.Should().Be(ScriptServiceVersion.ScriptServiceVersion2); + supportsAbandon.Should().BeFalse(); + } + + [Test] + public async Task WhenTheTentacleOnlyHasV1_SupportsAbandonIsFalse() + { + var (version, supportsAbandon) = await SelectFor( + nameof(IScriptService), nameof(IFileTransferService)); + + version.Should().Be(ScriptServiceVersion.ScriptServiceVersion1); + supportsAbandon.Should().BeFalse(); + } + + static async Task<(ScriptServiceVersion Version, bool SupportsAbandon)> SelectFor(params string[] capabilities) + { + var capabilitiesService = Substitute.For(); + capabilitiesService.GetCapabilitiesAsync(Arg.Any()) + .Returns(Task.FromResult(new CapabilitiesResponseV2(new List(capabilities)))); + + var selector = new ScriptServiceVersionSelector( + capabilitiesService, + Substitute.For(), + RpcCallExecutorFactory.Create(TimeSpan.Zero, Substitute.For()), + new TentacleClientOptions(new RpcRetrySettings(RetriesEnabled: false, RetryDuration: TimeSpan.Zero)), + ClientOperationMetricsBuilder.Start()); + + return await selector.DetermineScriptServiceVersionToUse(CancellationToken.None); + } + } +} diff --git a/source/Octopus.Tentacle.Client/Capabilities/CapabilitiesResponseV2ExtensionMethods.cs b/source/Octopus.Tentacle.Client/Capabilities/CapabilitiesResponseV2ExtensionMethods.cs index bc49a1e95..3132dcf5e 100644 --- a/source/Octopus.Tentacle.Client/Capabilities/CapabilitiesResponseV2ExtensionMethods.cs +++ b/source/Octopus.Tentacle.Client/Capabilities/CapabilitiesResponseV2ExtensionMethods.cs @@ -1,5 +1,6 @@ using System.Linq; using Octopus.Tentacle.Contracts.Capabilities; +using Octopus.Tentacle.Contracts.ClientServices; using Octopus.Tentacle.Contracts.ScriptServiceV2; namespace Octopus.Tentacle.Client.Capabilities @@ -15,5 +16,17 @@ public static bool HasScriptServiceV2(this CapabilitiesResponseV2 capabilities) return capabilities.SupportedCapabilities.Contains(nameof(IScriptServiceV2)); } + + public static bool HasAbandonScript(this CapabilitiesResponseV2 capabilities) + { + if (capabilities?.SupportedCapabilities?.Any() != true) + { + return false; + } + + // Both sides nameof their own AbandonScriptAsync. The strings match, and a rename on + // either side can't silently drift the capability check. + return capabilities.SupportedCapabilities.Contains(nameof(IAsyncClientScriptServiceV2.AbandonScriptAsync)); + } } } \ No newline at end of file diff --git a/source/Octopus.Tentacle.Client/EventDriven/CommandContext.cs b/source/Octopus.Tentacle.Client/EventDriven/CommandContext.cs index 0f34b8108..7a6139772 100644 --- a/source/Octopus.Tentacle.Client/EventDriven/CommandContext.cs +++ b/source/Octopus.Tentacle.Client/EventDriven/CommandContext.cs @@ -12,15 +12,22 @@ public class CommandContext { public CommandContext(ScriptTicket scriptTicket, long nextLogSequence, - ScriptServiceVersion scripServiceVersionUsed) + ScriptServiceVersion scripServiceVersionUsed, + bool supportsAbandon = false) { ScriptTicket = scriptTicket; NextLogSequence = nextLogSequence; ScripServiceVersionUsed = scripServiceVersionUsed; + SupportsAbandon = supportsAbandon; } public ScriptTicket ScriptTicket { get; } public long NextLogSequence { get; } public ScriptServiceVersion ScripServiceVersionUsed { get; } + + // Whether the Tentacle advertised the AbandonScript capability. Determined once from + // capabilities when the script starts and carried on the start result so the orchestrator + // can gate abandon escalation on it. + public bool SupportsAbandon { get; } } } \ No newline at end of file diff --git a/source/Octopus.Tentacle.Client/ScriptExecutor.cs b/source/Octopus.Tentacle.Client/ScriptExecutor.cs index 1ce5e1f2c..51ce32771 100644 --- a/source/Octopus.Tentacle.Client/ScriptExecutor.cs +++ b/source/Octopus.Tentacle.Client/ScriptExecutor.cs @@ -46,12 +46,18 @@ public async Task StartScript(ExecuteScriptComma { // Note: This class deliberately does not create OpenTelemetry Trace activities. // It is a facade over other ScriptExecutor services, and the facade doesn't do anything interesting - var scriptServiceVersionToUse = await DetermineScriptServiceVersionToUse(cancellationToken); + var (scriptServiceVersionToUse, supportsAbandon) = await DetermineScriptServiceVersionToUse(cancellationToken); var scriptExecutorFactory = CreateScriptExecutorFactory(); var scriptExecutor = scriptExecutorFactory.CreateScriptExecutor(scriptServiceVersionToUse); - return await scriptExecutor.StartScript(executeScriptCommand, startScriptIsBeingReAttempted, cancellationToken); + var result = await scriptExecutor.StartScript(executeScriptCommand, startScriptIsBeingReAttempted, cancellationToken); + + // Carry whether the Tentacle advertised abandon onto the context the orchestrator reads, so it can + // gate escalation on the real capability rather than just "is V2". + var context = result.ContextForNextCommand; + var contextWithAbandon = new CommandContext(context.ScriptTicket, context.NextLogSequence, context.ScripServiceVersionUsed, supportsAbandon); + return new ScriptOperationExecutionResult(result.ScriptStatus, contextWithAbandon); } public async Task GetStatus(CommandContext commandContext, CancellationToken cancellationToken) @@ -96,7 +102,7 @@ ScriptExecutorFactory CreateScriptExecutorFactory() logger); } - async Task DetermineScriptServiceVersionToUse(CancellationToken cancellationToken) + async Task<(ScriptServiceVersion Version, bool SupportsAbandon)> DetermineScriptServiceVersionToUse(CancellationToken cancellationToken) { try { diff --git a/source/Octopus.Tentacle.Client/Scripts/KubernetesScriptServiceV1Executor.cs b/source/Octopus.Tentacle.Client/Scripts/KubernetesScriptServiceV1Executor.cs index 124de8181..f575d7892 100644 --- a/source/Octopus.Tentacle.Client/Scripts/KubernetesScriptServiceV1Executor.cs +++ b/source/Octopus.Tentacle.Client/Scripts/KubernetesScriptServiceV1Executor.cs @@ -185,9 +185,9 @@ async Task CancelScriptAction(CancellationToke } public Task AbandonScript(CommandContext commandContext) - // KubernetesScriptServiceV1 has no abandon verb. The orchestrator checks - // ScriptServiceVersion.SupportsAbandon and won't escalate here, so reaching this is a bug — - // throw rather than quietly cancel. Hung pods are recovered by deleting the pod instead. + // KubernetesScriptServiceV1 has no abandon verb. The orchestrator only escalates to abandon + // when the Tentacle advertised the abandon capability (K8s agents never do), so it won't + // escalate here; reaching this is a bug. Hung pods are recovered by deleting the pod instead. => throw new NotSupportedException("KubernetesScriptServiceV1 cannot abandon a script; it has no abandon verb. Cancel the script instead."); public async Task CompleteScript(CommandContext lastStatusResponse, CancellationToken scriptExecutionCancellationToken) diff --git a/source/Octopus.Tentacle.Client/Scripts/ObservingScriptOrchestrator.cs b/source/Octopus.Tentacle.Client/Scripts/ObservingScriptOrchestrator.cs index 20f094412..b55e5bd6d 100644 --- a/source/Octopus.Tentacle.Client/Scripts/ObservingScriptOrchestrator.cs +++ b/source/Octopus.Tentacle.Client/Scripts/ObservingScriptOrchestrator.cs @@ -83,6 +83,7 @@ async Task ObserveUntilComplete( var iteration = 0; var cancellationIteration = 0; var lastResult = startScriptResult; + var supportsAbandon = startScriptResult.ContextForNextCommand.SupportsAbandon; var stopwatch = new Stopwatch(); while (lastResult.ScriptStatus.State != ProcessState.Complete) @@ -95,9 +96,10 @@ async Task ObserveUntilComplete( stopwatch.Start(); } - // Only escalate to abandon when the script service can actually abandon. On versions - // that can't (V1, Kubernetes) we keep cancelling rather than calling a verb they don't have. - var shouldAbandon = lastResult.ContextForNextCommand.ScripServiceVersionUsed.SupportsAbandon + // Only escalate to abandon when the Tentacle advertised the abandon capability. Old V2 + // Tentacles (pre-abandon) and V1/Kubernetes don't, so we keep cancelling rather than + // calling a verb they don't have. + var shouldAbandon = supportsAbandon && abandonAfterCancellationPendingFor.HasValue && stopwatch.Elapsed >= abandonAfterCancellationPendingFor.Value; diff --git a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV1Executor.cs b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV1Executor.cs index 75b3c3aad..0213bcb32 100644 --- a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV1Executor.cs +++ b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV1Executor.cs @@ -165,8 +165,8 @@ public async Task CancelScript(CommandContext co } public Task AbandonScript(CommandContext commandContext) - // ScriptServiceV1 has no abandon verb. The orchestrator checks ScriptServiceVersion.SupportsAbandon - // and won't escalate here, so reaching this is a bug — throw rather than quietly cancel. + // ScriptServiceV1 has no abandon verb. The orchestrator only escalates to abandon when the + // Tentacle advertised the abandon capability, so it won't escalate here; reaching this is a bug. => throw new NotSupportedException("ScriptServiceV1 cannot abandon a script; it has no abandon verb. Cancel the script instead."); public async Task CompleteScript(CommandContext lastStatusResponse, CancellationToken scriptExecutionCancellationToken) diff --git a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersion.cs b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersion.cs index ebfba57aa..ad8cfa420 100644 --- a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersion.cs +++ b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersion.cs @@ -6,8 +6,6 @@ public record ScriptServiceVersion(string Value) public static ScriptServiceVersion ScriptServiceVersion2 = new(nameof(ScriptServiceVersion2)); public static ScriptServiceVersion KubernetesScriptServiceVersion1 = new(nameof(KubernetesScriptServiceVersion1)); - public bool SupportsAbandon => Value == nameof(ScriptServiceVersion2); - public override string ToString() => Value; } } diff --git a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersionSelector.cs b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersionSelector.cs index c8d7c57e2..f449d63e4 100644 --- a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersionSelector.cs +++ b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersionSelector.cs @@ -34,7 +34,7 @@ internal ScriptServiceVersionSelector( this.clientOperationMetricsBuilder = clientOperationMetricsBuilder; } - public async Task DetermineScriptServiceVersionToUse(CancellationToken cancellationToken) + public async Task<(ScriptServiceVersion Version, bool SupportsAbandon)> DetermineScriptServiceVersionToUse(CancellationToken cancellationToken) { logger.Verbose("Determining ScriptService version to use"); @@ -59,10 +59,13 @@ async Task GetCapabilitiesFunc(CancellationToken ct) // It's implied (and tested) that GetCapabilities will only return Kubernetes or non-Kubernetes script services, never a mix if (tentacleCapabilities.HasAnyKubernetesScriptService()) { - return DetermineKubernetesScriptServiceVersionToUse(); + // Kubernetes agents never advertise abandon. + return (DetermineKubernetesScriptServiceVersionToUse(), false); } - return DetermineShellScriptServiceVersionToUse(tentacleCapabilities); + // Abandon support is whether the Tentacle actually advertised the capability, not just that it + // is V2 — old V2 Tentacles (pre-abandon) are V2 but have no abandon verb. + return (DetermineShellScriptServiceVersionToUse(tentacleCapabilities), tentacleCapabilities.HasAbandonScript()); } ScriptServiceVersion DetermineShellScriptServiceVersionToUse(CapabilitiesResponseV2 tentacleCapabilities) diff --git a/source/Octopus.Tentacle.CommonTestUtils/Builders/ScriptBuilder.cs b/source/Octopus.Tentacle.CommonTestUtils/Builders/ScriptBuilder.cs index b238ce154..9b7303d03 100644 --- a/source/Octopus.Tentacle.CommonTestUtils/Builders/ScriptBuilder.cs +++ b/source/Octopus.Tentacle.CommonTestUtils/Builders/ScriptBuilder.cs @@ -58,6 +58,17 @@ public ScriptBuilder CreateFile(string file) return this; } + public ScriptBuilder WritePidToFile(string file) + { + bashScript.AppendLine($@" +echo $$ > '{file}' +"); + windowsScript.AppendLine($@" +$PID | Out-File -FilePath '{file}' -Encoding ASCII +"); + return this; + } + public ScriptBuilder WaitForFileToExist(string fileToWaitFor) { bashScript.AppendLine($@" diff --git a/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs b/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs index ddfc7544a..bfe4982e4 100644 --- a/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs +++ b/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs @@ -126,7 +126,8 @@ public async Task Execute() : await RunScriptAsync(shellPath, writer, runningScriptToken, abandonToken); } } - // Fires when the caller cancelled the script and the underlying process honored the cancellation token. + // We get here when cancel fires before the script starts running. scriptIsolationMutex.Acquire + // watches runningScriptToken and throws while we're still waiting for the mutex. catch (OperationCanceledException) { writer.WriteOutput(ProcessOutputSource.StdOut, "Script execution canceled."); diff --git a/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs b/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs index 37743705c..dbc17cf75 100644 --- a/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs +++ b/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs @@ -29,14 +29,10 @@ public async Task CapabilitiesFromAnOlderTentacleWhichHasNoCapabilitiesService_W var capabilities = (await clientAndTentacle.TentacleClient.CapabilitiesServiceV2.GetCapabilitiesAsync(new(CancellationToken))).SupportedCapabilities; - var expected = new List { nameof(IScriptService), nameof(IFileTransferService) }; + capabilities.Should().Contain(nameof(IScriptService)); + capabilities.Should().Contain(nameof(IFileTransferService)); if (version.HasScriptServiceV2()) - expected.Add(nameof(IScriptServiceV2)); - if (version.HasAbandonScript()) - expected.Add(nameof(ScriptServiceV2.AbandonScriptAsync)); - - // Exact set, not just Contain: this also guards against over-advertising a capability we didn't intend. - capabilities.Should().BeEquivalentTo(expected); + capabilities.Should().Contain(nameof(IScriptServiceV2)); } [Test] @@ -49,18 +45,31 @@ public async Task CapabilitiesServiceDoesNotReturnKubernetesScriptServiceForNonK var capabilities = (await clientAndTentacle.TentacleClient.CapabilitiesServiceV2.GetCapabilitiesAsync(new(CancellationToken))).SupportedCapabilities; - var expected = new List { nameof(IScriptService), nameof(IFileTransferService) }; + capabilities.Should().Contain(nameof(IScriptService)); + capabilities.Should().Contain(nameof(IFileTransferService)); if (version.HasScriptServiceV2()) - expected.Add(nameof(IScriptServiceV2)); - if (version.HasAbandonScript()) - expected.Add(nameof(ScriptServiceV2.AbandonScriptAsync)); + capabilities.Should().Contain(nameof(IScriptServiceV2)); - // Exact set, not just Contain: this also guards against over-advertising. In particular a - // non-Kubernetes Tentacle must never surface IKubernetesScriptServiceV1. - capabilities.Should().BeEquivalentTo(expected); capabilities.Should().NotContain(nameof(IKubernetesScriptServiceV1)); } + [Test] + [TentacleConfigurations(testCapabilitiesServiceVersions: true)] + public async Task AbandonScriptIsAdvertisedOnlyByVersionsThatSupportIt(TentacleConfigurationTestCase tentacleConfigurationTestCase) + { + var version = tentacleConfigurationTestCase.Version; + + await using var clientAndTentacle = await tentacleConfigurationTestCase.CreateLegacyBuilder().Build(CancellationToken); + + var capabilities = (await clientAndTentacle.TentacleClient.CapabilitiesServiceV2.GetCapabilitiesAsync(new(CancellationToken))).SupportedCapabilities; + + // Older Tentacles predate abandon and must not advertise it; this build (and later) must. + if (version.HasAbandonScript()) + capabilities.Should().Contain(nameof(ScriptServiceV2.AbandonScriptAsync)); + else + capabilities.Should().NotContain(nameof(ScriptServiceV2.AbandonScriptAsync)); + } + [Test] [TentacleConfigurations(testCapabilitiesServiceVersions: true)] public async Task CapabilitiesResponseShouldBeCached(TentacleConfigurationTestCase tentacleConfigurationTestCase) diff --git a/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs b/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs index a6c4a8fdf..d25c6b054 100644 --- a/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs +++ b/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs @@ -1,11 +1,14 @@ using System; +using System.Diagnostics; using System.IO; -using System.Linq; using System.Threading.Tasks; using FluentAssertions; using NUnit.Framework; using Octopus.Tentacle.Client; +using Octopus.Tentacle.Client.EventDriven; +using Octopus.Tentacle.Client.Scripts; using Octopus.Tentacle.Contracts; +using Octopus.Tentacle.Contracts.Logging; using Octopus.Tentacle.Core.Util; using Octopus.Tentacle.Tests.Integration.Support; using Octopus.Tentacle.Tests.Integration.Util; @@ -18,146 +21,152 @@ public class ClientScriptExecutionAbandon : IntegrationTest { [Test] [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version2)] - public async Task AbandonScript_WhenCancelFailsToKillProcess_ReturnsAbandonedExitCode(TentacleConfigurationTestCase tentacleConfigurationTestCase) + public async Task AbandonScript_WhenProcessCannotBeKilled_LeavesItRunningAndReturnsAbandonedExitCode(TentacleConfigurationTestCase tentacleConfigurationTestCase) { - // TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION=1 makes Hitman a no-op, so - // CancelScript cannot actually terminate the underlying script process. The script - // becomes genuinely "stuck" from Tentacle's perspective. AbandonScript should then - // return promptly with AbandonedExitCode without waiting for the process to exit. + // TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION=1 makes Hitman a no-op, so CancelScript + // cannot terminate the process — it is genuinely stuck. AbandonScript then returns promptly with + // AbandonedExitCode without waiting, and the un-killable process is left running. await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() .WithTentacle(x => x.WithRunTentacleEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, "1")) .Build(CancellationToken); var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); var releaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "release"); + var pidFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "pid"); - var firstCommand = new TestExecuteShellScriptCommandBuilder() + var command = new TestExecuteShellScriptCommandBuilder() .SetScriptBody(new ScriptBuilder() + .WritePidToFile(pidFile) .CreateFile(startFile) .WaitForFileToExist(releaseFile)) .WithIsolationLevel(ScriptIsolationLevel.NoIsolation) .Build(); - var tentacleClient = clientTentacle.TentacleClient; + var client = clientTentacle.TentacleClient; + var log = Log(); - var scriptExecution = Task.Run(async () => await tentacleClient.ExecuteScript(firstCommand, CancellationToken)); + // Explicit StartScript (not ExecuteScript) so we drive cancel and abandon ourselves. + var startResult = await client.StartScript(command, StartScriptIsBeingReAttempted.FirstAttempt, log, CancellationToken); await Wait.For(() => File.Exists(startFile), TimeSpan.FromSeconds(30), () => throw new Exception("Script did not start"), CancellationToken); - // Cancel: Hitman is a no-op so the process keeps running. - await tentacleClient.CancelScript(firstCommand.ScriptTicket); - await Task.Delay(TimeSpan.FromSeconds(1)); + // Cancel: Hitman is a no-op, so the process keeps running. + var afterCancel = await client.CancelScript(startResult.ContextForNextCommand, log); + + // Abandon: returns the abandoned terminal state without waiting for the still-running process. + await client.AbandonScript(command.ScriptTicket, log, CancellationToken); - // Abandon: fires the abandon token. ExecuteScript observes the abandoned terminal state and - // returns it. We assert on its result rather than a separate GetStatus poll — the poll would - // race the orchestrator's own CompleteScript cleanup (which removes the script and deletes the - // workspace), and on slower transports lands after it, seeing UnknownScriptExitCode. - await tentacleClient.AbandonScript(firstCommand.ScriptTicket, CancellationToken); + var finalResult = await RunStatusUntilComplete(client, afterCancel.ContextForNextCommand, log); + finalResult.ScriptStatus.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); - var (result, _) = await scriptExecution; - result.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + // The process was never killed (kill disabled), so abandon left it running. + AssertProcessIsRunning(pidFile); - // The process is still alive (kill was disabled); release it so it stops leaking. + // Release the still-running script, then complete it so the workspace is cleaned up. File.WriteAllText(releaseFile, ""); + await client.CompleteScript(finalResult.ContextForNextCommand, log, CancellationToken); } [Test] [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version2)] - public async Task AbandonScript_ReleasesIsolationMutexEvenWhileProcessIsStillRunning(TentacleConfigurationTestCase tentacleConfigurationTestCase) + public async Task AbandonScript_WithNoPriorCancel_KillsTheProcessAndReturnsAbandonedExitCode(TentacleConfigurationTestCase tentacleConfigurationTestCase) { - // The whole reason Tentacle needs an abandon RPC is to release the isolation mutex - // when CancelScript can't unstick the script. This test proves that contract: a - // FullIsolation script gets stuck (because TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION - // makes cancel a no-op), abandon is called, and a second FullIsolation script with - // the same mutex name must then be able to acquire the mutex and run. + // Anti-abuse: a direct AbandonScript with no prior CancelScript must still attempt the kill. Kill is + // NOT disabled here, so the abandon branch's best-effort kill actually terminates the process. await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() - .WithTentacle(x => x.WithRunTentacleEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, "1")) .Build(CancellationToken); var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); var releaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "release"); + var pidFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "pid"); - const string sharedMutex = "abandon-test-mutex"; - - var firstCommand = new TestExecuteShellScriptCommandBuilder() + var command = new TestExecuteShellScriptCommandBuilder() .SetScriptBody(new ScriptBuilder() + .WritePidToFile(pidFile) .CreateFile(startFile) .WaitForFileToExist(releaseFile)) - .WithIsolationLevel(ScriptIsolationLevel.FullIsolation) - .WithIsolationMutexName(sharedMutex) + .WithIsolationLevel(ScriptIsolationLevel.NoIsolation) .Build(); - var tentacleClient = clientTentacle.TentacleClient; + var client = clientTentacle.TentacleClient; + var log = Log(); - var firstScriptExecution = Task.Run(async () => await tentacleClient.ExecuteScript(firstCommand, CancellationToken)); + var startResult = await client.StartScript(command, StartScriptIsBeingReAttempted.FirstAttempt, log, CancellationToken); await Wait.For(() => File.Exists(startFile), TimeSpan.FromSeconds(30), - () => throw new Exception("First script did not start"), + () => throw new Exception("Script did not start"), CancellationToken); - await tentacleClient.CancelScript(firstCommand.ScriptTicket); - await Task.Delay(TimeSpan.FromSeconds(1)); - - await tentacleClient.AbandonScript(firstCommand.ScriptTicket, CancellationToken); + // Direct abandon, NO prior cancel. We never write releaseFile, so the script only completes + // because the abandon branch killed it. + await client.AbandonScript(command.ScriptTicket, log, CancellationToken); - // Second FullIsolation script with the SAME mutex name. If the abandon released - // the mutex, this script can acquire it and run to completion. Otherwise it would - // block waiting for the (still-alive) first script's mutex hold. - var secondStartFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "second-start"); - var secondCommand = new TestExecuteShellScriptCommandBuilder() - .SetScriptBody(new ScriptBuilder().CreateFile(secondStartFile)) - .WithIsolationLevel(ScriptIsolationLevel.FullIsolation) - .WithIsolationMutexName(sharedMutex) - .Build(); + var finalResult = await RunStatusUntilComplete(client, startResult.ContextForNextCommand, log); + finalResult.ScriptStatus.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); - var (secondResult, _) = await tentacleClient.ExecuteScript(secondCommand, CancellationToken); - secondResult.ExitCode.Should().Be(0); - File.Exists(secondStartFile).Should().BeTrue("second script should have run after the mutex was released"); + // Kill was enabled, so abandon's best-effort kill landed and the process is gone. + AssertProcessExits(pidFile, TimeSpan.FromSeconds(10)); - File.WriteAllText(releaseFile, ""); - await firstScriptExecution; + await client.CompleteScript(finalResult.ContextForNextCommand, log, CancellationToken); } [Test] [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version2)] - public async Task AbandonScript_WithNoPriorCancel_KillsTheProcess(TentacleConfigurationTestCase tentacleConfigurationTestCase) + public async Task AbandonScript_ReleasesIsolationMutexEvenWhileProcessIsStillRunning(TentacleConfigurationTestCase tentacleConfigurationTestCase) { - // Anti-abuse: a direct AbandonScript with no prior CancelScript must still attempt the - // kill. The runner's abandon branch best-effort-kills (DoOurBestToCleanUp) before returning - // -48. Kill is NOT disabled here, so the underlying process must actually die (the execution drains). + // The whole reason Tentacle needs an abandon RPC is to release the isolation mutex when CancelScript + // can't unstick the script. A FullIsolation script gets stuck (kill disabled), abandon is called, and + // a second FullIsolation script with the same mutex name must then be able to acquire the mutex and run. await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() + .WithTentacle(x => x.WithRunTentacleEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, "1")) .Build(CancellationToken); var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); var releaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "release"); + const string sharedMutex = "abandon-test-mutex"; - var command = new TestExecuteShellScriptCommandBuilder() + var firstCommand = new TestExecuteShellScriptCommandBuilder() .SetScriptBody(new ScriptBuilder() .CreateFile(startFile) .WaitForFileToExist(releaseFile)) - .WithIsolationLevel(ScriptIsolationLevel.NoIsolation) + .WithIsolationLevel(ScriptIsolationLevel.FullIsolation) + .WithIsolationMutexName(sharedMutex) .Build(); - var tentacleClient = clientTentacle.TentacleClient; - var scriptExecution = Task.Run(async () => await tentacleClient.ExecuteScript(command, CancellationToken)); + var client = clientTentacle.TentacleClient; + var log = Log(); + + var startResult = await client.StartScript(firstCommand, StartScriptIsBeingReAttempted.FirstAttempt, log, CancellationToken); await Wait.For(() => File.Exists(startFile), TimeSpan.FromSeconds(30), - () => throw new Exception("Script did not start"), + () => throw new Exception("First script did not start"), CancellationToken); - // Direct abandon, NO prior cancel. Assert on ExecuteScript's own result rather than a separate - // GetStatus poll, which would race the orchestrator's CompleteScript cleanup (see - // AbandonScript_WhenCancelFailsToKillProcess for the same reasoning). - await tentacleClient.AbandonScript(command.ScriptTicket, CancellationToken); + var afterCancel = await client.CancelScript(startResult.ContextForNextCommand, log); + await client.AbandonScript(firstCommand.ScriptTicket, log, CancellationToken); - // We never wrote releaseFile, so the script only completes because the abandon branch killed it. - var (result, _) = await scriptExecution; - result.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + // Second FullIsolation script with the SAME mutex name. If abandon released the mutex, this script + // can acquire it and run to completion. Otherwise it blocks behind the still-alive first script. + var secondStartFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "second-start"); + var secondCommand = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder().CreateFile(secondStartFile)) + .WithIsolationLevel(ScriptIsolationLevel.FullIsolation) + .WithIsolationMutexName(sharedMutex) + .Build(); + + var (secondResult, _) = await client.ExecuteScript(secondCommand, CancellationToken); + secondResult.ExitCode.Should().Be(0); + File.Exists(secondStartFile).Should().BeTrue("second script should have run after the mutex was released"); + + // Release the still-running first script, then complete it. + File.WriteAllText(releaseFile, ""); + var firstFinal = await RunStatusUntilComplete(client, afterCancel.ContextForNextCommand, log); + await client.CompleteScript(firstFinal.ContextForNextCommand, log, CancellationToken); } [Test] @@ -165,9 +174,10 @@ await Wait.For(() => File.Exists(startFile), public async Task ExecuteScript_WhenCancellationStaysPending_EscalatesToAbandon_AndReleasesMutex(TentacleConfigurationTestCase tentacleConfigurationTestCase) { // Kill is disabled, so the script is genuinely stuck and CancelScript can't end it. With - // abandonAfterCancellationPendingFor set short, the orchestrator escalates from cancel to - // abandon, which releases the isolation mutex. We prove that by the outcome: a second - // FullIsolation script with the same mutex name can only run once the mutex is released. + // abandonAfterCancellationPendingFor set short, the orchestrator escalates from cancel to abandon, + // which releases the isolation mutex. We prove that by the outcome: a second FullIsolation script + // with the same mutex name can only run once the mutex is released. This is the ExecuteScript + // (orchestrator) path; the explicit-AbandonScript path is AbandonScript_ReleasesIsolationMutex... above. await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() .WithTentacle(x => x.WithRunTentacleEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, "1")) .Build(CancellationToken); @@ -184,14 +194,14 @@ public async Task ExecuteScript_WhenCancellationStaysPending_EscalatesToAbandon_ .WithIsolationMutexName(sharedMutex) .Build(); - var tentacleClient = clientTentacle.TentacleClient; + var client = clientTentacle.TentacleClient; using var executionCts = new System.Threading.CancellationTokenSource(); - var stuckExecution = Task.Run(async () => await tentacleClient.ExecuteScript( + var stuckExecution = Task.Run(async () => await client.ExecuteScript( stuckCommand, _ => { }, _ => Task.CompletedTask, - new SerilogLoggerBuilder().Build().ForContext().ToITentacleTaskLog(), + Log(), executionCts.Token, abandonAfterCancellationPendingFor: TimeSpan.FromSeconds(2))); @@ -212,7 +222,7 @@ await Wait.For(() => File.Exists(startFile), .WithIsolationMutexName(sharedMutex) .Build(); - var (secondResult, _) = await tentacleClient.ExecuteScript(secondCommand, CancellationToken); + var (secondResult, _) = await client.ExecuteScript(secondCommand, CancellationToken); secondResult.ExitCode.Should().Be(0); File.Exists(secondStartFile).Should().BeTrue("escalation to abandon should have released the mutex"); @@ -224,11 +234,9 @@ await Wait.For(() => File.Exists(startFile), [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version1)] public async Task ExecuteScript_OnV1Tentacle_DoesNotEscalateToAbandon(TentacleConfigurationTestCase tentacleConfigurationTestCase) { - // ScriptServiceV1 has no abandon verb, so the orchestrator must keep cancelling and never - // escalate. If it wrongly escalated it would call the V1 executor's AbandonScript, which - // throws NotSupportedException. We prove the gating by the outcome: even with - // abandonAfterCancellationPendingFor set, the run ends in OperationCanceledException (the - // cancel path) — never NotSupportedException. + // A V1 Tentacle has no abandon verb. If the orchestrator wrongly escalated, the V1 executor's + // AbandonScript throws NotSupportedException. Asserting the cancelled run throws OperationCanceledException + // (and therefore NOT NotSupportedException) proves the gate held and we never escalated. await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() .WithTentacle(x => x.WithRunTentacleEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, "1")) .Build(CancellationToken); @@ -243,14 +251,14 @@ public async Task ExecuteScript_OnV1Tentacle_DoesNotEscalateToAbandon(TentacleCo .WithIsolationLevel(ScriptIsolationLevel.NoIsolation) .Build(); - var tentacleClient = clientTentacle.TentacleClient; + var client = clientTentacle.TentacleClient; using var executionCts = new System.Threading.CancellationTokenSource(); - var execution = Task.Run(async () => await tentacleClient.ExecuteScript( + var execution = Task.Run(async () => await client.ExecuteScript( command, _ => { }, _ => Task.CompletedTask, - new SerilogLoggerBuilder().Build().ForContext().ToITentacleTaskLog(), + Log(), executionCts.Token, abandonAfterCancellationPendingFor: TimeSpan.FromSeconds(1))); @@ -261,30 +269,24 @@ await Wait.For(() => File.Exists(startFile), executionCts.Cancel(); - // Well past the 1s threshold: if the V1 gating were broken the orchestrator would have - // escalated to AbandonScript (which throws NotSupportedException) by now. + // Well past the 1s threshold: if the V1 gate were broken the orchestrator would have escalated to + // AbandonScript (which throws NotSupportedException) by now. await Task.Delay(TimeSpan.FromSeconds(3)); // Release so the stuck script can finish and the run can unwind. File.WriteAllText(releaseFile, ""); - // Cancel path only — never NotSupportedException. await FluentActions.Invoking(async () => await execution).Should().ThrowAsync(); } [Test] [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version2)] - public async Task ExecuteScript_WhenAQueuedScriptIsCancelled_ExitsViaCancelNotAbandon(TentacleConfigurationTestCase tentacleConfigurationTestCase) + public async Task AbandonScript_WhenAScriptIsWaitingOnAMutex_AndIsCancelled_ReturnsCancelledNotAbandoned(TentacleConfigurationTestCase tentacleConfigurationTestCase) { // End-to-end counterpart to ScriptServiceV2Fixture.AbandonScript_WhileWaitingOnTheIsolationMutex_ExitsCancelled. - // A script still waiting on the isolation mutex never started a process, so when its execution is - // cancelled (with abandon escalation armed) it ends via the cancel path (OperationCanceledException) - // and never runs. Abandon has nothing to act on for a queued script, so it cannot change the outcome. - // If we ever want abandon to win for a queued script, RunningScript.Execute can catch - // (OperationCanceledException) when (abandonToken.IsCancellationRequested) around the mutex Acquire - // and return AbandonedExitCode instead. Deliberately not doing that today. + // A script still waiting on the isolation mutex never started a process, so when it is cancelled (and + // abandoned) it exits cancelled, not abandoned. Abandon has nothing to act on for a queued script. await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() - .WithTentacle(x => x.WithRunTentacleEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, "1")) .Build(CancellationToken); var holderStartFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "holder-start"); @@ -292,7 +294,7 @@ public async Task ExecuteScript_WhenAQueuedScriptIsCancelled_ExitsViaCancelNotAb var waiterRanFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "waiter-ran"); const string sharedMutex = "queued-cancel-test-mutex"; - // Holder takes the mutex and stays stuck (kill disabled), so the waiter must queue behind it. + // Holder takes the mutex and holds it (it doesn't need to be stuck), so the waiter must queue behind it. var holderCommand = new TestExecuteShellScriptCommandBuilder() .SetScriptBody(new ScriptBuilder() .CreateFile(holderStartFile) @@ -301,8 +303,10 @@ public async Task ExecuteScript_WhenAQueuedScriptIsCancelled_ExitsViaCancelNotAb .WithIsolationMutexName(sharedMutex) .Build(); - var tentacleClient = clientTentacle.TentacleClient; - var holderExecution = Task.Run(async () => await tentacleClient.ExecuteScript(holderCommand, CancellationToken)); + var client = clientTentacle.TentacleClient; + var log = Log(); + + var holderStart = await client.StartScript(holderCommand, StartScriptIsBeingReAttempted.FirstAttempt, log, CancellationToken); await Wait.For(() => File.Exists(holderStartFile), TimeSpan.FromSeconds(30), @@ -316,26 +320,69 @@ await Wait.For(() => File.Exists(holderStartFile), .WithIsolationMutexName(sharedMutex) .Build(); - using var waiterCts = new System.Threading.CancellationTokenSource(); - var waiterExecution = Task.Run(async () => await tentacleClient.ExecuteScript( - waiterCommand, - _ => { }, - _ => Task.CompletedTask, - new SerilogLoggerBuilder().Build().ForContext().ToITentacleTaskLog(), - waiterCts.Token, - abandonAfterCancellationPendingFor: TimeSpan.FromSeconds(2))); + var waiterStart = await client.StartScript(waiterCommand, StartScriptIsBeingReAttempted.FirstAttempt, log, CancellationToken); - // Let the waiter reach the mutex queue, then cancel it. With escalation armed the orchestrator may - // also try to abandon — but abandon can't act on a queued script, so it still ends via cancel. + // Give the waiter time to reach the mutex queue, then cancel and abandon it. await Task.Delay(TimeSpan.FromSeconds(2)); - waiterCts.Cancel(); + var waiterAfterCancel = await client.CancelScript(waiterStart.ContextForNextCommand, log); + await client.AbandonScript(waiterCommand.ScriptTicket, log, CancellationToken); - await FluentActions.Invoking(async () => await waiterExecution).Should().ThrowAsync(); + var waiterFinal = await RunStatusUntilComplete(client, waiterAfterCancel.ContextForNextCommand, log); + waiterFinal.ScriptStatus.ExitCode.Should().Be(ScriptExitCodes.CanceledExitCode); File.Exists(waiterRanFile).Should().BeFalse("a script blocked on the isolation mutex never starts its process"); - // Cleanup: release the holder. + await client.CompleteScript(waiterFinal.ContextForNextCommand, log, CancellationToken); + + // Cleanup: release the holder, then complete it. File.WriteAllText(holderReleaseFile, ""); - await holderExecution; + var holderFinal = await RunStatusUntilComplete(client, holderStart.ContextForNextCommand, log); + await client.CompleteScript(holderFinal.ContextForNextCommand, log, CancellationToken); + } + + static ITentacleClientTaskLog Log() + => new SerilogLoggerBuilder().Build().ForContext().ToITentacleTaskLog(); + + async Task RunStatusUntilComplete(ITentacleClient client, CommandContext context, ITentacleClientTaskLog log) + { + var result = await client.GetStatus(context, log, CancellationToken); + while (result.ScriptStatus.State != ProcessState.Complete) + { + await Task.Delay(TimeSpan.FromMilliseconds(200)); + result = await client.GetStatus(result.ContextForNextCommand, log, CancellationToken); + } + + return result; + } + + static void AssertProcessIsRunning(string pidFile) + { + File.Exists(pidFile).Should().BeTrue($"the script should have written its PID to '{pidFile}'"); + int.TryParse(File.ReadAllText(pidFile).Trim(), out var pid).Should().BeTrue("a valid PID should have been written"); + IsProcessRunning(pid).Should().BeTrue($"abandon leaves an un-killable process running, so PID {pid} should still be alive"); + } + + static void AssertProcessExits(string pidFile, TimeSpan timeout) + { + File.Exists(pidFile).Should().BeTrue($"the script should have written its PID to '{pidFile}'"); + int.TryParse(File.ReadAllText(pidFile).Trim(), out var pid).Should().BeTrue("a valid PID should have been written"); + + // abandon's best-effort kill is asynchronous, so give it a moment to actually terminate. + var deadline = DateTime.UtcNow + timeout; + while (DateTime.UtcNow < deadline && IsProcessRunning(pid)) + System.Threading.Thread.Sleep(100); + + IsProcessRunning(pid).Should().BeFalse($"abandon best-effort-kills a killable process, so PID {pid} should be gone"); + } + + static bool IsProcessRunning(int pid) + { + try + { + using var process = Process.GetProcessById(pid); + return !process.HasExited; + } + catch (ArgumentException) { return false; } // not running + catch (InvalidOperationException) { return false; } // exited } } } diff --git a/source/Octopus.Tentacle.Tests.Integration/Support/TentacleVersions.cs b/source/Octopus.Tentacle.Tests.Integration/Support/TentacleVersions.cs index dc7d09723..806a436b9 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Support/TentacleVersions.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Support/TentacleVersions.cs @@ -81,7 +81,9 @@ public static bool HasScriptServiceV2(this Version? version) public static bool HasAbandonScript(this Version? version) { - // AbandonScriptAsync is advertised by ScriptServiceV2 from this build onward; no released version has it yet. + // Abandon ships in this build and no released version has it yet, so only Current advertises it. + // When it ships in a release, pin that version in TentacleVersions and make this + // `version == Current || version >= v`, the same shape as HasScriptServiceV2. return version == TentacleVersions.Current; } } diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs index c0752f08d..2c0ae51a6 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs @@ -277,14 +277,10 @@ public async Task CancellationToken_WhenUnixGrandchildHoldsRedirectedPipes_Shoul } [Test] - public async Task AbandonToken_ReturnsAbandonedExitCode() + public async Task AbandonToken_WhenProcessCanBeKilled_KillsTheProcessAndReturnsAbandonedExitCode() { - // Abandon stops waiting and returns the distinct AbandonedExitCode. Abandon also - // best-effort-kills the process — the kill itself is asserted by - // ClientScriptExecutionAbandon.AbandonScript_WithNoPriorCancel_KillsTheProcess, and the - // un-killable (survives) case by AbandonScript_WhenCancelFailsToKillProcess. (We do NOT - // disable kill here: setting that env var is process-wide and leaks into the Tentacle - // subprocesses other integration tests spawn.) + // Abandon returns AbandonedExitCode and best-effort-kills the process. Here the process is + // killable, so the kill lands and the process is gone. using var tempDir = new TemporaryDirectory(); var pidFile = Path.Combine(tempDir.DirectoryPath, "process.pid"); @@ -318,6 +314,7 @@ public async Task AbandonToken_ReturnsAbandonedExitCode() var exitCode = await task; exitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); infoMessages.ToString().Should().Contain("Tentacle has abandoned this script"); + AssertProcessExitsWithin(pidFile, TimeSpan.FromSeconds(10)); } finally { @@ -365,6 +362,30 @@ static void EnsureProcessKilled(string pidFile) catch (InvalidOperationException) { /* exited between lookup and kill */ } } + static void AssertProcessExitsWithin(string pidFile, TimeSpan timeout) + { + File.Exists(pidFile).Should().BeTrue($"the test should have written a PID to '{pidFile}'"); + int.TryParse(SafelyReadAllText(pidFile).Trim(), out var pid).Should().BeTrue("a valid PID should have been written"); + + // abandon's best-effort kill is asynchronous, so give it a moment to actually terminate. + var deadline = DateTime.UtcNow + timeout; + while (DateTime.UtcNow < deadline && IsProcessRunning(pid)) + Thread.Sleep(100); + + IsProcessRunning(pid).Should().BeFalse($"abandon best-effort-kills a killable process, so PID {pid} should be gone"); + } + + static bool IsProcessRunning(int pid) + { + try + { + using var process = Process.GetProcessById(pid); + return !process.HasExited; + } + catch (ArgumentException) { return false; } // not running + catch (InvalidOperationException) { return false; } // exited + } + static void TryKillGrandchild(string pidFile) { // Best-effort only. The grandchild is reparented to init, so it is NOT our child: Process.WaitForExit From d4e85bacea4ae73f88f630e16f5e15dd2b3d7314 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Fri, 5 Jun 2026 13:00:27 +1000 Subject: [PATCH 12/15] Restore @jimmyp's applied suggestions clobbered by my force-push A force-push from a stale base overwrote 4 'Apply suggestion from @jimmyp' commits (025b768f). Restoring them, reconciled with the capability-gating rewrite: - SilentProcessRunner: restore the swallow/abandon-wait comment wording verbatim. - KubernetesScriptServiceV1Executor: drop 'Hung pods are recovered by deleting the pod instead.' - ClientScriptExecutionAbandon: reword the no-prior-cancel releaseFile comment. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../Scripts/KubernetesScriptServiceV1Executor.cs | 2 +- .../Util/CommandLine/SilentProcessRunner.cs | 9 ++++----- .../ClientScriptExecutionAbandon.cs | 4 ++-- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/source/Octopus.Tentacle.Client/Scripts/KubernetesScriptServiceV1Executor.cs b/source/Octopus.Tentacle.Client/Scripts/KubernetesScriptServiceV1Executor.cs index f575d7892..42e984359 100644 --- a/source/Octopus.Tentacle.Client/Scripts/KubernetesScriptServiceV1Executor.cs +++ b/source/Octopus.Tentacle.Client/Scripts/KubernetesScriptServiceV1Executor.cs @@ -187,7 +187,7 @@ async Task CancelScriptAction(CancellationToke public Task AbandonScript(CommandContext commandContext) // KubernetesScriptServiceV1 has no abandon verb. The orchestrator only escalates to abandon // when the Tentacle advertised the abandon capability (K8s agents never do), so it won't - // escalate here; reaching this is a bug. Hung pods are recovered by deleting the pod instead. + // escalate here; reaching this is a bug. => throw new NotSupportedException("KubernetesScriptServiceV1 cannot abandon a script; it has no abandon verb. Cancel the script instead."); public async Task CompleteScript(CommandContext lastStatusResponse, CancellationToken scriptExecutionCancellationToken) diff --git a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs index cb75b4add..e12992b3e 100644 --- a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs +++ b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs @@ -148,14 +148,13 @@ void WriteData(Action action, ManualResetEventSlim resetEvent, DataRecei var waitForExit = Task.Run(() => { try { process.WaitForExit(); } - catch { /* swalloe exceptions thrown when released by Process.Dispose in DoOurBestToCleanUp */ } + catch { /* swallow exceptions thrown when released by Process.Dispose in DoOurBestToCleanUp */ } }); // Wait for the process to exit, but break if abandon cancallation token fires. - // Cancel kills then Close()s the process (via cancel.Register) so this returns. - // If the script is not actually stuck when abandon fires, abandoning will be in - // a race with the script to complete, if it wins, we return -48 and leave - // the process running. + // Cancel kills then Close()s the process (via DoOurBestToCleanUp) so this + // then returns. If the script is not actually stuck when abandon fires, + // abandoning will be ina race with the script to complete await Task.WhenAny(waitForExit, WaitForAbandon(abandon)).ConfigureAwait(false); if (abandon.IsCancellationRequested) diff --git a/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs b/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs index d25c6b054..70ccddc1f 100644 --- a/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs +++ b/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs @@ -101,8 +101,8 @@ await Wait.For(() => File.Exists(startFile), () => throw new Exception("Script did not start"), CancellationToken); - // Direct abandon, NO prior cancel. We never write releaseFile, so the script only completes - // because the abandon branch killed it. + // Direct abandon, NO prior cancel. We never write releaseFile to allow the script to complete, + // so it only completes because the abandon branch killed it. await client.AbandonScript(command.ScriptTicket, log, CancellationToken); var finalResult = await RunStatusUntilComplete(client, startResult.ContextForNextCommand, log); From 5fd82a5a92455b76e037f197921402f2a143e6fd Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Fri, 5 Jun 2026 15:44:31 +1000 Subject: [PATCH 13/15] Put SupportsAbandon on ScriptServiceVersion (ScriptServiceVersion2WithAbandon) + review fixes #1 Move abandon-support back onto the version (option A). The selector picks ScriptServiceVersion2WithAbandon when the Tentacle advertised AbandonScript, else ScriptServiceVersion2; the factory routes both to the V2 executor; the executor stamps the selected version onto the CommandContext, so the orchestrator gates on ScripServiceVersionUsed.SupportsAbandon. Reverts the CommandContext flag / capture-once / selector tuple. Capability string is nameof(IScriptServiceV2.AbandonScript) on both sides (server advertise + client check, unit fixture, integration test). Deleted the mocked selector unit test (redundant with the version approach). #2 CapabilitiesServiceV2Test: restore the count assertions (abandon-aware). #3 ClientScriptExecutionAbandon: capture each spawned PID and force-kill in TearDown so nothing leaks on CI. #4 SilentProcessRunnerFixture: add AbandonToken_WhenProcessCanNotBeKilled (asserts the process is still running, then cleans it up). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../ScriptServiceVersionSelectorTests.cs | 73 ------------------- .../CapabilitiesResponseV2ExtensionMethods.cs | 7 +- .../EventDriven/CommandContext.cs | 9 +-- .../Octopus.Tentacle.Client/ScriptExecutor.cs | 12 +-- .../Scripts/ObservingScriptOrchestrator.cs | 3 +- .../Scripts/ScriptExecutorFactory.cs | 5 +- .../Scripts/ScriptServiceV2Executor.cs | 9 ++- .../Scripts/ScriptServiceVersion.cs | 5 ++ .../Scripts/ScriptServiceVersionSelector.cs | 15 ++-- .../CapabilitiesServiceV2Test.cs | 19 ++++- .../ClientScriptExecutionAbandon.cs | 44 +++++++++++ .../Util/SilentProcessRunnerFixture.cs | 52 ++++++++++++- .../CapabilitiesServiceV2Fixture.cs | 2 +- .../Capabilities/CapabilitiesServiceV2.cs | 2 +- 14 files changed, 144 insertions(+), 113 deletions(-) delete mode 100644 source/Octopus.Tentacle.Client.Tests/ScriptServiceVersionSelectorTests.cs diff --git a/source/Octopus.Tentacle.Client.Tests/ScriptServiceVersionSelectorTests.cs b/source/Octopus.Tentacle.Client.Tests/ScriptServiceVersionSelectorTests.cs deleted file mode 100644 index 1eb73556f..000000000 --- a/source/Octopus.Tentacle.Client.Tests/ScriptServiceVersionSelectorTests.cs +++ /dev/null @@ -1,73 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Threading; -using System.Threading.Tasks; -using FluentAssertions; -using Halibut.ServiceModel; -using NSubstitute; -using NUnit.Framework; -using Octopus.Tentacle.Client.Execution; -using Octopus.Tentacle.Client.Observability; -using Octopus.Tentacle.Client.Retries; -using Octopus.Tentacle.Client.Scripts; -using Octopus.Tentacle.Contracts; -using Octopus.Tentacle.Contracts.Capabilities; -using Octopus.Tentacle.Contracts.ClientServices; -using Octopus.Tentacle.Contracts.Logging; -using Octopus.Tentacle.Contracts.Observability; -using Octopus.Tentacle.Contracts.ScriptServiceV2; - -namespace Octopus.Tentacle.Client.Tests -{ - [TestFixture] - public class ScriptServiceVersionSelectorTests - { - [Test] - public async Task WhenTheTentacleAdvertisesAbandon_SupportsAbandonIsTrue() - { - var (version, supportsAbandon) = await SelectFor( - nameof(IScriptService), nameof(IFileTransferService), nameof(IScriptServiceV2), nameof(IAsyncClientScriptServiceV2.AbandonScriptAsync)); - - version.Should().Be(ScriptServiceVersion.ScriptServiceVersion2); - supportsAbandon.Should().BeTrue(); - } - - [Test] - public async Task WhenAnOlderV2TentacleDoesNotAdvertiseAbandon_SupportsAbandonIsFalse() - { - // Old V2 Tentacles are V2 but predate the abandon verb. Gating on "is V2" would wrongly let the - // orchestrator call AbandonScript on them; gating on the advertised capability does not. - var (version, supportsAbandon) = await SelectFor( - nameof(IScriptService), nameof(IFileTransferService), nameof(IScriptServiceV2)); - - version.Should().Be(ScriptServiceVersion.ScriptServiceVersion2); - supportsAbandon.Should().BeFalse(); - } - - [Test] - public async Task WhenTheTentacleOnlyHasV1_SupportsAbandonIsFalse() - { - var (version, supportsAbandon) = await SelectFor( - nameof(IScriptService), nameof(IFileTransferService)); - - version.Should().Be(ScriptServiceVersion.ScriptServiceVersion1); - supportsAbandon.Should().BeFalse(); - } - - static async Task<(ScriptServiceVersion Version, bool SupportsAbandon)> SelectFor(params string[] capabilities) - { - var capabilitiesService = Substitute.For(); - capabilitiesService.GetCapabilitiesAsync(Arg.Any()) - .Returns(Task.FromResult(new CapabilitiesResponseV2(new List(capabilities)))); - - var selector = new ScriptServiceVersionSelector( - capabilitiesService, - Substitute.For(), - RpcCallExecutorFactory.Create(TimeSpan.Zero, Substitute.For()), - new TentacleClientOptions(new RpcRetrySettings(RetriesEnabled: false, RetryDuration: TimeSpan.Zero)), - ClientOperationMetricsBuilder.Start()); - - return await selector.DetermineScriptServiceVersionToUse(CancellationToken.None); - } - } -} diff --git a/source/Octopus.Tentacle.Client/Capabilities/CapabilitiesResponseV2ExtensionMethods.cs b/source/Octopus.Tentacle.Client/Capabilities/CapabilitiesResponseV2ExtensionMethods.cs index 3132dcf5e..353073db7 100644 --- a/source/Octopus.Tentacle.Client/Capabilities/CapabilitiesResponseV2ExtensionMethods.cs +++ b/source/Octopus.Tentacle.Client/Capabilities/CapabilitiesResponseV2ExtensionMethods.cs @@ -1,6 +1,5 @@ using System.Linq; using Octopus.Tentacle.Contracts.Capabilities; -using Octopus.Tentacle.Contracts.ClientServices; using Octopus.Tentacle.Contracts.ScriptServiceV2; namespace Octopus.Tentacle.Client.Capabilities @@ -24,9 +23,9 @@ public static bool HasAbandonScript(this CapabilitiesResponseV2 capabilities) return false; } - // Both sides nameof their own AbandonScriptAsync. The strings match, and a rename on - // either side can't silently drift the capability check. - return capabilities.SupportedCapabilities.Contains(nameof(IAsyncClientScriptServiceV2.AbandonScriptAsync)); + // Both sides nameof IScriptServiceV2.AbandonScript, so the strings match and a rename + // on either side can't silently drift the capability check. + return capabilities.SupportedCapabilities.Contains(nameof(IScriptServiceV2.AbandonScript)); } } } \ No newline at end of file diff --git a/source/Octopus.Tentacle.Client/EventDriven/CommandContext.cs b/source/Octopus.Tentacle.Client/EventDriven/CommandContext.cs index 7a6139772..0f34b8108 100644 --- a/source/Octopus.Tentacle.Client/EventDriven/CommandContext.cs +++ b/source/Octopus.Tentacle.Client/EventDriven/CommandContext.cs @@ -12,22 +12,15 @@ public class CommandContext { public CommandContext(ScriptTicket scriptTicket, long nextLogSequence, - ScriptServiceVersion scripServiceVersionUsed, - bool supportsAbandon = false) + ScriptServiceVersion scripServiceVersionUsed) { ScriptTicket = scriptTicket; NextLogSequence = nextLogSequence; ScripServiceVersionUsed = scripServiceVersionUsed; - SupportsAbandon = supportsAbandon; } public ScriptTicket ScriptTicket { get; } public long NextLogSequence { get; } public ScriptServiceVersion ScripServiceVersionUsed { get; } - - // Whether the Tentacle advertised the AbandonScript capability. Determined once from - // capabilities when the script starts and carried on the start result so the orchestrator - // can gate abandon escalation on it. - public bool SupportsAbandon { get; } } } \ No newline at end of file diff --git a/source/Octopus.Tentacle.Client/ScriptExecutor.cs b/source/Octopus.Tentacle.Client/ScriptExecutor.cs index 51ce32771..1ce5e1f2c 100644 --- a/source/Octopus.Tentacle.Client/ScriptExecutor.cs +++ b/source/Octopus.Tentacle.Client/ScriptExecutor.cs @@ -46,18 +46,12 @@ public async Task StartScript(ExecuteScriptComma { // Note: This class deliberately does not create OpenTelemetry Trace activities. // It is a facade over other ScriptExecutor services, and the facade doesn't do anything interesting - var (scriptServiceVersionToUse, supportsAbandon) = await DetermineScriptServiceVersionToUse(cancellationToken); + var scriptServiceVersionToUse = await DetermineScriptServiceVersionToUse(cancellationToken); var scriptExecutorFactory = CreateScriptExecutorFactory(); var scriptExecutor = scriptExecutorFactory.CreateScriptExecutor(scriptServiceVersionToUse); - var result = await scriptExecutor.StartScript(executeScriptCommand, startScriptIsBeingReAttempted, cancellationToken); - - // Carry whether the Tentacle advertised abandon onto the context the orchestrator reads, so it can - // gate escalation on the real capability rather than just "is V2". - var context = result.ContextForNextCommand; - var contextWithAbandon = new CommandContext(context.ScriptTicket, context.NextLogSequence, context.ScripServiceVersionUsed, supportsAbandon); - return new ScriptOperationExecutionResult(result.ScriptStatus, contextWithAbandon); + return await scriptExecutor.StartScript(executeScriptCommand, startScriptIsBeingReAttempted, cancellationToken); } public async Task GetStatus(CommandContext commandContext, CancellationToken cancellationToken) @@ -102,7 +96,7 @@ ScriptExecutorFactory CreateScriptExecutorFactory() logger); } - async Task<(ScriptServiceVersion Version, bool SupportsAbandon)> DetermineScriptServiceVersionToUse(CancellationToken cancellationToken) + async Task DetermineScriptServiceVersionToUse(CancellationToken cancellationToken) { try { diff --git a/source/Octopus.Tentacle.Client/Scripts/ObservingScriptOrchestrator.cs b/source/Octopus.Tentacle.Client/Scripts/ObservingScriptOrchestrator.cs index b55e5bd6d..9be49b13f 100644 --- a/source/Octopus.Tentacle.Client/Scripts/ObservingScriptOrchestrator.cs +++ b/source/Octopus.Tentacle.Client/Scripts/ObservingScriptOrchestrator.cs @@ -83,7 +83,6 @@ async Task ObserveUntilComplete( var iteration = 0; var cancellationIteration = 0; var lastResult = startScriptResult; - var supportsAbandon = startScriptResult.ContextForNextCommand.SupportsAbandon; var stopwatch = new Stopwatch(); while (lastResult.ScriptStatus.State != ProcessState.Complete) @@ -99,7 +98,7 @@ async Task ObserveUntilComplete( // Only escalate to abandon when the Tentacle advertised the abandon capability. Old V2 // Tentacles (pre-abandon) and V1/Kubernetes don't, so we keep cancelling rather than // calling a verb they don't have. - var shouldAbandon = supportsAbandon + var shouldAbandon = lastResult.ContextForNextCommand.ScripServiceVersionUsed.SupportsAbandon && abandonAfterCancellationPendingFor.HasValue && stopwatch.Elapsed >= abandonAfterCancellationPendingFor.Value; diff --git a/source/Octopus.Tentacle.Client/Scripts/ScriptExecutorFactory.cs b/source/Octopus.Tentacle.Client/Scripts/ScriptExecutorFactory.cs index a67824d6f..af46e33af 100644 --- a/source/Octopus.Tentacle.Client/Scripts/ScriptExecutorFactory.cs +++ b/source/Octopus.Tentacle.Client/Scripts/ScriptExecutorFactory.cs @@ -43,9 +43,12 @@ public IScriptExecutor CreateScriptExecutor(ScriptServiceVersion scriptServiceTo logger); } - if (scriptServiceToUse == ScriptServiceVersion.ScriptServiceVersion2) + if (scriptServiceToUse == ScriptServiceVersion.ScriptServiceVersion2 || scriptServiceToUse == ScriptServiceVersion.ScriptServiceVersion2WithAbandon) { + // Both variants talk to the same V2 service; pass the selected version so it's stamped + // onto the CommandContext and the orchestrator can read SupportsAbandon from it. return new ScriptServiceV2Executor( + scriptServiceToUse, allClients.ScriptServiceV2, rpcCallExecutor, clientOperationMetricsBuilder, diff --git a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV2Executor.cs b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV2Executor.cs index 229e917ad..a90338e7d 100644 --- a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV2Executor.cs +++ b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV2Executor.cs @@ -24,8 +24,10 @@ class ScriptServiceV2Executor : IScriptExecutor readonly TimeSpan onCancellationAbandonCompleteScriptAfter; readonly ITentacleClientTaskLog logger; readonly TentacleClientOptions clientOptions; + readonly ScriptServiceVersion scriptServiceVersion; public ScriptServiceV2Executor( + ScriptServiceVersion scriptServiceVersion, IAsyncClientScriptServiceV2 clientScriptServiceV2, RpcCallExecutor rpcCallExecutor, ClientOperationMetricsBuilder clientOperationMetricsBuilder, @@ -33,6 +35,7 @@ public ScriptServiceV2Executor( TentacleClientOptions clientOptions, ITentacleClientTaskLog logger) { + this.scriptServiceVersion = scriptServiceVersion; this.clientScriptServiceV2 = clientScriptServiceV2; this.rpcCallExecutor = rpcCallExecutor; this.clientOperationMetricsBuilder = clientOperationMetricsBuilder; @@ -59,11 +62,11 @@ StartScriptCommandV2 Map(ExecuteScriptCommand command) shellScriptCommand.Files.ToArray()); } - static ScriptOperationExecutionResult Map(ScriptStatusResponseV2 scriptStatusResponse) + ScriptOperationExecutionResult Map(ScriptStatusResponseV2 scriptStatusResponse) { return new ( new ScriptStatus(scriptStatusResponse.State, scriptStatusResponse.ExitCode, scriptStatusResponse.Logs), - new CommandContext(scriptStatusResponse.Ticket, scriptStatusResponse.NextLogSequence, ScriptServiceVersion.ScriptServiceVersion2)); + new CommandContext(scriptStatusResponse.Ticket, scriptStatusResponse.NextLogSequence, scriptServiceVersion)); } public async Task StartScript(ExecuteScriptCommand executeScriptCommand, @@ -116,7 +119,7 @@ void OnErrorAction(Exception ex) if (!startScriptCallIsConnecting || startScriptCallIsBeingRetried) { // We want to cancel the potentially started script, and wait till it finishes. By returning a result, the outer orchestration will take care of this. - return ScriptOperationExecutionResult.CreateScriptStartedResult(command.ScriptTicket, ScriptServiceVersion.ScriptServiceVersion2); + return ScriptOperationExecutionResult.CreateScriptStartedResult(command.ScriptTicket, scriptServiceVersion); } // If the StartScript call was not in-flight or being retries then we know the script has not started executing on Tentacle diff --git a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersion.cs b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersion.cs index ad8cfa420..84534100f 100644 --- a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersion.cs +++ b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersion.cs @@ -4,8 +4,13 @@ public record ScriptServiceVersion(string Value) { public static ScriptServiceVersion ScriptServiceVersion1 = new(nameof(ScriptServiceVersion1)); public static ScriptServiceVersion ScriptServiceVersion2 = new(nameof(ScriptServiceVersion2)); + public static ScriptServiceVersion ScriptServiceVersion2WithAbandon = new(nameof(ScriptServiceVersion2WithAbandon)); public static ScriptServiceVersion KubernetesScriptServiceVersion1 = new(nameof(KubernetesScriptServiceVersion1)); + // Only a V2 Tentacle that advertised the AbandonScript capability supports abandon. Old V2 + // Tentacles predate the verb, so they select ScriptServiceVersion2, not ...WithAbandon. + public bool SupportsAbandon => Value == nameof(ScriptServiceVersion2WithAbandon); + public override string ToString() => Value; } } diff --git a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersionSelector.cs b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersionSelector.cs index f449d63e4..674ab88e7 100644 --- a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersionSelector.cs +++ b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersionSelector.cs @@ -34,7 +34,7 @@ internal ScriptServiceVersionSelector( this.clientOperationMetricsBuilder = clientOperationMetricsBuilder; } - public async Task<(ScriptServiceVersion Version, bool SupportsAbandon)> DetermineScriptServiceVersionToUse(CancellationToken cancellationToken) + public async Task DetermineScriptServiceVersionToUse(CancellationToken cancellationToken) { logger.Verbose("Determining ScriptService version to use"); @@ -59,13 +59,10 @@ async Task GetCapabilitiesFunc(CancellationToken ct) // It's implied (and tested) that GetCapabilities will only return Kubernetes or non-Kubernetes script services, never a mix if (tentacleCapabilities.HasAnyKubernetesScriptService()) { - // Kubernetes agents never advertise abandon. - return (DetermineKubernetesScriptServiceVersionToUse(), false); + return DetermineKubernetesScriptServiceVersionToUse(); } - // Abandon support is whether the Tentacle actually advertised the capability, not just that it - // is V2 — old V2 Tentacles (pre-abandon) are V2 but have no abandon verb. - return (DetermineShellScriptServiceVersionToUse(tentacleCapabilities), tentacleCapabilities.HasAbandonScript()); + return DetermineShellScriptServiceVersionToUse(tentacleCapabilities); } ScriptServiceVersion DetermineShellScriptServiceVersionToUse(CapabilitiesResponseV2 tentacleCapabilities) @@ -76,7 +73,11 @@ ScriptServiceVersion DetermineShellScriptServiceVersionToUse(CapabilitiesRespons logger.Verbose(clientOptions.RpcRetrySettings.RetriesEnabled ? $"RPC call retries are enabled. Retry timeout {rpcCallExecutor.RetryTimeout.TotalSeconds} seconds" : "RPC call retries are disabled."); - return ScriptServiceVersion.ScriptServiceVersion2; + // Old V2 Tentacles are V2 but predate the abandon verb; only pick the abandon-capable + // variant when the Tentacle actually advertised AbandonScript. + return tentacleCapabilities.HasAbandonScript() + ? ScriptServiceVersion.ScriptServiceVersion2WithAbandon + : ScriptServiceVersion.ScriptServiceVersion2; } logger.Verbose("RPC call retries are enabled but will not be used for Script Execution as a compatible ScriptService was not found. Please upgrade Tentacle to enable this feature."); diff --git a/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs b/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs index dbc17cf75..208fe0dec 100644 --- a/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs +++ b/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs @@ -9,7 +9,6 @@ using Octopus.Tentacle.Contracts.Capabilities; using Octopus.Tentacle.Contracts.KubernetesScriptServiceV1; using Octopus.Tentacle.Contracts.ScriptServiceV2; -using Octopus.Tentacle.Core.Services.Scripts; using Octopus.Tentacle.Tests.Integration.Common.Builders.Decorators; using Octopus.Tentacle.Tests.Integration.Support; using Octopus.Tentacle.Tests.Integration.Util.Builders; @@ -31,8 +30,15 @@ public async Task CapabilitiesFromAnOlderTentacleWhichHasNoCapabilitiesService_W capabilities.Should().Contain(nameof(IScriptService)); capabilities.Should().Contain(nameof(IFileTransferService)); + var expectedCount = 2; if (version.HasScriptServiceV2()) + { capabilities.Should().Contain(nameof(IScriptServiceV2)); + expectedCount++; + } + if (version.HasAbandonScript()) + expectedCount++; + capabilities.Count.Should().Be(expectedCount); } [Test] @@ -47,10 +53,17 @@ public async Task CapabilitiesServiceDoesNotReturnKubernetesScriptServiceForNonK capabilities.Should().Contain(nameof(IScriptService)); capabilities.Should().Contain(nameof(IFileTransferService)); + var expectedCount = 2; if (version.HasScriptServiceV2()) + { capabilities.Should().Contain(nameof(IScriptServiceV2)); + expectedCount++; + } + if (version.HasAbandonScript()) + expectedCount++; capabilities.Should().NotContain(nameof(IKubernetesScriptServiceV1)); + capabilities.Count.Should().Be(expectedCount); } [Test] @@ -65,9 +78,9 @@ public async Task AbandonScriptIsAdvertisedOnlyByVersionsThatSupportIt(TentacleC // Older Tentacles predate abandon and must not advertise it; this build (and later) must. if (version.HasAbandonScript()) - capabilities.Should().Contain(nameof(ScriptServiceV2.AbandonScriptAsync)); + capabilities.Should().Contain(nameof(IScriptServiceV2.AbandonScript)); else - capabilities.Should().NotContain(nameof(ScriptServiceV2.AbandonScriptAsync)); + capabilities.Should().NotContain(nameof(IScriptServiceV2.AbandonScript)); } [Test] diff --git a/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs b/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs index 70ccddc1f..959e80d20 100644 --- a/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs +++ b/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs @@ -1,4 +1,5 @@ using System; +using System.Collections.Generic; using System.Diagnostics; using System.IO; using System.Threading.Tasks; @@ -19,6 +20,23 @@ namespace Octopus.Tentacle.Tests.Integration [IntegrationTestTimeout] public class ClientScriptExecutionAbandon : IntegrationTest { + // PIDs of script processes a test spawned. Captured during the test (while the Tentacle's temp dir + // still exists) and force-killed in TearDown so nothing leaks onto the CI agent — including the + // un-killable (kill-disabled) processes, and any process left behind when a test fails early. + readonly List spawnedProcessPids = new(); + + [TearDown] + public void ForceKillSpawnedProcesses() + { + foreach (var pid in spawnedProcessPids) + { + try { using var process = Process.GetProcessById(pid); process.Kill(); } + catch { /* already gone — best effort */ } + } + + spawnedProcessPids.Clear(); + } + [Test] [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version2)] public async Task AbandonScript_WhenProcessCannotBeKilled_LeavesItRunningAndReturnsAbandonedExitCode(TentacleConfigurationTestCase tentacleConfigurationTestCase) @@ -53,6 +71,8 @@ await Wait.For(() => File.Exists(startFile), () => throw new Exception("Script did not start"), CancellationToken); + TrackSpawnedProcess(pidFile); + // Cancel: Hitman is a no-op, so the process keeps running. var afterCancel = await client.CancelScript(startResult.ContextForNextCommand, log); @@ -101,6 +121,8 @@ await Wait.For(() => File.Exists(startFile), () => throw new Exception("Script did not start"), CancellationToken); + TrackSpawnedProcess(pidFile); + // Direct abandon, NO prior cancel. We never write releaseFile to allow the script to complete, // so it only completes because the abandon branch killed it. await client.AbandonScript(command.ScriptTicket, log, CancellationToken); @@ -127,10 +149,12 @@ public async Task AbandonScript_ReleasesIsolationMutexEvenWhileProcessIsStillRun var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); var releaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "release"); + var pidFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "pid"); const string sharedMutex = "abandon-test-mutex"; var firstCommand = new TestExecuteShellScriptCommandBuilder() .SetScriptBody(new ScriptBuilder() + .WritePidToFile(pidFile) .CreateFile(startFile) .WaitForFileToExist(releaseFile)) .WithIsolationLevel(ScriptIsolationLevel.FullIsolation) @@ -147,6 +171,8 @@ await Wait.For(() => File.Exists(startFile), () => throw new Exception("First script did not start"), CancellationToken); + TrackSpawnedProcess(pidFile); + var afterCancel = await client.CancelScript(startResult.ContextForNextCommand, log); await client.AbandonScript(firstCommand.ScriptTicket, log, CancellationToken); @@ -184,10 +210,12 @@ public async Task ExecuteScript_WhenCancellationStaysPending_EscalatesToAbandon_ var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); var releaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "release"); + var pidFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "pid"); const string sharedMutex = "escalation-test-mutex"; var stuckCommand = new TestExecuteShellScriptCommandBuilder() .SetScriptBody(new ScriptBuilder() + .WritePidToFile(pidFile) .CreateFile(startFile) .WaitForFileToExist(releaseFile)) .WithIsolationLevel(ScriptIsolationLevel.FullIsolation) @@ -210,6 +238,8 @@ await Wait.For(() => File.Exists(startFile), () => throw new Exception("Script did not start"), CancellationToken); + TrackSpawnedProcess(pidFile); + // Cancel stays pending (kill disabled), so after the threshold the orchestrator escalates to abandon. executionCts.Cancel(); await FluentActions.Invoking(async () => await stuckExecution).Should().ThrowAsync(); @@ -243,9 +273,11 @@ public async Task ExecuteScript_OnV1Tentacle_DoesNotEscalateToAbandon(TentacleCo var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); var releaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "release"); + var pidFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "pid"); var command = new TestExecuteShellScriptCommandBuilder() .SetScriptBody(new ScriptBuilder() + .WritePidToFile(pidFile) .CreateFile(startFile) .WaitForFileToExist(releaseFile)) .WithIsolationLevel(ScriptIsolationLevel.NoIsolation) @@ -267,6 +299,8 @@ await Wait.For(() => File.Exists(startFile), () => throw new Exception("Script did not start"), CancellationToken); + TrackSpawnedProcess(pidFile); + executionCts.Cancel(); // Well past the 1s threshold: if the V1 gate were broken the orchestrator would have escalated to @@ -291,12 +325,14 @@ public async Task AbandonScript_WhenAScriptIsWaitingOnAMutex_AndIsCancelled_Retu var holderStartFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "holder-start"); var holderReleaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "holder-release"); + var holderPidFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "holder-pid"); var waiterRanFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "waiter-ran"); const string sharedMutex = "queued-cancel-test-mutex"; // Holder takes the mutex and holds it (it doesn't need to be stuck), so the waiter must queue behind it. var holderCommand = new TestExecuteShellScriptCommandBuilder() .SetScriptBody(new ScriptBuilder() + .WritePidToFile(holderPidFile) .CreateFile(holderStartFile) .WaitForFileToExist(holderReleaseFile)) .WithIsolationLevel(ScriptIsolationLevel.FullIsolation) @@ -313,6 +349,8 @@ await Wait.For(() => File.Exists(holderStartFile), () => throw new Exception("Holder script did not start"), CancellationToken); + TrackSpawnedProcess(holderPidFile); + // Waiter wants the same mutex, so it blocks acquiring it — its process never starts. var waiterCommand = new TestExecuteShellScriptCommandBuilder() .SetScriptBody(new ScriptBuilder().CreateFile(waiterRanFile)) @@ -339,6 +377,12 @@ await Wait.For(() => File.Exists(holderStartFile), await client.CompleteScript(holderFinal.ContextForNextCommand, log, CancellationToken); } + void TrackSpawnedProcess(string pidFile) + { + if (File.Exists(pidFile) && int.TryParse(File.ReadAllText(pidFile).Trim(), out var pid) && pid > 0) + spawnedProcessPids.Add(pid); + } + static ITentacleClientTaskLog Log() => new SerilogLoggerBuilder().Build().ForContext().ToITentacleTaskLog(); diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs index 2c0ae51a6..71937ed4f 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs @@ -309,15 +309,58 @@ public async Task AbandonToken_WhenProcessCanBeKilled_KillsTheProcessAndReturnsA await WaitForPidFileAsync(pidFile, TimeSpan.FromSeconds(30)); abandonCts.Cancel(); + var exitCode = await task; + exitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + infoMessages.ToString().Should().Contain("Tentacle has abandoned this script"); + } + + [Test] + [NonParallelizable] + public async Task AbandonToken_WhenProcessCanNotBeKilled_ReturnsAbandonedExitCode() + { + // Simulate a stuck process: TentacleDebugDisableProcessKill makes Hitman a no-op, so abandon's + // best-effort kill can't terminate it. Abandon still returns AbandonedExitCode without waiting and + // the process is left running. That env var is process-wide, so the test is NonParallelizable and + // resets it (and force-kills the leftover process) in finally. + using var tempDir = new TemporaryDirectory(); + var pidFile = Path.Combine(tempDir.DirectoryPath, "process.pid"); + + var abandonCommand = PlatformDetection.IsRunningOnWindows ? "powershell.exe" : "/bin/bash"; + var arguments = PlatformDetection.IsRunningOnWindows + ? $"-NoProfile -NonInteractive -Command \"$PID | Out-File -FilePath '{pidFile}' -Encoding ASCII; Start-Sleep -Seconds 300\"" + : $"-c \"echo $$ > '{pidFile}' && sleep 300\""; + + using var cancelCts = new CancellationTokenSource(); + using var abandonCts = new CancellationTokenSource(); + var infoMessages = new StringBuilder(); + + Environment.SetEnvironmentVariable(Octopus.Tentacle.Core.Util.EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, "1"); try { + var task = Task.Run(async () => await SilentProcessRunner.ExecuteCommandAsync( + abandonCommand, + arguments, + Environment.CurrentDirectory, + debug: _ => { }, + info: msg => { lock (infoMessages) infoMessages.AppendLine(msg); }, + error: _ => { }, + customEnvironmentVariables: null, + cancel: cancelCts.Token, + abandon: abandonCts.Token)); + + await WaitForPidFileAsync(pidFile, TimeSpan.FromSeconds(30)); + abandonCts.Cancel(); + var exitCode = await task; exitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); infoMessages.ToString().Should().Contain("Tentacle has abandoned this script"); - AssertProcessExitsWithin(pidFile, TimeSpan.FromSeconds(10)); + + // Kill was a no-op, so abandon left the process running. + AssertProcessIsRunning(pidFile); } finally { + Environment.SetEnvironmentVariable(Octopus.Tentacle.Core.Util.EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, null); EnsureProcessKilled(pidFile); } } @@ -375,6 +418,13 @@ static void AssertProcessExitsWithin(string pidFile, TimeSpan timeout) IsProcessRunning(pid).Should().BeFalse($"abandon best-effort-kills a killable process, so PID {pid} should be gone"); } + static void AssertProcessIsRunning(string pidFile) + { + File.Exists(pidFile).Should().BeTrue($"the test should have written a PID to '{pidFile}'"); + int.TryParse(SafelyReadAllText(pidFile).Trim(), out var pid).Should().BeTrue("a valid PID should have been written"); + IsProcessRunning(pid).Should().BeTrue($"abandon leaves an un-killable process running, so PID {pid} should still be alive"); + } + static bool IsProcessRunning(int pid) { try diff --git a/source/Octopus.Tentacle.Tests/Capabilities/CapabilitiesServiceV2Fixture.cs b/source/Octopus.Tentacle.Tests/Capabilities/CapabilitiesServiceV2Fixture.cs index 3f6b03be6..07df373d8 100644 --- a/source/Octopus.Tentacle.Tests/Capabilities/CapabilitiesServiceV2Fixture.cs +++ b/source/Octopus.Tentacle.Tests/Capabilities/CapabilitiesServiceV2Fixture.cs @@ -21,7 +21,7 @@ public async Task CapabilitiesAreReturned() .GetCapabilitiesAsync(CancellationToken.None)) .SupportedCapabilities; - capabilities.Should().BeEquivalentTo(nameof(IScriptService), nameof(IFileTransferService), nameof(IScriptServiceV2), nameof(ScriptServiceV2.AbandonScriptAsync)); + capabilities.Should().BeEquivalentTo(nameof(IScriptService), nameof(IFileTransferService), nameof(IScriptServiceV2), nameof(IScriptServiceV2.AbandonScript)); capabilities.Should().NotContainMatch("IKubernetesScriptService*"); } diff --git a/source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs b/source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs index a4537f26a..a831d54a2 100644 --- a/source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs +++ b/source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs @@ -25,7 +25,7 @@ public async Task GetCapabilitiesAsync(CancellationToken } //non-kubernetes agent tentacles only support the standard script services - return new CapabilitiesResponseV2(new List { nameof(IScriptService), nameof(IFileTransferService), nameof(IScriptServiceV2), nameof(ScriptServiceV2.AbandonScriptAsync) }); + return new CapabilitiesResponseV2(new List { nameof(IScriptService), nameof(IFileTransferService), nameof(IScriptServiceV2), nameof(IScriptServiceV2.AbandonScript) }); } } } \ No newline at end of file From 21b8d2a86e695b65f0e744301e98b63801ede188 Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Fri, 5 Jun 2026 16:08:37 +1000 Subject: [PATCH 14/15] Address review: ScriptServiceVersion.IsV2, split escalation test, V1 NotSupported assertion - Add ScriptServiceVersion.IsV2 (true for both V2 variants); ScriptExecutorFactory dispatches on it and the explanatory comment is gone. - Split the escalation test: ExecuteScript_WhenCancellationStaysPending_EscalatesToAbandon now just proves the run terminates; the mutex-release outcome is its own test, ..._AndReleasesTheIsolationMutex. - V1 test: assert the run does NOT throw NotSupportedException (rather than asserting it throws OCE) and rename to ExecuteScript_OnV1Tentacle_DoesNotEscalateToNotSupportedAbandon. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../Scripts/ScriptExecutorFactory.cs | 4 +- .../Scripts/ScriptServiceVersion.cs | 4 + .../ClientScriptExecutionAbandon.cs | 74 +++++++++++++++---- 3 files changed, 65 insertions(+), 17 deletions(-) diff --git a/source/Octopus.Tentacle.Client/Scripts/ScriptExecutorFactory.cs b/source/Octopus.Tentacle.Client/Scripts/ScriptExecutorFactory.cs index af46e33af..fa8a2c7b6 100644 --- a/source/Octopus.Tentacle.Client/Scripts/ScriptExecutorFactory.cs +++ b/source/Octopus.Tentacle.Client/Scripts/ScriptExecutorFactory.cs @@ -43,10 +43,8 @@ public IScriptExecutor CreateScriptExecutor(ScriptServiceVersion scriptServiceTo logger); } - if (scriptServiceToUse == ScriptServiceVersion.ScriptServiceVersion2 || scriptServiceToUse == ScriptServiceVersion.ScriptServiceVersion2WithAbandon) + if (scriptServiceToUse.IsV2) { - // Both variants talk to the same V2 service; pass the selected version so it's stamped - // onto the CommandContext and the orchestrator can read SupportsAbandon from it. return new ScriptServiceV2Executor( scriptServiceToUse, allClients.ScriptServiceV2, diff --git a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersion.cs b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersion.cs index 84534100f..9091daf3f 100644 --- a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersion.cs +++ b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersion.cs @@ -7,6 +7,10 @@ public record ScriptServiceVersion(string Value) public static ScriptServiceVersion ScriptServiceVersion2WithAbandon = new(nameof(ScriptServiceVersion2WithAbandon)); public static ScriptServiceVersion KubernetesScriptServiceVersion1 = new(nameof(KubernetesScriptServiceVersion1)); + // True for both V2 variants — they talk to the same ScriptServiceV2; ...WithAbandon just also + // advertised the abandon verb. + public bool IsV2 => Value == nameof(ScriptServiceVersion2) || Value == nameof(ScriptServiceVersion2WithAbandon); + // Only a V2 Tentacle that advertised the AbandonScript capability supports abandon. Old V2 // Tentacles predate the verb, so they select ScriptServiceVersion2, not ...WithAbandon. public bool SupportsAbandon => Value == nameof(ScriptServiceVersion2WithAbandon); diff --git a/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs b/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs index 959e80d20..8e1db4a63 100644 --- a/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs +++ b/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs @@ -197,13 +197,12 @@ await Wait.For(() => File.Exists(startFile), [Test] [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version2)] - public async Task ExecuteScript_WhenCancellationStaysPending_EscalatesToAbandon_AndReleasesMutex(TentacleConfigurationTestCase tentacleConfigurationTestCase) + public async Task ExecuteScript_WhenCancellationStaysPending_EscalatesToAbandon(TentacleConfigurationTestCase tentacleConfigurationTestCase) { // Kill is disabled, so the script is genuinely stuck and CancelScript can't end it. With - // abandonAfterCancellationPendingFor set short, the orchestrator escalates from cancel to abandon, - // which releases the isolation mutex. We prove that by the outcome: a second FullIsolation script - // with the same mutex name can only run once the mutex is released. This is the ExecuteScript - // (orchestrator) path; the explicit-AbandonScript path is AbandonScript_ReleasesIsolationMutex... above. + // abandonAfterCancellationPendingFor set short, the orchestrator escalates from cancel to abandon and + // the run terminates (throws OperationCanceledException) instead of hanging on the stuck script. The + // mutex-release outcome of that escalation is its own test, ..._AndReleasesTheIsolationMutex below. await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() .WithTentacle(x => x.WithRunTentacleEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, "1")) .Build(CancellationToken); @@ -211,7 +210,57 @@ public async Task ExecuteScript_WhenCancellationStaysPending_EscalatesToAbandon_ var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); var releaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "release"); var pidFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "pid"); - const string sharedMutex = "escalation-test-mutex"; + + var stuckCommand = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder() + .WritePidToFile(pidFile) + .CreateFile(startFile) + .WaitForFileToExist(releaseFile)) + .WithIsolationLevel(ScriptIsolationLevel.NoIsolation) + .Build(); + + var client = clientTentacle.TentacleClient; + + using var executionCts = new System.Threading.CancellationTokenSource(); + var stuckExecution = Task.Run(async () => await client.ExecuteScript( + stuckCommand, + _ => { }, + _ => Task.CompletedTask, + Log(), + executionCts.Token, + abandonAfterCancellationPendingFor: TimeSpan.FromSeconds(2))); + + await Wait.For(() => File.Exists(startFile), + TimeSpan.FromSeconds(30), + () => throw new Exception("Script did not start"), + CancellationToken); + + TrackSpawnedProcess(pidFile); + + // Cancel stays pending (kill disabled), so after the threshold the orchestrator escalates to abandon + // and the run ends. + executionCts.Cancel(); + await FluentActions.Invoking(async () => await stuckExecution).Should().ThrowAsync(); + + File.WriteAllText(releaseFile, ""); + } + + [Test] + [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version2)] + public async Task ExecuteScript_WhenCancellationStaysPending_EscalatesToAbandon_AndReleasesTheIsolationMutex(TentacleConfigurationTestCase tentacleConfigurationTestCase) + { + // The mutex-release outcome of the escalation in ..._EscalatesToAbandon above: once the orchestrator + // escalates a stuck FullIsolation script to abandon, the isolation mutex is freed, so a second + // FullIsolation script with the same mutex name can run. (Explicit-AbandonScript equivalent: + // AbandonScript_ReleasesIsolationMutexEvenWhileProcessIsStillRunning.) + await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() + .WithTentacle(x => x.WithRunTentacleEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, "1")) + .Build(CancellationToken); + + var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); + var releaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "release"); + var pidFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "pid"); + const string sharedMutex = "escalation-mutex-release-test"; var stuckCommand = new TestExecuteShellScriptCommandBuilder() .SetScriptBody(new ScriptBuilder() @@ -239,12 +288,10 @@ await Wait.For(() => File.Exists(startFile), CancellationToken); TrackSpawnedProcess(pidFile); - - // Cancel stays pending (kill disabled), so after the threshold the orchestrator escalates to abandon. executionCts.Cancel(); await FluentActions.Invoking(async () => await stuckExecution).Should().ThrowAsync(); - // Outcome: a second FullIsolation script with the same mutex now runs — only possible if abandon released it. + // A second FullIsolation script with the same mutex now runs — only possible if abandon released it. var secondStartFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "second-start"); var secondCommand = new TestExecuteShellScriptCommandBuilder() .SetScriptBody(new ScriptBuilder().CreateFile(secondStartFile)) @@ -256,17 +303,16 @@ await Wait.For(() => File.Exists(startFile), secondResult.ExitCode.Should().Be(0); File.Exists(secondStartFile).Should().BeTrue("escalation to abandon should have released the mutex"); - // Cleanup: release the still-alive first script. File.WriteAllText(releaseFile, ""); } [Test] [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version1)] - public async Task ExecuteScript_OnV1Tentacle_DoesNotEscalateToAbandon(TentacleConfigurationTestCase tentacleConfigurationTestCase) + public async Task ExecuteScript_OnV1Tentacle_DoesNotEscalateToNotSupportedAbandon(TentacleConfigurationTestCase tentacleConfigurationTestCase) { // A V1 Tentacle has no abandon verb. If the orchestrator wrongly escalated, the V1 executor's - // AbandonScript throws NotSupportedException. Asserting the cancelled run throws OperationCanceledException - // (and therefore NOT NotSupportedException) proves the gate held and we never escalated. + // AbandonScript throws NotSupportedException. Asserting the run does NOT throw NotSupportedException + // proves the gate held and we never escalated. await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() .WithTentacle(x => x.WithRunTentacleEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, "1")) .Build(CancellationToken); @@ -310,7 +356,7 @@ await Wait.For(() => File.Exists(startFile), // Release so the stuck script can finish and the run can unwind. File.WriteAllText(releaseFile, ""); - await FluentActions.Invoking(async () => await execution).Should().ThrowAsync(); + await FluentActions.Invoking(async () => await execution).Should().NotThrowAsync(); } [Test] From d195d65222ff60f4e3c9edde5b633deaff7dfd8a Mon Sep 17 00:00:00 2001 From: Jim Pelletier Date: Fri, 5 Jun 2026 16:18:25 +1000 Subject: [PATCH 15/15] Make the escalation split meaningful: each test asserts a distinct effect The mutex-release test was a strict superset of _EscalatesToAbandon (both asserted OCE). Drop the redundant OCE assertion from _AndReleasesTheIsolationMutex so it asserts only the mutex release (second script runs); _EscalatesToAbandon keeps the run-terminates (OCE) assertion. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../ClientScriptExecutionAbandon.cs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs b/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs index 8e1db4a63..608dc8b72 100644 --- a/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs +++ b/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs @@ -288,10 +288,13 @@ await Wait.For(() => File.Exists(startFile), CancellationToken); TrackSpawnedProcess(pidFile); + + // Cancel; this stays pending (kill disabled), so after the threshold the orchestrator escalates to + // abandon. We don't assert the run's outcome here — that's _EscalatesToAbandon's job. This test only + // proves the mutex was released, by the second script being able to run. executionCts.Cancel(); - await FluentActions.Invoking(async () => await stuckExecution).Should().ThrowAsync(); - // A second FullIsolation script with the same mutex now runs — only possible if abandon released it. + // A second FullIsolation script with the same mutex can only run once abandon released the mutex. var secondStartFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "second-start"); var secondCommand = new TestExecuteShellScriptCommandBuilder() .SetScriptBody(new ScriptBuilder().CreateFile(secondStartFile)) @@ -303,7 +306,9 @@ await Wait.For(() => File.Exists(startFile), secondResult.ExitCode.Should().Be(0); File.Exists(secondStartFile).Should().BeTrue("escalation to abandon should have released the mutex"); + // Cleanup: release the stuck script and observe the cancelled run. File.WriteAllText(releaseFile, ""); + try { await stuckExecution; } catch (OperationCanceledException) { } } [Test]