diff --git a/docs/superpowers/plans/2026-06-01-abandon-best-effort-kill-and-execute-script-timeout.md b/docs/superpowers/plans/2026-06-01-abandon-best-effort-kill-and-execute-script-timeout.md new file mode 100644 index 000000000..7d76e7af2 --- /dev/null +++ b/docs/superpowers/plans/2026-06-01-abandon-best-effort-kill-and-execute-script-timeout.md @@ -0,0 +1,1000 @@ +# Abandon best-effort-kill + ExecuteScript abandon-timeout — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax. + +**Goal:** Make Tentacle's `AbandonScript` best-effort-kill the process before it stops waiting (anti-abuse), make the PR1 runner deterministically return `AbandonedExitCode (-48)` keyed on the abandon token rather than on which task won the `Task.WhenAny` race, and give the client an optional `abandonAfterCancellationPendingFor` timeout that escalates a stuck cancel to an abandon when the Tentacle advertises the capability. + +**Architecture:** Two changes across server (Tentacle) and client. +- **Change 1 (server, PR1 + later PR2).** `ScriptServiceV2.AbandonScriptAsync` calls `runningScript.Cancel()` *then* `runningScript.Abandon()`, so the existing `cancel.Register → DoOurBestToCleanUp → Hitman.Kill` machinery attempts the kill on *every* abandon — including a direct RPC with no prior cancel. The PR1 runner in `SilentProcessRunner.ExecuteCommandAsync` keys its abandoned-result branch on `abandon.IsCancellationRequested` instead of `== abandoned.Task`. After `Cancel()→Close()` the process is detached, so the abandon branch must NOT read `process.HasExited`/`process.ExitCode`; it just returns `-48`. The "abandon was unnecessary because the script already finished" case is handled one layer up: the script's `RunningScript` is already `Complete` with its recorded exit code, so `GetResponse` returns that real code, never reaching the runner's abandon branch. +- **Change 2 (client, PR1 only).** Optional `TimeSpan? abandonAfterCancellationPendingFor = null` threaded through `ITentacleClient.ExecuteScript` → `TentacleClient.ExecuteScript` → `ObservingScriptOrchestrator`. The orchestrator's `ObserveUntilComplete` already calls `CancelScript` each poll while the token is cancelled; it records when cancellation first fired and, once elapsed crosses the threshold AND the abandon capability is advertised, calls `AbandonScript` instead. A new `IScriptExecutor.AbandonScript` is implemented on `ScriptServiceV2Executor` (with a capability check) and as a no-op fallback on `ScriptServiceV1Executor`/`KubernetesScriptServiceV1Executor`. `ScriptExecutionResult.ExitCode (-48)` is the source of truth. NO shared `Octopus.Tentacle.Contracts` constant for the message. + +**Tech Stack:** C#, NUnit, FluentAssertions, NSubstitute, Halibut RPC. + +> **State of the tree at plan time (read before starting).** The contract surface is already merged on this branch: `AbandonScriptCommandV2`, `IScriptServiceV2.AbandonScript`, `ScriptExitCodes.AbandonedExitCode = -48`, `ITentacleClient.AbandonScript`/`TentacleClient.AbandonScript`, `RunningScript.CreateAbandonable` with `abandonToken`, the `RunningScript.Execute` abandon catch, `ScriptServiceV2.AbandonScriptAsync` (currently calls only `Abandon()`), and the PR1 runner `Task.WhenAny` block (currently keyed on `== abandoned.Task`). Capabilities advertise `nameof(ScriptServiceV2.AbandonScriptAsync)` → the literal string `"AbandonScriptAsync"`, NOT `"AbandonScriptV2"`. This plan ONLY makes the two deltas above; it does not re-create the contract. + +--- + +## Task 1 — PR1 runner: key the abandoned branch on `abandon.IsCancellationRequested` + +The PR1 `Task.WhenAny` currently returns `AbandonedExitCode` only when `abandoned.Task` literally won the race (`== abandoned.Task`). The spec requires the result keyed on the *token*, so that when both cancel and abandon fire and `waitForExit` happens to win (e.g. the abandon TCS callback is still queued), we still return `-48` deterministically. The abandon branch must NOT touch `process.HasExited`/`ExitCode` — after `Cancel()→Close()` the Process is detached. + +**Files** +- Test: `source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs` (add new `[Test]` after `AbandonToken_ShouldReturnAbandonedExitCodeWithoutKillingProcess` which ends at line ~372) +- Modify: `source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs` lines 161–177 (the `abandoned`/`Task.WhenAny` block) + +- [ ] Write a failing test `AbandonToken_WhenWaitTaskWinsRace_StillReturnsAbandonedExitCode` in `SilentProcessRunnerFixture.cs`. This drives both tokens: cancel fires first (no-op kill via the env var), then abandon, and asserts the return is `-48` regardless of race ordering. Add this method inside the class: + +```csharp + [Test] + public async Task AbandonToken_WhenBothTokensFire_ReturnsAbandonedExitCodeKeyedOnToken() + { + // Simulate an un-killable script: DisableProcessKill makes cancel's Hitman.Kill a no-op, + // so cancel does NOT make the process exit. Then abandon fires. The return must be + // AbandonedExitCode because abandon.IsCancellationRequested is set — not because of which + // task won the WhenAny race. This is the deterministic-(-48) guarantee from the spec. + Environment.SetEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, "1"); + try + { + using var tempDir = new TemporaryDirectory(); + var pidFile = Path.Combine(tempDir.DirectoryPath, "process.pid"); + + var executable = PlatformDetection.IsRunningOnWindows ? "powershell.exe" : "/bin/bash"; + var arguments = PlatformDetection.IsRunningOnWindows + ? $"-NoProfile -NonInteractive -Command \"$PID | Out-File -FilePath '{pidFile}' -Encoding ASCII; Start-Sleep -Seconds 300\"" + : $"-c \"echo $$ > '{pidFile}' && sleep 300\""; + + using var cancelCts = new CancellationTokenSource(); + using var abandonCts = new CancellationTokenSource(); + + var task = Task.Run(async () => await SilentProcessRunner.ExecuteCommandAsync( + executable, + arguments, + Environment.CurrentDirectory, + debug: _ => { }, + info: _ => { }, + error: _ => { }, + customEnvironmentVariables: null, + cancel: cancelCts.Token, + abandon: abandonCts.Token)); + + await WaitForPidFileAsync(pidFile, TimeSpan.FromSeconds(30)); + + // Cancel first (kill is no-op'd so the process keeps running), then abandon. + cancelCts.Cancel(); + abandonCts.Cancel(); + + var exitCode = await task; + exitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + + // Cleanup: the real process is still alive because kill was no-op'd. + if (File.Exists(pidFile) && int.TryParse(SafelyReadAllText(pidFile).Trim(), out var pid) && pid > 0) + { + try { Process.GetProcessById(pid).Kill(); } catch { /* already gone */ } + } + } + finally + { + Environment.SetEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, null); + } + } +``` + +- [ ] Run it on net48 + net8.0 and watch it FAIL (today the `== abandoned.Task` branch can be skipped when `waitForExit` wins, falling through to `SafelyGetExitCode` on a detached process → returns `-1`, not `-48`): + +```bash +dotnet test source/Octopus.Tentacle.Tests.Integration --framework net48 \ + --filter "FullyQualifiedName~SilentProcessRunnerFixture.AbandonToken_WhenBothTokensFire_ReturnsAbandonedExitCodeKeyedOnToken" +dotnet test source/Octopus.Tentacle.Tests.Integration --framework net8.0 \ + --filter "FullyQualifiedName~SilentProcessRunnerFixture.AbandonToken_WhenBothTokensFire_ReturnsAbandonedExitCodeKeyedOnToken" +``` +Expected: FAIL (returns -1 or hangs). + +- [ ] Make the minimal implementation change in `SilentProcessRunner.cs`. Replace the `Task.WhenAny`/`== abandoned.Task` block (lines 161–177) so the abandoned branch is keyed on the token and runs after the race regardless of winner: + +```csharp + var abandoned = new TaskCompletionSource(); + using (abandon.Register(() => abandoned.TrySetResult(null))) + { + var waitForExit = Task.Run(() => + { + try { process.WaitForExit(); } + catch { /* released by Process.Dispose on the abandon path */ } + }); + + await Task.WhenAny(waitForExit, abandoned.Task).ConfigureAwait(false); + + // Key the abandoned result on the TOKEN, not on which task won the race. + // When both cancel and abandon fire, cancel's Kill is no-op'd / can't land, so + // waitForExit may win even though we must still return AbandonedExitCode. + // After Cancel()->Close() the Process is detached, so do NOT read + // process.HasExited / process.ExitCode here — just return -48. The + // "abandon was unnecessary" case never reaches here: the script's + // RunningScript is already Complete with its real code one layer up. + if (abandon.IsCancellationRequested) + { + info("Tentacle has abandoned this script. The underlying script process may still be running on this host."); + SafelyCancelOutputAndErrorRead(process, debug); + running = false; + return ScriptExitCodes.AbandonedExitCode; + } + } +``` + +- [ ] Re-run the same two `dotnet test` commands from above. Expected: PASS on both net48 and net8.0. + +- [ ] Run the existing abandon + grandchild regression tests to confirm no regression: + +```bash +dotnet test source/Octopus.Tentacle.Tests.Integration --framework net8.0 \ + --filter "FullyQualifiedName~SilentProcessRunnerFixture.AbandonToken_ShouldReturnAbandonedExitCodeWithoutKillingProcess|FullyQualifiedName~SilentProcessRunnerFixture.CancellationToken_WhenGrandchildHoldsRedirectedPipes_ShouldNotHang|FullyQualifiedName~SilentProcessRunnerFixture.CancellationToken_WhenUnixGrandchildHoldsRedirectedPipes_ShouldNotHang" +``` +Expected: PASS. + +- [ ] Commit: `git commit -am "PR1 runner: key abandoned result on abandon token, not WhenAny winner"` + +--- + +## Task 2 — `AbandonScriptAsync` best-effort-kills (Cancel then Abandon) — service-layer test + +`ScriptServiceV2.AbandonScriptAsync` currently calls only `runningScript.Abandon()`. The spec requires `runningScript.Cancel()` first so `Hitman.Kill` runs on every abandon (anti-abuse). We prove this through the integration layer because `ScriptServiceV2` wiring (workspace factory, state store, shell) is awkward to stub; the existing abandon integration tests already exercise this service with a real Tentacle. + +**Files** +- Test: `source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs` (add a third `[Test]` after `AbandonScript_ReleasesIsolationMutexEvenWhileProcessIsStillRunning`, which ends at line ~129) + +- [ ] Write a failing test `AbandonScript_WithNoPriorCancel_KillsTheProcess` in `ClientScriptExecutionAbandon.cs`. Kill is NOT disabled here, so abandon's internal `Cancel()` must terminate the process. Add this method inside the class: + +```csharp + [Test] + [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version2)] + public async Task AbandonScript_WithNoPriorCancel_KillsTheProcess(TentacleConfigurationTestCase tentacleConfigurationTestCase) + { + // Anti-abuse: a direct AbandonScript with no prior CancelScript must still attempt the + // kill. AbandonScriptAsync calls Cancel() then Abandon(), so Hitman.Kill runs. Kill is + // NOT disabled here, so the underlying process must actually die. + await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() + .Build(CancellationToken); + + var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); + var releaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "release"); + + var command = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder() + .CreateFile(startFile) + .WaitForFileToExist(releaseFile)) + .WithIsolationLevel(ScriptIsolationLevel.NoIsolation) + .Build(); + + var tentacleClient = clientTentacle.TentacleClient; + var scriptExecution = Task.Run(async () => await tentacleClient.ExecuteScript(command, CancellationToken)); + + await Wait.For(() => File.Exists(startFile), + TimeSpan.FromSeconds(30), + () => throw new Exception("Script did not start"), + CancellationToken); + + // Direct abandon, NO prior cancel. + await tentacleClient.AbandonScript(command.ScriptTicket, CancellationToken); + + ScriptStatus abandonResponse = null!; + await Wait.For(async () => + { + abandonResponse = await tentacleClient.GetStatus(command.ScriptTicket, CancellationToken); + return abandonResponse.State == ProcessState.Complete; + }, + TimeSpan.FromSeconds(30), + () => throw new Exception("Abandoned script did not reach Complete state within 30s"), + CancellationToken); + + abandonResponse.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + + // The kill landed because Hitman was NOT disabled: the script never saw the release file, + // so if it is still waiting the file would be absent and the process alive. The script + // reaching Complete with AbandonedExitCode while we never wrote releaseFile proves the + // wait was broken by abandon; the kill having run is asserted by the script process being + // gone — drain the execution task, which completes once the process is dead. + await scriptExecution; + } +``` + +- [ ] Run it and watch it FAIL (today abandon does NOT call `Cancel()`, so `Hitman.Kill` never runs; the process keeps waiting on `releaseFile`, and `scriptExecution` never drains within the test timeout): + +```bash +dotnet test source/Octopus.Tentacle.Tests.Integration --framework net8.0 \ + --filter "FullyQualifiedName~ClientScriptExecutionAbandon.AbandonScript_WithNoPriorCancel_KillsTheProcess" +``` +Expected: FAIL (test times out because the process is never killed). + +- [ ] Make the minimal implementation change in `source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs`. Modify `AbandonScriptAsync` (lines 144–156) to call `Cancel()` then `Abandon()`: + +```csharp + public Task AbandonScriptAsync(AbandonScriptCommandV2 command, CancellationToken cancellationToken) + { + // Abandon best-effort-kills first (anti-abuse): Cancel() runs the existing + // cancel.Register -> DoOurBestToCleanUp -> Hitman.Kill machinery so ANY abandon — + // including a direct RPC with no prior cancel — attempts the kill. Abandon() then + // layers "stop waiting + release the mutex + return -48" on top. A process survives + // abandon only when the kill genuinely can't land (stuck / re-parented grandchild). + if (runningScripts.TryGetValue(command.Ticket, out var runningScript)) + { + runningScript.Cancel(); + runningScript.Abandon(); + } + + return Task.FromResult(GetResponse(command.Ticket, command.LastLogSequence, runningScript?.Process)); + } +``` + +- [ ] Re-run the same `dotnet test` command. Expected: PASS. + +- [ ] Run the two existing abandon integration tests to confirm no regression (kill-disabled path still returns -48 and releases the mutex): + +```bash +dotnet test source/Octopus.Tentacle.Tests.Integration --framework net8.0 \ + --filter "FullyQualifiedName~ClientScriptExecutionAbandon" +``` +Expected: PASS (all three tests). + +- [ ] Commit: `git commit -am "AbandonScriptAsync: Cancel() then Abandon() so abandon best-effort-kills"` + +--- + +## Task 3 — Client: add `AbandonScript` to `IScriptExecutor` and capability extension + +Change 2 needs the orchestrator to call abandon through the executor abstraction. Add `AbandonScript(CommandContext)` to `IScriptExecutor`, a capability-checking implementation on `ScriptServiceV2Executor`, and no-op fallbacks on V1/Kubernetes executors. The capability check uses a new `HasAbandonScriptV2()` extension that matches the capability string Tentacle actually advertises today (`"AbandonScriptAsync"`). + +> **Spec gap (resolved here):** the spec names the capability `AbandonScriptV2`, but `CapabilitiesServiceV2` already advertises `nameof(ScriptServiceV2.AbandonScriptAsync)` = the string `"AbandonScriptAsync"`. Changing the advertised string would break the contract already shipped on this branch. This plan matches the *actual* advertised string. If the team wants the spec's `AbandonScriptV2` literal, that is a separate contract change with its own backward-compat story — flagged, not silently done. + +**Files** +- Test: `source/Octopus.Tentacle.Client.Tests/CapabilitiesResponseV2ExtensionMethodsTests.cs` (CREATE) +- Modify: `source/Octopus.Tentacle.Client/Capabilities/CapabilitiesResponseV2ExtensionMethods.cs` (add method after `HasScriptServiceV2`, line 17) +- Modify: `source/Octopus.Tentacle.Client/Scripts/IScriptExecutor.cs` (add method after `CancelScript`, line 33) + +- [ ] Write a failing test file `source/Octopus.Tentacle.Client.Tests/CapabilitiesResponseV2ExtensionMethodsTests.cs`: + +```csharp +using System.Collections.Generic; +using FluentAssertions; +using NUnit.Framework; +using Octopus.Tentacle.Client.Capabilities; +using Octopus.Tentacle.Contracts.Capabilities; + +namespace Octopus.Tentacle.Client.Tests +{ + [TestFixture] + public class CapabilitiesResponseV2ExtensionMethodsTests + { + [Test] + public void HasAbandonScriptV2_WhenAdvertised_ReturnsTrue() + { + var capabilities = new CapabilitiesResponseV2(new List { "IScriptServiceV2", "AbandonScriptAsync" }); + capabilities.HasAbandonScriptV2().Should().BeTrue(); + } + + [Test] + public void HasAbandonScriptV2_WhenNotAdvertised_ReturnsFalse() + { + var capabilities = new CapabilitiesResponseV2(new List { "IScriptServiceV2" }); + capabilities.HasAbandonScriptV2().Should().BeFalse(); + } + + [Test] + public void HasAbandonScriptV2_WhenEmpty_ReturnsFalse() + { + var capabilities = new CapabilitiesResponseV2(new List()); + capabilities.HasAbandonScriptV2().Should().BeFalse(); + } + } +} +``` + +- [ ] Run it and watch it FAIL (the extension does not exist → compile error): + +```bash +dotnet test source/Octopus.Tentacle.Client.Tests --framework net8.0 \ + --filter "FullyQualifiedName~CapabilitiesResponseV2ExtensionMethodsTests" +``` +Expected: FAIL (does not compile). + +- [ ] Add the extension method to `CapabilitiesResponseV2ExtensionMethods.cs` after `HasScriptServiceV2` (the closing brace of that method is at line 17). Insert: + +```csharp + + public static bool HasAbandonScriptV2(this CapabilitiesResponseV2 capabilities) + { + if (capabilities?.SupportedCapabilities?.Any() != true) + { + return false; + } + + // Tentacle advertises this as nameof(ScriptServiceV2.AbandonScriptAsync). Keep the + // literal in sync with CapabilitiesServiceV2.GetCapabilitiesAsync on the Tentacle side. + return capabilities.SupportedCapabilities.Contains("AbandonScriptAsync"); + } +``` + +- [ ] Re-run the extension test. Expected: PASS. + +- [ ] Add `AbandonScript` to `IScriptExecutor.cs`. Insert after the `CancelScript` declaration (line 33, before `CompleteScript`'s doc comment): + +```csharp + + /// + /// Abandon the script: signal Tentacle to stop waiting and release the isolation mutex. + /// Returns the abandoned status when the Tentacle advertises the abandon capability; + /// otherwise falls back to cancelling and returns that result. + /// + /// The CommandContext from the previous command + Task AbandonScript(CommandContext commandContext); +``` + +- [ ] Build the client to confirm the interface compiles (implementations come in Task 4 — expect implementor errors are fine to defer only if you build implementors together; here build will FAIL until Task 4, so just verify the interface file itself parses by building the project after Task 4). Skip a standalone build here; commit the interface + extension together with implementations is cleaner — but to keep commits frequent, commit the extension + its test now: + +```bash +dotnet test source/Octopus.Tentacle.Client.Tests --framework net8.0 \ + --filter "FullyQualifiedName~CapabilitiesResponseV2ExtensionMethodsTests" +``` +Expected: PASS. + +- [ ] Commit: `git commit -am "Client: add HasAbandonScriptV2 capability check + IScriptExecutor.AbandonScript"` + +--- + +## Task 4 — Implement `AbandonScript` on the three executors + +`ScriptServiceV2Executor` does the real work with a capability check; V1 and Kubernetes executors fall back to `CancelScript` (they have no abandon RPC). The V2 executor needs the capabilities client; add it as a constructor dependency and wire it through `ScriptExecutorFactory`. + +**Files** +- Modify: `source/Octopus.Tentacle.Client/Scripts/ScriptServiceV2Executor.cs` (ctor lines 27–41; add `AbandonScript` after `CancelScript`, line 174) +- Modify: `source/Octopus.Tentacle.Client/Scripts/ScriptServiceV1Executor.cs` (add `AbandonScript` fallback) +- Modify: `source/Octopus.Tentacle.Client/Scripts/KubernetesScriptServiceV1Executor.cs` (add `AbandonScript` fallback) +- Modify: `source/Octopus.Tentacle.Client/Scripts/ScriptExecutorFactory.cs` (pass capabilities client to V2 executor, line 48) +- Test: `source/Octopus.Tentacle.Client.Tests/ScriptServiceV2ExecutorAbandonTests.cs` (CREATE) + +- [ ] Write a failing test file `source/Octopus.Tentacle.Client.Tests/ScriptServiceV2ExecutorAbandonTests.cs`: + +```csharp +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; +using FluentAssertions; +using Halibut.ServiceModel; +using NSubstitute; +using NUnit.Framework; +using Octopus.Tentacle.Client; +using Octopus.Tentacle.Client.EventDriven; +using Octopus.Tentacle.Client.Execution; +using Octopus.Tentacle.Client.Observability; +using Octopus.Tentacle.Client.Retries; +using Octopus.Tentacle.Client.Scripts; +using Octopus.Tentacle.Contracts; +using Octopus.Tentacle.Contracts.Capabilities; +using Octopus.Tentacle.Contracts.ClientServices; +using Octopus.Tentacle.Contracts.Logging; +using Octopus.Tentacle.Contracts.Observability; +using Octopus.Tentacle.Contracts.ScriptServiceV2; + +namespace Octopus.Tentacle.Client.Tests +{ + [TestFixture] + public class ScriptServiceV2ExecutorAbandonTests + { + static ScriptServiceV2Executor CreateExecutor(IAsyncClientScriptServiceV2 scriptService, IAsyncClientCapabilitiesServiceV2 capabilities) + => new( + scriptService, + capabilities, + RpcCallExecutorFactory.Create(TimeSpan.Zero, Substitute.For()), + ClientOperationMetricsBuilder.Start(), + TimeSpan.Zero, + new TentacleClientOptions(new RpcRetrySettings(retriesEnabled: false, retryDuration: TimeSpan.Zero)), + Substitute.For()); + + static CommandContext Context() => new(new ScriptTicket("TestTicket"), 0, ScriptServiceVersion.ScriptServiceVersion2); + + [Test] + public async Task AbandonScript_WhenCapabilityAdvertised_CallsAbandonScriptAsync() + { + var scriptService = Substitute.For(); + scriptService.AbandonScriptAsync(Arg.Any(), Arg.Any()) + .Returns(x => Task.FromResult(new ScriptStatusResponseV2( + x.Arg().Ticket, ProcessState.Complete, + ScriptExitCodes.AbandonedExitCode, new List(), 1))); + + var capabilities = Substitute.For(); + capabilities.GetCapabilitiesAsync(Arg.Any()) + .Returns(Task.FromResult(new CapabilitiesResponseV2(new List { "IScriptServiceV2", "AbandonScriptAsync" }))); + + var executor = CreateExecutor(scriptService, capabilities); + + var result = await executor.AbandonScript(Context()); + + await scriptService.Received(1).AbandonScriptAsync(Arg.Any(), Arg.Any()); + await scriptService.DidNotReceive().CancelScriptAsync(Arg.Any(), Arg.Any()); + result.ScriptStatus.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + } + + [Test] + public async Task AbandonScript_WhenCapabilityAbsent_FallsBackToCancelScriptAsync() + { + var scriptService = Substitute.For(); + scriptService.CancelScriptAsync(Arg.Any(), Arg.Any()) + .Returns(x => Task.FromResult(new ScriptStatusResponseV2( + x.Arg().Ticket, ProcessState.Running, + 0, new List(), 1))); + + var capabilities = Substitute.For(); + capabilities.GetCapabilitiesAsync(Arg.Any()) + .Returns(Task.FromResult(new CapabilitiesResponseV2(new List { "IScriptServiceV2" }))); + + var executor = CreateExecutor(scriptService, capabilities); + + await executor.AbandonScript(Context()); + + await scriptService.DidNotReceive().AbandonScriptAsync(Arg.Any(), Arg.Any()); + await scriptService.Received(1).CancelScriptAsync(Arg.Any(), Arg.Any()); + } + } +} +``` + +- [ ] Run it and watch it FAIL (V2 executor has no capabilities ctor param and no `AbandonScript` → compile error): + +```bash +dotnet test source/Octopus.Tentacle.Client.Tests --framework net8.0 \ + --filter "FullyQualifiedName~ScriptServiceV2ExecutorAbandonTests" +``` +Expected: FAIL (does not compile). + +- [ ] Add the capabilities dependency to `ScriptServiceV2Executor`. Modify the field block (after line 20 `readonly IAsyncClientScriptServiceV2 clientScriptServiceV2;`) and the ctor (lines 27–41): + +```csharp + readonly IAsyncClientScriptServiceV2 clientScriptServiceV2; + readonly IAsyncClientCapabilitiesServiceV2 clientCapabilitiesServiceV2; + readonly RpcCallExecutor rpcCallExecutor; + readonly ClientOperationMetricsBuilder clientOperationMetricsBuilder; + readonly TimeSpan onCancellationAbandonCompleteScriptAfter; + readonly ITentacleClientTaskLog logger; + readonly TentacleClientOptions clientOptions; + + public ScriptServiceV2Executor( + IAsyncClientScriptServiceV2 clientScriptServiceV2, + IAsyncClientCapabilitiesServiceV2 clientCapabilitiesServiceV2, + RpcCallExecutor rpcCallExecutor, + ClientOperationMetricsBuilder clientOperationMetricsBuilder, + TimeSpan onCancellationAbandonCompleteScriptAfter, + TentacleClientOptions clientOptions, + ITentacleClientTaskLog logger) + { + this.clientScriptServiceV2 = clientScriptServiceV2; + this.clientCapabilitiesServiceV2 = clientCapabilitiesServiceV2; + this.rpcCallExecutor = rpcCallExecutor; + this.clientOperationMetricsBuilder = clientOperationMetricsBuilder; + this.onCancellationAbandonCompleteScriptAfter = onCancellationAbandonCompleteScriptAfter; + this.clientOptions = clientOptions; + this.logger = logger; + } +``` + +- [ ] Add the `AbandonScript` method to `ScriptServiceV2Executor.cs` immediately after `CancelScript` (after line 174, before `CompleteScript`). Add `using Octopus.Tentacle.Client.Capabilities;` at the top if absent: + +```csharp + public async Task AbandonScript(CommandContext commandContext) + { + using var activity = TentacleClient.ActivitySource.StartActivity($"{nameof(ScriptServiceV2Executor)}.{nameof(AbandonScript)}"); + + var capabilities = await rpcCallExecutor.Execute( + retriesEnabled: clientOptions.RpcRetrySettings.RetriesEnabled, + RpcCall.Create(nameof(Contracts.Capabilities.ICapabilitiesServiceV2.GetCapabilities)), + async ct => await clientCapabilitiesServiceV2.GetCapabilitiesAsync(new HalibutProxyRequestOptions(ct)), + logger, + clientOperationMetricsBuilder, + CancellationToken.None).ConfigureAwait(false); + + // Capability absent → the Tentacle is too old to abandon. Keep cancelling cleanly. + if (!capabilities.HasAbandonScriptV2()) + { + logger.Verbose("Tentacle does not advertise AbandonScript; falling back to CancelScript."); + return await CancelScript(commandContext).ConfigureAwait(false); + } + + async Task AbandonScriptAction(CancellationToken ct) + { + var request = new AbandonScriptCommandV2(commandContext.ScriptTicket, commandContext.NextLogSequence); + return await clientScriptServiceV2.AbandonScriptAsync(request, new HalibutProxyRequestOptions(ct)); + } + + var scriptStatusResponseV2 = await rpcCallExecutor.Execute( + retriesEnabled: clientOptions.RpcRetrySettings.RetriesEnabled, + RpcCall.Create(nameof(IScriptServiceV2.AbandonScript)), + AbandonScriptAction, + logger, + clientOperationMetricsBuilder, + // Like CancelScript, abandon must not be cancelled — it stops the script on Tentacle. + CancellationToken.None).ConfigureAwait(false); + return Map(scriptStatusResponseV2); + } +``` + +- [ ] Add the no-op fallback to `ScriptServiceV1Executor.cs` (mirror its `CancelScript`; V1 has no abandon RPC, so abandon degrades to cancel). Add after that executor's `CancelScript` method: + +```csharp + public Task AbandonScript(CommandContext commandContext) + { + // ScriptServiceV1 has no abandon verb; degrade to cancel. + return CancelScript(commandContext); + } +``` + +- [ ] Add the same fallback to `KubernetesScriptServiceV1Executor.cs` after its `CancelScript`: + +```csharp + public Task AbandonScript(CommandContext commandContext) + { + // KubernetesScriptServiceV1 has no abandon verb; degrade to cancel. + return CancelScript(commandContext); + } +``` + +- [ ] Wire the capabilities client through `ScriptExecutorFactory.cs`. Change the V2 construction (line 48) to pass `allClients.CapabilitiesServiceV2`: + +```csharp + return new ScriptServiceV2Executor( + allClients.ScriptServiceV2, + allClients.CapabilitiesServiceV2, + rpcCallExecutor, + clientOperationMetricsBuilder, + onCancellationAbandonCompleteScriptAfter, + clientOptions, + logger); +``` + +- [ ] Add `AbandonScript` to the `ScriptExecutor` facade (`source/Octopus.Tentacle.Client/ScriptExecutor.cs`) after its `CancelScript` method (line ~70), mirroring the existing delegation pattern: + +```csharp + public async Task AbandonScript(CommandContext commandContext) + { + var scriptExecutorFactory = CreateScriptExecutorFactory(); + var scriptExecutor = scriptExecutorFactory.CreateScriptExecutor(commandContext.ScripServiceVersionUsed); + + return await scriptExecutor.AbandonScript(commandContext); + } +``` + +- [ ] Re-run the V2 executor abandon tests. Expected: PASS. + +```bash +dotnet test source/Octopus.Tentacle.Client.Tests --framework net8.0 \ + --filter "FullyQualifiedName~ScriptServiceV2ExecutorAbandonTests" +``` + +- [ ] Build the client + run the full client test project to confirm no implementor was missed: + +```bash +dotnet build source/Octopus.Tentacle.Client --framework net8.0 +dotnet test source/Octopus.Tentacle.Client.Tests --framework net8.0 +``` +Expected: build succeeds; tests PASS. + +- [ ] Commit: `git commit -am "Client executors: implement AbandonScript with capability check + cancel fallback"` + +--- + +## Task 5 — Orchestrator: escalate to abandon after the cancellation-pending threshold + +`ObservingScriptOrchestrator.ObserveUntilComplete` calls `CancelScript` each poll while the token is cancelled. Add `abandonAfterCancellationPendingFor`; record when cancellation first fired; once elapsed crosses the threshold call `AbandonScript` (the executor handles the capability check + fallback). When the param is `null`, behaviour is unchanged. + +**Files** +- Modify: `source/Octopus.Tentacle.Client/Scripts/ObservingScriptOrchestrator.cs` (ctor lines 18–28; `ExecuteScript` lines 30–45; `ObserveUntilComplete` lines 76–144) +- Test: `source/Octopus.Tentacle.Client.Tests/ObservingScriptOrchestratorAbandonTests.cs` (CREATE) + +- [ ] First, make `ObserveUntilComplete` testable by changing its access modifier from `private` (implicit) to `internal` in `ObservingScriptOrchestrator.cs` (the class is already `internal`; `Octopus.Tentacle.Client.Tests` has `InternalsVisibleTo`). Change line 76 from `async Task ObserveUntilComplete(` to `internal async Task ObserveUntilComplete(`. (`ExecuteScriptCommand` is abstract, so driving the full `ExecuteScript` from a unit test is awkward; calling `ObserveUntilComplete` directly is the deterministic seam.) + +- [ ] Write a failing test file `source/Octopus.Tentacle.Client.Tests/ObservingScriptOrchestratorAbandonTests.cs`. It drives `ObserveUntilComplete` directly with an NSubstitute `IScriptExecutor`, a pre-cancelled token, and asserts cancel-vs-abandon based on the threshold: + +```csharp +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; +using FluentAssertions; +using NSubstitute; +using NUnit.Framework; +using Octopus.Tentacle.Client.EventDriven; +using Octopus.Tentacle.Client.Scripts; +using Octopus.Tentacle.Client.Scripts.Models; +using Octopus.Tentacle.Contracts; + +namespace Octopus.Tentacle.Client.Tests +{ + [TestFixture] + public class ObservingScriptOrchestratorAbandonTests + { + static CommandContext Context() => new(new ScriptTicket("T"), 0, ScriptServiceVersion.ScriptServiceVersion2); + + static ScriptOperationExecutionResult Running() + => new(new ScriptStatus(ProcessState.Running, 0, new List()), Context()); + + static ScriptOperationExecutionResult Complete(int exitCode) + => new(new ScriptStatus(ProcessState.Complete, exitCode, new List()), Context()); + + static ObservingScriptOrchestrator CreateOrchestrator(IScriptExecutor executor, TimeSpan? abandonAfter) + => new( + new ImmediateBackoff(), + _ => { }, + _ => Task.CompletedTask, + executor, + abandonAfter); + + sealed class ImmediateBackoff : IScriptObserverBackoffStrategy + { + public TimeSpan GetBackoff(int iteration) => TimeSpan.Zero; + } + + [Test] + public async Task ParamUnset_CancelsOnly_NeverAbandons() + { + var executor = Substitute.For(); + executor.CancelScript(Arg.Any()) + .Returns(Running(), Complete(ScriptExitCodes.CanceledExitCode)); + + var orchestrator = CreateOrchestrator(executor, abandonAfter: null); + using var cts = new CancellationTokenSource(); + cts.Cancel(); + + await orchestrator.ObserveUntilComplete(Running(), cts.Token); + + await executor.DidNotReceive().AbandonScript(Arg.Any()); + await executor.Received().CancelScript(Arg.Any()); + } + + [Test] + public async Task ThresholdCrossed_SwitchesFromCancelToAbandon() + { + var executor = Substitute.For(); + // abandonAfter = Zero means the threshold is crossed on the first cancelled iteration, + // so the orchestrator abandons immediately. AbandonScript returns Complete to end the loop. + executor.AbandonScript(Arg.Any()) + .Returns(Complete(ScriptExitCodes.AbandonedExitCode)); + + var orchestrator = CreateOrchestrator(executor, abandonAfter: TimeSpan.Zero); + using var cts = new CancellationTokenSource(); + cts.Cancel(); + + var result = await orchestrator.ObserveUntilComplete(Running(), cts.Token); + + await executor.Received().AbandonScript(Arg.Any()); + result.ScriptStatus.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + } + } +} +``` + +- [ ] Run it and watch it FAIL (orchestrator ctor has no `abandonAfter` param and `ObserveUntilComplete` is not yet `internal`/escalating → compile error on the 5-arg ctor): + +```bash +dotnet test source/Octopus.Tentacle.Client.Tests --framework net8.0 \ + --filter "FullyQualifiedName~ObservingScriptOrchestratorAbandonTests" +``` +Expected: FAIL (does not compile). + +- [ ] Add the field + ctor param to `ObservingScriptOrchestrator.cs`. After `readonly IScriptExecutor scriptExecutor;` (line 16) add `readonly TimeSpan? abandonAfterCancellationPendingFor;`. Change the ctor signature (line 18) to append `TimeSpan? abandonAfterCancellationPendingFor` and assign it: + +```csharp + public ObservingScriptOrchestrator( + IScriptObserverBackoffStrategy scriptObserverBackOffStrategy, + OnScriptStatusResponseReceived onScriptStatusResponseReceived, + OnScriptCompleted onScriptCompleted, + IScriptExecutor scriptExecutor, + TimeSpan? abandonAfterCancellationPendingFor = null) + { + this.scriptExecutor = scriptExecutor; + this.scriptObserverBackOffStrategy = scriptObserverBackOffStrategy; + this.onScriptStatusResponseReceived = onScriptStatusResponseReceived; + this.onScriptCompleted = onScriptCompleted; + this.abandonAfterCancellationPendingFor = abandonAfterCancellationPendingFor; + } +``` + +- [ ] Change `ObserveUntilComplete` (lines 76–144) to record first-cancel time and escalate. Replace the cancellation branch (lines 86–89): + +```csharp + var iteration = 0; + var cancellationIteration = 0; + var lastResult = startScriptResult; + var stopwatch = new Stopwatch(); + + while (lastResult.ScriptStatus.State != ProcessState.Complete) + { + if (scriptExecutionCancellationToken.IsCancellationRequested) + { + // Record when cancellation first fired so we can escalate to abandon after the threshold. + if (!stopwatch.IsRunning) + { + stopwatch.Start(); + } + + var shouldAbandon = abandonAfterCancellationPendingFor is { } threshold + && stopwatch.Elapsed >= threshold; + + lastResult = shouldAbandon + ? await scriptExecutor.AbandonScript(lastResult.ContextForNextCommand).ConfigureAwait(false) + : await scriptExecutor.CancelScript(lastResult.ContextForNextCommand).ConfigureAwait(false); + } + else + { +``` + +(`System.Diagnostics` is already imported for `Stopwatch` at line 2.) + +- [ ] Re-run the orchestrator abandon tests. Expected: PASS. + +```bash +dotnet test source/Octopus.Tentacle.Client.Tests --framework net8.0 \ + --filter "FullyQualifiedName~ObservingScriptOrchestratorAbandonTests" +``` + +- [ ] Commit: `git commit -am "Orchestrator: escalate cancel to abandon after abandonAfterCancellationPendingFor"` + +--- + +## Task 6 — Thread `abandonAfterCancellationPendingFor` through `TentacleClient` + `ITentacleClient` + +The orchestrator now accepts the timeout; expose it on the public `ExecuteScript` surface as an optional parameter (default `null`, so all existing callers are unchanged). + +**Files** +- Modify: `source/Octopus.Tentacle.Client/ITentacleClient.cs` (`ExecuteScript` declaration lines 30–35) +- Modify: `source/Octopus.Tentacle.Client/TentacleClient.cs` (`ExecuteScript` lines 168–208; orchestrator construction line 189) +- Test: `source/Octopus.Tentacle.Client.Tests/ObservingScriptOrchestratorAbandonTests.cs` (no change — covered) + a compile-only assertion in `source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs` (Task 7 covers behavioural) + +- [ ] Add the optional parameter to `ITentacleClient.ExecuteScript` (after `scriptExecutionCancellationToken`, line 35): + +```csharp + Task ExecuteScript( + ExecuteScriptCommand executeScriptCommand, + OnScriptStatusResponseReceived onScriptStatusResponseReceived, + OnScriptCompleted onScriptCompleted, + ITentacleClientTaskLog logger, + CancellationToken scriptExecutionCancellationToken, + TimeSpan? abandonAfterCancellationPendingFor = null); +``` + +- [ ] Add the matching parameter to `TentacleClient.ExecuteScript` (line 168) and pass it to the orchestrator (line 189): + +```csharp + public async Task ExecuteScript(ExecuteScriptCommand executeScriptCommand, + OnScriptStatusResponseReceived onScriptStatusResponseReceived, + OnScriptCompleted onScriptCompleted, + ITentacleClientTaskLog logger, + CancellationToken scriptExecutionCancellationToken, + TimeSpan? abandonAfterCancellationPendingFor = null) + { +``` + +and: + +```csharp + var orchestrator = new ObservingScriptOrchestrator(scriptObserverBackOffStrategy, + onScriptStatusResponseReceived, + onScriptCompleted, + scriptExecutor, + abandonAfterCancellationPendingFor); +``` + +- [ ] Build the client and the integration test project (which references the client) to confirm the optional param did not break any caller: + +```bash +dotnet build source/Octopus.Tentacle.Client --framework net8.0 +dotnet build source/Octopus.Tentacle.Tests.Integration --framework net8.0 +``` +Expected: both build succeed (optional param = no caller changes required). + +- [ ] Run the full client test project to confirm green: + +```bash +dotnet test source/Octopus.Tentacle.Client.Tests --framework net8.0 +``` +Expected: PASS. + +- [ ] Commit: `git commit -am "TentacleClient: expose optional abandonAfterCancellationPendingFor on ExecuteScript"` + +--- + +## Task 7 — End-to-end integration test: ExecuteScript escalates to abandon after the threshold + +Prove the whole client→Tentacle path: a stuck (kill-disabled) script with `abandonAfterCancellationPendingFor` set short, cancelled mid-flight, escalates to `AbandonScript` and returns `AbandonedExitCode`. Uses `TentacleServiceDecoratorBuilder.RecordMethodUsages` to assert the `AbandonScript` verb was called. + +**Files** +- Modify: `source/Octopus.Tentacle.Tests.Integration.Common/Builders/Decorators/Proxies/RecordMethodUsagesExtensionMethods.cs` (add `ForAbandonScriptAsync`) +- Test: `source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs` (add a `[Test]` after Task 2's test) + +- [ ] Add a `ForAbandonScriptAsync` extension to `RecordMethodUsagesExtensionMethods.cs` (mirrors the existing `ForCancelScriptAsync` at line 22; the recorded method name is the proxied async client method `AbandonScriptAsync`). Insert after `ForCancelScriptAsync` (line 25): + +```csharp + + public static IRecordedMethodUsage ForAbandonScriptAsync(this IRecordedMethodUsages o) + { + return o.For("AbandonScriptAsync"); + } +``` + +- [ ] Write a failing test `ExecuteScript_WhenCancellationStaysPending_EscalatesToAbandon` in `ClientScriptExecutionAbandon.cs`. This uses the exact recorded-usages API (`IRecordedMethodUsages recordedUsages = new MethodUsages();` then `.RecordMethodUsages(out recordedUsages)`, queried via `recordedUsages.ForAbandonScriptAsync().Started`). Add the `using Octopus.Tentacle.Tests.Integration.Common.Builders.Decorators.Proxies;` and `using Octopus.Tentacle.Contracts.ClientServices;` imports at the top of the file if absent: + +```csharp + [Test] + [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version2)] + public async Task ExecuteScript_WhenCancellationStaysPending_EscalatesToAbandon(TentacleConfigurationTestCase tentacleConfigurationTestCase) + { + // Kill is disabled, so the script is genuinely stuck and CancelScript can't end it. + // With abandonAfterCancellationPendingFor set short, the orchestrator must escalate to + // AbandonScript, which returns AbandonedExitCode and releases the script. + IRecordedMethodUsages recordedUsages = new MethodUsages(); + await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() + .WithTentacle(x => x.WithRunTentacleEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, "1")) + .WithTentacleServiceDecorator(new TentacleServiceDecoratorBuilder() + .RecordMethodUsages(out recordedUsages) + .Build()) + .Build(CancellationToken); + + var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); + var releaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "release"); + + var command = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder() + .CreateFile(startFile) + .WaitForFileToExist(releaseFile)) + .WithIsolationLevel(ScriptIsolationLevel.NoIsolation) + .Build(); + + var tentacleClient = clientTentacle.TentacleClient; + + using var executionCts = new CancellationTokenSource(); + var logs = new System.Collections.Generic.List(); + + var execution = Task.Run(async () => await tentacleClient.ExecuteScript( + command, + onScriptStatusResponseReceived => logs.AddRange(onScriptStatusResponseReceived.Logs), + _ => Task.CompletedTask, + new SerilogLoggerBuilder().Build().ForContext().ToITentacleTaskLog(), + executionCts.Token, + abandonAfterCancellationPendingFor: TimeSpan.FromSeconds(2))); + + await Wait.For(() => File.Exists(startFile), + TimeSpan.FromSeconds(30), + () => throw new Exception("Script did not start"), + CancellationToken); + + executionCts.Cancel(); + + // ExecuteScript throws OperationCanceledException once the token is cancelled. + await FluentActions.Invoking(async () => await execution).Should().ThrowAsync(); + + // The orchestrator must have escalated to AbandonScript after the 2s threshold. + recordedUsages.ForAbandonScriptAsync().Started.Should().BeGreaterThan(0); + + // Cleanup: release + reap the still-alive process (kill was disabled). + File.WriteAllText(releaseFile, ""); + } +``` + +- [ ] Run it. With Tasks 5–6 implemented it should PASS; if abandon was never called it FAILs on the `.Started.Should().BeGreaterThan(0)` assertion (debug Task 5): + +```bash +dotnet test source/Octopus.Tentacle.Tests.Integration --framework net8.0 \ + --filter "FullyQualifiedName~ClientScriptExecutionAbandon.ExecuteScript_WhenCancellationStaysPending_EscalatesToAbandon" +``` +Expected after correct wiring: PASS. (If it fails because abandon was not called, the orchestrator escalation is wrong — debug Task 5.) + +- [ ] Run the whole abandon integration fixture once more for regression: + +```bash +dotnet test source/Octopus.Tentacle.Tests.Integration --framework net8.0 \ + --filter "FullyQualifiedName~ClientScriptExecutionAbandon" +``` +Expected: PASS (all tests). + +- [ ] Commit: `git commit -am "Integration: ExecuteScript escalates stuck cancel to abandon after threshold"` + +--- + +## Task 8 — Full build across all TFMs (net48 trap guard) + +The Core project multi-targets `net48;net8.0;net8.0-windows`. The runner change in Task 1 keeps the generic `TaskCompletionSource` (the non-generic `TaskCompletionSource` does NOT exist on net48), so this must still build on net48. + +**Files** +- (no source change) + +- [ ] Build the Core project on all three TFMs: + +```bash +dotnet build source/Octopus.Tentacle.Core/Octopus.Tentacle.Core.csproj --framework net48 +dotnet build source/Octopus.Tentacle.Core/Octopus.Tentacle.Core.csproj --framework net8.0 +dotnet build source/Octopus.Tentacle.Core/Octopus.Tentacle.Core.csproj --framework net8.0-windows +``` +Expected: all three succeed. + +- [ ] Build the integration test project on net48 + net8.0 (it runs the runner tests on both): + +```bash +dotnet build source/Octopus.Tentacle.Tests.Integration --framework net48 +dotnet build source/Octopus.Tentacle.Tests.Integration --framework net8.0 +``` +Expected: both succeed. + +- [ ] Run the runner fixture on net48 to confirm the keyed-on-token change behaves there too: + +```bash +dotnet test source/Octopus.Tentacle.Tests.Integration --framework net48 \ + --filter "FullyQualifiedName~SilentProcessRunnerFixture.AbandonToken" +``` +Expected: PASS. + +- [ ] No commit needed (no source change). If any TFM failed, fix inline and commit `git commit -am "Fix net48 build for abandon runner change"`. + +--- + +## Task 9 (final) — Apply Change 1 to PR2 branch (#1244) + +PR2 (`jimpelletier/eft-3295-pr2-async-unlink`, #1244) already has the async `WaitForExitAsync` migration and already keys its abandon-catch on `abandon.IsCancellationRequested` — so the *runner* half of Change 1 is already correct there. What PR2 is still missing is the `AbandonScriptAsync` best-effort-kill: its `AbandonScriptAsync` calls only `Abandon()`. Apply the same Cancel-then-Abandon change. PR2's grandchild tests must STAY as "cancel then abandon" (do NOT flip them back to "cancel alone"). + +**Files (on branch `jimpelletier/eft-3295-pr2-async-unlink`)** +- Modify: `source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs` (`AbandonScriptAsync`) +- Test: `source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs` (cherry-pick Task 2's `AbandonScript_WithNoPriorCancel_KillsTheProcess`) + +- [ ] Switch to PR2 and confirm the runner is already token-keyed (no runner change needed): + +```bash +git switch jimpelletier/eft-3295-pr2-async-unlink +grep -n "abandon.IsCancellationRequested\|runningScript.Cancel\|runningScript.Abandon" \ + source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs \ + source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs +``` +Expected: the runner shows `catch (OperationCanceledException) when (abandon.IsCancellationRequested)`; `AbandonScriptAsync` shows only `runningScript.Abandon()` (the gap to fix). + +- [ ] Cherry-pick Task 2's failing test onto PR2: add `AbandonScript_WithNoPriorCancel_KillsTheProcess` (identical body to Task 2) to `ClientScriptExecutionAbandon.cs`. Run it and watch it FAIL: + +```bash +dotnet test source/Octopus.Tentacle.Tests.Integration --framework net8.0 \ + --filter "FullyQualifiedName~ClientScriptExecutionAbandon.AbandonScript_WithNoPriorCancel_KillsTheProcess" +``` +Expected: FAIL (no kill, test times out). + +- [ ] Apply the same `AbandonScriptAsync` change as Task 2 (Cancel then Abandon) to PR2's `ScriptServiceV2.cs`: + +```csharp + if (runningScripts.TryGetValue(command.Ticket, out var runningScript)) + { + runningScript.Cancel(); + runningScript.Abandon(); + } +``` + +- [ ] Re-run the cherry-picked test. Expected: PASS. + +- [ ] Confirm PR2's grandchild tests STAY as "cancel then abandon" — do NOT modify them. Run them to confirm they still pass with the async runner + best-effort-kill abandon: + +```bash +dotnet test source/Octopus.Tentacle.Tests.Integration --framework net8.0 \ + --filter "FullyQualifiedName~SilentProcessRunnerFixture" +``` +Expected: PASS (grandchild tests require cancel-then-abandon on PR2 by design — do not regress them to PR1's "cancel alone" shape). + +- [ ] Build PR2's Core on all TFMs (net48 guard for PR2's `WaitForExitAsync` polyfill): + +```bash +dotnet build source/Octopus.Tentacle.Core/Octopus.Tentacle.Core.csproj --framework net48 +dotnet build source/Octopus.Tentacle.Core/Octopus.Tentacle.Core.csproj --framework net8.0 +dotnet build source/Octopus.Tentacle.Core/Octopus.Tentacle.Core.csproj --framework net8.0-windows +``` +Expected: all succeed. + +- [ ] Commit on PR2: `git commit -am "PR2: AbandonScriptAsync Cancel() then Abandon() so abandon best-effort-kills"` + +- [ ] Switch back to PR1: `git switch jimpelletier/eft-3295-tentacle-script-abandonment-to-release-the-mutex` + +--- + +## Notes / spec gaps surfaced during planning + +- **Capability string mismatch.** Spec Section 1 names the capability `AbandonScriptV2`. The code on this branch advertises `nameof(ScriptServiceV2.AbandonScriptAsync)` = `"AbandonScriptAsync"`. This plan matches the *deployed* string in both the Tentacle advertisement and the client `HasAbandonScriptV2()` check. Renaming to `AbandonScriptV2` is a separate contract decision and is intentionally NOT done here. +- **Most of the contract is already merged.** `AbandonScriptCommandV2`, `IScriptServiceV2.AbandonScript`, `AbandonedExitCode = -48`, `ITentacleClient.AbandonScript`, `RunningScript` abandon catch, and the PR1 runner block already exist. The two deltas in this plan are the only behavioural changes Change 1 and Change 2 require. +- **`process.HasExited` is deliberately NOT read in the PR1 abandon branch.** After `Cancel()→Close()` detaches the Process, reading `HasExited`/`ExitCode` would throw or lie. The "abandon unnecessary" race is handled one layer up by `GetResponse` returning the already-`Complete` `RunningScript`'s real code. diff --git a/docs/superpowers/specs/2026-05-21-tentacle-script-abandon-design.md b/docs/superpowers/specs/2026-05-21-tentacle-script-abandon-design.md new file mode 100644 index 000000000..7c07c1722 --- /dev/null +++ b/docs/superpowers/specs/2026-05-21-tentacle-script-abandon-design.md @@ -0,0 +1,477 @@ +# Tentacle script abandon — design + +**Status:** Draft, ready for implementation planning. Contract aligned with the parallel server-side session. +**Ticket:** [EFT-3295](https://linear.app/octopus/issue/EFT-3295/tentacle-script-abandonment-to-release-the-mutex) +**ADR:** [ADR-042 — Defer server-task Abandoned state](https://github.com/OctopusDeploy/adr/pull/226) +**Parallel work:** Server-side (ProcessExecution layer) is being designed in a separate session and consumes the contract proposed here. + +--- + +## Problem + +When a Tentacle script is hung in a way that resists `Process.Kill` (Philips' case: PowerShell stuck inside CrowdStrike + Rapid7 fighting over the same process; kernel-level uninterruptible wait), today's flow ends with: + +- `ScriptIsolationMutex` stays held → subsequent deployments to that Tentacle queue forever. +- The .NET threadpool thread inside `RunningScript.Execute()` stays parked on `process.WaitForExit()` (synchronous). +- The customer's only recovery is RDP-in-and-kill or reboot. Not acceptable for Philips. + +Server-side detects that cancellation hasn't propagated within its own timeout and tells Tentacle to **abandon** the script. Tentacle stops waiting, releases the mutex, logs honestly, and accepts new work. + +**Abandon now best-effort-kills the process** (see ADR-42 reversal below). The runaway OS process survives only when the kill genuinely can't land — exactly the stuck / re-parented-grandchild case the ticket is about. + +## Abandon semantics (the agreed model) + +- **Cancel** = best-effort kill, then **WAIT** for the script to finish and report its real outcome. +- **Abandon** = best-effort kill, then **DON'T wait** — release the isolation mutex and return `AbandonedExitCode` (-48) even if the script didn't die. + +Both attempt the kill. Only abandon stops waiting and releases the mutex. A process survives abandon only when the kill genuinely can't land (stuck / re-parented grandchild). This closes the abuse vector where someone could use abandon to leave a perfectly killable process running unmanaged. + +## Scope + +In scope: +- `IScriptServiceV2` only (Listening + Polling Tentacles). +- New Halibut RPC verb `AbandonScript`, new exit code `AbandonedExitCode = -48`. +- New optional client parameter `abandonAfterCancellationPendingFor` on `TentacleClient.ExecuteScript`. +- Gated by server-side feature flag (`AbandonTentacleScriptOnCancellationTimeoutFeatureToggle`) for the first release. No Tentacle-side flag — capability advertisement is binary on build version. + +Out of scope: +- SSH targets (different lock model; ticket explicitly defers). +- Kubernetes agent (`IKubernetesScriptServiceV1`): different mechanism, separate stuck-pod work already in flight (`KubernetesPendingPodWatchDog`). Server's capability negotiation handles "don't try abandon on Kubernetes targets" cleanly. +- Old `IScriptService` (V1): no signal that any active Tentacle still negotiates V1. +- Server-task Abandoned UI state — deferred by ADR-042; task continues to surface as Cancelled. + +## Section 0 — Two PRs, sequenced + +The work ships as two PRs in sequence so the async migration is reviewable and mergeable independently of the abandon feature, and so the behavioural change to grandchild handling is gated behind universal server abandon support. + +``` +main ← PR1 (#1226) ← PR2 (#1244, draft) +``` + +### PR1 (#1226) — ships first + +The wait in `SilentProcessRunner` is a **synchronous `process.WaitForExit()` run on a `Task`**, raced against an abandon signal via `Task.WhenAny`. The abandon signal is a `TaskCompletionSource` completed when the abandon token fires. (The non-generic `TaskCompletionSource` does NOT exist on net48, so the generic form is required.) + +**Cancel behaves exactly like main.** `cancel.Register` runs `DoOurBestToCleanUp`, which does best-effort `Hitman.Kill` **and `process.Close()`**. `Close` releases the redirected pipe handles, so the synchronous wait returns even when a re-parented grandchild holds stdout/stderr open. So cancel — including the grandchild case — is handled exactly as it is today, **with no server dependency**. Cancel of a genuinely un-killable script hangs the wait until abandon fires. + +`Close` is safe here precisely because the wait is synchronous; it would race the `Exited` event of the async `WaitForExitAsync`, which PR1 does not use. + +PR1 also adds the full abandon contract surface, the client parameter, capability advertisement, and tests. + +### PR2 (#1244) — draft, ships only after all servers support abandon + +Switches the wait to async `await process.WaitForExitAsync(...)` for performance: no thread is blocked per running script. With the async wait, `Close()` can no longer be used to unblock the grandchild case — it races the `Exited` event the async wait depends on — so `process.Close()` is removed from `DoOurBestToCleanUp`. Consequence: **cancel no longer unsticks a re-parented grandchild on its own; grandchildren require abandon (cancel-then-abandon)**. Cancel returns the real kill exit code instead of -1. + +PR2 must ship only after every server in the fleet supports the abandon escalation, because it makes server-side abandon the only recovery path for the grandchild case. + +The two grandchild tests flip from "cancel alone does not hang" (PR1) to "cancel then abandon" (PR2). + +## Section 1 — Contract surface + +Add a method to existing `IScriptServiceV2`. Do NOT introduce V3 — the convention here is method-addition + capability negotiation. The server-side RPC contract below is **unchanged** by the ADR-42 reversal. + +```csharp +// source/Octopus.Tentacle.Contracts/ScriptServiceV2/IScriptServiceV2.cs +public interface IScriptServiceV2 +{ + ScriptStatusResponseV2 StartScript(StartScriptCommandV2 command); + ScriptStatusResponseV2 GetStatus(ScriptStatusRequestV2 request); + ScriptStatusResponseV2 CancelScript(CancelScriptCommandV2 command); + ScriptStatusResponseV2 AbandonScript(AbandonScriptCommandV2 command); // NEW + void CompleteScript(CompleteScriptCommandV2 command); +} + +// NEW: source/Octopus.Tentacle.Contracts/ScriptServiceV2/AbandonScriptCommandV2.cs +public class AbandonScriptCommandV2 +{ + public AbandonScriptCommandV2(ScriptTicket ticket, long lastLogSequence) { /* … */ } + public ScriptTicket Ticket { get; } + public long LastLogSequence { get; } +} +``` + +**Capability advertisement.** Tentacle's `CapabilitiesServiceV2` advertises `AbandonScriptV2` once the build supports it. Binary on build version, no Tentacle-side toggle. Server's existing `BackwardsCompatibleAsyncCapabilitiesV2Decorator` handles "Tentacle doesn't advertise it → don't call it" for older Tentacles. Server-side `AbandonTentacleScriptOnCancellationTimeoutFeatureToggle` is the only feature-flag off-switch. + +**Why a new verb (not a "force" flag on Cancel).** Different semantics: Cancel = "best-effort kill, then wait for the real outcome". Abandon = "best-effort kill, then stop waiting; release the mutex; return -48". Two verbs map cleanly to ProcessExecution's two-step escalation (cancel first, abandon if cancel doesn't propagate). + +## Section 2 — Where abandon's kill lives, and mutex release + +**The core constraint.** `RunningScript.Execute()` acquires `ScriptIsolationMutex` inside a `using` block that wraps the call into `SilentProcessRunner`. That call blocks (PR1) or awaits (PR2) on the process wait. When the wait never returns: +1. The mutex is welded shut (the `using`'s Dispose never runs). +2. In PR1 the threadpool thread is parked; in PR2 no thread is parked, but the awaiter never completes. + +Abandon solves both by completing the wait via the abandon signal regardless of whether the OS process exited. + +**Where the kill lives.** `ScriptServiceV2.AbandonScriptAsync` calls only `runningScript.Abandon()`. The kill happens in `SilentProcessRunner`'s **abandon branch**: when the abandon token is observed it calls `DoOurBestToCleanUp` (best-effort `Hitman.Kill`, idempotent if cancel already ran it) and then returns `AbandonedExitCode`. Doing the kill there — in the runner, sequentially in the abandon branch — closes the abuse vector for ANY abandon (including a direct RPC with no prior cancel), and is **race-free**. + +> **Why not fire the cancel token from `AbandonScriptAsync` (the originally-proposed shape)?** Firing `Cancel()` then `Abandon()` races: cancel's `Hitman.Kill` makes a killable process exit, which resolves the runner's wait to the *cancel* exit code (`-1`) before the abandon token is observed — the runner returns `-1` instead of `-48`. Reversing the order makes `-48` deterministic but then the kill races the runner disposing the process. Doing the kill sequentially inside the abandon branch avoids both races. (Proven by `AbandonScript_WithNoPriorCancel_KillsTheProcess`.) + +**Exit code when both tokens fire.** The runner returns `-48` whenever the **abandon path resolves the wait** — i.e. the process is still blocking the wait when abandon fires. This is the case the feature exists for: cancel's kill could not unstick the script (genuinely stuck / re-parented grandchild), so the wait is still pending when abandon arrives. + +It is NOT a goal to return `-48` when cancel and abandon race on a *killable* process. There, cancel's kill can resolve the wait to the killed exit code before abandon is observed, and returning that code is correct — the script is being torn down either way. This race does not occur in production: cancel and abandon arrive as separate, sequential RPCs, and the server only sends abandon **after** cancel has failed to unstick the script. By the time abandon's RPC lands, a killable script has already completed (its later abandon RPC is a no-op against an already-`Complete` script); only a stuck script is still waiting, and that path is deterministically `-48`. Both PR1 (`Task.WhenAny`) and PR2 (async abandon-catch) behave this way — neither tries to override cancel's exit code when cancel wins the kill race. + +**PR1 wait shape (synchronous wait raced against abandon):** + +```csharp +using (cancel.Register(() => DoOurBestToCleanUp(process, error))) // Hitman.Kill + process.Close() +{ + process.BeginOutputReadLine(); + process.BeginErrorReadLine(); + + var abandonTcs = new TaskCompletionSource(); + using (abandon.Register(() => abandonTcs.TrySetResult(null))) + { + var waitTask = Task.Run(() => process.WaitForExit()); + await Task.WhenAny(waitTask, abandonTcs.Task).ConfigureAwait(false); + + if (abandon.IsCancellationRequested && !process.HasExited) + { + info("Tentacle has abandoned this script. The underlying script process may still be running on this host."); + SafelyCancelRead(process.CancelErrorRead, debug); + SafelyCancelRead(process.CancelOutputRead, debug); + return ScriptExitCodes.AbandonedExitCode; + } + + // process exited (naturally, or via cancel-triggered Kill+Close releasing the pipes) + SafelyCancelRead(process.CancelErrorRead, debug); + SafelyCancelRead(process.CancelOutputRead, debug); + return SafelyGetExitCode(process); + } +} +``` + +**PR2 wait shape (async wait, no `process.Close()`):** the `Task.Run(WaitForExit)` + `WhenAny` is replaced by `await process.WaitForExitAsync(abandon)`, and `process.Close()` is removed from `DoOurBestToCleanUp`. The abandon-catch keys on `abandon.IsCancellationRequested && !process.HasExited` exactly as above. Cancel returns the real kill exit code via the natural `Exited`-event completion. + +**Diff shape.** The wait method and its callers migrate to async (`ExecuteCommand` → `ExecuteCommandAsync`, returning `Task`). A search across the repo found ~20 call sites; every one migrates. PR1 introduces the async signature with the synchronous-wait-on-a-Task body and a net48 polyfill where `WaitForExitAsync` is unavailable; PR2 swaps the body to the true async wait. + +Production code: +- `source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs` — the method itself. Async signature, two-token (`cancel`, `abandon`). +- `source/Octopus.Tentacle/Util/ISilentProcessRunner.cs` — interface and in-process wrapper become async. +- `source/Octopus.Tentacle/Util/CommandLineRunner.cs` — caller migration. +- `source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs` — `RunScript` → `RunScriptAsync`; ctor takes `abandonToken` alongside `runningScriptToken`; `Execute()` awaits the new path. +- `source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs` — `LaunchShell` passes `abandonToken` from the wrapper. `RunningScriptWrapper` gains `abandonTokenSource`. `AbandonScriptAsync` calls `Abandon()` (the best-effort kill lives in the runner's abandon branch — see above). +- `source/Octopus.Tentacle.Contracts/ScriptServiceV2/` — new `AbandonScriptCommandV2.cs`, interface method on `IScriptServiceV2.cs` (per Section 1). +- `source/Octopus.Tentacle.Contracts/ScriptExitCodes.cs` — add `AbandonedExitCode = -48`. +- Capabilities advertisement (`AbandonScriptV2`). + +Immediate sync callers migrated with `.GetAwaiter().GetResult()` and a sync-boundary comment ("We're on a plain thread-pool worker — when the async work finishes it can resume on any free thread, so the block resolves normally"): +- `PowerShellPrerequisite.Check()`, `KubernetesDirectoryInformationProvider.GetDriveBytesUsingDu()`, `SystemCtlHelper.RunServiceCommand()` (×2), `LinuxServiceConfigurator` (×3), `WindowsServiceConfigurator.Sc()`, `CommandLineRunner.Execute(CommandLineInvocation, ...)`. + +Kubernetes integration test scaffolding (all caller-migration, no logic change): +- `KubeCtlTool.cs`, `DockerImageLoader.cs` (×2), `KubernetesAgentInstaller.cs` (×3), `KubernetesClusterInstaller.cs` (×4), `HelmDownloader.cs`, `ToolDownloader.cs` (all under `source/Octopus.Tentacle.Kubernetes.Tests.Integration/`). + +Tentacle integration test scaffolding (caller migration): +- `PowerShellStartupDetectionTests.cs` (×3), `Util/SilentProcessRunnerFixture.cs`, `Support/TentacleFetchers/LinuxTentacleFetcher.cs` (all under `source/Octopus.Tentacle.Tests.Integration/`). + +**What happens to stdout/stderr after abandon.** Returning `AbandonedExitCode` unwinds the method. The outer `using (var process = new Process())` disposes the Process, which closes our end of the redirected pipes. The OS process may get EPIPE on its next stdout/stderr write. The script's runtime keeps doing whatever it's doing; many scripts ignore broken-pipe errors, and scripts that fail on them already had nowhere to log anyway. The alternative — leaving the Process and its pipes pinned in memory indefinitely — is a resource-accumulation problem. + +**Async correctness watch-outs for the implementation plan:** +- Every new async method gets `.ConfigureAwait(false)`. +- No `.Result` / `.Wait()` on the new path; surface any caller that can't easily go async rather than block-on-async. +- Verify no deadlock under the Tentacle's synchronisation context (none expected, but worth confirming). + +**Rejected alternatives** (documented for the reviewer's benefit): +- **Orphan the Task + release mutex via external Dispose.** Releases the mutex but leaks a threadpool worker per abandon. Tentacle eventually starves the threadpool. +- **Manual `Thread` instead of `Task`.** Same leak problem, just trades threadpool for kernel thread handles + stack memory. +- **`Thread.Abort` / `Thread.Interrupt` / `TerminateThread` P/Invoke.** No safe managed mechanism to release a thread parked in unmanaged code; can corrupt Tentacle's own state. +- **Out-of-process script worker.** Cleanly isolates the stuck-process problem but is a massive refactor far outside EFT-3295's scope. Worth a separate proposal someday. + +## Section 3 — Client parameter (PR1) + +Add one optional parameter, threaded through the client orchestration: + +```csharp +TimeSpan? abandonAfterCancellationPendingFor = null +``` + +It is added to `TentacleClient.ExecuteScript`, `ITentacleClient.ExecuteScript`, and `ObservingScriptOrchestrator`. + +The orchestrator's poll loop (`ObservingScriptOrchestrator.ObserveUntilComplete`) already calls `CancelScript` every iteration while the `CancellationToken` is cancelled. When the parameter is set: +1. Record when cancellation first fired. +2. Once elapsed crosses the threshold AND the `AbandonScriptV2` capability is advertised, call `AbandonScript` instead of `CancelScript`. +3. If the capability is absent, keep cancelling (skip the abandon attempt cleanly). + +Add an `AbandonScript` method to the V2 script executor (`ScriptServiceV2Executor`) with the capability check. `ScriptExecutionResult.ExitCode` is the source of truth (-48 = abandoned). + +**No shared constant in `Octopus.Tentacle.Contracts`.** An earlier ask for a shared abandoned-message constant was retracted; the Tentacle's own script-log line is the only customer-facing message. + +## Section 4 — State, exit code, log wording + +- **Exit code:** `ScriptExitCodes.AbandonedExitCode = -48`. Distinct from `CanceledExitCode (-43)`. Server-side telemetry can tell abandoned from cancelled even though task UI surfaces both as "Cancelled" per ADR-042. +- **State on GetStatus after abandon:** `(ProcessState.Complete, AbandonedExitCode, latestLogs)`. Same shape as Cancel returns today. +- **Honest log line:** `"Tentacle has abandoned this script. The underlying script process may still be running on this host."` Written once, into the workspace script log, near the end of the abandon path. Still accurate after the ADR-42 reversal: the process survives only when the best-effort kill couldn't land. +- **Workspace cleanup on subsequent `CompleteScript`:** targeted best-effort. `CompleteScript` reads the stateStore and checks the persisted exit code. If `AbandonedExitCode`, wrap `workspace.Delete` in try/catch, log a `Warn` to systemLog naming the leaked directory, return success. For any other exit code, `workspace.Delete` is called as today and exceptions propagate. The relaxed-deletion policy applies only to the rare abandon case; bugs that leak handles on normal-completion paths can't hide under a blanket try/catch. No janitor — OS-level state on the host is the customer's problem per the ticket. +- **Idempotency — actual-status return (NOT silent no-op):** + - Abandon called twice on the same already-abandoned ticket → returns the cached `(Complete, AbandonedExitCode, logs)` response. + - Abandon called on a ticket that completed naturally before the abandon arrived (race case the server-side session flagged) → returns `(Complete, realExitCode, logs)` with the **real exit code**, distinct from `AbandonedExitCode`. The server uses this distinction to log *"Script had already completed before abandon was needed"* instead of *"Tentacle abandoned the script"*. + - Abandon called on an unknown ticket (never started, or already cleaned up via `CompleteScript`) → returns `(Complete, UnknownScriptExitCode, [])`, matching Cancel's behaviour for the same case. +- **Race with natural completion:** the wrapper's existing `StartScriptMutex` (or a new dedicated lock) serialises abandon entry. If state is already Complete, abandon returns the cached status per the rules above. + +## Section 5 — Automated test strategy + +### 5.1 `SilentProcessRunner` unit tests + +Style: matches existing `SilentProcessRunnerFixture.cs`. Use short-lived helper scripts/exes as process subjects. + +| Test | Trigger | Verify | +|---|---|---| +| Normal exit | Run a process that exits 0 | Returns 0; no abandon log line captured by the `info` callback spy. | +| Cancel kills process | Long-running process; fire cancel token | Within 1s: process is killed (`process.HasExited == true`). PR1: return is the kill-induced exit code (Linux 137; Windows process-defined) via the `Close`-released pipe path. PR2: return is the real kill exit code. No abandon log line. | +| Abandon while running | Long-running process; fire abandon token | Within ~100ms: returns `AbandonedExitCode`, `info` callback received exactly one call containing "Tentacle has abandoned this script". Assert `process.HasExited == false`; clean up by killing externally. | +| Abandon AFTER natural exit (race) | Process that exits in ~50ms; fire abandon token at the moment exit fires | Return value is the process's real exit code, not `AbandonedExitCode`. No abandon log line. Verifies the `abandon.IsCancellationRequested && !process.HasExited` guard. | +| Cancel also requested (abandon first) | Long-running process; fire abandon, then also fire cancel | Returns `AbandonedExitCode`. Abandon resolves the wait deterministically; the additional cancel does not change the result. (Firing abandon first avoids the killable-process race — see "Exit code when both tokens fire" above.) The stuck both-tokens case is covered end-to-end by the integration test `AbandonScript_WhenCancelFailsToKillProcess`. | + +**Grandchild (re-parented) tests — differ by PR.** A child process spawns a grandchild that inherits stdout/stderr and outlives the child. +- **PR1:** "cancel alone does not hang" — `cancel.Register`'s `process.Close()` releases the inherited pipe handles, so the synchronous wait returns even though the grandchild holds them open. No abandon needed. +- **PR2:** "cancel then abandon" — `process.Close()` is gone, the async wait depends on the `Exited` event, so cancel alone leaves the wait pending; abandon is required to complete it and return -48. + +**Async-specific timing assertion (PR2):** `WaitForExitAsync(token)` returns within ~50ms of cancellation. Wrap the await in `Stopwatch.StartNew()`; assert elapsed < 100ms. Proves the async wait is independent of process exit. + +**Thread-leak regression test (PR2):** start 50 stuck processes via `ExecuteCommandAsync` (all `await`ed in parallel), fire abandon on all; capture `Process.GetCurrentProcess().Threads.Count` before and 1s after; assert delta ≤ 5 (allow for threadpool jitter). The async path should produce zero parked threads at steady state. (Under PR1 the synchronous-wait-on-a-Task body still parks a worker per running script, so this assertion is PR2-only.) + +### 5.2 `ScriptServiceV2` service-layer tests + +Style: matches existing service-layer fixtures using in-memory script shells and stub workspace factories. + +| Test | Trigger | Verify | +|---|---|---| +| **Mutex release (load-bearing)** | Start `FullIsolation` script; abandon it; immediately start second `FullIsolation` script | Second `StartScript` returns with `State == Running` within 1s. Reading `ScriptIsolationMutex.TaskLock.Report()` between abandon and second-start shows the lock free in that window. | +| Abandon before StartScript | Call AbandonScript with a ticket never seen | Returns `(Complete, UnknownScriptExitCode)`. Matches existing Cancel behaviour for unknown ticket. | +| Abandon after CompleteScript | Start → Complete → Abandon | Returns `(Complete, UnknownScriptExitCode)` (wrapper already removed; stateStore gone). | +| Abandon then Cancel | Abandon, then Cancel same ticket | Cancel returns the cached abandoned response unchanged. Asserts via response equality. | +| **Cancel then Abandon (real flow)** | Long-running script; cancel; `cancel.Register` no-op'd to simulate un-killable; abandon | Final GetStatus returns `(Complete, AbandonedExitCode, logs)`. Log content includes the honest line. Subsequent same-ticket StartScript returns the cached state. | +| **Abandon best-effort-kills (anti-abuse)** | Long-running but killable script; call AbandonScript directly with no prior cancel | The process is killed (the runner's abandon branch ran `DoOurBestToCleanUp → Hitman.Kill`). Final state is `(Complete, AbandonedExitCode, logs)`. Confirms a direct abandon does not leave a killable process running unmanaged. | +| Abandon during StartScript launch | Concurrent: StartScript holding `StartScriptMutex`, AbandonScript called | Abandon serialises behind StartScript via the existing wrapper mutex. Final state is consistent (no half-abandoned wrapper). | +| Capability advertisement | Tentacle build with the abandon feature; query `CapabilitiesServiceV2.GetCapabilities()` | Response includes `AbandonScriptV2`. Builds without the feature do not advertise it. | + +### 5.3 Client orchestrator tests + +| Test | Trigger | Verify | +|---|---|---| +| Param unset → cancel only | `abandonAfterCancellationPendingFor = null`; cancel the token mid-execution | Orchestrator calls `CancelScript` each poll; never calls `AbandonScript`. | +| Threshold crossed + capability present | Param set to a short span; capability advertised; cancellation pending past the span | Orchestrator switches from `CancelScript` to `AbandonScript` after the threshold. `RecordMethodUsages` shows the `AbandonScript` call. | +| Threshold crossed + capability absent | Param set; `AbandonScriptV2` NOT advertised; cancellation pending past the span | Orchestrator keeps calling `CancelScript`; never calls `AbandonScript`; no error. | + +### 5.4 Integration tests (real shells, real processes) + +Style: matches `Octopus.Tentacle.Tests.Integration/ClientScriptExecutionIsolationMutex.cs` (real Tentacle, real script, mutex semantics under test). + +**Timing flakiness: use the existing builders, not raw shell + `Thread.Sleep`.** +- `ScriptBuilder` (`Octopus.Tentacle.CommonTestUtils/Builders/ScriptBuilder.cs`): `.CreateFile(path)` signals "script reached this line"; `.WaitForFileToExist(path)` blocks the script on an event, not a sleep race. +- `TestExecuteShellScriptCommandBuilder` (`Octopus.Tentacle.Tests.Integration/Util/Builders/`): `.SetScriptBody(ScriptBuilder)`, `.WithIsolationLevel(...)`, `.WithIsolationMutexName(...)`, `.Build()`. +- `TentacleConfigurationTestCase.CreateBuilder()` and `ClientAndTentacleBuilder` set up real Tentacle + Halibut. +- `TentacleServiceDecoratorBuilder.RecordMethodUsages(...)` decorates the script service so the test can assert call counts for the new `AbandonScript` verb and capability negotiation. +- `Wait.For(condition, timeout, onFail, ct)` is the event-driven polling helper. Always preferred over `Task.Delay`. + +**Pattern to follow:** mirror `ClientScriptExecutionIsolationMutex.cs`. Stuck-script tests use `ScriptBuilder.WaitForFileToExist(...)` as the "kernel-blocked" simulant rather than `sleep 600`. For the un-killable variant, combine the file-wait with the `Tentacle.Debug.DisableProcessKill` flag so `Hitman` becomes a no-op for the test's duration. + +| Test | Trigger | Verify | +|---|---|---| +| PowerShell + cancel (kill works) | Real PowerShell, `Start-Sleep -Seconds 600`, fire Cancel, normal kill path | Final response is `(Complete, CanceledExitCode)` via the existing path. **Negative check:** abandon log line NOT present. Confirms Cancel isn't regressed into the abandon path. | +| PowerShell + abandon (kill mocked off) | Real PowerShell, sleep; `Hitman` mocked to no-op; fire Cancel; wait; fire AbandonScript | Within 2s of abandon: response is `(Complete, AbandonedExitCode, [...honest log line...])`; mutex is free (start a second `FullIsolation` script that acquires within 1s); the real PowerShell process is still alive (verified via `Process.GetProcessById`). Test cleanup: kill the leftover PowerShell. | +| **Multi-level-deep hang (ticket-mandated)** | bootstrap → Calamari-shim → user script, with `Hitman` no-op flag set | All verifications from the previous row pass through the multi-level launch chain. Confirms abandon works when the stuck process is not the immediate child of Tentacle. | +| **Re-parented grandchild — PR1** | Child spawns a grandchild that inherits stdout/stderr and outlives it; fire Cancel | Cancel alone returns (does NOT hang): `process.Close()` releases the inherited pipes. No abandon needed. | +| **Re-parented grandchild — PR2** | Same setup; fire Cancel, then AbandonScript | Cancel alone leaves the wait pending; abandon completes it and returns `(Complete, AbandonedExitCode)`. The two grandchild tests flip from PR1's "cancel alone does not hang" to PR2's "cancel then abandon". | +| Windows workspace cleanup with open handles | Run the abandon path; leave the simulated zombie holding the workspace log file open; call CompleteScript | CompleteScript returns without exception. Tentacle systemLog contains a `Warn` naming the leaked workspace directory. Workspace dir on disk still exists (assert via `Directory.Exists`). No exception bubbles up. | +| Polling Tentacle variant | Configure test fixture as Polling | All verifications from the kill-mocked-off row pass against a Polling Tentacle. | + +**End-to-end async thread audit (PR2).** Capture `Process.GetCurrentProcess().Threads.Count` 5s into a stuck-script scenario; assert no thread parked attributable to the script pipeline. Most reliable proxy: total thread count not higher than baseline + epsilon. + +**Normal-path timing regression check.** Run a 100-iteration benchmark of normal short-script execution (`Write-Host "x"`); compare median wall-clock time vs. a baseline build. **Verify:** median delta within margin of error. + +## Section 6 — Manual testing plan + +Manual scenarios on a real test Tentacle. All scenarios assume the parallel server-side build is deployed. + +### Setup + +- Test Octopus Server with EFT-3295 server-side build. +- Windows Tentacle (primary) + Linux Tentacle (smoke). +- Debug Tentacle build with `Tentacle.Debug.DisableProcessKill=true` making `Hitman.TryKillProcessAndChildrenRecursively` a no-op — simulant for "kill doesn't work" without engineering real kernel-level waits. +- Server-side feature flag `AbandonTentacleScriptOnCancellationTimeoutFeatureToggle` (default ON, configured on the test Octopus Server). + +### Where to find things (reference for verification steps below) + +- **Tentacle systemLog (Windows):** `C:\Octopus\Logs\OctopusTentacle.txt` (confirm via `Tentacle show-configuration`). +- **Tentacle systemLog (Linux):** `/etc/octopus//Logs/OctopusTentacle.txt`. +- **Tentacle workspace root:** `/Work/`. Each script gets a subdirectory named after its `ScriptTicket`. Inside: `bootstrapRunner.log`, `Output.log`, `script.ps1`/`Bootstrap.sh`, the state store file. +- **Script log in UI:** Octopus Server → the task → expand the deployment step. This is what the customer sees and what gets the honest abandon line. +- **Thread count (Windows):** PowerShell `(Get-Process Tentacle).Threads.Count`, or Process Explorer's Threads tab. +- **Thread count (Linux):** `ps -o nlwp= -p $(pgrep -f Tentacle)`. +- **Capability advertisement:** Tentacle systemLog at startup contains `Negotiated capabilities: [...]`. Or enable Halibut verbose tracing server-side and inspect the `CapabilitiesResponseV2` payload. +- **Mutex state in Tentacle log:** grep for `acquiring isolation mutex` / `Lock acquired` / `Releasing lock` with the relevant task ID. + +### M1 — Regression smoke (flag ON, normal script) + +Deploy `Write-Host "hello"; Start-Sleep 5; Write-Host "done"`. + +**Verify (all must pass):** +1. Octopus UI task status → **Success** (green tick). +2. Script log in UI shows `hello` and `done`; no abandon line. +3. Tentacle systemLog: `grep "abandon" OctopusTentacle.txt` → zero matches for this task ID. +4. Tentacle systemLog shows the normal acquire/release pair for this task ID. +5. Thread count (sampled 5s after task completes) → within ±2 of pre-test baseline. + +### M2 — Cancel still works (flag ON, killable script) + +`DisableProcessKill=false`. Deploy `Start-Sleep -Seconds 300`. Wait ~10s. Click **Cancel** in Octopus UI. + +**Verify:** +1. UI task status transitions to **Cancelled** within 30s. +2. Tentacle systemLog shows the kill attempt followed by mutex release for this task ID. +3. PowerShell process is gone (match by PID captured from Tentacle log at script start). +4. `grep "abandon" OctopusTentacle.txt` → zero matches for this task ID. Cancel path was used, not abandon. +5. Deploy a second project to the same Tentacle → starts immediately (mutex released by the normal Cancel path). + +### M3 — The Philips scenario (flag ON, unkillable script) + +`Tentacle.Debug.DisableProcessKill=true`. Restart Tentacle. Capture thread-count baseline. Deploy `Start-Sleep -Seconds 600`. Note the PowerShell PID from the Tentacle log. Click **Cancel** after ~10s. Wait for the server-side abandon timeout (2 min for first release). + +**Verify (all must pass; this is the load-bearing scenario):** + +1. **Server side called Abandon.** Server log shows an `AbandonScript` call for this task's ticket, timestamped after the Cancel attempt + the abandon timeout. +2. **Honest log line in the customer-visible task log.** Confirm `Tentacle has abandoned this script. The underlying script process may still be running on this host.` is present in the script log section. +3. **Tentacle systemLog records the abandon path.** Shows: AbandonScript invocation received, best-effort kill attempted (no-op'd here), abandon signal fired, mutex released, wrapper removed. +4. **Mutex released — load-bearing check.** Immediately deploy a second trivial project. **Pass:** starts within 5s. **Fail:** queues with "Waiting for the script in task...". +5. **Task UI status = Cancelled** (no separate "Abandoned" state — per ADR-042). +6. **Thread count returned to baseline** (PR2). Sample 10s after the abandon. **Pass:** within ±2 of baseline. (Under PR1 a worker is parked per running script in the normal case, so use the PR2 build for this check.) +7. **The PowerShell process is still alive on the host.** `Get-Process -Id ` returns the process — because the best-effort kill was no-op'd. Kill it manually at end of test for cleanup. (With `DisableProcessKill=false` abandon would have killed it; this scenario specifically simulates the kill not landing.) +8. **Exit code in the task log = -48 (AbandonedExitCode)**, distinct from `-43` (CanceledExitCode). + +### M4 — Repeated abandon (thread/handle-leak check under repetition) + +Capture baseline thread count and Tentacle working-set memory. Run M3 ten times back-to-back (script the loop). + +**Verify:** +1. Sample thread count after each iteration (PR2 build). **Pass:** within ±5 of baseline across all ten runs. **Fail:** monotonic growth. +2. Sample working-set memory after each iteration. **Pass:** within ~50MB of baseline. **Fail:** grows by more than ~10MB per iteration. +3. After all ten runs, deploy a normal project. **Pass:** runs normally. +4. Kill leftover `powershell.exe` / `sleep` processes manually at end of test. + +### M5 — Server-side flag off (Tentacle behaves as today) + +Set `AbandonTentacleScriptOnCancellationTimeoutFeatureToggle` to OFF in the test Octopus Server. Restart Server. Leave Tentacle untouched. + +**Verify:** +1. **Server doesn't dispatch Abandon.** Repeat the M3 setup. Wait past the 2-minute point. Server log: zero `AbandonScript` matches for this task ID. +2. **Tentacle still advertises the capability.** `CapabilitiesResponseV2` still contains `AbandonScriptV2`. The flag lives on the Server, not on Tentacle. +3. **Tentacle stays wedged.** Subsequent deployment queues with "Waiting for the script in task...". Confirms today's behaviour is preserved when Server has the feature off. +4. Recovery: restart Tentacle (the existing workaround). + +### M6 — Workspace cleanup with open handles (Windows-specific) + +Run M3 to completion. Note the `ScriptTicket` from the Tentacle log. + +**Verify:** +1. **Workspace dir still exists.** Listing shows log files present; open handles prevent deletion. +2. **systemLog records the failure.** A `Warn`-level entry names the directory that could not be deleted, with the underlying I/O exception message. +3. **No propagated exception to Server.** `CompleteScript` returns normally; Server log shows successful completion. +4. **Tentacle continues to function.** Deploy a third project (not to the wedged workspace). **Pass:** runs normally. +5. **Manual cleanup works after the zombie process is killed.** Confirms the leak is bounded. + +### M7 — Polling Tentacle variant + +Register a Polling Tentacle. Repeat M3 setup and execution. + +**Verify:** +1. All M3 verification points pass with no Polling-specific differences. The Halibut RPC path is the same — only connection initiation direction differs. +2. **Polling-specific check:** during the abandon, Tentacle's polling loop continues. **Pass:** polling not blocked by the abandon flow. +3. After abandon, the Polling Tentacle picks up the next deployment. **Pass:** new deployment dispatched and runs (mutex released). + +### M8 — Linux smoke + +On a Linux Tentacle, deploy a Bash script: `sleep 600`. Repeat M2 (kill works) and M3 (kill mocked off). + +**Verify:** +1. **M2 on Linux:** `ps -p ` shows the bash/sleep process gone after Cancel. Same outcomes as Windows M2. +2. **M3 on Linux:** all M3 verification points pass. Thread count via `ps -o nlwp= -p $(pgrep -f Tentacle)`. Workspace location: `/etc/octopus//Work//`. +3. **Linux file-handle behaviour differs:** Linux generally allows deletion of files held open by other processes. For M6's analogue on Linux, workspace deletion is more likely to succeed even with the zombie running. Note in test result. +4. Confirms the implementation isn't accidentally Windows-only. + +### M9 — Server escalation ordering + +Server escalation is hardcoded at **2 minutes** post-Cancel for the first release. Ask the server-side session for a debug-build override constant to run this faster. + +**Verify the killable case (no escalation expected):** +1. Run M2 (killable + cancel). Wait at least 3 minutes. +2. Server log: zero `AbandonScript` matches for this task ID. Cancel succeeded inside the 2-minute window. +3. Tentacle log: zero abandon entries for this task ID. + +**Verify the unkillable case (escalation expected):** +4. Run M3 (kill mocked off + cancel). Wait through the 2-minute timeout. +5. Server log: exactly one `AbandonScript` match, timestamped ~2 minutes after the Cancel. +6. Tentacle log: one abandon entry for this task ID. + +**Verify the actual-status race case** (server-side session's idempotency concern): +7. Set up M3, but let the script complete naturally just before the 2-minute timer fires (~110-second script). +8. Server fires AbandonScript anyway because the completion event hasn't reached it yet. +9. Tentacle returns `(Complete, realExitCode, logs)` — NOT `AbandonedExitCode`. +10. Server task log entry: *Script had already completed before abandon was needed.* + +**Bug indicators to flag back to the server session:** +- Server calls AbandonScript on every Cancel (even killable cases) → server's escalation predicate is wrong. +- Server retries AbandonScript multiple times for the same ticket → server idempotency broken. +- Server calls AbandonScript before the 2-minute window → timer is wrong. +- Server calls AbandonScript with the Tentacle capability missing → capability gating broken. + +### Sign-off criteria + +To turn the feature flag on by default in a future release: M1–M5 pass on Windows; M3 + M4 pass on Linux; M7 passes on Polling; M9 confirms server escalation policy; M6 confirms workspace leak is bounded and logged. + +## ADR-42 / "not killed" reversal + +The original ticket and ADR-42 said the runaway process is **not** killed. This design **reverses** that: abandon now best-effort-kills the process (anti-abuse — see Abandon semantics). The Tentacle script-log line remains accurate because the process survives only when the kill genuinely couldn't land. + +Follow-ups required outside this Tentacle work: +- **ADR-42** — update to reflect best-effort-kill-on-abandon. +- **EFT-3295 ticket scope** — update the "do not kill" statement. +- **Docs PR #3175** — update the customer-facing wording to match best-effort-kill-on-abandon. + +**Unchanged:** the server-side RPC contract — `AbandonScript(AbandonScriptCommandV2) → ScriptStatusResponseV2`, exit code -48 — is identical to what was already agreed. + +## Risks and rollout + +- **Feature flag off by default** for the first release. Customer-by-customer opt-in. +- **PR2 sequencing:** PR2 (#1244) must ship only after every server in the fleet supports the abandon escalation, because PR2 removes `process.Close()` and makes server abandon the only recovery for the re-parented-grandchild case. +- **Sequence:** after EFT V1 cleanup closes (target end May 2026), before Task Cap 320, targeting Philips' July self-host release. +- **Telemetry:** count of AbandonScript calls per Tentacle per day. Spike = signal that either Cancel is broken or this feature is masking a different bug. +- **Soak test pre-release:** 1000 normal scripts with the server-side flag ON, verify no resource leak vs. flag OFF baseline. + +## Open questions for external reviewer + +(None remaining. Workspace cleanup policy resolved 2026-05-21 — targeted best-effort gated on `AbandonedExitCode` in the stateStore. No janitor; OS-level state on the host is the customer's responsibility per the ticket.) + +## Coordination — locked with the server-side session (2026-05-21) + +Aligned via Linear thread on EFT-3295 (commenter Jim, both sessions). Items below are locked unless explicitly noted. + +**Contract (final shape):** + +- `ScriptStatusResponseV2 AbandonScript(AbandonScriptCommandV2 command)` on `IScriptServiceV2`. +- `AbandonScriptCommandV2 { ScriptTicket Ticket; long LastLogSequence; }` — same shape as `CancelScriptCommandV2`. Server-side dropped its initial `ServerTaskId` and "cancellation correlation id" proposal; `ScriptTicket` is sufficient. +- Capability name: `AbandonScriptV2`. + +**Idempotency (final):** Tentacle returns actual current status. Already-completed script returns `(Complete, realExitCode, logs)` — distinct from `AbandonedExitCode`, so the server's task log entry can record that the abandon was unnecessary. Unknown/already-cleaned-up ticket returns `(Complete, UnknownScriptExitCode, [])`, matching Cancel's existing shape. + +**Capability check is the primary gate.** Server uses `BackwardsCompatibleAsyncCapabilitiesV2Decorator` to query `AbandonScriptV2` once per session. Capability absent → server does not schedule the abandon dispatch at all. The RPC-fail-then-log path stays as a defensive fallback for capability-cache staleness. + +**One off-switch, server-side:** `AbandonTentacleScriptOnCancellationTimeoutFeatureToggle` (default ON). Governs whether server escalates to AbandonScript at all. No Tentacle-side flag — Tentacle's capability advertisement is binary on build version. + +**Escalation timing (locked for first release):** 2 minutes. Both V1 and V2 execution pipelines escalate to AbandonScript on their next status-poll once cancellation has been pending that long. Hardcoded on the server toggle class, not configurable. Server-side trigger is a polling-loop check, not a delayed NSB message; no new server-side timers. The Tentacle-side contract is unchanged either way. + +**Execution-pipeline scope (server-side, 2026-05-21):** V1 *and* V2 server-side execution pipelines call AbandonScript via the same contract. Philips is V1 self-host so V1 is the urgent path. Doesn't change anything Tentacle is building. + +**Post-abandon flow:** + +1. Server calls `AbandonScript` → gets `ScriptStatusResponseV2`. +2. Server publishes `TentacleScriptAbandonedEvent`. +3. Existing post-cancel path proceeds (eventually calls `CompleteScript` downstream). + +**Task log wording:** + +- Tentacle script log (this doc's Section 4): *Tentacle has abandoned this script. The underlying script process may still be running on this host.* +- Server task log (server session's surface). Server session's working proposal: + - On dispatch: *Cancellation hasn't taken effect on Tentacle after 2 minutes. Abandoning the script to release the script-isolation mutex.* + - On Tentacle returning `AbandonedExitCode`: *Tentacle abandoned the script.* + - On Tentacle returning a real exit code (abandon unnecessary): *Script had already completed before abandon was needed.* +- I pushed back on the dispatch wording — "script-isolation mutex" exposes internal terminology to the customer. Suggested rewrite: *Cancellation hasn't taken effect on Tentacle after 2 minutes. Abandoning the script so this target can accept new deployments.* Server session's call which to ship with. diff --git a/source/Octopus.Tentacle.Client/Capabilities/CapabilitiesResponseV2ExtensionMethods.cs b/source/Octopus.Tentacle.Client/Capabilities/CapabilitiesResponseV2ExtensionMethods.cs index bc49a1e95..353073db7 100644 --- a/source/Octopus.Tentacle.Client/Capabilities/CapabilitiesResponseV2ExtensionMethods.cs +++ b/source/Octopus.Tentacle.Client/Capabilities/CapabilitiesResponseV2ExtensionMethods.cs @@ -15,5 +15,17 @@ public static bool HasScriptServiceV2(this CapabilitiesResponseV2 capabilities) return capabilities.SupportedCapabilities.Contains(nameof(IScriptServiceV2)); } + + public static bool HasAbandonScript(this CapabilitiesResponseV2 capabilities) + { + if (capabilities?.SupportedCapabilities?.Any() != true) + { + return false; + } + + // Both sides nameof IScriptServiceV2.AbandonScript, so the strings match and a rename + // on either side can't silently drift the capability check. + return capabilities.SupportedCapabilities.Contains(nameof(IScriptServiceV2.AbandonScript)); + } } } \ No newline at end of file diff --git a/source/Octopus.Tentacle.Client/ITentacleClient.cs b/source/Octopus.Tentacle.Client/ITentacleClient.cs index c79a285fc..9f3d28413 100644 --- a/source/Octopus.Tentacle.Client/ITentacleClient.cs +++ b/source/Octopus.Tentacle.Client/ITentacleClient.cs @@ -8,6 +8,7 @@ using Octopus.Tentacle.Client.Scripts.Models; using Octopus.Tentacle.Contracts; using Octopus.Tentacle.Contracts.Logging; +using Octopus.Tentacle.Contracts.ScriptServiceV2; namespace Octopus.Tentacle.Client { @@ -31,7 +32,8 @@ Task ExecuteScript( OnScriptStatusResponseReceived onScriptStatusResponseReceived, OnScriptCompleted onScriptCompleted, ITentacleClientTaskLog logger, - CancellationToken scriptExecutionCancellationToken); + CancellationToken scriptExecutionCancellationToken, + TimeSpan? abandonAfterCancellationPendingFor = null); /// /// Start the script. @@ -59,6 +61,16 @@ Task StartScript(ExecuteScriptCommand command, /// The result, which includes the CommandContext for the next command Task CancelScript(CommandContext commandContext, ITentacleClientTaskLog logger); + /// + /// Abandon a running script. This attempts cancellation, but if necessary leaves the script + /// running in the OS but no longer has Tentacle watching or managing it. + /// + /// The ticket of the script to abandon + /// Used to output user orientated log messages + /// Cancels the RPC call + /// The current status snapshot of the script at the time abandon was processed + Task AbandonScript(ScriptTicket scriptTicket, ITentacleClientTaskLog logger, CancellationToken cancellationToken); + /// /// Complete the script. /// diff --git a/source/Octopus.Tentacle.Client/ScriptExecutor.cs b/source/Octopus.Tentacle.Client/ScriptExecutor.cs index 572db403a..1ce5e1f2c 100644 --- a/source/Octopus.Tentacle.Client/ScriptExecutor.cs +++ b/source/Octopus.Tentacle.Client/ScriptExecutor.cs @@ -69,7 +69,15 @@ public async Task CancelScript(CommandContext co return await scriptExecutor.CancelScript(commandContext); } - + + public async Task AbandonScript(CommandContext commandContext) + { + var scriptExecutorFactory = CreateScriptExecutorFactory(); + var scriptExecutor = scriptExecutorFactory.CreateScriptExecutor(commandContext.ScripServiceVersionUsed); + + return await scriptExecutor.AbandonScript(commandContext); + } + public async Task CompleteScript(CommandContext commandContext, CancellationToken cancellationToken) { var scriptExecutorFactory = CreateScriptExecutorFactory(); diff --git a/source/Octopus.Tentacle.Client/Scripts/IScriptExecutor.cs b/source/Octopus.Tentacle.Client/Scripts/IScriptExecutor.cs index 0544ef9ed..752b5bbbc 100644 --- a/source/Octopus.Tentacle.Client/Scripts/IScriptExecutor.cs +++ b/source/Octopus.Tentacle.Client/Scripts/IScriptExecutor.cs @@ -32,6 +32,13 @@ Task StartScript(ExecuteScriptCommand command, /// The result, which includes the CommandContext for the next command Task CancelScript(CommandContext commandContext); + /// + /// Abandon the script. Signals Tentacle to stop waiting for the script to cancel and make the tentacle + /// available to run more scripts with the same isolation mutex. + /// + /// The CommandContext from the previous command + Task AbandonScript(CommandContext commandContext); + /// /// Complete the script. /// diff --git a/source/Octopus.Tentacle.Client/Scripts/KubernetesScriptServiceV1Executor.cs b/source/Octopus.Tentacle.Client/Scripts/KubernetesScriptServiceV1Executor.cs index 374084511..77990132a 100644 --- a/source/Octopus.Tentacle.Client/Scripts/KubernetesScriptServiceV1Executor.cs +++ b/source/Octopus.Tentacle.Client/Scripts/KubernetesScriptServiceV1Executor.cs @@ -185,6 +185,12 @@ async Task CancelScriptAction(CancellationToke return Map(kubernetesScriptStatusResponseV1); } + public Task AbandonScript(CommandContext commandContext) + // KubernetesScriptServiceV1 has no abandon verb. The orchestrator only escalates to abandon + // when the Tentacle advertised the abandon capability (K8s agents never do), so it won't + // escalate here; reaching this is a bug. + => throw new NotSupportedException("KubernetesScriptServiceV1 cannot abandon a script; it has no abandon verb. Cancel the script instead."); + public async Task CompleteScript(CommandContext lastStatusResponse, CancellationToken scriptExecutionCancellationToken) { using var activity = TentacleClient.ActivitySource.StartActivity($"{nameof(KubernetesScriptServiceV1Executor)}.{nameof(CompleteScript)}"); diff --git a/source/Octopus.Tentacle.Client/Scripts/ObservingScriptOrchestrator.cs b/source/Octopus.Tentacle.Client/Scripts/ObservingScriptOrchestrator.cs index 209941900..9be49b13f 100644 --- a/source/Octopus.Tentacle.Client/Scripts/ObservingScriptOrchestrator.cs +++ b/source/Octopus.Tentacle.Client/Scripts/ObservingScriptOrchestrator.cs @@ -14,17 +14,20 @@ sealed class ObservingScriptOrchestrator readonly OnScriptStatusResponseReceived onScriptStatusResponseReceived; readonly OnScriptCompleted onScriptCompleted; readonly IScriptExecutor scriptExecutor; + readonly TimeSpan? abandonAfterCancellationPendingFor; public ObservingScriptOrchestrator( IScriptObserverBackoffStrategy scriptObserverBackOffStrategy, OnScriptStatusResponseReceived onScriptStatusResponseReceived, OnScriptCompleted onScriptCompleted, - IScriptExecutor scriptExecutor) + IScriptExecutor scriptExecutor, + TimeSpan? abandonAfterCancellationPendingFor = null) { this.scriptExecutor = scriptExecutor; this.scriptObserverBackOffStrategy = scriptObserverBackOffStrategy; this.onScriptStatusResponseReceived = onScriptStatusResponseReceived; this.onScriptCompleted = onScriptCompleted; + this.abandonAfterCancellationPendingFor = abandonAfterCancellationPendingFor; } public async Task ExecuteScript(ExecuteScriptCommand command, CancellationToken scriptExecutionCancellationToken) @@ -80,12 +83,28 @@ async Task ObserveUntilComplete( var iteration = 0; var cancellationIteration = 0; var lastResult = startScriptResult; + var stopwatch = new Stopwatch(); while (lastResult.ScriptStatus.State != ProcessState.Complete) { if (scriptExecutionCancellationToken.IsCancellationRequested) { - lastResult = await scriptExecutor.CancelScript(lastResult.ContextForNextCommand).ConfigureAwait(false); + // Record when cancellation first fired so we can escalate to abandon after the threshold. + if (!stopwatch.IsRunning) + { + stopwatch.Start(); + } + + // Only escalate to abandon when the Tentacle advertised the abandon capability. Old V2 + // Tentacles (pre-abandon) and V1/Kubernetes don't, so we keep cancelling rather than + // calling a verb they don't have. + var shouldAbandon = lastResult.ContextForNextCommand.ScripServiceVersionUsed.SupportsAbandon + && abandonAfterCancellationPendingFor.HasValue + && stopwatch.Elapsed >= abandonAfterCancellationPendingFor.Value; + + lastResult = shouldAbandon + ? await scriptExecutor.AbandonScript(lastResult.ContextForNextCommand).ConfigureAwait(false) + : await scriptExecutor.CancelScript(lastResult.ContextForNextCommand).ConfigureAwait(false); } else { diff --git a/source/Octopus.Tentacle.Client/Scripts/ScriptExecutorFactory.cs b/source/Octopus.Tentacle.Client/Scripts/ScriptExecutorFactory.cs index a67824d6f..fa8a2c7b6 100644 --- a/source/Octopus.Tentacle.Client/Scripts/ScriptExecutorFactory.cs +++ b/source/Octopus.Tentacle.Client/Scripts/ScriptExecutorFactory.cs @@ -43,9 +43,10 @@ public IScriptExecutor CreateScriptExecutor(ScriptServiceVersion scriptServiceTo logger); } - if (scriptServiceToUse == ScriptServiceVersion.ScriptServiceVersion2) + if (scriptServiceToUse.IsV2) { return new ScriptServiceV2Executor( + scriptServiceToUse, allClients.ScriptServiceV2, rpcCallExecutor, clientOperationMetricsBuilder, diff --git a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV1Executor.cs b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV1Executor.cs index b777b8e1d..0213bcb32 100644 --- a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV1Executor.cs +++ b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV1Executor.cs @@ -164,6 +164,11 @@ public async Task CancelScript(CommandContext co return Map(response); } + public Task AbandonScript(CommandContext commandContext) + // ScriptServiceV1 has no abandon verb. The orchestrator only escalates to abandon when the + // Tentacle advertised the abandon capability, so it won't escalate here; reaching this is a bug. + => throw new NotSupportedException("ScriptServiceV1 cannot abandon a script; it has no abandon verb. Cancel the script instead."); + public async Task CompleteScript(CommandContext lastStatusResponse, CancellationToken scriptExecutionCancellationToken) { try diff --git a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV2Executor.cs b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV2Executor.cs index c3fc68898..a90338e7d 100644 --- a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV2Executor.cs +++ b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceV2Executor.cs @@ -3,6 +3,7 @@ using System.Threading.Tasks; using Halibut; using Halibut.ServiceModel; +using Octopus.Tentacle.Client.Capabilities; using Octopus.Tentacle.Client.EventDriven; using Octopus.Tentacle.Client.Execution; using Octopus.Tentacle.Client.Observability; @@ -23,8 +24,10 @@ class ScriptServiceV2Executor : IScriptExecutor readonly TimeSpan onCancellationAbandonCompleteScriptAfter; readonly ITentacleClientTaskLog logger; readonly TentacleClientOptions clientOptions; + readonly ScriptServiceVersion scriptServiceVersion; public ScriptServiceV2Executor( + ScriptServiceVersion scriptServiceVersion, IAsyncClientScriptServiceV2 clientScriptServiceV2, RpcCallExecutor rpcCallExecutor, ClientOperationMetricsBuilder clientOperationMetricsBuilder, @@ -32,6 +35,7 @@ public ScriptServiceV2Executor( TentacleClientOptions clientOptions, ITentacleClientTaskLog logger) { + this.scriptServiceVersion = scriptServiceVersion; this.clientScriptServiceV2 = clientScriptServiceV2; this.rpcCallExecutor = rpcCallExecutor; this.clientOperationMetricsBuilder = clientOperationMetricsBuilder; @@ -58,11 +62,11 @@ StartScriptCommandV2 Map(ExecuteScriptCommand command) shellScriptCommand.Files.ToArray()); } - static ScriptOperationExecutionResult Map(ScriptStatusResponseV2 scriptStatusResponse) + ScriptOperationExecutionResult Map(ScriptStatusResponseV2 scriptStatusResponse) { return new ( new ScriptStatus(scriptStatusResponse.State, scriptStatusResponse.ExitCode, scriptStatusResponse.Logs), - new CommandContext(scriptStatusResponse.Ticket, scriptStatusResponse.NextLogSequence, ScriptServiceVersion.ScriptServiceVersion2)); + new CommandContext(scriptStatusResponse.Ticket, scriptStatusResponse.NextLogSequence, scriptServiceVersion)); } public async Task StartScript(ExecuteScriptCommand executeScriptCommand, @@ -115,7 +119,7 @@ void OnErrorAction(Exception ex) if (!startScriptCallIsConnecting || startScriptCallIsBeingRetried) { // We want to cancel the potentially started script, and wait till it finishes. By returning a result, the outer orchestration will take care of this. - return ScriptOperationExecutionResult.CreateScriptStartedResult(command.ScriptTicket, ScriptServiceVersion.ScriptServiceVersion2); + return ScriptOperationExecutionResult.CreateScriptStartedResult(command.ScriptTicket, scriptServiceVersion); } // If the StartScript call was not in-flight or being retries then we know the script has not started executing on Tentacle @@ -173,6 +177,27 @@ async Task CancelScriptAction(CancellationToken ct) return Map(scriptStatusResponseV2); } + public async Task AbandonScript(CommandContext commandContext) + { + using var activity = TentacleClient.ActivitySource.StartActivity($"{nameof(ScriptServiceV2Executor)}.{nameof(AbandonScript)}"); + + async Task AbandonScriptAction(CancellationToken ct) + { + var request = new AbandonScriptCommandV2(commandContext.ScriptTicket, commandContext.NextLogSequence); + return await clientScriptServiceV2.AbandonScriptAsync(request, new HalibutProxyRequestOptions(ct)); + } + + var scriptStatusResponseV2 = await rpcCallExecutor.Execute( + retriesEnabled: clientOptions.RpcRetrySettings.RetriesEnabled, + RpcCall.Create(nameof(IScriptServiceV2.AbandonScript)), + AbandonScriptAction, + logger, + clientOperationMetricsBuilder, + // Like CancelScript, abandon must not be cancelled — it stops the script on Tentacle. + CancellationToken.None).ConfigureAwait(false); + return Map(scriptStatusResponseV2); + } + public async Task CompleteScript(CommandContext lastStatusResponse, CancellationToken scriptExecutionCancellationToken) { using var activity = TentacleClient.ActivitySource.StartActivity($"{nameof(ScriptServiceV2Executor)}.{nameof(CompleteScript)}"); diff --git a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersion.cs b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersion.cs index 972256b9b..9091daf3f 100644 --- a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersion.cs +++ b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersion.cs @@ -1,11 +1,20 @@ -namespace Octopus.Tentacle.Client.Scripts +namespace Octopus.Tentacle.Client.Scripts { public record ScriptServiceVersion(string Value) { public static ScriptServiceVersion ScriptServiceVersion1 = new(nameof(ScriptServiceVersion1)); public static ScriptServiceVersion ScriptServiceVersion2 = new(nameof(ScriptServiceVersion2)); + public static ScriptServiceVersion ScriptServiceVersion2WithAbandon = new(nameof(ScriptServiceVersion2WithAbandon)); public static ScriptServiceVersion KubernetesScriptServiceVersion1 = new(nameof(KubernetesScriptServiceVersion1)); + // True for both V2 variants — they talk to the same ScriptServiceV2; ...WithAbandon just also + // advertised the abandon verb. + public bool IsV2 => Value == nameof(ScriptServiceVersion2) || Value == nameof(ScriptServiceVersion2WithAbandon); + + // Only a V2 Tentacle that advertised the AbandonScript capability supports abandon. Old V2 + // Tentacles predate the verb, so they select ScriptServiceVersion2, not ...WithAbandon. + public bool SupportsAbandon => Value == nameof(ScriptServiceVersion2WithAbandon); + public override string ToString() => Value; } } diff --git a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersionSelector.cs b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersionSelector.cs index c8d7c57e2..674ab88e7 100644 --- a/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersionSelector.cs +++ b/source/Octopus.Tentacle.Client/Scripts/ScriptServiceVersionSelector.cs @@ -73,7 +73,11 @@ ScriptServiceVersion DetermineShellScriptServiceVersionToUse(CapabilitiesRespons logger.Verbose(clientOptions.RpcRetrySettings.RetriesEnabled ? $"RPC call retries are enabled. Retry timeout {rpcCallExecutor.RetryTimeout.TotalSeconds} seconds" : "RPC call retries are disabled."); - return ScriptServiceVersion.ScriptServiceVersion2; + // Old V2 Tentacles are V2 but predate the abandon verb; only pick the abandon-capable + // variant when the Tentacle actually advertised AbandonScript. + return tentacleCapabilities.HasAbandonScript() + ? ScriptServiceVersion.ScriptServiceVersion2WithAbandon + : ScriptServiceVersion.ScriptServiceVersion2; } logger.Verbose("RPC call retries are enabled but will not be used for Script Execution as a compatible ScriptService was not found. Please upgrade Tentacle to enable this feature."); diff --git a/source/Octopus.Tentacle.Client/TentacleClient.cs b/source/Octopus.Tentacle.Client/TentacleClient.cs index 8176cccc0..188165cf2 100644 --- a/source/Octopus.Tentacle.Client/TentacleClient.cs +++ b/source/Octopus.Tentacle.Client/TentacleClient.cs @@ -16,6 +16,7 @@ using Octopus.Tentacle.Contracts.Capabilities; using Octopus.Tentacle.Contracts.Logging; using Octopus.Tentacle.Contracts.Observability; +using Octopus.Tentacle.Contracts.ScriptServiceV2; using ITentacleClientObserver = Octopus.Tentacle.Contracts.Observability.ITentacleClientObserver; namespace Octopus.Tentacle.Client @@ -168,7 +169,8 @@ public async Task ExecuteScript(ExecuteScriptCommand exec OnScriptStatusResponseReceived onScriptStatusResponseReceived, OnScriptCompleted onScriptCompleted, ITentacleClientTaskLog logger, - CancellationToken scriptExecutionCancellationToken) + CancellationToken scriptExecutionCancellationToken, + TimeSpan? abandonAfterCancellationPendingFor = null) { using var activity = ActivitySource.StartActivity($"{nameof(TentacleClient)}.{nameof(ExecuteScript)}"); activity?.AddTag("octopus.tentacle.script.files", string.Join(",", executeScriptCommand.Files.Select(f => f.Name))); @@ -188,7 +190,8 @@ public async Task ExecuteScript(ExecuteScriptCommand exec var orchestrator = new ObservingScriptOrchestrator(scriptObserverBackOffStrategy, onScriptStatusResponseReceived, onScriptCompleted, - scriptExecutor); + scriptExecutor, + abandonAfterCancellationPendingFor); var result = await orchestrator.ExecuteScript(executeScriptCommand, scriptExecutionCancellationToken); @@ -260,6 +263,27 @@ public async Task CancelScript(CommandContext co return await scriptExecutor.CancelScript(commandContext); } + public async Task AbandonScript(ScriptTicket scriptTicket, ITentacleClientTaskLog logger, CancellationToken cancellationToken) + { + using var activity = ActivitySource.StartActivity($"{nameof(TentacleClient)}.{nameof(AbandonScript)}"); + activity?.AddTag("octopus.tentacle.script.ticket", scriptTicket.TaskId); + + async Task AbandonScriptAction(CancellationToken ct) + { + var request = new AbandonScriptCommandV2(scriptTicket, lastLogSequence: 0); + return await allClients.ScriptServiceV2.AbandonScriptAsync(request, new HalibutProxyRequestOptions(ct)); + } + + return await rpcCallExecutor.Execute( + retriesEnabled: clientOptions.RpcRetrySettings.RetriesEnabled, + RpcCall.Create(nameof(IScriptServiceV2.AbandonScript)), + AbandonScriptAction, + logger, + // Abandon is a one-shot RPC; like CancelScript we don't track operation metrics for it. + ClientOperationMetricsBuilder.Start(), + cancellationToken).ConfigureAwait(false); + } + public async Task CompleteScript(CommandContext commandContext, ITentacleClientTaskLog logger, CancellationToken scriptExecutionCancellationToken) { using var activity = ActivitySource.StartActivity($"{nameof(TentacleClient)}.{nameof(CompleteScript)}"); diff --git a/source/Octopus.Tentacle.CommonTestUtils/Builders/ScriptBuilder.cs b/source/Octopus.Tentacle.CommonTestUtils/Builders/ScriptBuilder.cs index b238ce154..9b7303d03 100644 --- a/source/Octopus.Tentacle.CommonTestUtils/Builders/ScriptBuilder.cs +++ b/source/Octopus.Tentacle.CommonTestUtils/Builders/ScriptBuilder.cs @@ -58,6 +58,17 @@ public ScriptBuilder CreateFile(string file) return this; } + public ScriptBuilder WritePidToFile(string file) + { + bashScript.AppendLine($@" +echo $$ > '{file}' +"); + windowsScript.AppendLine($@" +$PID | Out-File -FilePath '{file}' -Encoding ASCII +"); + return this; + } + public ScriptBuilder WaitForFileToExist(string fileToWaitFor) { bashScript.AppendLine($@" diff --git a/source/Octopus.Tentacle.Contracts/ClientServices/IAsyncClientScriptServiceV2.cs b/source/Octopus.Tentacle.Contracts/ClientServices/IAsyncClientScriptServiceV2.cs index 996f915c1..0c89048e4 100644 --- a/source/Octopus.Tentacle.Contracts/ClientServices/IAsyncClientScriptServiceV2.cs +++ b/source/Octopus.Tentacle.Contracts/ClientServices/IAsyncClientScriptServiceV2.cs @@ -10,6 +10,7 @@ public interface IAsyncClientScriptServiceV2 Task StartScriptAsync(StartScriptCommandV2 command, HalibutProxyRequestOptions proxyRequestOptions); Task GetStatusAsync(ScriptStatusRequestV2 request, HalibutProxyRequestOptions proxyRequestOptions); Task CancelScriptAsync(CancelScriptCommandV2 command, HalibutProxyRequestOptions proxyRequestOptions); + Task AbandonScriptAsync(AbandonScriptCommandV2 command, HalibutProxyRequestOptions proxyRequestOptions); Task CompleteScriptAsync(CompleteScriptCommandV2 command, HalibutProxyRequestOptions proxyRequestOptions); } } \ No newline at end of file diff --git a/source/Octopus.Tentacle.Contracts/ScriptExitCodes.cs b/source/Octopus.Tentacle.Contracts/ScriptExitCodes.cs index 2e0ce1b15..2319410c4 100644 --- a/source/Octopus.Tentacle.Contracts/ScriptExitCodes.cs +++ b/source/Octopus.Tentacle.Contracts/ScriptExitCodes.cs @@ -12,6 +12,7 @@ public static class ScriptExitCodes public const int UnknownScriptExitCode = -45; public const int UnknownResultExitCode = -46; public const int PowerShellNeverStartedExitCode = -47; + public const int AbandonedExitCode = -48; //Kubernetes Agent public const int KubernetesScriptPodNotFound = -81; diff --git a/source/Octopus.Tentacle.Contracts/ScriptServiceV2/AbandonScriptCommandV2.cs b/source/Octopus.Tentacle.Contracts/ScriptServiceV2/AbandonScriptCommandV2.cs new file mode 100644 index 000000000..66efba446 --- /dev/null +++ b/source/Octopus.Tentacle.Contracts/ScriptServiceV2/AbandonScriptCommandV2.cs @@ -0,0 +1,17 @@ +using System; + +namespace Octopus.Tentacle.Contracts.ScriptServiceV2 +{ + public class AbandonScriptCommandV2 + { + public AbandonScriptCommandV2(ScriptTicket ticket, long lastLogSequence) + { + Ticket = ticket; + LastLogSequence = lastLogSequence; + } + + public ScriptTicket Ticket { get; } + + public long LastLogSequence { get; } + } +} diff --git a/source/Octopus.Tentacle.Contracts/ScriptServiceV2/IScriptServiceV2.cs b/source/Octopus.Tentacle.Contracts/ScriptServiceV2/IScriptServiceV2.cs index 3effc17b8..9858111fa 100644 --- a/source/Octopus.Tentacle.Contracts/ScriptServiceV2/IScriptServiceV2.cs +++ b/source/Octopus.Tentacle.Contracts/ScriptServiceV2/IScriptServiceV2.cs @@ -7,6 +7,7 @@ public interface IScriptServiceV2 ScriptStatusResponseV2 StartScript(StartScriptCommandV2 command); ScriptStatusResponseV2 GetStatus(ScriptStatusRequestV2 request); ScriptStatusResponseV2 CancelScript(CancelScriptCommandV2 command); + ScriptStatusResponseV2 AbandonScript(AbandonScriptCommandV2 command); void CompleteScript(CompleteScriptCommandV2 command); } } \ No newline at end of file diff --git a/source/Octopus.Tentacle.Contracts/Services/Scripts/IAsyncScriptServiceV2.cs b/source/Octopus.Tentacle.Contracts/Services/Scripts/IAsyncScriptServiceV2.cs index c1c9d3118..10b24a752 100644 --- a/source/Octopus.Tentacle.Contracts/Services/Scripts/IAsyncScriptServiceV2.cs +++ b/source/Octopus.Tentacle.Contracts/Services/Scripts/IAsyncScriptServiceV2.cs @@ -12,6 +12,7 @@ public interface IAsyncScriptServiceV2 Task StartScriptAsync(StartScriptCommandV2 command, CancellationToken cancellationToken); Task GetStatusAsync(ScriptStatusRequestV2 request, CancellationToken cancellationToken); Task CancelScriptAsync(CancelScriptCommandV2 command, CancellationToken cancellationToken); + Task AbandonScriptAsync(AbandonScriptCommandV2 command, CancellationToken cancellationToken); Task CompleteScriptAsync(CompleteScriptCommandV2 command, CancellationToken cancellationToken); } } \ No newline at end of file diff --git a/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs b/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs index 412a415e3..bfe4982e4 100644 --- a/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs +++ b/source/Octopus.Tentacle.Core/Services/Scripts/RunningScript.cs @@ -22,18 +22,20 @@ public class RunningScript: IRunningScript readonly IShell shell; readonly string taskId; readonly CancellationToken runningScriptToken; + readonly CancellationToken abandonToken; readonly IReadOnlyDictionary environmentVariables; readonly ILog log; readonly ScriptIsolationMutex scriptIsolationMutex; readonly TimeSpan powerShellStartupTimeout; - public RunningScript(IShell shell, + RunningScript(IShell shell, IScriptWorkspace workspace, IScriptStateStore? stateStore, IScriptLog scriptLog, string taskId, ScriptIsolationMutex scriptIsolationMutex, CancellationToken runningScriptToken, + CancellationToken abandonToken, IReadOnlyDictionary environmentVariables, TimeSpan powerShellStartupTimeout, ILog log @@ -44,6 +46,7 @@ ILog log this.stateStore = stateStore; this.taskId = taskId; this.runningScriptToken = runningScriptToken; + this.abandonToken = abandonToken; this.environmentVariables = environmentVariables; this.log = log; this.scriptIsolationMutex = scriptIsolationMutex; @@ -52,7 +55,7 @@ ILog log this.powerShellStartupTimeout = powerShellStartupTimeout; } - public RunningScript(IShell shell, + RunningScript(IShell shell, IScriptWorkspace workspace, IScriptLog scriptLog, string taskId, @@ -60,10 +63,34 @@ public RunningScript(IShell shell, CancellationToken runningScriptToken, IReadOnlyDictionary environmentVariables, TimeSpan powerShellStartupTimeout, - ILog log) : this(shell, workspace, null, scriptLog, taskId, scriptIsolationMutex, runningScriptToken, environmentVariables, powerShellStartupTimeout, log) + ILog log) : this(shell, workspace, null, scriptLog, taskId, scriptIsolationMutex, runningScriptToken, CancellationToken.None, environmentVariables, powerShellStartupTimeout, log) { } + public static RunningScript Create(IShell shell, + IScriptWorkspace workspace, + IScriptLog scriptLog, + string taskId, + ScriptIsolationMutex scriptIsolationMutex, + CancellationToken runningScriptToken, + IReadOnlyDictionary environmentVariables, + TimeSpan powerShellStartupTimeout, + ILog log) + => new RunningScript(shell, workspace, null, scriptLog, taskId, scriptIsolationMutex, runningScriptToken, CancellationToken.None, environmentVariables, powerShellStartupTimeout, log); + + public static RunningScript CreateAbandonable(IShell shell, + IScriptWorkspace workspace, + IScriptStateStore? stateStore, + IScriptLog scriptLog, + string taskId, + ScriptIsolationMutex scriptIsolationMutex, + CancellationToken runningScriptToken, + CancellationToken abandonToken, + IReadOnlyDictionary environmentVariables, + TimeSpan powerShellStartupTimeout, + ILog log) + => new RunningScript(shell, workspace, stateStore, scriptLog, taskId, scriptIsolationMutex, runningScriptToken, abandonToken, environmentVariables, powerShellStartupTimeout, log); + public ProcessState State { get; private set; } public int ExitCode { get; private set; } @@ -95,15 +122,18 @@ public async Task Execute() RecordScriptHasStarted(writer); exitCode = workspace.ShouldMonitorPowerShellStartup() - ? await RunPowershellScriptWithMonitoring(shellPath, writer, runningScriptToken) - : await RunScriptAsync(shellPath, writer, runningScriptToken); + ? await RunPowershellScriptWithMonitoring(shellPath, writer, runningScriptToken, abandonToken) + : await RunScriptAsync(shellPath, writer, runningScriptToken, abandonToken); } } + // We get here when cancel fires before the script starts running. scriptIsolationMutex.Acquire + // watches runningScriptToken and throws while we're still waiting for the mutex. catch (OperationCanceledException) { writer.WriteOutput(ProcessOutputSource.StdOut, "Script execution canceled."); exitCode = ScriptExitCodes.CanceledExitCode; } + // Fires when acquiring the isolation mutex timed out before the script could start. catch (TimeoutException) { writer.WriteOutput(ProcessOutputSource.StdOut, "Script execution timed out."); @@ -130,7 +160,7 @@ public async Task Execute() } } - async Task RunPowershellScriptWithMonitoring(string shellPath, IScriptLogWriter writer, CancellationToken runningScriptToken) + async Task RunPowershellScriptWithMonitoring(string shellPath, IScriptLogWriter writer, CancellationToken runningScriptToken, CancellationToken abandonToken) { // We want to be able to make some effort to cancel the running script, if the monitor task detects it as hung. // Hence, we make a linked cancellation token with the runningScriptToken @@ -147,7 +177,7 @@ async Task RunPowershellScriptWithMonitoring(string shellPath, IScriptLogWr var monitor = new PowerShellStartupMonitor(workspace.WorkingDirectory, powerShellStartupTimeout, log, taskId); var monitoringTask = monitor.WaitForStartup(monitoringTaskCts.Token); - var scriptTask = Task.Run(async () => await RunScriptAsync(shellPath, writer, scriptTaskCts.Token), scriptTaskCts.Token); + var scriptTask = Task.Run(async () => await RunScriptAsync(shellPath, writer, scriptTaskCts.Token, abandonToken), scriptTaskCts.Token); var completedTask = await Task.WhenAny(monitoringTask, scriptTask); @@ -222,7 +252,7 @@ void RecordScriptHasCompleted(int exitCode) } } - async Task RunScriptAsync(string shellPath, IScriptLogWriter writer, CancellationToken cancellationToken) + async Task RunScriptAsync(string shellPath, IScriptLogWriter writer, CancellationToken cancellationToken, CancellationToken abandon) { try { @@ -234,7 +264,8 @@ async Task RunScriptAsync(string shellPath, IScriptLogWriter writer, Cancel LogScriptOutputTo(writer, ProcessOutputSource.StdOut), LogScriptOutputTo(writer, ProcessOutputSource.StdErr), environmentVariables, - cancel: cancellationToken); + cancellationToken, + abandon); return exitCode; } diff --git a/source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs b/source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs index 0a200fc19..791750c8d 100644 --- a/source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs +++ b/source/Octopus.Tentacle.Core/Services/Scripts/ScriptServiceV2.cs @@ -72,7 +72,7 @@ public async Task StartScriptAsync(StartScriptCommandV2 { IScriptWorkspace workspace; - // If the state already exists then this runningScript is already running/has already run and we should not run it again + // StartScript may be called multiple times for the same ticket (e.g. if server retries the tentacle command), so we must guard against actually starting the script twice. if (runningScript.ScriptStateStore.Exists()) { var state = runningScript.ScriptStateStore.Load(); @@ -103,7 +103,8 @@ public async Task StartScriptAsync(StartScriptCommandV2 command.TaskId, workspace, runningScript.ScriptStateStore, - runningScript.CancellationToken); + runningScript.CancellationToken, + runningScript.AbandonToken); runningScript.Process = process; @@ -140,22 +141,66 @@ public async Task CancelScriptAsync(CancelScriptCommandV return GetResponse(command.Ticket, command.LastLogSequence, runningScript?.Process); } - public async Task CompleteScriptAsync(CompleteScriptCommandV2 command, CancellationToken cancellationToken) + public Task AbandonScriptAsync(AbandonScriptCommandV2 command, CancellationToken cancellationToken) { - await Task.CompletedTask; + // runningScript.Abandon() cancels AbandonToken. That token is awaited only in + // SilentProcessRunner.ExecuteCommandAsync, via Task.WhenAny(waitForExit, WaitForAbandon(abandon)): + // cancelling it resolves that wait, so the runner best-effort-kills the process, returns -48, and + // RunningScript's isolation-mutex `using` unwinds to release the mutex. + if (runningScripts.TryGetValue(command.Ticket, out var runningScript)) + { + runningScript.Abandon(); + } + + return Task.FromResult(GetResponse(command.Ticket, command.LastLogSequence, runningScript?.Process)); + } + public async Task CompleteScriptAsync(CompleteScriptCommandV2 command, CancellationToken cancellationToken) + { if (runningScripts.TryRemove(command.Ticket, out var runningScript)) { runningScript.Dispose(); } var workspace = workspaceFactory.GetWorkspace(command.Ticket, WorkspaceReadinessCheck.Skip); - await workspace.Delete(cancellationToken); + + // For abandoned scripts (see AbandonScriptCommandV2 and + // https://octopus.com/docs/infrastructure/deployment-targets/tentacle/tentacle-script-abandonment) + // the underlying OS process is, by design, still alive + // and unable to be killed by Tentacle. It may still hold open file handles inside + // the workspace (logs being written to, working files, etc.). workspace.Delete() + // will fail in that case on Windows due to sharing violations and may partially + // delete on Linux. We need to tolerate the failure, which will leave the workspace + // on disk to hopefully be cleaned up by another mechanism (manual cleanup, + // instance restart) etc. This is the best we can do. For all other completion paths + // the process has exited and Delete should succeed; surface any failure there. + if (WasAbandoned(workspace)) + { + try + { + await workspace.Delete(cancellationToken); + } + catch (Exception ex) + { + log.Warn(ex, $"Could not delete abandoned workspace at {workspace.WorkingDirectory}. Leaving on disk; the underlying script process may still hold open file handles."); + } + } + else + { + await workspace.Delete(cancellationToken); + } } - RunningScript LaunchShell(ScriptTicket ticket, string serverTaskId, IScriptWorkspace workspace, IScriptStateStore stateStore, CancellationToken cancellationToken) + bool WasAbandoned(IScriptWorkspace workspace) { - var runningScript = new RunningScript(shell, workspace, stateStore, workspace.CreateLog(), serverTaskId, scriptIsolationMutex, cancellationToken, environmentVariables, powerShellStartupTimeout, log); + var stateStore = scriptStateStoreFactory.Create(workspace); + return stateStore.Exists() + && stateStore.Load().ExitCode == ScriptExitCodes.AbandonedExitCode; + } + + RunningScript LaunchShell(ScriptTicket ticket, string serverTaskId, IScriptWorkspace workspace, IScriptStateStore stateStore, CancellationToken cancellationToken, CancellationToken abandonToken) + { + var runningScript = RunningScript.CreateAbandonable(shell, workspace, stateStore, workspace.CreateLog(), serverTaskId, scriptIsolationMutex, cancellationToken, abandonToken, environmentVariables, powerShellStartupTimeout, log); _ = Task.Run(async () => await runningScript.Execute()); return runningScript; } @@ -204,13 +249,14 @@ public bool IsRunningScript(ScriptTicket ticket) class RunningScriptWrapper : IDisposable { - readonly CancellationTokenSource cancellationTokenSource = new (); + readonly CancellationTokenSource cancellationTokenSource = new(); + readonly CancellationTokenSource abandonTokenSource = new(); public RunningScriptWrapper(ScriptStateStore scriptStateStore) { ScriptStateStore = scriptStateStore; - CancellationToken = cancellationTokenSource.Token; + AbandonToken = abandonTokenSource.Token; } public RunningScript? Process { get; set; } @@ -218,15 +264,15 @@ public RunningScriptWrapper(ScriptStateStore scriptStateStore) public SemaphoreSlim StartScriptMutex { get; } = new(1, 1); public CancellationToken CancellationToken { get; } + public CancellationToken AbandonToken { get; } - public void Cancel() - { - cancellationTokenSource.Cancel(); - } + public void Cancel() => cancellationTokenSource.Cancel(); + public void Abandon() => abandonTokenSource.Cancel(); public void Dispose() { cancellationTokenSource.Dispose(); + abandonTokenSource.Dispose(); } } } diff --git a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs index 0e57221c6..e12992b3e 100644 --- a/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs +++ b/source/Octopus.Tentacle.Core/Util/CommandLine/SilentProcessRunner.cs @@ -9,7 +9,9 @@ using System.Text; using System.Threading; using System.Threading.Tasks; +using Octopus.Tentacle.Contracts; using Octopus.Tentacle.Core.Diagnostics; +using Octopus.Tentacle.Core.Util; namespace Octopus.Tentacle.Util { @@ -22,12 +24,13 @@ public static Task ExecuteCommandAsync( Action debug, Action info, Action error, - CancellationToken cancel) + CancellationToken cancel, + CancellationToken abandon) { - return ExecuteCommandAsync(executable, arguments, workingDirectory, debug, info, error, customEnvironmentVariables: null, cancel: cancel); + return ExecuteCommandAsync(executable, arguments, workingDirectory, debug, info, error, customEnvironmentVariables: null, cancel: cancel, abandon: abandon); } - public static Task ExecuteCommandAsync( + public static async Task ExecuteCommandAsync( string executable, string arguments, string workingDirectory, @@ -35,7 +38,8 @@ public static Task ExecuteCommandAsync( Action info, Action error, IReadOnlyDictionary? customEnvironmentVariables = null, - CancellationToken cancel = default) + CancellationToken cancel = default, + CancellationToken abandon = default) { if (executable == null) throw new ArgumentNullException(nameof(executable)); @@ -136,15 +140,33 @@ void WriteData(Action action, ManualResetEventSlim resetEvent, DataRecei })) { if (cancel.IsCancellationRequested) - DoOurBestToCleanUp(process, error); + DoOurBestToCleanUp(process, error); process.BeginOutputReadLine(); process.BeginErrorReadLine(); - process.WaitForExit(); + var waitForExit = Task.Run(() => + { + try { process.WaitForExit(); } + catch { /* swallow exceptions thrown when released by Process.Dispose in DoOurBestToCleanUp */ } + }); - SafelyCancelRead(process.CancelErrorRead, debug); - SafelyCancelRead(process.CancelOutputRead, debug); + // Wait for the process to exit, but break if abandon cancallation token fires. + // Cancel kills then Close()s the process (via DoOurBestToCleanUp) so this + // then returns. If the script is not actually stuck when abandon fires, + // abandoning will be ina race with the script to complete + await Task.WhenAny(waitForExit, WaitForAbandon(abandon)).ConfigureAwait(false); + + if (abandon.IsCancellationRequested) + { + DoOurBestToCleanUp(process, error); + info("Tentacle has abandoned this script. The underlying script process may still be running on this host."); + SafelyCancelOutputAndErrorRead(process, debug); + running = false; + return ScriptExitCodes.AbandonedExitCode; + } + + SafelyCancelOutputAndErrorRead(process, debug); SafelyWaitForAllOutput(outputResetEvent, cancel, debug); SafelyWaitForAllOutput(errorResetEvent, cancel, debug); @@ -153,7 +175,7 @@ void WriteData(Action action, ManualResetEventSlim resetEvent, DataRecei debug($"Process {exeFileNameOrFullPath} in {workingDirectory} exited with code {exitCode}"); running = false; - return Task.FromResult(exitCode); + return exitCode; } } } @@ -177,13 +199,20 @@ static int SafelyGetExitCode(Process process) } } + static Task WaitForAbandon(CancellationToken token) + { + var tcs = new TaskCompletionSource(); + token.Register(() => tcs.TrySetResult(null)); + return tcs.Task; + } + static void SafelyWaitForAllOutput(ManualResetEventSlim outputResetEvent, CancellationToken cancel, Action debug) { + //5 seconds is a bit arbitrary, but the process should have already exited by now, so unwise to wait too long try { - //5 seconds is a bit arbitrary, but the process should have already exited by now, so unwise to wait too long outputResetEvent.Wait(TimeSpan.FromSeconds(5), cancel); } catch (OperationCanceledException ex) @@ -192,6 +221,14 @@ static void SafelyWaitForAllOutput(ManualResetEventSlim outputResetEvent, } } + static void SafelyCancelOutputAndErrorRead(Process process, Action debug) + { + // Cancel the readers so a late callback can't write to the workspace log after it's + // disposed (that would throw). Used by both the normal and abandon paths. + SafelyCancelRead(process.CancelErrorRead, debug); + SafelyCancelRead(process.CancelOutputRead, debug); + } + static void SafelyCancelRead(Action action, Action debug) { try @@ -226,15 +263,13 @@ static void DoOurBestToCleanUp(Process process, Action error) { try { - // When cancelling, close the file handles. - // If the child finishes, but the grandchild holds stdout/stderr and never completes, - // THEN we won't issue a kill to the grandchild but will wait for the grandchild to - // close the handles. + // Close the handles after the kill, on this thread. Releases the redirected pipes + // so the WaitForExit above returns even when a re-parented grandchild holds them open. process.Close(); } - catch (Exception ex) + catch (Exception closeException) { - error($"Failed to close process resources: {ex.Message}"); + error($"Failed to close process resources: {closeException.Message}"); } } } @@ -252,11 +287,18 @@ class Hitman { public static void TryKillProcessAndChildrenRecursively(Process process) { + if (!string.IsNullOrEmpty(Environment.GetEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION))) + { + // Test-only no-op: simulate "kill was attempted but didn't terminate the process". + // Only activated when the test harness sets this env var on the Tentacle process. + return; + } + #if NETFRAMEWORK TryKillWindowsProcessAndChildrenRecursively(process.Id); #endif #if !NETFRAMEWORK - // Since .NET Core 3.0 there is support for killing a process and it's children + // Since .NET Core 3.0 there is support for killing a process and it's children process.Kill(true); #endif } diff --git a/source/Octopus.Tentacle.Core/Util/EnvironmentVariables.cs b/source/Octopus.Tentacle.Core/Util/EnvironmentVariables.cs index 2b5f83a59..faf01eb00 100644 --- a/source/Octopus.Tentacle.Core/Util/EnvironmentVariables.cs +++ b/source/Octopus.Tentacle.Core/Util/EnvironmentVariables.cs @@ -29,6 +29,7 @@ public static class EnvironmentVariables public const string TentacleMachineConfigurationHomeDirectory = "TentacleMachineConfigurationHomeDirectory"; public const string TentaclePollingConnectionCount = "TentaclePollingConnectionCount"; public const string TentaclePowerShellStartupTimeout = "TentaclePowerShellStartupTimeout"; + public const string TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION = "TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION"; public const string NfsWatchdogDirectory = "watchdog_directory"; public static string TentacleUseTcpNoDelay = "TentacleUseTcpNoDelay"; public static string TentacleUseAsyncListener = "TentacleUseAsyncListener"; diff --git a/source/Octopus.Tentacle.Tests.Integration.Common/Builders/Decorators/ScriptServiceV2DecoratorBuilder.cs b/source/Octopus.Tentacle.Tests.Integration.Common/Builders/Decorators/ScriptServiceV2DecoratorBuilder.cs index 17b499a06..5cd747cf0 100644 --- a/source/Octopus.Tentacle.Tests.Integration.Common/Builders/Decorators/ScriptServiceV2DecoratorBuilder.cs +++ b/source/Octopus.Tentacle.Tests.Integration.Common/Builders/Decorators/ScriptServiceV2DecoratorBuilder.cs @@ -188,6 +188,11 @@ public async Task CancelScriptAsync(CancelScriptCommandV return await cancelScriptFunc(inner, command, options); } + public async Task AbandonScriptAsync(AbandonScriptCommandV2 command, HalibutProxyRequestOptions options) + { + return await inner.AbandonScriptAsync(command, options); + } + public async Task CompleteScriptAsync(CompleteScriptCommandV2 command, HalibutProxyRequestOptions options) { await completeScriptAction(inner, command, options); diff --git a/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs b/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs index 6f9b7300a..208fe0dec 100644 --- a/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs +++ b/source/Octopus.Tentacle.Tests.Integration/CapabilitiesServiceV2Test.cs @@ -30,16 +30,15 @@ public async Task CapabilitiesFromAnOlderTentacleWhichHasNoCapabilitiesService_W capabilities.Should().Contain(nameof(IScriptService)); capabilities.Should().Contain(nameof(IFileTransferService)); - - //all versions have ScriptServiceV1 & IFileTransferService - var expectedCapabilitiesCount = 2; + var expectedCount = 2; if (version.HasScriptServiceV2()) { capabilities.Should().Contain(nameof(IScriptServiceV2)); - expectedCapabilitiesCount++; + expectedCount++; } - - capabilities.Count.Should().Be(expectedCapabilitiesCount); + if (version.HasAbandonScript()) + expectedCount++; + capabilities.Count.Should().Be(expectedCount); } [Test] @@ -54,18 +53,34 @@ public async Task CapabilitiesServiceDoesNotReturnKubernetesScriptServiceForNonK capabilities.Should().Contain(nameof(IScriptService)); capabilities.Should().Contain(nameof(IFileTransferService)); - - //all versions have ScriptServiceV1 & IFileTransferService - var expectedCapabilitiesCount = 2; + var expectedCount = 2; if (version.HasScriptServiceV2()) { capabilities.Should().Contain(nameof(IScriptServiceV2)); - expectedCapabilitiesCount++; + expectedCount++; } + if (version.HasAbandonScript()) + expectedCount++; capabilities.Should().NotContain(nameof(IKubernetesScriptServiceV1)); + capabilities.Count.Should().Be(expectedCount); + } + + [Test] + [TentacleConfigurations(testCapabilitiesServiceVersions: true)] + public async Task AbandonScriptIsAdvertisedOnlyByVersionsThatSupportIt(TentacleConfigurationTestCase tentacleConfigurationTestCase) + { + var version = tentacleConfigurationTestCase.Version; + + await using var clientAndTentacle = await tentacleConfigurationTestCase.CreateLegacyBuilder().Build(CancellationToken); + + var capabilities = (await clientAndTentacle.TentacleClient.CapabilitiesServiceV2.GetCapabilitiesAsync(new(CancellationToken))).SupportedCapabilities; - capabilities.Count.Should().Be(expectedCapabilitiesCount); + // Older Tentacles predate abandon and must not advertise it; this build (and later) must. + if (version.HasAbandonScript()) + capabilities.Should().Contain(nameof(IScriptServiceV2.AbandonScript)); + else + capabilities.Should().NotContain(nameof(IScriptServiceV2.AbandonScript)); } [Test] diff --git a/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs b/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs new file mode 100644 index 000000000..608dc8b72 --- /dev/null +++ b/source/Octopus.Tentacle.Tests.Integration/ClientScriptExecutionAbandon.cs @@ -0,0 +1,483 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Threading.Tasks; +using FluentAssertions; +using NUnit.Framework; +using Octopus.Tentacle.Client; +using Octopus.Tentacle.Client.EventDriven; +using Octopus.Tentacle.Client.Scripts; +using Octopus.Tentacle.Contracts; +using Octopus.Tentacle.Contracts.Logging; +using Octopus.Tentacle.Core.Util; +using Octopus.Tentacle.Tests.Integration.Support; +using Octopus.Tentacle.Tests.Integration.Util; +using Octopus.Tentacle.Tests.Integration.Util.Builders; + +namespace Octopus.Tentacle.Tests.Integration +{ + [IntegrationTestTimeout] + public class ClientScriptExecutionAbandon : IntegrationTest + { + // PIDs of script processes a test spawned. Captured during the test (while the Tentacle's temp dir + // still exists) and force-killed in TearDown so nothing leaks onto the CI agent — including the + // un-killable (kill-disabled) processes, and any process left behind when a test fails early. + readonly List spawnedProcessPids = new(); + + [TearDown] + public void ForceKillSpawnedProcesses() + { + foreach (var pid in spawnedProcessPids) + { + try { using var process = Process.GetProcessById(pid); process.Kill(); } + catch { /* already gone — best effort */ } + } + + spawnedProcessPids.Clear(); + } + + [Test] + [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version2)] + public async Task AbandonScript_WhenProcessCannotBeKilled_LeavesItRunningAndReturnsAbandonedExitCode(TentacleConfigurationTestCase tentacleConfigurationTestCase) + { + // TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION=1 makes Hitman a no-op, so CancelScript + // cannot terminate the process — it is genuinely stuck. AbandonScript then returns promptly with + // AbandonedExitCode without waiting, and the un-killable process is left running. + await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() + .WithTentacle(x => x.WithRunTentacleEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, "1")) + .Build(CancellationToken); + + var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); + var releaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "release"); + var pidFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "pid"); + + var command = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder() + .WritePidToFile(pidFile) + .CreateFile(startFile) + .WaitForFileToExist(releaseFile)) + .WithIsolationLevel(ScriptIsolationLevel.NoIsolation) + .Build(); + + var client = clientTentacle.TentacleClient; + var log = Log(); + + // Explicit StartScript (not ExecuteScript) so we drive cancel and abandon ourselves. + var startResult = await client.StartScript(command, StartScriptIsBeingReAttempted.FirstAttempt, log, CancellationToken); + + await Wait.For(() => File.Exists(startFile), + TimeSpan.FromSeconds(30), + () => throw new Exception("Script did not start"), + CancellationToken); + + TrackSpawnedProcess(pidFile); + + // Cancel: Hitman is a no-op, so the process keeps running. + var afterCancel = await client.CancelScript(startResult.ContextForNextCommand, log); + + // Abandon: returns the abandoned terminal state without waiting for the still-running process. + await client.AbandonScript(command.ScriptTicket, log, CancellationToken); + + var finalResult = await RunStatusUntilComplete(client, afterCancel.ContextForNextCommand, log); + finalResult.ScriptStatus.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + + // The process was never killed (kill disabled), so abandon left it running. + AssertProcessIsRunning(pidFile); + + // Release the still-running script, then complete it so the workspace is cleaned up. + File.WriteAllText(releaseFile, ""); + await client.CompleteScript(finalResult.ContextForNextCommand, log, CancellationToken); + } + + [Test] + [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version2)] + public async Task AbandonScript_WithNoPriorCancel_KillsTheProcessAndReturnsAbandonedExitCode(TentacleConfigurationTestCase tentacleConfigurationTestCase) + { + // Anti-abuse: a direct AbandonScript with no prior CancelScript must still attempt the kill. Kill is + // NOT disabled here, so the abandon branch's best-effort kill actually terminates the process. + await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() + .Build(CancellationToken); + + var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); + var releaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "release"); + var pidFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "pid"); + + var command = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder() + .WritePidToFile(pidFile) + .CreateFile(startFile) + .WaitForFileToExist(releaseFile)) + .WithIsolationLevel(ScriptIsolationLevel.NoIsolation) + .Build(); + + var client = clientTentacle.TentacleClient; + var log = Log(); + + var startResult = await client.StartScript(command, StartScriptIsBeingReAttempted.FirstAttempt, log, CancellationToken); + + await Wait.For(() => File.Exists(startFile), + TimeSpan.FromSeconds(30), + () => throw new Exception("Script did not start"), + CancellationToken); + + TrackSpawnedProcess(pidFile); + + // Direct abandon, NO prior cancel. We never write releaseFile to allow the script to complete, + // so it only completes because the abandon branch killed it. + await client.AbandonScript(command.ScriptTicket, log, CancellationToken); + + var finalResult = await RunStatusUntilComplete(client, startResult.ContextForNextCommand, log); + finalResult.ScriptStatus.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + + // Kill was enabled, so abandon's best-effort kill landed and the process is gone. + AssertProcessExits(pidFile, TimeSpan.FromSeconds(10)); + + await client.CompleteScript(finalResult.ContextForNextCommand, log, CancellationToken); + } + + [Test] + [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version2)] + public async Task AbandonScript_ReleasesIsolationMutexEvenWhileProcessIsStillRunning(TentacleConfigurationTestCase tentacleConfigurationTestCase) + { + // The whole reason Tentacle needs an abandon RPC is to release the isolation mutex when CancelScript + // can't unstick the script. A FullIsolation script gets stuck (kill disabled), abandon is called, and + // a second FullIsolation script with the same mutex name must then be able to acquire the mutex and run. + await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() + .WithTentacle(x => x.WithRunTentacleEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, "1")) + .Build(CancellationToken); + + var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); + var releaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "release"); + var pidFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "pid"); + const string sharedMutex = "abandon-test-mutex"; + + var firstCommand = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder() + .WritePidToFile(pidFile) + .CreateFile(startFile) + .WaitForFileToExist(releaseFile)) + .WithIsolationLevel(ScriptIsolationLevel.FullIsolation) + .WithIsolationMutexName(sharedMutex) + .Build(); + + var client = clientTentacle.TentacleClient; + var log = Log(); + + var startResult = await client.StartScript(firstCommand, StartScriptIsBeingReAttempted.FirstAttempt, log, CancellationToken); + + await Wait.For(() => File.Exists(startFile), + TimeSpan.FromSeconds(30), + () => throw new Exception("First script did not start"), + CancellationToken); + + TrackSpawnedProcess(pidFile); + + var afterCancel = await client.CancelScript(startResult.ContextForNextCommand, log); + await client.AbandonScript(firstCommand.ScriptTicket, log, CancellationToken); + + // Second FullIsolation script with the SAME mutex name. If abandon released the mutex, this script + // can acquire it and run to completion. Otherwise it blocks behind the still-alive first script. + var secondStartFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "second-start"); + var secondCommand = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder().CreateFile(secondStartFile)) + .WithIsolationLevel(ScriptIsolationLevel.FullIsolation) + .WithIsolationMutexName(sharedMutex) + .Build(); + + var (secondResult, _) = await client.ExecuteScript(secondCommand, CancellationToken); + secondResult.ExitCode.Should().Be(0); + File.Exists(secondStartFile).Should().BeTrue("second script should have run after the mutex was released"); + + // Release the still-running first script, then complete it. + File.WriteAllText(releaseFile, ""); + var firstFinal = await RunStatusUntilComplete(client, afterCancel.ContextForNextCommand, log); + await client.CompleteScript(firstFinal.ContextForNextCommand, log, CancellationToken); + } + + [Test] + [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version2)] + public async Task ExecuteScript_WhenCancellationStaysPending_EscalatesToAbandon(TentacleConfigurationTestCase tentacleConfigurationTestCase) + { + // Kill is disabled, so the script is genuinely stuck and CancelScript can't end it. With + // abandonAfterCancellationPendingFor set short, the orchestrator escalates from cancel to abandon and + // the run terminates (throws OperationCanceledException) instead of hanging on the stuck script. The + // mutex-release outcome of that escalation is its own test, ..._AndReleasesTheIsolationMutex below. + await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() + .WithTentacle(x => x.WithRunTentacleEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, "1")) + .Build(CancellationToken); + + var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); + var releaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "release"); + var pidFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "pid"); + + var stuckCommand = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder() + .WritePidToFile(pidFile) + .CreateFile(startFile) + .WaitForFileToExist(releaseFile)) + .WithIsolationLevel(ScriptIsolationLevel.NoIsolation) + .Build(); + + var client = clientTentacle.TentacleClient; + + using var executionCts = new System.Threading.CancellationTokenSource(); + var stuckExecution = Task.Run(async () => await client.ExecuteScript( + stuckCommand, + _ => { }, + _ => Task.CompletedTask, + Log(), + executionCts.Token, + abandonAfterCancellationPendingFor: TimeSpan.FromSeconds(2))); + + await Wait.For(() => File.Exists(startFile), + TimeSpan.FromSeconds(30), + () => throw new Exception("Script did not start"), + CancellationToken); + + TrackSpawnedProcess(pidFile); + + // Cancel stays pending (kill disabled), so after the threshold the orchestrator escalates to abandon + // and the run ends. + executionCts.Cancel(); + await FluentActions.Invoking(async () => await stuckExecution).Should().ThrowAsync(); + + File.WriteAllText(releaseFile, ""); + } + + [Test] + [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version2)] + public async Task ExecuteScript_WhenCancellationStaysPending_EscalatesToAbandon_AndReleasesTheIsolationMutex(TentacleConfigurationTestCase tentacleConfigurationTestCase) + { + // The mutex-release outcome of the escalation in ..._EscalatesToAbandon above: once the orchestrator + // escalates a stuck FullIsolation script to abandon, the isolation mutex is freed, so a second + // FullIsolation script with the same mutex name can run. (Explicit-AbandonScript equivalent: + // AbandonScript_ReleasesIsolationMutexEvenWhileProcessIsStillRunning.) + await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() + .WithTentacle(x => x.WithRunTentacleEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, "1")) + .Build(CancellationToken); + + var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); + var releaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "release"); + var pidFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "pid"); + const string sharedMutex = "escalation-mutex-release-test"; + + var stuckCommand = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder() + .WritePidToFile(pidFile) + .CreateFile(startFile) + .WaitForFileToExist(releaseFile)) + .WithIsolationLevel(ScriptIsolationLevel.FullIsolation) + .WithIsolationMutexName(sharedMutex) + .Build(); + + var client = clientTentacle.TentacleClient; + + using var executionCts = new System.Threading.CancellationTokenSource(); + var stuckExecution = Task.Run(async () => await client.ExecuteScript( + stuckCommand, + _ => { }, + _ => Task.CompletedTask, + Log(), + executionCts.Token, + abandonAfterCancellationPendingFor: TimeSpan.FromSeconds(2))); + + await Wait.For(() => File.Exists(startFile), + TimeSpan.FromSeconds(30), + () => throw new Exception("Script did not start"), + CancellationToken); + + TrackSpawnedProcess(pidFile); + + // Cancel; this stays pending (kill disabled), so after the threshold the orchestrator escalates to + // abandon. We don't assert the run's outcome here — that's _EscalatesToAbandon's job. This test only + // proves the mutex was released, by the second script being able to run. + executionCts.Cancel(); + + // A second FullIsolation script with the same mutex can only run once abandon released the mutex. + var secondStartFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "second-start"); + var secondCommand = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder().CreateFile(secondStartFile)) + .WithIsolationLevel(ScriptIsolationLevel.FullIsolation) + .WithIsolationMutexName(sharedMutex) + .Build(); + + var (secondResult, _) = await client.ExecuteScript(secondCommand, CancellationToken); + secondResult.ExitCode.Should().Be(0); + File.Exists(secondStartFile).Should().BeTrue("escalation to abandon should have released the mutex"); + + // Cleanup: release the stuck script and observe the cancelled run. + File.WriteAllText(releaseFile, ""); + try { await stuckExecution; } catch (OperationCanceledException) { } + } + + [Test] + [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version1)] + public async Task ExecuteScript_OnV1Tentacle_DoesNotEscalateToNotSupportedAbandon(TentacleConfigurationTestCase tentacleConfigurationTestCase) + { + // A V1 Tentacle has no abandon verb. If the orchestrator wrongly escalated, the V1 executor's + // AbandonScript throws NotSupportedException. Asserting the run does NOT throw NotSupportedException + // proves the gate held and we never escalated. + await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() + .WithTentacle(x => x.WithRunTentacleEnvironmentVariable(EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, "1")) + .Build(CancellationToken); + + var startFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "start"); + var releaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "release"); + var pidFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "pid"); + + var command = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder() + .WritePidToFile(pidFile) + .CreateFile(startFile) + .WaitForFileToExist(releaseFile)) + .WithIsolationLevel(ScriptIsolationLevel.NoIsolation) + .Build(); + + var client = clientTentacle.TentacleClient; + + using var executionCts = new System.Threading.CancellationTokenSource(); + var execution = Task.Run(async () => await client.ExecuteScript( + command, + _ => { }, + _ => Task.CompletedTask, + Log(), + executionCts.Token, + abandonAfterCancellationPendingFor: TimeSpan.FromSeconds(1))); + + await Wait.For(() => File.Exists(startFile), + TimeSpan.FromSeconds(30), + () => throw new Exception("Script did not start"), + CancellationToken); + + TrackSpawnedProcess(pidFile); + + executionCts.Cancel(); + + // Well past the 1s threshold: if the V1 gate were broken the orchestrator would have escalated to + // AbandonScript (which throws NotSupportedException) by now. + await Task.Delay(TimeSpan.FromSeconds(3)); + + // Release so the stuck script can finish and the run can unwind. + File.WriteAllText(releaseFile, ""); + + await FluentActions.Invoking(async () => await execution).Should().NotThrowAsync(); + } + + [Test] + [TentacleConfigurations(scriptServiceToTest: ScriptServiceVersionToTest.Version2)] + public async Task AbandonScript_WhenAScriptIsWaitingOnAMutex_AndIsCancelled_ReturnsCancelledNotAbandoned(TentacleConfigurationTestCase tentacleConfigurationTestCase) + { + // End-to-end counterpart to ScriptServiceV2Fixture.AbandonScript_WhileWaitingOnTheIsolationMutex_ExitsCancelled. + // A script still waiting on the isolation mutex never started a process, so when it is cancelled (and + // abandoned) it exits cancelled, not abandoned. Abandon has nothing to act on for a queued script. + await using var clientTentacle = await tentacleConfigurationTestCase.CreateBuilder() + .Build(CancellationToken); + + var holderStartFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "holder-start"); + var holderReleaseFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "holder-release"); + var holderPidFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "holder-pid"); + var waiterRanFile = Path.Combine(clientTentacle.TemporaryDirectory.DirectoryPath, "waiter-ran"); + const string sharedMutex = "queued-cancel-test-mutex"; + + // Holder takes the mutex and holds it (it doesn't need to be stuck), so the waiter must queue behind it. + var holderCommand = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder() + .WritePidToFile(holderPidFile) + .CreateFile(holderStartFile) + .WaitForFileToExist(holderReleaseFile)) + .WithIsolationLevel(ScriptIsolationLevel.FullIsolation) + .WithIsolationMutexName(sharedMutex) + .Build(); + + var client = clientTentacle.TentacleClient; + var log = Log(); + + var holderStart = await client.StartScript(holderCommand, StartScriptIsBeingReAttempted.FirstAttempt, log, CancellationToken); + + await Wait.For(() => File.Exists(holderStartFile), + TimeSpan.FromSeconds(30), + () => throw new Exception("Holder script did not start"), + CancellationToken); + + TrackSpawnedProcess(holderPidFile); + + // Waiter wants the same mutex, so it blocks acquiring it — its process never starts. + var waiterCommand = new TestExecuteShellScriptCommandBuilder() + .SetScriptBody(new ScriptBuilder().CreateFile(waiterRanFile)) + .WithIsolationLevel(ScriptIsolationLevel.FullIsolation) + .WithIsolationMutexName(sharedMutex) + .Build(); + + var waiterStart = await client.StartScript(waiterCommand, StartScriptIsBeingReAttempted.FirstAttempt, log, CancellationToken); + + // Give the waiter time to reach the mutex queue, then cancel and abandon it. + await Task.Delay(TimeSpan.FromSeconds(2)); + var waiterAfterCancel = await client.CancelScript(waiterStart.ContextForNextCommand, log); + await client.AbandonScript(waiterCommand.ScriptTicket, log, CancellationToken); + + var waiterFinal = await RunStatusUntilComplete(client, waiterAfterCancel.ContextForNextCommand, log); + waiterFinal.ScriptStatus.ExitCode.Should().Be(ScriptExitCodes.CanceledExitCode); + File.Exists(waiterRanFile).Should().BeFalse("a script blocked on the isolation mutex never starts its process"); + + await client.CompleteScript(waiterFinal.ContextForNextCommand, log, CancellationToken); + + // Cleanup: release the holder, then complete it. + File.WriteAllText(holderReleaseFile, ""); + var holderFinal = await RunStatusUntilComplete(client, holderStart.ContextForNextCommand, log); + await client.CompleteScript(holderFinal.ContextForNextCommand, log, CancellationToken); + } + + void TrackSpawnedProcess(string pidFile) + { + if (File.Exists(pidFile) && int.TryParse(File.ReadAllText(pidFile).Trim(), out var pid) && pid > 0) + spawnedProcessPids.Add(pid); + } + + static ITentacleClientTaskLog Log() + => new SerilogLoggerBuilder().Build().ForContext().ToITentacleTaskLog(); + + async Task RunStatusUntilComplete(ITentacleClient client, CommandContext context, ITentacleClientTaskLog log) + { + var result = await client.GetStatus(context, log, CancellationToken); + while (result.ScriptStatus.State != ProcessState.Complete) + { + await Task.Delay(TimeSpan.FromMilliseconds(200)); + result = await client.GetStatus(result.ContextForNextCommand, log, CancellationToken); + } + + return result; + } + + static void AssertProcessIsRunning(string pidFile) + { + File.Exists(pidFile).Should().BeTrue($"the script should have written its PID to '{pidFile}'"); + int.TryParse(File.ReadAllText(pidFile).Trim(), out var pid).Should().BeTrue("a valid PID should have been written"); + IsProcessRunning(pid).Should().BeTrue($"abandon leaves an un-killable process running, so PID {pid} should still be alive"); + } + + static void AssertProcessExits(string pidFile, TimeSpan timeout) + { + File.Exists(pidFile).Should().BeTrue($"the script should have written its PID to '{pidFile}'"); + int.TryParse(File.ReadAllText(pidFile).Trim(), out var pid).Should().BeTrue("a valid PID should have been written"); + + // abandon's best-effort kill is asynchronous, so give it a moment to actually terminate. + var deadline = DateTime.UtcNow + timeout; + while (DateTime.UtcNow < deadline && IsProcessRunning(pid)) + System.Threading.Thread.Sleep(100); + + IsProcessRunning(pid).Should().BeFalse($"abandon best-effort-kills a killable process, so PID {pid} should be gone"); + } + + static bool IsProcessRunning(int pid) + { + try + { + using var process = Process.GetProcessById(pid); + return !process.HasExited; + } + catch (ArgumentException) { return false; } // not running + catch (InvalidOperationException) { return false; } // exited + } + } +} diff --git a/source/Octopus.Tentacle.Tests.Integration/Support/TentacleVersions.cs b/source/Octopus.Tentacle.Tests.Integration/Support/TentacleVersions.cs index 3edf7984d..806a436b9 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Support/TentacleVersions.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Support/TentacleVersions.cs @@ -78,5 +78,13 @@ public static bool HasScriptServiceV2(this Version? version) { return version == TentacleVersions.Current || version >= TentacleVersions.v7_1_189_SyncHalibutAndScriptServiceV2; } + + public static bool HasAbandonScript(this Version? version) + { + // Abandon ships in this build and no released version has it yet, so only Current advertises it. + // When it ships in a release, pin that version in TentacleVersions and make this + // `version == Current || version >= v`, the same shape as HasScriptServiceV2. + return version == TentacleVersions.Current; + } } } \ No newline at end of file diff --git a/source/Octopus.Tentacle.Tests.Integration/Support/TestDecoratorsAreCalledInTheCorrectOrder.cs b/source/Octopus.Tentacle.Tests.Integration/Support/TestDecoratorsAreCalledInTheCorrectOrder.cs index 09cbc58fb..fc3fd090b 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Support/TestDecoratorsAreCalledInTheCorrectOrder.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Support/TestDecoratorsAreCalledInTheCorrectOrder.cs @@ -105,6 +105,11 @@ public Task CancelScriptAsync(CancelScriptCommandV2 comm throw new NotImplementedException(); } + public Task AbandonScriptAsync(AbandonScriptCommandV2 command, HalibutProxyRequestOptions proxyRequestOptions) + { + throw new NotImplementedException(); + } + public async Task CompleteScriptAsync(CompleteScriptCommandV2 command, HalibutProxyRequestOptions proxyRequestOptions) { await Task.CompletedTask; diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/Builders/TentacleClientExtensionMethods.cs b/source/Octopus.Tentacle.Tests.Integration/Util/Builders/TentacleClientExtensionMethods.cs index c0975c788..bf6ba7343 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/Builders/TentacleClientExtensionMethods.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/Builders/TentacleClientExtensionMethods.cs @@ -5,10 +5,12 @@ using System.Threading.Tasks; using Halibut; using Octopus.Tentacle.Client; +using Octopus.Tentacle.Client.EventDriven; using Octopus.Tentacle.Client.Scripts; using Octopus.Tentacle.Client.Scripts.Models; using Octopus.Tentacle.Contracts; using Octopus.Tentacle.Contracts.Logging; +using Octopus.Tentacle.Contracts.ScriptServiceV2; using Octopus.Tentacle.Tests.Integration.Support; using Octopus.Tentacle.Tests.Integration.Support.ExtensionMethods; @@ -56,6 +58,42 @@ public static async Task UploadFile( return result; } + public static async Task AbandonScript( + this TentacleClient tentacleClient, + ScriptTicket scriptTicket, + CancellationToken token, + ITentacleClientTaskLog? log = null) + { + return await tentacleClient.AbandonScript(scriptTicket, + new SerilogLoggerBuilder().Build().ForContext().ToITentacleTaskLog().Chain(log), + token).ConfigureAwait(false); + } + + public static async Task CancelScript( + this TentacleClient tentacleClient, + ScriptTicket scriptTicket, + ITentacleClientTaskLog? log = null) + { + var commandContext = new CommandContext(scriptTicket, 0, ScriptServiceVersion.ScriptServiceVersion2); + var result = await tentacleClient.CancelScript(commandContext, + new SerilogLoggerBuilder().Build().ForContext().ToITentacleTaskLog().Chain(log)) + .ConfigureAwait(false); + return result.ScriptStatus; + } + + public static async Task GetStatus( + this TentacleClient tentacleClient, + ScriptTicket scriptTicket, + CancellationToken token, + ITentacleClientTaskLog? log = null) + { + var commandContext = new CommandContext(scriptTicket, 0, ScriptServiceVersion.ScriptServiceVersion2); + var result = await tentacleClient.GetStatus(commandContext, + new SerilogLoggerBuilder().Build().ForContext().ToITentacleTaskLog().Chain(log), + token).ConfigureAwait(false); + return result.ScriptStatus; + } + public static async Task DownloadFile( this TentacleClient tentacleClient, string remotePath, diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/RunningScriptFixture.cs b/source/Octopus.Tentacle.Tests.Integration/Util/RunningScriptFixture.cs index c0e3b05ad..2c6b9ceec 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/RunningScriptFixture.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/RunningScriptFixture.cs @@ -67,7 +67,7 @@ public void SetUpLocal() scriptLog = new TestScriptLog(); cancellationTokenSource = new CancellationTokenSource(TimeSpan.FromSeconds(10)); scriptIsolationMutex = new ScriptIsolationMutex(); - runningScript = new RunningScript(shell, + runningScript = RunningScript.Create(shell, workspace, scriptLog, taskId, @@ -169,7 +169,7 @@ public async Task CancellationToken_ShouldKillTheProcess() ? (new PowerShell(), "Start-Sleep -seconds") : (new Bash() as IShell, "sleep"); - var script = new RunningScript(shell, + var script = RunningScript.Create(shell, workspace, scriptLog, taskId, diff --git a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs index 44125a578..71937ed4f 100644 --- a/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs +++ b/source/Octopus.Tentacle.Tests.Integration/Util/SilentProcessRunnerFixture.cs @@ -7,6 +7,7 @@ using FluentAssertions; using NUnit.Framework; using Octopus.Tentacle.CommonTestUtils; +using Octopus.Tentacle.Contracts; using Octopus.Tentacle.Tests.Integration.Support; using Octopus.Tentacle.Tests.Integration.Support.TestAttributes; using Octopus.Tentacle.Util; @@ -196,7 +197,7 @@ public async Task CancellationToken_WhenGrandchildHoldsRedirectedPipes_ShouldNot cts.Token)); // Wait for the grandchild to actually be spawned before cancelling - await WaitForGrandchildSpawnAsync(grandchildPidFile, TimeSpan.FromSeconds(60)); + await WaitForPidFileAsync(grandchildPidFile, TimeSpan.FromSeconds(60)); var sw = Stopwatch.StartNew(); cts.Cancel(); @@ -253,7 +254,7 @@ public async Task CancellationToken_WhenUnixGrandchildHoldsRedirectedPipes_Shoul out _, cts.Token)); - await WaitForGrandchildSpawnAsync(grandchildPidFile, TimeSpan.FromSeconds(30)); + await WaitForPidFileAsync(grandchildPidFile, TimeSpan.FromSeconds(30)); var sw = Stopwatch.StartNew(); cts.Cancel(); @@ -275,7 +276,96 @@ public async Task CancellationToken_WhenUnixGrandchildHoldsRedirectedPipes_Shoul } } - static async Task WaitForGrandchildSpawnAsync(string pidFile, TimeSpan timeout) + [Test] + public async Task AbandonToken_WhenProcessCanBeKilled_KillsTheProcessAndReturnsAbandonedExitCode() + { + // Abandon returns AbandonedExitCode and best-effort-kills the process. Here the process is + // killable, so the kill lands and the process is gone. + using var tempDir = new TemporaryDirectory(); + var pidFile = Path.Combine(tempDir.DirectoryPath, "process.pid"); + + var abandonCommand = PlatformDetection.IsRunningOnWindows ? "powershell.exe" : "/bin/bash"; + var arguments = PlatformDetection.IsRunningOnWindows + ? $"-NoProfile -NonInteractive -Command \"$PID | Out-File -FilePath '{pidFile}' -Encoding ASCII; Start-Sleep -Seconds 300\"" + : $"-c \"echo $$ > '{pidFile}' && sleep 300\""; + + using var cancelCts = new CancellationTokenSource(); + using var abandonCts = new CancellationTokenSource(); + + var infoMessages = new StringBuilder(); + + var task = Task.Run(async () => await SilentProcessRunner.ExecuteCommandAsync( + abandonCommand, + arguments, + Environment.CurrentDirectory, + debug: _ => { }, + info: msg => { lock (infoMessages) infoMessages.AppendLine(msg); }, + error: _ => { }, + customEnvironmentVariables: null, + cancel: cancelCts.Token, + abandon: abandonCts.Token)); + + // Wait deterministically for the process to write its PID before we abandon + await WaitForPidFileAsync(pidFile, TimeSpan.FromSeconds(30)); + abandonCts.Cancel(); + + var exitCode = await task; + exitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + infoMessages.ToString().Should().Contain("Tentacle has abandoned this script"); + } + + [Test] + [NonParallelizable] + public async Task AbandonToken_WhenProcessCanNotBeKilled_ReturnsAbandonedExitCode() + { + // Simulate a stuck process: TentacleDebugDisableProcessKill makes Hitman a no-op, so abandon's + // best-effort kill can't terminate it. Abandon still returns AbandonedExitCode without waiting and + // the process is left running. That env var is process-wide, so the test is NonParallelizable and + // resets it (and force-kills the leftover process) in finally. + using var tempDir = new TemporaryDirectory(); + var pidFile = Path.Combine(tempDir.DirectoryPath, "process.pid"); + + var abandonCommand = PlatformDetection.IsRunningOnWindows ? "powershell.exe" : "/bin/bash"; + var arguments = PlatformDetection.IsRunningOnWindows + ? $"-NoProfile -NonInteractive -Command \"$PID | Out-File -FilePath '{pidFile}' -Encoding ASCII; Start-Sleep -Seconds 300\"" + : $"-c \"echo $$ > '{pidFile}' && sleep 300\""; + + using var cancelCts = new CancellationTokenSource(); + using var abandonCts = new CancellationTokenSource(); + var infoMessages = new StringBuilder(); + + Environment.SetEnvironmentVariable(Octopus.Tentacle.Core.Util.EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, "1"); + try + { + var task = Task.Run(async () => await SilentProcessRunner.ExecuteCommandAsync( + abandonCommand, + arguments, + Environment.CurrentDirectory, + debug: _ => { }, + info: msg => { lock (infoMessages) infoMessages.AppendLine(msg); }, + error: _ => { }, + customEnvironmentVariables: null, + cancel: cancelCts.Token, + abandon: abandonCts.Token)); + + await WaitForPidFileAsync(pidFile, TimeSpan.FromSeconds(30)); + abandonCts.Cancel(); + + var exitCode = await task; + exitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + infoMessages.ToString().Should().Contain("Tentacle has abandoned this script"); + + // Kill was a no-op, so abandon left the process running. + AssertProcessIsRunning(pidFile); + } + finally + { + Environment.SetEnvironmentVariable(Octopus.Tentacle.Core.Util.EnvironmentVariables.TentacleDebugDisableProcessKill_UNSAFE_FOR_PRODUCTION, null); + EnsureProcessKilled(pidFile); + } + } + + static async Task WaitForPidFileAsync(string pidFile, TimeSpan timeout) { var deadline = DateTime.UtcNow + timeout; while (DateTime.UtcNow < deadline) @@ -285,8 +375,8 @@ static async Task WaitForGrandchildSpawnAsync(string pidFile, TimeSpan timeout) await Task.Delay(50); } throw new TimeoutException( - $"Test setup failed: the grandchild PID was never written to '{pidFile}'. " + - $"The grandchild-pipe scenario is not being exercised."); + $"Test setup failed: a valid PID was never written to '{pidFile}'. " + + $"The scenario under test is not being exercised."); } static string SafelyReadAllText(string path) @@ -294,16 +384,69 @@ static string SafelyReadAllText(string path) try { return File.ReadAllText(path); } catch { return string.Empty; } } - static void TryKillGrandchild(string pidFile) + static void EnsureProcessKilled(string pidFile) { + // Safety net for tests that spawn a long-lived process. It should already be gone (abandon's + // best-effort kill, or the grandchild reaped). If it's somehow still alive, kill it and confirm + // it died — we'd rather flake loudly here than leak the process onto a CI agent. + if (!File.Exists(pidFile) || !int.TryParse(SafelyReadAllText(pidFile).Trim(), out var pid) || pid <= 0) + return; + try { - if (File.Exists(pidFile) && int.TryParse(SafelyReadAllText(pidFile).Trim(), out var pid)) - { - try { Process.GetProcessById(pid).Kill(); } catch { /* already gone */ } - } + using var process = Process.GetProcessById(pid); + if (process.HasExited) + return; + + process.Kill(); + process.WaitForExit(10_000).Should().BeTrue($"cleanup must terminate PID {pid} so it isn't leaked onto the CI agent"); } - catch { /* ignore */ } + catch (ArgumentException) { /* not running — already gone */ } + catch (InvalidOperationException) { /* exited between lookup and kill */ } + } + + static void AssertProcessExitsWithin(string pidFile, TimeSpan timeout) + { + File.Exists(pidFile).Should().BeTrue($"the test should have written a PID to '{pidFile}'"); + int.TryParse(SafelyReadAllText(pidFile).Trim(), out var pid).Should().BeTrue("a valid PID should have been written"); + + // abandon's best-effort kill is asynchronous, so give it a moment to actually terminate. + var deadline = DateTime.UtcNow + timeout; + while (DateTime.UtcNow < deadline && IsProcessRunning(pid)) + Thread.Sleep(100); + + IsProcessRunning(pid).Should().BeFalse($"abandon best-effort-kills a killable process, so PID {pid} should be gone"); + } + + static void AssertProcessIsRunning(string pidFile) + { + File.Exists(pidFile).Should().BeTrue($"the test should have written a PID to '{pidFile}'"); + int.TryParse(SafelyReadAllText(pidFile).Trim(), out var pid).Should().BeTrue("a valid PID should have been written"); + IsProcessRunning(pid).Should().BeTrue($"abandon leaves an un-killable process running, so PID {pid} should still be alive"); + } + + static bool IsProcessRunning(int pid) + { + try + { + using var process = Process.GetProcessById(pid); + return !process.HasExited; + } + catch (ArgumentException) { return false; } // not running + catch (InvalidOperationException) { return false; } // exited + } + + static void TryKillGrandchild(string pidFile) + { + // Best-effort only. The grandchild is reparented to init, so it is NOT our child: Process.WaitForExit + // can't reliably observe a non-child exiting (and a CI container's PID 1 may not reap the zombie), + // so we can't assert the kill the way EnsureProcessKilled does. The test's real assertion is that + // cancellation returns promptly, not that this cleanup succeeded. + if (!File.Exists(pidFile) || !int.TryParse(SafelyReadAllText(pidFile).Trim(), out var pid) || pid <= 0) + return; + + try { using var process = Process.GetProcessById(pid); process.Kill(); } + catch { /* already gone, or can't be reliably killed/observed — best effort */ } } [Test] diff --git a/source/Octopus.Tentacle.Tests/Capabilities/CapabilitiesServiceV2Fixture.cs b/source/Octopus.Tentacle.Tests/Capabilities/CapabilitiesServiceV2Fixture.cs index 0c6326a05..07df373d8 100644 --- a/source/Octopus.Tentacle.Tests/Capabilities/CapabilitiesServiceV2Fixture.cs +++ b/source/Octopus.Tentacle.Tests/Capabilities/CapabilitiesServiceV2Fixture.cs @@ -6,6 +6,7 @@ using Octopus.Tentacle.Contracts; using Octopus.Tentacle.Contracts.KubernetesScriptServiceV1; using Octopus.Tentacle.Contracts.ScriptServiceV2; +using Octopus.Tentacle.Core.Services.Scripts; using Octopus.Tentacle.Kubernetes; using Octopus.Tentacle.Services.Capabilities; @@ -20,8 +21,7 @@ public async Task CapabilitiesAreReturned() .GetCapabilitiesAsync(CancellationToken.None)) .SupportedCapabilities; - capabilities.Should().BeEquivalentTo(nameof(IScriptService), nameof(IFileTransferService), nameof(IScriptServiceV2)); - capabilities.Count.Should().Be(3); + capabilities.Should().BeEquivalentTo(nameof(IScriptService), nameof(IFileTransferService), nameof(IScriptServiceV2), nameof(IScriptServiceV2.AbandonScript)); capabilities.Should().NotContainMatch("IKubernetesScriptService*"); } @@ -36,7 +36,6 @@ public async Task OnlyKubernetesScriptServicesAreReturnedWhenRunningAsKubernetes .SupportedCapabilities; capabilities.Should().BeEquivalentTo(nameof(IFileTransferService), nameof(IKubernetesScriptServiceV1)); - capabilities.Count.Should().Be(2); capabilities.Should().NotContainMatch("IScriptService*"); diff --git a/source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs b/source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs index d5c771eef..b8c56e28e 100644 --- a/source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs +++ b/source/Octopus.Tentacle.Tests/Integration/ScriptServiceV2Fixture.cs @@ -1,4 +1,4 @@ -using System; +using System; using System.Collections.Concurrent; using System.Collections.Generic; using System.Diagnostics; @@ -16,6 +16,7 @@ using Octopus.Tentacle.Core.Diagnostics; using Octopus.Tentacle.Core.Services.Scripts; using Octopus.Tentacle.Core.Services.Scripts.Locking; +using Octopus.Tentacle.Core.Services.Scripts.Logging; using Octopus.Tentacle.Core.Services.Scripts.Security.Masking; using Octopus.Tentacle.Core.Services.Scripts.Shell; using Octopus.Tentacle.Core.Services.Scripts.StateStore; @@ -29,30 +30,12 @@ namespace Octopus.Tentacle.Tests.Integration [TestFixture] public class ScriptServiceV2Fixture { - ScriptServiceV2 service = null!; - ScriptWorkspaceFactory workspaceFactory = null!; - ScriptStateStoreFactory stateStoreFactory = null!; - - [SetUp] - public void SetUp() - { - var homeConfiguration = Substitute.For(); - homeConfiguration.HomeDirectory.Returns(Environment.CurrentDirectory); - - var octopusPhysicalFileSystem = new OctopusPhysicalFileSystem(Substitute.For()); - workspaceFactory = new ScriptWorkspaceFactory(octopusPhysicalFileSystem, homeConfiguration, new SensitiveValueMasker()); - stateStoreFactory = new ScriptStateStoreFactory(octopusPhysicalFileSystem); - service = new ScriptServiceV2( - PlatformDetection.IsRunningOnWindows ? (IShell)new PowerShell() : new Bash(), - workspaceFactory, - stateStoreFactory, - new ScriptIsolationMutex(), - Substitute.For()); - } - [Test] public async Task ShouldExecuteAScriptSuccessfully() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var windowsScript = "& ping.exe localhost -n 1"; var bashScript = "ping localhost -c 1"; @@ -63,7 +46,7 @@ public async Task ShouldExecuteAScriptSuccessfully() .Build(); var startScriptResponse = await service.StartScriptAsync(startScriptCommand, CancellationToken.None); - var (logs, finalResponse) = await RunUntilScriptCompletes(startScriptCommand, startScriptResponse); + var (logs, finalResponse) = await RunUntilScriptCompletes(service, startScriptCommand, startScriptResponse); finalResponse.State.Should().Be(ProcessState.Complete); finalResponse.ExitCode.Should().Be(0); @@ -73,6 +56,9 @@ public async Task ShouldExecuteAScriptSuccessfully() [Test] public async Task ShouldReturnANonZeroExitCodeForAFailingScript() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var windowsScript = "& ping.exe nope -n 1"; var bashScript = "ping nope -c 1"; @@ -83,7 +69,7 @@ public async Task ShouldReturnANonZeroExitCodeForAFailingScript() .Build(); var startScriptResponse = await service.StartScriptAsync(startScriptCommand, CancellationToken.None); - var (logs, finalResponse) = await RunUntilScriptCompletes(startScriptCommand, startScriptResponse); + var (logs, finalResponse) = await RunUntilScriptCompletes(service, startScriptCommand, startScriptResponse); finalResponse.State.Should().Be(ProcessState.Complete); finalResponse.ExitCode.Should().NotBe(0); @@ -93,6 +79,9 @@ public async Task ShouldReturnANonZeroExitCodeForAFailingScript() [Test] public async Task ShouldExecuteMultipleScriptsConcurrently() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var bashScript = "sleep 10"; var windowsScript = "Start-Sleep -Seconds 10"; @@ -114,7 +103,7 @@ public async Task ShouldExecuteMultipleScriptsConcurrently() }); await Task.WhenAll(tasks); - + var startDuration = started.Elapsed; startDuration.Should().BeLessThan(TimeSpan.FromSeconds(5)); @@ -122,7 +111,7 @@ public async Task ShouldExecuteMultipleScriptsConcurrently() foreach (var script in scripts) { - var (logs, finalResponse) = await RunUntilScriptCompletes(script.Command, script.Response); + var (logs, finalResponse) = await RunUntilScriptCompletes(service, script.Command, script.Response); finalResponse.State.Should().Be(ProcessState.Complete); finalResponse.ExitCode.Should().Be(0); @@ -136,6 +125,9 @@ public async Task ShouldExecuteMultipleScriptsConcurrently() [Test] public async Task ShouldStartExecuteAScriptQuickly() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var script = "echo \"finished\""; var startScriptCommand = new StartScriptCommandV2Builder() @@ -170,6 +162,9 @@ public async Task ShouldStartExecuteAScriptQuickly() [Test] public async Task ShouldExecuteALongRunningScriptSuccessfully() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var bashScript = "sleep 10"; var windowsScript = "Start-Sleep -Seconds 10"; @@ -180,7 +175,7 @@ public async Task ShouldExecuteALongRunningScriptSuccessfully() .Build(); var startScriptResponse = await service.StartScriptAsync(startScriptCommand, CancellationToken.None); - var (_, finalResponse) = await RunUntilScriptCompletes(startScriptCommand, startScriptResponse); + var (_, finalResponse) = await RunUntilScriptCompletes(service, startScriptCommand, startScriptResponse); finalResponse.State.Should().Be(ProcessState.Complete); finalResponse.ExitCode.Should().Be(0); @@ -189,6 +184,9 @@ public async Task ShouldExecuteALongRunningScriptSuccessfully() [Test] public async Task StartScriptShouldWaitForAShortScriptToFinish() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var startScriptCommand = new StartScriptCommandV2Builder() .WithScriptBody("echo \"finished\"") .WithDurationStartScriptCanWaitForScriptToFinish(TimeSpan.FromSeconds(5)) @@ -207,6 +205,9 @@ public async Task StartScriptShouldWaitForAShortScriptToFinish() [Test] public async Task StartScriptShouldNotWaitForALongScriptToFinish() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var bashScript = "sleep 10"; var windowsScript = "Start-Sleep -Seconds 10"; @@ -217,7 +218,7 @@ public async Task StartScriptShouldNotWaitForALongScriptToFinish() .Build(); var startScriptResponse = await service.StartScriptAsync(startScriptCommand, CancellationToken.None); - await RunUntilScriptCompletes(startScriptCommand, startScriptResponse); + await RunUntilScriptCompletes(service, startScriptCommand, startScriptResponse); startScriptResponse.State.Should().Be(ProcessState.Running); startScriptResponse.ExitCode.Should().Be(0); @@ -227,6 +228,9 @@ public async Task StartScriptShouldNotWaitForALongScriptToFinish() [Test] public async Task StartScriptShouldNotStartTheScriptForTheSameScriptTicketMoreThanOnce_SequentialRequests() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + // Arrange var scriptTicket = new ScriptTicket(Guid.NewGuid().ToString()); var script1 = GetStartScriptCommandForScriptThatCreatesAFile(scriptTicket); @@ -236,7 +240,7 @@ public async Task StartScriptShouldNotStartTheScriptForTheSameScriptTicketMoreTh await service.StartScriptAsync(script1.StartScriptCommand, CancellationToken.None); var startScriptResponse = await service.StartScriptAsync(script2.StartScriptCommand, CancellationToken.None); - var (_, finalResponse) = await RunUntilScriptCompletes(script2.StartScriptCommand, startScriptResponse); + var (_, finalResponse) = await RunUntilScriptCompletes(service, script2.StartScriptCommand, startScriptResponse); // Assert finalResponse.State.Should().Be(ProcessState.Complete); @@ -249,6 +253,9 @@ public async Task StartScriptShouldNotStartTheScriptForTheSameScriptTicketMoreTh [Test] public async Task StartScriptShouldNotStartTheScriptForTheSameScriptTicketMoreThanOnce_ConcurrentRequests() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + // Arrange var scriptTicket = new ScriptTicket(Guid.NewGuid().ToString()); @@ -271,6 +278,7 @@ public async Task StartScriptShouldNotStartTheScriptForTheSameScriptTicketMoreTh await Task.WhenAll(tasks); var (_, finalResponse) = await RunUntilScriptCompletes( + service, scripts[0].StartScriptCommand, new ScriptStatusResponseV2(scripts[0].StartScriptCommand.ScriptTicket, ProcessState.Pending, 0, new List(), 0)); @@ -292,6 +300,9 @@ public async Task StartScriptShouldNotStartTheScriptForTheSameScriptTicketMoreTh [Test] public async Task CancelScriptShouldCancelAnExecutingScript() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var bashScript = "sleep 60"; var windowsScript = "Start-Sleep -Seconds 60"; @@ -334,9 +345,73 @@ public async Task CancelScriptShouldCancelAnExecutingScript() cancellationTimer.Elapsed.Should().BeLessOrEqualTo(TimeSpan.FromSeconds(5)); } + [Test] + public async Task AbandonScript_WhileWaitingOnTheIsolationMutex_ExitsCancelled() + { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + + const string mutexName = "abandon-while-acquiring-mutex"; + + // The holder takes the full-isolation mutex and keeps it for the duration of the test. + var holderTicket = new ScriptTicket(Guid.NewGuid().ToString()); + var holderCommand = new StartScriptCommandV2Builder() + .WithScriptBodyForCurrentOs("Start-Sleep -Seconds 60", "sleep 60") + .WithIsolation(ScriptIsolationLevel.FullIsolation) + .WithMutexName(mutexName) + .WithScriptTicket(holderTicket) + .WithDurationStartScriptCanWaitForScriptToFinish(null) + .Build(); + + // The waiter wants the same mutex, so it blocks in Acquire and never starts its process. + var waiterTicket = new ScriptTicket(Guid.NewGuid().ToString()); + var waiterCommand = new StartScriptCommandV2Builder() + .WithScriptBodyForCurrentOs("Start-Sleep -Seconds 1", "sleep 1") + .WithIsolation(ScriptIsolationLevel.FullIsolation) + .WithMutexName(mutexName) + .WithScriptTicket(waiterTicket) + .WithDurationStartScriptCanWaitForScriptToFinish(null) + .Build(); + + try + { + var holderResponse = await service.StartScriptAsync(holderCommand, CancellationToken.None); + holderResponse = await WaitForState(service, holderTicket, holderResponse, ProcessState.Running); + + var waiterResponse = await service.StartScriptAsync(waiterCommand, CancellationToken.None); + + // Give the waiter time to reach (and block in) the mutex Acquire. It cannot complete while the + // holder owns the mutex, so it must still be waiting. + await Task.Delay(TimeSpan.FromSeconds(2)); + waiterResponse = await service.GetStatusAsync(new ScriptStatusRequestV2(waiterTicket, waiterResponse.NextLogSequence), CancellationToken.None); + waiterResponse.State.Should().NotBe(ProcessState.Complete, "the waiter should be blocked acquiring the isolation mutex"); + + // A script still waiting on the isolation mutex never started a process, so we report it as + // cancelled (-43), not abandoned. Abandon is a no-op here (Acquire doesn't observe the abandon + // token); cancel unblocks Acquire and wins. If we ever want abandon to win in this case, + // RunningScript.Execute can catch (OperationCanceledException) when (abandonToken.IsCancellationRequested) + // around the mutex Acquire and return AbandonedExitCode (-48) instead. Deliberately not doing that today. + await service.AbandonScriptAsync(new AbandonScriptCommandV2(waiterTicket, waiterResponse.NextLogSequence), CancellationToken.None); + var afterCancel = await service.CancelScriptAsync(new CancelScriptCommandV2(waiterTicket, waiterResponse.NextLogSequence), CancellationToken.None); + + var (_, waiterFinal) = await RunUntilScriptCompletes(service, waiterCommand, afterCancel); + waiterFinal.ExitCode.Should().Be(ScriptExitCodes.CanceledExitCode); + } + finally + { + // Release the holder so we don't sit on the mutex (and its 60s sleep gets killed). + await service.CancelScriptAsync(new CancelScriptCommandV2(holderTicket, 0), CancellationToken.None); + await service.CompleteScriptAsync(new CompleteScriptCommandV2(holderTicket), CancellationToken.None); + } + } + [Test] public async Task CompleteScriptShouldCleanupTheWorkspace() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var workspaceFactory = (ScriptWorkspaceFactory)builder.WorkspaceFactory; + var script = "echo \"finished\""; var startScriptCommand = new StartScriptCommandV2Builder() @@ -351,13 +426,14 @@ public async Task CompleteScriptShouldCleanupTheWorkspace() var startScriptResponse = await service.StartScriptAsync(startScriptCommand, CancellationToken.None); Directory.Exists(workspaceDirectory).Should().BeTrue(); - await RunUntilScriptCompletes(startScriptCommand, startScriptResponse); + await RunUntilScriptCompletes(service, startScriptCommand, startScriptResponse); Directory.Exists(workspaceDirectory).Should().BeFalse(); } [Test] public async Task GetStatusShouldReturnAnExitCodeOf45ForAnUnknownScriptTicket() { + var service = new ScriptServiceV2Builder().Build(); var response = await service.GetStatusAsync(new ScriptStatusRequestV2(new ScriptTicket("nope"), 0), CancellationToken.None); response.ExitCode.Should().Be(-45); @@ -366,6 +442,7 @@ public async Task GetStatusShouldReturnAnExitCodeOf45ForAnUnknownScriptTicket() [Test] public async Task CancelScriptShouldReturnAnExitCodeOf45ForAnUnknownScriptTicket() { + var service = new ScriptServiceV2Builder().Build(); var response = await service.CancelScriptAsync(new CancelScriptCommandV2(new ScriptTicket("nope"), 0), CancellationToken.None); response.ExitCode.Should().Be(-45); @@ -374,43 +451,53 @@ public async Task CancelScriptShouldReturnAnExitCodeOf45ForAnUnknownScriptTicket [Test] public async Task CompleteScriptShouldNotErrorForAnUnknownScriptTicket() { + var service = new ScriptServiceV2Builder().Build(); await service.CompleteScriptAsync(new CompleteScriptCommandV2(new ScriptTicket("nope")), CancellationToken.None); } [Test] public async Task GetStatusShouldReturnAnExitCodeOf46ForAScriptWithAnUnknownResult() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var request = new ScriptStatusRequestV2(new ScriptTicket($"did-not-finish-{Guid.NewGuid()}"), 0); var ticket = request.Ticket; - SetupScriptState(ticket); + SetupScriptState(builder.WorkspaceFactory, builder.StateStoreFactory, ticket); var response = await service.GetStatusAsync(request, CancellationToken.None); response.ExitCode.Should().Be(-46); - await CleanupWorkspace(ticket, CancellationToken.None); + await CleanupWorkspace(builder.WorkspaceFactory, ticket, CancellationToken.None); } [Test] public async Task CancelScriptShouldReturnAnExitCodeOf46ForAScriptWithAnUnknownResult() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var request = new CancelScriptCommandV2(new ScriptTicket($"did-not-finish-{Guid.NewGuid()}"), 0); var ticket = request.Ticket; - SetupScriptState(ticket); + SetupScriptState(builder.WorkspaceFactory, builder.StateStoreFactory, ticket); var response = await service.CancelScriptAsync(request, CancellationToken.None); response.ExitCode.Should().Be(-46); - await CleanupWorkspace(ticket, CancellationToken.None); + await CleanupWorkspace(builder.WorkspaceFactory, ticket, CancellationToken.None); } [Test] public async Task CompleteScriptShouldNotErrorForAScriptWithAnUnknownResult() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var request = new CompleteScriptCommandV2(new ScriptTicket($"did-not-finish-{Guid.NewGuid()}")); var ticket = request.Ticket; - SetupScriptState(ticket); + SetupScriptState(builder.WorkspaceFactory, builder.StateStoreFactory, ticket); await service.CompleteScriptAsync(request, CancellationToken.None); } @@ -418,6 +505,9 @@ public async Task CompleteScriptShouldNotErrorForAScriptWithAnUnknownResult() [Test] public async Task ShouldStoreTheStateOfTheScriptInTheScriptStateStore() { + var builder = new ScriptServiceV2Builder(); + var service = builder.Build(); + var testStarted = DateTimeOffset.UtcNow; var bashScript = "sleep 10"; @@ -429,13 +519,13 @@ public async Task ShouldStoreTheStateOfTheScriptInTheScriptStateStore() .WithDurationStartScriptCanWaitForScriptToFinish(null) .Build(); - var scriptStateStore = SetupScriptStateStore(startScriptCommand.ScriptTicket); + var scriptStateStore = SetupScriptStateStore(builder.WorkspaceFactory, builder.StateStoreFactory, startScriptCommand.ScriptTicket); var startScriptResponse = await service.StartScriptAsync(startScriptCommand, CancellationToken.None); var runningScriptState = scriptStateStore.Load(); - var (logs, finalResponse) = await RunUntilScriptFinishes(startScriptCommand, startScriptResponse); + var (logs, finalResponse) = await RunUntilScriptFinishes(service, startScriptCommand, startScriptResponse); var testFinished = DateTimeOffset.UtcNow; var finishedScriptState = scriptStateStore.Load(); @@ -461,6 +551,8 @@ public async Task ShouldStoreTheStateOfTheScriptInTheScriptStateStore() [Test] public async Task ScriptTicketCasingShouldNotAffectCommands() { + var service = new ScriptServiceV2Builder().Build(); + // Arrange var startScriptCommand = new StartScriptCommandV2Builder() .WithScriptBody("echo \"finished\"") @@ -483,22 +575,364 @@ public async Task ScriptTicketCasingShouldNotAffectCommands() lowerCaseResponse.ExitCode.Should().Be(0); } + [Test] + public async Task AbandonScript_OnUnknownTicket_ReturnsCompleteWithUnknownScriptExitCode() + { + var service = new ScriptServiceV2Builder().Build(); + + var ticket = new ScriptTicket("unknown-ticket-" + Guid.NewGuid().ToString("N")); + var response = await service.AbandonScriptAsync(new AbandonScriptCommandV2(ticket, 0), CancellationToken.None); + + response.State.Should().Be(ProcessState.Complete); + response.ExitCode.Should().Be(ScriptExitCodes.UnknownScriptExitCode); + } + + [Test] + public async Task AbandonScript_OnRunningScript_FiresAbandonToken_ReturnsAbandonedExitCode() + { + var service = new ScriptServiceV2Builder().Build(); + + var startCommand = new StartScriptCommandV2Builder() + .WithScriptBodyForCurrentOs("Start-Sleep -Seconds 60", "sleep 60") + .WithIsolation(ScriptIsolationLevel.NoIsolation) + .WithDurationStartScriptCanWaitForScriptToFinish(null) + .Build(); + + await service.StartScriptAsync(startCommand, CancellationToken.None); + + // Wait for the script to reach Running state + ScriptStatusResponseV2 status; + var deadline = DateTime.UtcNow.AddSeconds(30); + do + { + status = await service.GetStatusAsync(new ScriptStatusRequestV2(startCommand.ScriptTicket, 0), CancellationToken.None); + if (status.State == ProcessState.Running) break; + await Task.Delay(50); + } while (DateTime.UtcNow < deadline); + status.State.Should().Be(ProcessState.Running, "script should have reached Running state within 30 seconds"); + + // Fire abandon + await service.AbandonScriptAsync(new AbandonScriptCommandV2(startCommand.ScriptTicket, 0), CancellationToken.None); + + // Poll until the script completes (the abandon token causes the process runner to return AbandonedExitCode) + ScriptStatusResponseV2 finalResponse; + var completionDeadline = DateTime.UtcNow.AddSeconds(30); + do + { + finalResponse = await service.GetStatusAsync(new ScriptStatusRequestV2(startCommand.ScriptTicket, 0), CancellationToken.None); + if (finalResponse.State == ProcessState.Complete) break; + await Task.Delay(100); + } while (DateTime.UtcNow < completionDeadline); + + finalResponse.State.Should().Be(ProcessState.Complete); + finalResponse.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + } + + [Test] + public async Task AbandonScript_OnAlreadyCompletedScript_ReturnsRealExitCode() + { + var service = new ScriptServiceV2Builder().Build(); + + var startCommand = new StartScriptCommandV2Builder() + .WithScriptBody("echo \"finished\"") + .WithIsolation(ScriptIsolationLevel.NoIsolation) + .WithDurationStartScriptCanWaitForScriptToFinish(null) + .Build(); + + await service.StartScriptAsync(startCommand, CancellationToken.None); + + // Wait for the script to complete + ScriptStatusResponseV2 status; + var deadline = DateTime.UtcNow.AddSeconds(30); + do + { + status = await service.GetStatusAsync(new ScriptStatusRequestV2(startCommand.ScriptTicket, 0), CancellationToken.None); + if (status.State == ProcessState.Complete) break; + await Task.Delay(50); + } while (DateTime.UtcNow < deadline); + status.State.Should().Be(ProcessState.Complete); + + var abandonResponse = await service.AbandonScriptAsync(new AbandonScriptCommandV2(startCommand.ScriptTicket, 0), CancellationToken.None); + abandonResponse.ExitCode.Should().Be(0, "real exit code should be returned, not AbandonedExitCode"); + } + + [Test] + public async Task CompleteScript_AfterAbandon_WhenWorkspaceDeleteFails_LogsWarnAndReturnsNormally() + { + var deleteException = new IOException("file in use"); + var builder = new ScriptServiceV2Builder(); + var (throwingFactory, mockLog) = BuildFactoryWithThrowingDelete(builder.WorkspaceFactory, deleteException); + var serviceUnderTest = builder + .WithWorkspaceFactory(throwingFactory) + .WithLog(mockLog) + .Build(); + + var startCommand = new StartScriptCommandV2Builder() + .WithScriptBodyForCurrentOs("Start-Sleep -Seconds 60", "sleep 60") + .WithIsolation(ScriptIsolationLevel.NoIsolation) + .WithDurationStartScriptCanWaitForScriptToFinish(null) + .Build(); + + await serviceUnderTest.StartScriptAsync(startCommand, CancellationToken.None); + + // Wait for Running + ScriptStatusResponseV2 status; + var runningDeadline = DateTime.UtcNow.AddSeconds(30); + do + { + status = await serviceUnderTest.GetStatusAsync(new ScriptStatusRequestV2(startCommand.ScriptTicket, 0), CancellationToken.None); + if (status.State == ProcessState.Running) break; + await Task.Delay(50); + } while (DateTime.UtcNow < runningDeadline); + status.State.Should().Be(ProcessState.Running, "script should have reached Running state within 30 seconds"); + + await serviceUnderTest.AbandonScriptAsync(new AbandonScriptCommandV2(startCommand.ScriptTicket, 0), CancellationToken.None); + + // Poll until Complete + var completeDeadline = DateTime.UtcNow.AddSeconds(30); + do + { + status = await serviceUnderTest.GetStatusAsync(new ScriptStatusRequestV2(startCommand.ScriptTicket, 0), CancellationToken.None); + if (status.State == ProcessState.Complete) break; + await Task.Delay(50); + } while (DateTime.UtcNow < completeDeadline); + status.State.Should().Be(ProcessState.Complete); + status.ExitCode.Should().Be(ScriptExitCodes.AbandonedExitCode); + + Func complete = async () => await serviceUnderTest.CompleteScriptAsync(new CompleteScriptCommandV2(startCommand.ScriptTicket), CancellationToken.None); + + await complete.Should().NotThrowAsync(); + mockLog.Received().Warn(deleteException, Arg.Is(m => m.Contains("Could not delete") && m.Contains(startCommand.ScriptTicket.TaskId))); + } + + [Test] + public async Task CompleteScript_AfterNormalCompletion_WhenWorkspaceDeleteFails_PropagatesException() + { + var deleteException = new IOException("file in use"); + var builder = new ScriptServiceV2Builder(); + var (throwingFactory, mockLog) = BuildFactoryWithThrowingDelete(builder.WorkspaceFactory, deleteException); + var serviceUnderTest = builder + .WithWorkspaceFactory(throwingFactory) + .WithLog(mockLog) + .Build(); + + var startCommand = new StartScriptCommandV2Builder() + .WithScriptBody("echo \"finished\"") + .WithIsolation(ScriptIsolationLevel.NoIsolation) + .WithDurationStartScriptCanWaitForScriptToFinish(null) + .Build(); + + await serviceUnderTest.StartScriptAsync(startCommand, CancellationToken.None); + + // Poll until natural completion + ScriptStatusResponseV2 status; + var deadline = DateTime.UtcNow.AddSeconds(30); + do + { + status = await serviceUnderTest.GetStatusAsync(new ScriptStatusRequestV2(startCommand.ScriptTicket, 0), CancellationToken.None); + if (status.State == ProcessState.Complete) break; + await Task.Delay(50); + } while (DateTime.UtcNow < deadline); + status.State.Should().Be(ProcessState.Complete); + status.ExitCode.Should().Be(0, "the script exited cleanly, not via abandon"); + + Func complete = async () => await serviceUnderTest.CompleteScriptAsync(new CompleteScriptCommandV2(startCommand.ScriptTicket), CancellationToken.None); + + await complete.Should().ThrowAsync(); + } + + /// + /// Builds an IScriptWorkspaceFactory decorator over the supplied workspaceFactory whose returned + /// workspaces forward every member except Delete(CancellationToken), which throws the supplied + /// exception. Also returns a mock ISystemLog for assertion. + /// + static (IScriptWorkspaceFactory factory, ISystemLog log) BuildFactoryWithThrowingDelete(IScriptWorkspaceFactory workspaceFactory, Exception deleteException) + { + var throwingFactory = new DeleteThrowingScriptWorkspaceFactory(workspaceFactory, deleteException); + var fakeLog = Substitute.For(); + return (throwingFactory, fakeLog); + } + + /// + /// Builder for ScriptServiceV2 SUT construction. Defaults match what the previous [SetUp] + /// produced; tests opt into mock overrides via the chainable With* methods. The + /// WorkspaceFactory and StateStoreFactory properties materialize on first access so tests + /// can grab them before Build() (e.g. to wrap the default factory in a decorator). + /// + class ScriptServiceV2Builder + { + IScriptWorkspaceFactory? workspaceFactory; + ScriptStateStoreFactory? stateStoreFactory; + ISystemLog? log; + IShell? shell; + ScriptIsolationMutex? mutex; + OctopusPhysicalFileSystem? cachedFileSystem; + + public IScriptWorkspaceFactory WorkspaceFactory => workspaceFactory ??= BuildDefaultWorkspaceFactory(); + public ScriptStateStoreFactory StateStoreFactory => stateStoreFactory ??= new ScriptStateStoreFactory(FileSystem); + + OctopusPhysicalFileSystem FileSystem => cachedFileSystem ??= new OctopusPhysicalFileSystem(Substitute.For()); + + ScriptWorkspaceFactory BuildDefaultWorkspaceFactory() + { + var homeConfiguration = Substitute.For(); + homeConfiguration.HomeDirectory.Returns(Environment.CurrentDirectory); + return new ScriptWorkspaceFactory(FileSystem, homeConfiguration, new SensitiveValueMasker()); + } + + public ScriptServiceV2Builder WithWorkspaceFactory(IScriptWorkspaceFactory factory) + { + workspaceFactory = factory; + return this; + } + + public ScriptServiceV2Builder WithStateStoreFactory(ScriptStateStoreFactory factory) + { + stateStoreFactory = factory; + return this; + } + + public ScriptServiceV2Builder WithLog(ISystemLog log) + { + this.log = log; + return this; + } + + public ScriptServiceV2Builder WithShell(IShell shell) + { + this.shell = shell; + return this; + } + + public ScriptServiceV2Builder WithMutex(ScriptIsolationMutex mutex) + { + this.mutex = mutex; + return this; + } + + public ScriptServiceV2 Build() + { + return new ScriptServiceV2( + shell ?? (PlatformDetection.IsRunningOnWindows ? (IShell)new PowerShell() : new Bash()), + WorkspaceFactory, + StateStoreFactory, + mutex ?? new ScriptIsolationMutex(), + log ?? Substitute.For()); + } + } + + /// + /// IScriptWorkspaceFactory decorator that wraps every workspace it returns in a + /// DeleteThrowingScriptWorkspace so that Delete throws the configured exception while all + /// other members forward to the real workspace. + /// + class DeleteThrowingScriptWorkspaceFactory : IScriptWorkspaceFactory + { + readonly IScriptWorkspaceFactory inner; + readonly Exception deleteException; + + public DeleteThrowingScriptWorkspaceFactory(IScriptWorkspaceFactory inner, Exception deleteException) + { + this.inner = inner; + this.deleteException = deleteException; + } + + public IScriptWorkspace GetWorkspace(ScriptTicket ticket, WorkspaceReadinessCheck readinessCheck) + => new DeleteThrowingScriptWorkspace(inner.GetWorkspace(ticket, readinessCheck), deleteException); + + public async Task PrepareWorkspace( + ScriptTicket ticket, + string scriptBody, + Dictionary scripts, + ScriptIsolationLevel isolationLevel, + TimeSpan scriptMutexAcquireTimeout, + string? scriptMutexName, + string[]? scriptArguments, + List files, + CancellationToken cancellationToken) + { + var workspace = await inner.PrepareWorkspace(ticket, scriptBody, scripts, isolationLevel, scriptMutexAcquireTimeout, scriptMutexName, scriptArguments, files, cancellationToken); + return new DeleteThrowingScriptWorkspace(workspace, deleteException); + } + + public List GetUncompletedWorkspaces() + => inner.GetUncompletedWorkspaces().Select(w => (IScriptWorkspace)new DeleteThrowingScriptWorkspace(w, deleteException)).ToList(); + } + + /// + /// IScriptWorkspace decorator that forwards every member to an inner real workspace, except + /// Delete(CancellationToken), which throws the configured exception. Used to exercise the + /// CompleteScript abandon-aware tolerance of Delete failures without disturbing anything else + /// StartScript / RunningScript may touch on the workspace. + /// + class DeleteThrowingScriptWorkspace : IScriptWorkspace + { + readonly IScriptWorkspace inner; + readonly Exception deleteException; + + public DeleteThrowingScriptWorkspace(IScriptWorkspace inner, Exception deleteException) + { + this.inner = inner; + this.deleteException = deleteException; + } + + public ScriptTicket ScriptTicket => inner.ScriptTicket; + public string WorkingDirectory => inner.WorkingDirectory; + public string BootstrapScriptFilePath => inner.BootstrapScriptFilePath; + public string LogFilePath => inner.LogFilePath; + + public string[]? ScriptArguments + { + get => inner.ScriptArguments; + set => inner.ScriptArguments = value; + } + + public ScriptIsolationLevel IsolationLevel + { + get => inner.IsolationLevel; + set => inner.IsolationLevel = value; + } + + public TimeSpan ScriptMutexAcquireTimeout + { + get => inner.ScriptMutexAcquireTimeout; + set => inner.ScriptMutexAcquireTimeout = value; + } + + public string? ScriptMutexName + { + get => inner.ScriptMutexName; + set => inner.ScriptMutexName = value; + } + + public bool ShouldMonitorPowerShellStartup() => inner.ShouldMonitorPowerShellStartup(); + public void BootstrapScript(string scriptBody) => inner.BootstrapScript(scriptBody); + public string ResolvePath(string fileName) => inner.ResolvePath(fileName); + public IScriptLog CreateLog() => inner.CreateLog(); + public void WriteFile(string filename, string contents) => inner.WriteFile(filename, contents); + public void CopyFile(string sourceFilePath, string destFileName, bool overwrite) => inner.CopyFile(sourceFilePath, destFileName, overwrite); + public void CheckReadiness() => inner.CheckReadiness(); + public string? TryReadFile(string filename) => inner.TryReadFile(filename); + + public Task Delete(CancellationToken cancellationToken) => throw deleteException; + } + // TODO - Test the stateStore is updated. - private void SetupScriptState(ScriptTicket ticket) + static void SetupScriptState(IScriptWorkspaceFactory workspaceFactory, ScriptStateStoreFactory stateStoreFactory, ScriptTicket ticket) { - var stateWorkspace = SetupScriptStateStore(ticket); + var stateWorkspace = SetupScriptStateStore(workspaceFactory, stateStoreFactory, ticket); stateWorkspace.Create(); } - private ScriptStateStore SetupScriptStateStore(ScriptTicket ticket) + static ScriptStateStore SetupScriptStateStore(IScriptWorkspaceFactory workspaceFactory, ScriptStateStoreFactory stateStoreFactory, ScriptTicket ticket) { var workspace = workspaceFactory.GetWorkspace(ticket, WorkspaceReadinessCheck.Perform); var stateWorkspace = stateStoreFactory.Create(workspace); return stateWorkspace; } - private async Task CleanupWorkspace(ScriptTicket ticket, CancellationToken cancellationToken) + static async Task CleanupWorkspace(IScriptWorkspaceFactory workspaceFactory, ScriptTicket ticket, CancellationToken cancellationToken) { var workspace = workspaceFactory.GetWorkspace(ticket, WorkspaceReadinessCheck.Skip); await workspace.Delete(cancellationToken); @@ -527,9 +961,9 @@ private async Task CleanupWorkspace(ScriptTicket ticket, CancellationToken cance return (startScriptCommand, new FileInfo(filePath)); } - async Task<(List, ScriptStatusResponseV2)> RunUntilScriptCompletes(StartScriptCommandV2 startScriptCommand, ScriptStatusResponseV2 response) + static async Task<(List, ScriptStatusResponseV2)> RunUntilScriptCompletes(ScriptServiceV2 service, StartScriptCommandV2 startScriptCommand, ScriptStatusResponseV2 response) { - var (logs, lastResponse) = await RunUntilScriptFinishes(startScriptCommand, response); + var (logs, lastResponse) = await RunUntilScriptFinishes(service, startScriptCommand, response); await service.CompleteScriptAsync(new CompleteScriptCommandV2(startScriptCommand.ScriptTicket), CancellationToken.None); @@ -538,7 +972,7 @@ private async Task CleanupWorkspace(ScriptTicket ticket, CancellationToken cance return (logs, lastResponse); } - async Task<(List logs, ScriptStatusResponseV2 response)> RunUntilScriptFinishes(StartScriptCommandV2 startScriptCommand, ScriptStatusResponseV2 response) + static async Task<(List logs, ScriptStatusResponseV2 response)> RunUntilScriptFinishes(ScriptServiceV2 service, StartScriptCommandV2 startScriptCommand, ScriptStatusResponseV2 response) { var logs = new List(response.Logs); @@ -557,7 +991,20 @@ private async Task CleanupWorkspace(ScriptTicket ticket, CancellationToken cance return (logs, response); } - void WriteLogsToConsole(List logs) + static async Task WaitForState(ScriptServiceV2 service, ScriptTicket ticket, ScriptStatusResponseV2 response, ProcessState desiredState) + { + var deadline = DateTime.UtcNow + TimeSpan.FromSeconds(30); + while (response.State != desiredState && DateTime.UtcNow < deadline) + { + await Task.Delay(TimeSpan.FromMilliseconds(100)); + response = await service.GetStatusAsync(new ScriptStatusRequestV2(ticket, response.NextLogSequence), CancellationToken.None); + } + + response.State.Should().Be(desiredState, $"the script should reach {desiredState} within the timeout"); + return response; + } + + static void WriteLogsToConsole(List logs) { foreach (var log in logs) { @@ -576,4 +1023,4 @@ public StartScriptCommandAndResponse(StartScriptCommandV2 command) public ScriptStatusResponseV2? Response { get; set; } } } -} \ No newline at end of file +} diff --git a/source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs b/source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs index 7dda62791..a831d54a2 100644 --- a/source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs +++ b/source/Octopus.Tentacle/Services/Capabilities/CapabilitiesServiceV2.cs @@ -6,6 +6,7 @@ using Octopus.Tentacle.Contracts.KubernetesScriptServiceV1; using Octopus.Tentacle.Contracts.ScriptServiceV2; using Octopus.Tentacle.Core.Services; +using Octopus.Tentacle.Core.Services.Scripts; using Octopus.Tentacle.Util; namespace Octopus.Tentacle.Services.Capabilities @@ -24,7 +25,7 @@ public async Task GetCapabilitiesAsync(CancellationToken } //non-kubernetes agent tentacles only support the standard script services - return new CapabilitiesResponseV2(new List { nameof(IScriptService), nameof(IFileTransferService), nameof(IScriptServiceV2) }); + return new CapabilitiesResponseV2(new List { nameof(IScriptService), nameof(IFileTransferService), nameof(IScriptServiceV2), nameof(IScriptServiceV2.AbandonScript) }); } } } \ No newline at end of file diff --git a/source/Octopus.Tentacle/Services/Scripts/ScriptService.cs b/source/Octopus.Tentacle/Services/Scripts/ScriptService.cs index e1b952202..82b482d6f 100644 --- a/source/Octopus.Tentacle/Services/Scripts/ScriptService.cs +++ b/source/Octopus.Tentacle/Services/Scripts/ScriptService.cs @@ -93,7 +93,7 @@ public async Task CompleteScriptAsync(CompleteScriptComman RunningScript LaunchShell(ScriptTicket ticket, string serverTaskId, IScriptWorkspace workspace, CancellationTokenSource cancel) { - var runningScript = new RunningScript(shell, workspace, workspace.CreateLog(), serverTaskId, scriptIsolationMutex, cancel.Token, new Dictionary(), PowerShellStartupDetection.PowerShellStartupTimeout, log); + var runningScript = RunningScript.Create(shell, workspace, workspace.CreateLog(), serverTaskId, scriptIsolationMutex, cancel.Token, new Dictionary(), PowerShellStartupDetection.PowerShellStartupTimeout, log); _ = Task.Run(async () => await runningScript.Execute()); return runningScript; } diff --git a/source/Octopus.Tentacle/Util/ISilentProcessRunner.cs b/source/Octopus.Tentacle/Util/ISilentProcessRunner.cs index 49c54fd6b..fa6d21f45 100644 --- a/source/Octopus.Tentacle/Util/ISilentProcessRunner.cs +++ b/source/Octopus.Tentacle/Util/ISilentProcessRunner.cs @@ -14,7 +14,8 @@ Task ExecuteCommandAsync( string workingDirectory, Action info, Action error, - CancellationToken cancel = default); + CancellationToken cancel = default, + CancellationToken abandon = default); Task ExecuteCommandAsync( string executable, @@ -23,19 +24,20 @@ Task ExecuteCommandAsync( Action debug, Action info, Action error, - CancellationToken cancel = default); + CancellationToken cancel = default, + CancellationToken abandon = default); } public class SilentProcessRunnerWrapper : ISilentProcessRunner { - public Task ExecuteCommandAsync(string executable, string arguments, string workingDirectory, Action info, Action error, CancellationToken cancel = default) + public Task ExecuteCommandAsync(string executable, string arguments, string workingDirectory, Action info, Action error, CancellationToken cancel = default, CancellationToken abandon = default) { - return SilentProcessRunnerExtended.ExecuteCommandAsync(executable, arguments, workingDirectory, info, error, cancel); + return SilentProcessRunnerExtended.ExecuteCommandAsync(executable, arguments, workingDirectory, info, error, cancel, abandon); } - public Task ExecuteCommandAsync(string executable, string arguments, string workingDirectory, Action debug, Action info, Action error, CancellationToken cancel = default) + public Task ExecuteCommandAsync(string executable, string arguments, string workingDirectory, Action debug, Action info, Action error, CancellationToken cancel = default, CancellationToken abandon = default) { - return SilentProcessRunner.ExecuteCommandAsync(executable, arguments, workingDirectory, debug, info, error, cancel: cancel); + return SilentProcessRunner.ExecuteCommandAsync(executable, arguments, workingDirectory, debug, info, error, cancel: cancel, abandon: abandon); } } @@ -70,7 +72,8 @@ public static Task ExecuteCommandAsync( string workingDirectory, Action info, Action error, - CancellationToken cancel = default) + CancellationToken cancel = default, + CancellationToken abandon = default) => SilentProcessRunner.ExecuteCommandAsync(executable, arguments, workingDirectory, @@ -78,6 +81,7 @@ public static Task ExecuteCommandAsync( info, error, customEnvironmentVariables: null, - cancel: cancel); + cancel: cancel, + abandon: abandon); } }