diff --git a/apps/mesh/src/api/routes/sandbox-proxy.ts b/apps/mesh/src/api/routes/sandbox-proxy.ts index cc0f96e333..7a225b233c 100644 --- a/apps/mesh/src/api/routes/sandbox-proxy.ts +++ b/apps/mesh/src/api/routes/sandbox-proxy.ts @@ -164,7 +164,7 @@ async function proxyDaemon( const runner = requireRunner(c); if (runner instanceof Response) return runner; - const { claimName } = c.get("vmClaim"); + const { claimName, userId, projectRef } = c.get("vmClaim"); const method = opts?.method ?? "POST"; let body: string | null = null; const headers = new Headers(); @@ -174,14 +174,20 @@ async function proxyDaemon( headers.set("content-type", "application/json"); } + const requestInit = { + method, + headers, + body, + ...(opts?.signal ? { signal: opts.signal } : {}), + }; + let upstream: Response; try { - upstream = await runner.proxyDaemonRequest(claimName, daemonPath, { - method, - headers, - body, - ...(opts?.signal ? { signal: opts.signal } : {}), - }); + upstream = await runner.proxyDaemonRequest( + claimName, + daemonPath, + requestInit, + ); } catch (err) { const message = err instanceof Error ? err.message : String(err); return c.json({ error: `Daemon unreachable: ${message}` }, 502); @@ -193,14 +199,32 @@ async function proxyDaemon( } catch { /* ignore */ } - return c.json( - { - error: - "Sandbox handle is gone. The sandbox needs to be re-provisioned.", - }, - 410, - SANDBOX_PROXY_CACHE_HEADERS, + const adopted = await runner.adoptLiveClaim?.( + { userId, projectRef }, + claimName, ); + if (adopted) { + try { + upstream = await runner.proxyDaemonRequest( + claimName, + daemonPath, + requestInit, + ); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + return c.json({ error: `Daemon unreachable: ${message}` }, 502); + } + } + if (upstream.status === 404) { + return c.json( + { + error: + "Sandbox handle is gone. The sandbox needs to be re-provisioned.", + }, + 410, + SANDBOX_PROXY_CACHE_HEADERS, + ); + } } const text = await upstream.text(); @@ -217,13 +241,30 @@ async function fetchDaemonJson( claimName: string, daemonPath: string, method: "GET" | "POST" = "GET", + sandboxId?: { userId: string; projectRef: string }, ): Promise { - const upstream = await runner.proxyDaemonRequest(claimName, daemonPath, { + let upstream = await runner.proxyDaemonRequest(claimName, daemonPath, { method, headers: new Headers(), body: null, }); + if (upstream.status === 404 && sandboxId) { + try { + await upstream.body?.cancel(); + } catch { + /* ignore */ + } + const adopted = await runner.adoptLiveClaim?.(sandboxId, claimName); + if (adopted) { + upstream = await runner.proxyDaemonRequest(claimName, daemonPath, { + method, + headers: new Headers(), + body: null, + }); + } + } + if (upstream.status === 404) { try { await upstream.body?.cancel(); @@ -423,7 +464,7 @@ export const createSandboxRoutes = () => { const runner = requireRunner(c); if (runner instanceof Response) return runner; - const { claimName } = c.get("vmClaim"); + const { claimName, userId, projectRef } = c.get("vmClaim"); const ctx = c.var.meshContext; let body: { status?: GitStatusLike; diff?: GitDiffLike } = {}; @@ -453,12 +494,14 @@ export const createSandboxRoutes = () => { claimName, "/_sandbox/git/status", "GET", + { userId, projectRef }, ), fetchDaemonJson( runner, claimName, "/_sandbox/git/diff", "GET", + { userId, projectRef }, ), ]); } diff --git a/apps/mesh/src/web/components/thread/github/publish-dialog.tsx b/apps/mesh/src/web/components/thread/github/publish-dialog.tsx index 9a7e1fbb41..95b2b2d749 100644 --- a/apps/mesh/src/web/components/thread/github/publish-dialog.tsx +++ b/apps/mesh/src/web/components/thread/github/publish-dialog.tsx @@ -1,4 +1,4 @@ -import { useMCPClient } from "@decocms/mesh-sdk"; +import { SELF_MCP_ALIAS_ID, useMCPClient } from "@decocms/mesh-sdk"; import { Button } from "@deco/ui/components/button.tsx"; import { Dialog, DialogContent } from "@deco/ui/components/dialog.tsx"; import { Input } from "@deco/ui/components/input.tsx"; @@ -31,6 +31,7 @@ import { type CreatedPullRequest, } from "./github-pr-api.ts"; import type { PrSummary } from "./use-pr-data.ts"; +import { useSandboxStart } from "@/web/components/sandbox/hooks/use-sandbox-start"; import { countGitChanges, discardGitFiles, @@ -39,6 +40,7 @@ import { fetchSuggestCommitMessage, hasUnpublishedWork, isDecoOnlyDiff, + isSandboxUnreachable, publishGitChanges, PUBLISH_REQUIRES_SUBMIT_TOOLTIP, readGitHeadBranch, @@ -119,6 +121,12 @@ function PublishDialogBody({ orgId, orgSlug, }); + const selfClient = useMCPClient({ + connectionId: SELF_MCP_ALIAS_ID, + orgId, + orgSlug, + }); + const startSandbox = useSandboxStart(selfClient); const commitToOpenPr = openPullRequest?.state === "open"; @@ -138,6 +146,32 @@ function PublishDialogBody({ const [isGeneratingSuggestion, setIsGeneratingSuggestion] = useState(false); const loadStartedRef = useRef(false); + const reprovisionAttemptedRef = useRef(false); + + const loadGitState = async () => { + const [status, diff] = await Promise.all([ + fetchGitStatus(orgSlug, virtualMcpId, branch), + fetchGitDiff(orgSlug, virtualMcpId, branch), + ]); + setGitStatus(status); + setGitDiff(diff); + setPublishTitle(`Changes from ${status.current ?? branch}`); + + setIsGeneratingSuggestion(true); + fetchSuggestCommitMessage(orgSlug, virtualMcpId, branch, { + status, + diff, + }) + .then((commitSuggestion) => { + setPublishTitle(commitSuggestion.title); + setPublishBody(commitSuggestion.body); + }) + .catch(() => { + /* best-effort */ + }) + .finally(() => setIsGeneratingSuggestion(false)); + }; + // oxlint-disable-next-line ban-ref-current-assignment/ban-ref-current-assignment -- one-shot load on dialog open if (!loadStartedRef.current) { // oxlint-disable-next-line ban-ref-current-assignment/ban-ref-current-assignment -- one-shot load on dialog open @@ -151,28 +185,26 @@ function PublishDialogBody({ setSubmitForReviewError(undefined); setSaveChangesError(undefined); try { - const [status, diff] = await Promise.all([ - fetchGitStatus(orgSlug, virtualMcpId, branch), - fetchGitDiff(orgSlug, virtualMcpId, branch), - ]); - setGitStatus(status); - setGitDiff(diff); - setPublishTitle(`Changes from ${status.current ?? branch}`); - - setIsGeneratingSuggestion(true); - fetchSuggestCommitMessage(orgSlug, virtualMcpId, branch, { - status, - diff, - }) - .then((commitSuggestion) => { - setPublishTitle(commitSuggestion.title); - setPublishBody(commitSuggestion.body); - }) - .catch(() => { - /* best-effort */ - }) - .finally(() => setIsGeneratingSuggestion(false)); + await loadGitState(); } catch (error) { + // Preview can stay live via the gateway while mesh's daemon proxy + // still holds a stale handle. Re-provision once, then retry — same + // self-heal path as preview.tsx's notFound → SANDBOX_START flow. + if (isSandboxUnreachable(error) && !reprovisionAttemptedRef.current) { + reprovisionAttemptedRef.current = true; + try { + await startSandbox.mutateAsync({ virtualMcpId, branch }); + await loadGitState(); + return; + } catch (retryErr) { + setPublishError( + retryErr instanceof Error + ? retryErr.message + : "Failed to load changes after re-provisioning the sandbox.", + ); + return; + } + } setPublishError( error instanceof Error ? error.message : "Failed to load changes.", ); diff --git a/packages/sandbox/server/provider/agent-sandbox/runner.ts b/packages/sandbox/server/provider/agent-sandbox/runner.ts index 6772ee5855..6af191f791 100644 --- a/packages/sandbox/server/provider/agent-sandbox/runner.ts +++ b/packages/sandbox/server/provider/agent-sandbox/runner.ts @@ -506,7 +506,7 @@ export class AgentSandboxProvider implements SandboxProvider { path: string, init: ProxyRequestInit, ): Promise { - const rec = await this.getRecord(handle); + let rec = await this.getRecord(handle); // rehydrate failed (port-forward is pod-local); route via in-cluster Service instead. if (!rec && this.previewUrlPattern && this.stateStore) { @@ -516,10 +516,31 @@ export class AgentSandboxProvider implements SandboxProvider { if (row && token) { const adoptedName = state?.adoptedSandboxName ?? handle; const daemonUrl = `http://${adoptedName}.${this.namespace}.svc.cluster.local:${DAEMON_CONTAINER_PORT}`; - return proxyDaemonRequest(daemonUrl, token, path, init); + try { + const resp = await proxyDaemonRequest(daemonUrl, token, path, init); + if (resp.status !== 404) { + return resp; + } + try { + await resp.body?.cancel(); + } catch { + /* ignore */ + } + } catch { + // Stale adoptedSandboxName after operator eviction — fall through + // to resurrection, same as `proxyPreviewRequest`. + } } } + // Preview traffic can resurrect autonomously (gateway fetch retry) while + // daemon/git/exec paths still hit a cold records map. `exec` already + // routes through `requireRecord`; mirror that here so Save-changes/git + // APIs don't 410 while the iframe preview is live. + if (!rec) { + rec = await this.resurrectByHandle(handle); + } + if (!rec) { return new Response(JSON.stringify({ error: "sandbox not found" }), { status: 404, @@ -529,6 +550,7 @@ export class AgentSandboxProvider implements SandboxProvider { let activeRec = rec; const start = performance.now(); let status = 0; + const canRetryBody = !(init.body instanceof ReadableStream); try { let resp = await proxyDaemonRequest(rec.daemonUrl, rec.token, path, init); // A 401 means the cached record is stale — the pool pod was recreated @@ -537,9 +559,9 @@ export class AgentSandboxProvider implements SandboxProvider { // body is re-sendable: of the BodyInit variants only a ReadableStream is // one-shot (consumed by the first fetch); strings, buffers, // URLSearchParams, FormData and Blobs are re-read from memory on retry. - if (resp.status === 401 && !(init.body instanceof ReadableStream)) { + if (resp.status === 401 && canRetryBody) { this.invalidateRecord(handle); - const fresh = await this.getRecord(handle).catch(() => null); + const fresh = await this.requireRecord(handle).catch(() => null); if (fresh) { activeRec = fresh; resp = await proxyDaemonRequest( @@ -552,6 +574,24 @@ export class AgentSandboxProvider implements SandboxProvider { } status = resp.status; return resp; + } catch (err) { + // Stale port-forward / dead pod after idle eviction — same recovery + // path as preview's fetch-retry arm. + if (!canRetryBody) throw err; + this.invalidateRecord(handle); + const fresh = + (await this.resurrectByHandle(handle)) ?? + (await this.getRecord(handle).catch(() => null)); + if (!fresh) throw err; + activeRec = fresh; + const resp = await proxyDaemonRequest( + fresh.daemonUrl, + fresh.token, + path, + init, + ); + status = resp.status; + return resp; } finally { this.recordProxyDuration( "daemon", @@ -562,6 +602,37 @@ export class AgentSandboxProvider implements SandboxProvider { } } + /** + * Repopulate mesh's records cache from a SandboxClaim that already exists + * in the cluster. Preview gateway traffic can keep serving while mesh's + * daemon/git proxy still holds a cold cache or missing state-store row. + */ + async adoptLiveClaim(id: SandboxId, handle: string): Promise { + if (this.records.has(handle)) return true; + const existing = await getSandboxClaim( + this.kubeConfig, + this.namespace, + handle, + ).catch(() => undefined); + if (!existing || existing.metadata?.deletionTimestamp) return false; + + return this.inflight.run(handle, async () => { + if (this.records.has(handle)) return true; + const adopted = await this.adopt(id, handle, existing).catch(() => null); + if (!adopted) return false; + await withSandboxLock(this.stateStore, id, RUNNER_KIND, async (ops) => { + await this.finish( + adopted, + ops, + /* persistNow */ true, + /* patchTtl */ true, + "adopt", + ); + }); + return true; + }); + } + /** * Resolves the HTTP base URL for a sandbox's daemon. Used by the preview * reverse-proxy at the mesh edge. diff --git a/packages/sandbox/server/provider/types.ts b/packages/sandbox/server/provider/types.ts index 9cd6f3f8d7..3107ce00d7 100644 --- a/packages/sandbox/server/provider/types.ts +++ b/packages/sandbox/server/provider/types.ts @@ -137,6 +137,13 @@ export interface SandboxProvider { init: ProxyRequestInit, ): Promise; + /** + * Repopulate in-process routing state from a claim that already exists in + * the cluster (preview gateway traffic can outlive mesh's records cache). + * Optional — only agent-sandbox implements this today. + */ + adoptLiveClaim?(id: SandboxId, handle: string): Promise; + /** * Stream of phase transitions for the pre-Ready lifecycle. Used by mesh's * unified `/api/vm-events` SSE so the UI can show meaningful progress