From 82f5c9d06577fcb7dbd7c3ee976611ea28a0c460 Mon Sep 17 00:00:00 2001 From: Nathan Flurry Date: Mon, 4 May 2026 01:25:39 -0700 Subject: [PATCH] fix(gateway): split actor response wait errors --- .../guard.actor_stopped_while_waiting.json | 5 +++ .../guard.gateway_response_start_timeout.json | 5 +++ .../errors/guard.tunnel_message_timeout.json | 5 +++ .../errors/guard.tunnel_request_aborted.json | 5 +++ .../errors/guard.tunnel_response_closed.json | 5 +++ engine/packages/guard-core/src/errors.rs | 40 +++++++++++++++++++ engine/packages/guard-core/src/utils.rs | 31 +++++++++++--- engine/packages/pegboard-gateway/src/lib.rs | 17 ++++---- engine/packages/pegboard-gateway2/src/lib.rs | 17 ++++---- .../rivetkit/src/client/lifecycle-errors.ts | 26 ++++++++++++ .../rivetkit/tests/lifecycle-errors.test.ts | 20 ++++++++++ .../src/content/docs/clients/javascript.mdx | 9 ++++- 12 files changed, 163 insertions(+), 22 deletions(-) create mode 100644 engine/artifacts/errors/guard.actor_stopped_while_waiting.json create mode 100644 engine/artifacts/errors/guard.gateway_response_start_timeout.json create mode 100644 engine/artifacts/errors/guard.tunnel_message_timeout.json create mode 100644 engine/artifacts/errors/guard.tunnel_request_aborted.json create mode 100644 engine/artifacts/errors/guard.tunnel_response_closed.json create mode 100644 rivetkit-typescript/packages/rivetkit/tests/lifecycle-errors.test.ts diff --git a/engine/artifacts/errors/guard.actor_stopped_while_waiting.json b/engine/artifacts/errors/guard.actor_stopped_while_waiting.json new file mode 100644 index 0000000000..78c7dfe82c --- /dev/null +++ b/engine/artifacts/errors/guard.actor_stopped_while_waiting.json @@ -0,0 +1,5 @@ +{ + "code": "actor_stopped_while_waiting", + "group": "guard", + "message": "Actor stopped while waiting for a response." +} \ No newline at end of file diff --git a/engine/artifacts/errors/guard.gateway_response_start_timeout.json b/engine/artifacts/errors/guard.gateway_response_start_timeout.json new file mode 100644 index 0000000000..640b2a1fbc --- /dev/null +++ b/engine/artifacts/errors/guard.gateway_response_start_timeout.json @@ -0,0 +1,5 @@ +{ + "code": "gateway_response_start_timeout", + "group": "guard", + "message": "Timed out waiting for actor response start." +} \ No newline at end of file diff --git a/engine/artifacts/errors/guard.tunnel_message_timeout.json b/engine/artifacts/errors/guard.tunnel_message_timeout.json new file mode 100644 index 0000000000..b70b1fdfd9 --- /dev/null +++ b/engine/artifacts/errors/guard.tunnel_message_timeout.json @@ -0,0 +1,5 @@ +{ + "code": "tunnel_message_timeout", + "group": "guard", + "message": "Actor tunnel message timed out." +} \ No newline at end of file diff --git a/engine/artifacts/errors/guard.tunnel_request_aborted.json b/engine/artifacts/errors/guard.tunnel_request_aborted.json new file mode 100644 index 0000000000..a3410d03ac --- /dev/null +++ b/engine/artifacts/errors/guard.tunnel_request_aborted.json @@ -0,0 +1,5 @@ +{ + "code": "tunnel_request_aborted", + "group": "guard", + "message": "Actor tunnel aborted the request." +} \ No newline at end of file diff --git a/engine/artifacts/errors/guard.tunnel_response_closed.json b/engine/artifacts/errors/guard.tunnel_response_closed.json new file mode 100644 index 0000000000..8065402994 --- /dev/null +++ b/engine/artifacts/errors/guard.tunnel_response_closed.json @@ -0,0 +1,5 @@ +{ + "code": "tunnel_response_closed", + "group": "guard", + "message": "Actor tunnel closed before sending a response." +} \ No newline at end of file diff --git a/engine/packages/guard-core/src/errors.rs b/engine/packages/guard-core/src/errors.rs index ec33b9c613..7774bb8aef 100644 --- a/engine/packages/guard-core/src/errors.rs +++ b/engine/packages/guard-core/src/errors.rs @@ -100,6 +100,46 @@ pub struct ConnectionError { #[error("guard", "service_unavailable", "Service unavailable.")] pub struct ServiceUnavailable; +#[derive(RivetError, Serialize, Deserialize)] +#[error( + "guard", + "actor_stopped_while_waiting", + "Actor stopped while waiting for a response." +)] +pub struct ActorStoppedWhileWaiting; + +#[derive(RivetError, Serialize, Deserialize)] +#[error( + "guard", + "tunnel_request_aborted", + "Actor tunnel aborted the request." +)] +pub struct TunnelRequestAborted; + +#[derive(RivetError, Serialize, Deserialize)] +#[error( + "guard", + "tunnel_message_timeout", + "Actor tunnel message timed out." +)] +pub struct TunnelMessageTimeout; + +#[derive(RivetError, Serialize, Deserialize)] +#[error( + "guard", + "tunnel_response_closed", + "Actor tunnel closed before sending a response." +)] +pub struct TunnelResponseClosed; + +#[derive(RivetError, Serialize, Deserialize)] +#[error( + "guard", + "gateway_response_start_timeout", + "Timed out waiting for actor response start." +)] +pub struct GatewayResponseStartTimeout; + #[derive(RivetError, Serialize, Deserialize)] #[error( "guard", diff --git a/engine/packages/guard-core/src/utils.rs b/engine/packages/guard-core/src/utils.rs index 348da297e5..0bdf6eee04 100644 --- a/engine/packages/guard-core/src/utils.rs +++ b/engine/packages/guard-core/src/utils.rs @@ -179,6 +179,11 @@ pub(crate) fn err_into_response(err: anyhow::Error) -> Result StatusCode::BAD_GATEWAY, ("actor", "not_found") => StatusCode::NOT_FOUND, ("guard", "service_unavailable") => StatusCode::SERVICE_UNAVAILABLE, + ("guard", "actor_stopped_while_waiting") => StatusCode::SERVICE_UNAVAILABLE, + ("guard", "tunnel_request_aborted") => StatusCode::SERVICE_UNAVAILABLE, + ("guard", "tunnel_message_timeout") => StatusCode::GATEWAY_TIMEOUT, + ("guard", "tunnel_response_closed") => StatusCode::SERVICE_UNAVAILABLE, + ("guard", "gateway_response_start_timeout") => StatusCode::GATEWAY_TIMEOUT, ("guard", "actor_ready_timeout") => StatusCode::SERVICE_UNAVAILABLE, ("guard", "no_route") => StatusCode::NOT_FOUND, ("guard", "invalid_request_body") => StatusCode::PAYLOAD_TOO_LARGE, @@ -218,7 +223,7 @@ pub(crate) fn should_retry_request(res: &Result>) -> bool Ok(resp) => should_retry_request_inner(resp.status(), resp.headers()), Err(err) => { if let Some(rivet_err) = err.chain().find_map(|x| x.downcast_ref::()) { - rivet_err.group() == "guard" && rivet_err.code() == "service_unavailable" + rivet_err.group() == "guard" && is_retryable_guard_http_error(rivet_err.code()) } else { false } @@ -226,11 +231,27 @@ pub(crate) fn should_retry_request(res: &Result>) -> bool } } -// Determine if a response should trigger a retry. Guard-specific actor startup -// failures, including guard.actor_ready_timeout, are signaled as 503 with -// x-rivet-error and should be retried against a freshly resolved target. +fn is_retryable_guard_http_error(code: &str) -> bool { + matches!( + code, + "service_unavailable" + | "actor_ready_timeout" + | "actor_stopped_while_waiting" + | "tunnel_request_aborted" + | "tunnel_message_timeout" + | "tunnel_response_closed" + | "gateway_response_start_timeout" + ) +} + +// Determine if a response should trigger a retry: transient status and x-rivet-error. pub(crate) fn should_retry_request_inner(status: StatusCode, headers: &hyper::HeaderMap) -> bool { - status == StatusCode::SERVICE_UNAVAILABLE && headers.contains_key(X_RIVET_ERROR) + (status == StatusCode::SERVICE_UNAVAILABLE || status == StatusCode::GATEWAY_TIMEOUT) + && headers + .get(X_RIVET_ERROR) + .and_then(|value| value.to_str().ok()) + .and_then(|value| value.split_once('.')) + .is_some_and(|(group, code)| group == "guard" && is_retryable_guard_http_error(code)) } // Determine if a websocket error is retryable (e.g., transient UPS/tunnel issues) diff --git a/engine/packages/pegboard-gateway/src/lib.rs b/engine/packages/pegboard-gateway/src/lib.rs index 13ee000334..44f36b256b 100644 --- a/engine/packages/pegboard-gateway/src/lib.rs +++ b/engine/packages/pegboard-gateway/src/lib.rs @@ -9,7 +9,10 @@ use rivet_error::*; use rivet_guard_core::{ ResponseBody, WebSocketHandle, custom_serve::{CustomServeTrait, HibernationResult}, - errors::{ServiceUnavailable, WebSocketServiceUnavailable}, + errors::{ + ActorStoppedWhileWaiting, GatewayResponseStartTimeout, TunnelMessageTimeout, + TunnelRequestAborted, TunnelResponseClosed, WebSocketServiceUnavailable, + }, request_context::RequestContext, utils::is_ws_hibernate, websocket_handle::WebSocketReceiver, @@ -168,7 +171,7 @@ impl PegboardGateway { } protocol::mk2::ToServerTunnelMessageKind::ToServerResponseAbort => { tracing::warn!("request aborted"); - return Err(ServiceUnavailable.build()); + return Err(TunnelRequestAborted.build()); } _ => { tracing::warn!("received non-response message from pubsub"); @@ -179,21 +182,19 @@ impl PegboardGateway { request_id=%protocol::util::id_to_string(&request_id), "received no message response during request init", ); - break; + return Err(TunnelResponseClosed.build()); } } _ = stopped_sub.next() => { tracing::debug!("actor stopped while waiting for request response"); - return Err(ServiceUnavailable.build()); + return Err(ActorStoppedWhileWaiting.build()); } _ = drop_rx.changed() => { tracing::warn!(reason=?drop_rx.borrow(), "tunnel message timeout"); - return Err(ServiceUnavailable.build()); + return Err(TunnelMessageTimeout.build()); } } } - - Err(ServiceUnavailable.build()) }; let response_start_timeout = Duration::from_millis( self.ctx @@ -206,7 +207,7 @@ impl PegboardGateway { .map_err(|_| { tracing::warn!("timed out waiting for response start from runner"); - ServiceUnavailable.build() + GatewayResponseStartTimeout.build() })??; tracing::debug!("response handler task ended"); diff --git a/engine/packages/pegboard-gateway2/src/lib.rs b/engine/packages/pegboard-gateway2/src/lib.rs index 67392244f6..1969f38150 100644 --- a/engine/packages/pegboard-gateway2/src/lib.rs +++ b/engine/packages/pegboard-gateway2/src/lib.rs @@ -9,7 +9,10 @@ use rivet_error::*; use rivet_guard_core::{ ResponseBody, WebSocketHandle, custom_serve::{CustomServeTrait, HibernationResult}, - errors::{ServiceUnavailable, WebSocketServiceUnavailable}, + errors::{ + ActorStoppedWhileWaiting, GatewayResponseStartTimeout, TunnelMessageTimeout, + TunnelRequestAborted, TunnelResponseClosed, WebSocketServiceUnavailable, + }, request_context::RequestContext, utils::is_ws_hibernate, }; @@ -171,7 +174,7 @@ impl PegboardGateway2 { } protocol::ToRivetTunnelMessageKind::ToRivetResponseAbort => { tracing::warn!("request aborted"); - return Err(ServiceUnavailable.build()); + return Err(TunnelRequestAborted.build()); } _ => { tracing::warn!("received non-response message from pubsub"); @@ -182,21 +185,19 @@ impl PegboardGateway2 { request_id=%protocol::util::id_to_string(&request_id), "received no message response during request init", ); - break; + return Err(TunnelResponseClosed.build()); } } _ = stopped_sub.next() => { tracing::debug!("actor stopped while waiting for request response"); - return Err(ServiceUnavailable.build()); + return Err(ActorStoppedWhileWaiting.build()); } _ = drop_rx.changed() => { tracing::warn!(reason=?drop_rx.borrow(), "tunnel message timeout"); - return Err(ServiceUnavailable.build()); + return Err(TunnelMessageTimeout.build()); } } } - - Err(ServiceUnavailable.build()) } .instrument(tracing::info_span!("wait_for_tunnel_response")); let response_start_timeout = Duration::from_millis( @@ -210,7 +211,7 @@ impl PegboardGateway2 { .map_err(|_| { tracing::warn!("timed out waiting for response start from envoy"); - ServiceUnavailable.build() + GatewayResponseStartTimeout.build() })??; tracing::debug!("response handler task ended"); diff --git a/rivetkit-typescript/packages/rivetkit/src/client/lifecycle-errors.ts b/rivetkit-typescript/packages/rivetkit/src/client/lifecycle-errors.ts index 5fc5507984..8bc004d8c3 100644 --- a/rivetkit-typescript/packages/rivetkit/src/client/lifecycle-errors.ts +++ b/rivetkit-typescript/packages/rivetkit/src/client/lifecycle-errors.ts @@ -76,6 +76,21 @@ function classifyActorError( ); } + if ( + error.group === "guard" && + isRetryableGuardGatewayHttpError(error.code) + ) { + return buildLifecycleBoundaryInfo( + "request_retry", + "actor_error", + error.message, + { + group: error.group, + code: error.code, + }, + ); + } + // TODO(RVT-6193): Remove this legacy match after structured restart errors // are authoritative everywhere. if ( @@ -144,6 +159,17 @@ function classifyActorError( return undefined; } +function isRetryableGuardGatewayHttpError(code: string): boolean { + return ( + code === "service_unavailable" || + code === "actor_stopped_while_waiting" || + code === "tunnel_request_aborted" || + code === "tunnel_message_timeout" || + code === "tunnel_response_closed" || + code === "gateway_response_start_timeout" + ); +} + function classifyTransportError( error: Error, ): LifecycleBoundaryInfo | undefined { diff --git a/rivetkit-typescript/packages/rivetkit/tests/lifecycle-errors.test.ts b/rivetkit-typescript/packages/rivetkit/tests/lifecycle-errors.test.ts new file mode 100644 index 0000000000..55fcc48c8c --- /dev/null +++ b/rivetkit-typescript/packages/rivetkit/tests/lifecycle-errors.test.ts @@ -0,0 +1,20 @@ +import { describe, expect, test } from "vitest"; +import { ActorError } from "../src/client/errors"; +import { isRetryableLifecycleRequestError } from "../src/client/lifecycle-errors"; + +describe("lifecycle error retry classification", () => { + test.each([ + "service_unavailable", + "actor_stopped_while_waiting", + "tunnel_request_aborted", + "tunnel_message_timeout", + "tunnel_response_closed", + "gateway_response_start_timeout", + ])("classifies guard.%s as retryable", (code) => { + expect( + isRetryableLifecycleRequestError( + new ActorError("guard", code, "transient gateway error"), + ), + ).toBe(true); + }); +}); diff --git a/website/src/content/docs/clients/javascript.mdx b/website/src/content/docs/clients/javascript.mdx index c7cd99c169..ce2d2d7eaf 100644 --- a/website/src/content/docs/clients/javascript.mdx +++ b/website/src/content/docs/clients/javascript.mdx @@ -276,7 +276,14 @@ const ws = await handle.webSocket("probe", undefined, { }); ``` -Requests still return a transient `actor.stopping` lifecycle error (`{"group":"actor","code":"stopping","message":"Actor is stopping."}`) if the actor has fully stopped, i.e. the sleep grace period has ended but it has not yet restarted. Retry once the actor is available again. +Requests can still return transient lifecycle or gateway errors. Retry once the actor is available again. + +- `actor.stopping`: the actor has fully stopped, i.e. the sleep grace period has ended but it has not yet restarted. +- `guard.actor_stopped_while_waiting`: the request reached the actor tunnel, but the actor stopped before the gateway received a response. +- `guard.tunnel_request_aborted`: the actor tunnel aborted the request before a response started. +- `guard.tunnel_message_timeout`: the gateway dropped the in-flight tunnel request after its tunnel message timeout. +- `guard.tunnel_response_closed`: the actor tunnel closed before sending a response. +- `guard.gateway_response_start_timeout`: the gateway timed out waiting for the actor response to start. ## API Reference