From 9b6e7eefbb10458106278c0a708b71857ae299ba Mon Sep 17 00:00:00 2001 From: Tim Diekmann Date: Wed, 29 Apr 2026 10:32:08 +0200 Subject: [PATCH 01/17] BE-519: Set up OpenTelemetry across hash-api and Temporal workers --- Cargo.lock | 3 + apps/hash-ai-worker-ts/package.json | 6 + apps/hash-ai-worker-ts/src/instrument.ts | 59 +++++ apps/hash-ai-worker-ts/src/main.ts | 113 ++++++++- .../src/ensure-system-graph-is-initialized.ts | 2 +- apps/hash-api/src/graphql/opentelemetry.ts | 91 -------- apps/hash-api/src/index.ts | 47 +++- apps/hash-api/src/instrument.mjs | 72 +++++- .../src/integrations/linear/webhook.ts | 2 +- .../hash-external-services/docker-compose.yml | 3 + .../otel-collector-config.yaml | 30 ++- apps/hash-integration-worker/package.json | 6 + .../hash-integration-worker/src/instrument.ts | 58 +++++ apps/hash-integration-worker/src/main.ts | 125 ++++++++-- libs/@local/hash-backend-utils/package.json | 12 + .../hash-backend-utils/src/opentelemetry.ts | 218 ++++++++++++++++++ .../@local/hash-backend-utils/src/temporal.ts | 53 ++++- .../interceptors/activities/opentelemetry.ts | 16 ++ .../interceptors/workflows/opentelemetry.ts | 21 ++ .../src/temporal/workflow-span-adapter.ts | 85 +++++++ libs/@local/temporal-client/Cargo.toml | 19 +- libs/@local/temporal-client/src/ai.rs | 134 ++++++++--- yarn.lock | 118 +++++++++- 23 files changed, 1111 insertions(+), 182 deletions(-) create mode 100644 apps/hash-ai-worker-ts/src/instrument.ts delete mode 100644 apps/hash-api/src/graphql/opentelemetry.ts create mode 100644 apps/hash-integration-worker/src/instrument.ts create mode 100644 libs/@local/hash-backend-utils/src/opentelemetry.ts create mode 100644 libs/@local/hash-backend-utils/src/temporal/interceptors/activities/opentelemetry.ts create mode 100644 libs/@local/hash-backend-utils/src/temporal/interceptors/workflows/opentelemetry.ts create mode 100644 libs/@local/hash-backend-utils/src/temporal/workflow-span-adapter.ts diff --git a/Cargo.lock b/Cargo.lock index 8afb21de452..919d242c246 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3782,12 +3782,15 @@ name = "hash-temporal-client" version = "0.0.0" dependencies = [ "error-stack", + "opentelemetry", "serde", "serde_json", "simple-mermaid", "temporalio-client", "temporalio-common", "thiserror 2.0.18", + "tracing", + "tracing-opentelemetry", "type-system", "url", "uuid", diff --git a/apps/hash-ai-worker-ts/package.json b/apps/hash-ai-worker-ts/package.json index d4c3e4ca783..bf12def9436 100644 --- a/apps/hash-ai-worker-ts/package.json +++ b/apps/hash-ai-worker-ts/package.json @@ -56,9 +56,15 @@ "@local/hash-graph-sdk": "workspace:*", "@local/hash-isomorphic-utils": "workspace:*", "@local/status": "workspace:*", + "@opentelemetry/api": "1.9.0", + "@opentelemetry/api-logs": "0.207.0", + "@opentelemetry/instrumentation": "0.207.0", + "@opentelemetry/instrumentation-grpc": "0.207.0", + "@opentelemetry/instrumentation-http": "0.207.0", "@sentry/node": "10.42.0", "@temporalio/activity": "1.12.1", "@temporalio/common": "1.12.1", + "@temporalio/interceptors-opentelemetry": "1.12.1", "@temporalio/proto": "1.12.1", "@temporalio/worker": "1.12.1", "@temporalio/workflow": "1.12.1", diff --git a/apps/hash-ai-worker-ts/src/instrument.ts b/apps/hash-ai-worker-ts/src/instrument.ts new file mode 100644 index 00000000000..3988d8ea9a0 --- /dev/null +++ b/apps/hash-ai-worker-ts/src/instrument.ts @@ -0,0 +1,59 @@ +/** + * OpenTelemetry bootstrap for the AI worker. Imported as the very first + * statement of `main.ts` so the auto-instrumentations can patch http + * and gRPC modules before any other code requires them. + */ +import { + createUndiciInstrumentation, + registerOpenTelemetry, +} from "@local/hash-backend-utils/opentelemetry"; +import { GrpcInstrumentation } from "@opentelemetry/instrumentation-grpc"; +import { HttpInstrumentation } from "@opentelemetry/instrumentation-http"; + +const otlpEndpoint = process.env.HASH_OTLP_ENDPOINT; + +/** + * Setup handles. `undefined` when no `HASH_OTLP_ENDPOINT` is configured — + * `main.ts` checks for that and skips workflow-side OTEL wiring in that + * case. + */ +export const otelSetup = otlpEndpoint + ? registerOpenTelemetry({ + endpoint: otlpEndpoint, + serviceName: process.env.OTEL_SERVICE_NAME ?? "AI Worker", + instrumentations: [ + new HttpInstrumentation({ + // Don't trace the OTLP exporter's own outgoing requests, otherwise + // every export creates a span that needs to be exported, recursively. + ignoreOutgoingRequestHook: (options) => options.port === 4317, + // Default span name is just `HTTP POST` — replace with + // `METHOD /path` so outbound LLM / Graph calls are + // distinguishable in Tempo. + requestHook: (span, request) => { + if (!("method" in request) || !request.method) { + return; + } + const candidates = [ + "originalUrl" in request ? request.originalUrl : undefined, + "url" in request ? request.url : undefined, + "path" in request ? request.path : undefined, + ]; + const rawPath = candidates.find( + (value): value is string => typeof value === "string", + ); + const path = rawPath?.split("?")[0]; + if (path) { + span.updateName(`${request.method} ${path}`); + } + }, + }), + new GrpcInstrumentation(), + // Native `fetch` (used by openai / @anthropic-ai/sdk / Vertex AI + // SDKs) goes through undici, which the http instrumentation does + // not patch. The shared helper sets `peer.service` so Tempo's + // service_graphs processor renders OpenAI / Anthropic / Vertex + // as external-service edges in the service map. + createUndiciInstrumentation(), + ], + }) + : undefined; diff --git a/apps/hash-ai-worker-ts/src/main.ts b/apps/hash-ai-worker-ts/src/main.ts index 2e6655ec36d..c010ae289af 100644 --- a/apps/hash-ai-worker-ts/src/main.ts +++ b/apps/hash-ai-worker-ts/src/main.ts @@ -1,4 +1,8 @@ -/* eslint-disable import/first */ +/* eslint-disable import/first, import/order, simple-import-sort/imports */ + +// Must be the first import so OTEL auto-instrumentations can patch +// http / grpc / Sentry's own monkey-patches before they apply. +import { otelSetup } from "./instrument.js"; import * as Sentry from "@sentry/node"; @@ -12,6 +16,12 @@ Sentry.init({ process.env.ENVIRONMENT || (process.env.NODE_ENV === "production" ? "production" : "development"), tracesSampleRate: process.env.NODE_ENV === "production" ? 1.0 : 0, + // When OTEL is configured we already have a global TracerProvider + // wired up to OTLP. Tell Sentry to share it instead of registering its + // own, so Sentry's spans flow through our pipeline and inherit the + // active OTEL context (including parent spans extracted from Temporal + // headers by OpenTelemetryActivityInboundInterceptor). + skipOpenTelemetrySetup: !!process.env.HASH_OTLP_ENDPOINT, }); import * as http from "node:http"; @@ -22,11 +32,23 @@ import { fileURLToPath } from "node:url"; import { createGraphClient } from "@local/hash-backend-utils/create-graph-client"; import { getRequiredEnv } from "@local/hash-backend-utils/environment"; import { createCommonFlowActivities } from "@local/hash-backend-utils/flows"; +import { OpenTelemetryActivityInboundInterceptor } from "@local/hash-backend-utils/temporal/interceptors/activities/opentelemetry"; import { SentryActivityInboundInterceptor } from "@local/hash-backend-utils/temporal/interceptors/activities/sentry"; import { sentrySinks } from "@local/hash-backend-utils/temporal/sinks/sentry"; +import { createTemporalSdkLogger } from "@local/hash-backend-utils/temporal"; +import { wrapWorkflowSpanExporter } from "@local/hash-backend-utils/temporal/workflow-span-adapter"; import { createVaultClient } from "@local/hash-backend-utils/vault"; -import type { WorkerOptions } from "@temporalio/worker"; -import { defaultSinks, NativeConnection, Worker } from "@temporalio/worker"; +import { makeWorkflowExporter } from "@temporalio/interceptors-opentelemetry"; +import type { + ActivityInboundCallsInterceptorFactory, + WorkerOptions, +} from "@temporalio/worker"; +import { + defaultSinks, + NativeConnection, + Runtime, + Worker, +} from "@temporalio/worker"; import { config } from "dotenv-flow"; import { TsconfigPathsPlugin } from "tsconfig-paths-webpack-plugin"; @@ -105,6 +127,27 @@ const workflowOptions: Partial = async function run() { logger.info("Starting AI worker..."); + // Temporal SDK runtime telemetry: emits SDK-internal metrics (worker + // slot utilisation, sticky cache hits, polling latency, activity / + // workflow execution latency) and forwards Rust core logs to OTLP. + // Must be installed before any Temporal Connection / Worker is + // created. Distinct from the per-activity user-code spans the + // interceptors below produce. + if (otelSetup) { + Runtime.install({ + logger: createTemporalSdkLogger(logger), + telemetryOptions: { + metrics: { + otel: { + url: process.env.HASH_OTLP_ENDPOINT!, + metricsExportInterval: "30s", + }, + }, + logging: { forward: { level: "INFO" } }, + }, + }); + } + const graphApiClient = createGraphClient(logger, { host: getRequiredEnv("HASH_GRAPH_HTTP_HOST"), port: parseInt(getRequiredEnv("HASH_GRAPH_HTTP_PORT"), 10), @@ -146,14 +189,53 @@ async function run() { maxHeartbeatThrottleInterval: "10 seconds", namespace: "HASH", taskQueue: "ai", - sinks: { ...defaultSinks(), ...sentrySinks() }, + sinks: { + ...defaultSinks(), + ...sentrySinks(), + ...(otelSetup + ? { + exporter: makeWorkflowExporter( + // Wrap our v2 OTLPTraceExporter so it can ingest the v1- + // shaped `ReadableSpan`s that Temporal's workflow sandbox + // produces. Without this, the `instrumentationLibrary` → + // `instrumentationScope` rename in sdk-trace-base v2 + // causes the OTLP transformer to crash on every workflow + // span export. + wrapWorkflowSpanExporter( + otelSetup.traceExporter, + ) as unknown as Parameters[0], + otelSetup.resource as unknown as Parameters< + typeof makeWorkflowExporter + >[1], + ), + } + : {}), + }, interceptors: { workflowModules: [ require.resolve( "@local/hash-backend-utils/temporal/interceptors/workflows/sentry", ), + ...(otelSetup + ? [ + require.resolve( + "@local/hash-backend-utils/temporal/interceptors/workflows/opentelemetry", + ), + ] + : []), + ], + // OTEL interceptor must run as the OUTER wrapper so it can + // extract the trace context the workflow client injected into + // the activity headers before any other span is created. Sentry + // doesn't know the Temporal `_tracer-data` header convention, so + // a Sentry-first ordering produces a parent-less span and breaks + // the caller→workflow→activity trace chain. + activityInbound: [ + ...((otelSetup + ? [(ctx) => new OpenTelemetryActivityInboundInterceptor(ctx)] + : []) satisfies ActivityInboundCallsInterceptorFactory[]), + (ctx) => new SentryActivityInboundInterceptor(ctx), ], - activityInbound: [(ctx) => new SentryActivityInboundInterceptor(ctx)], }, }); @@ -166,14 +248,21 @@ async function run() { await worker.run(); } -process.on("SIGINT", () => { - logger.info("Received SIGINT, exiting..."); - process.exit(1); -}); -process.on("SIGTERM", () => { - logger.info("Received SIGTERM, exiting..."); +const shutdown = async (signal: NodeJS.Signals) => { + logger.info(`Received ${signal}, exiting...`); + // Flush any pending OTLP exports before tearing down the process. + // Without this, the last seconds of telemetry are lost on every + // graceful restart. + try { + await otelSetup?.shutdown(); + } catch (error) { + logger.error("Failed to flush OpenTelemetry", { error }); + } process.exit(1); -}); +}; + +process.on("SIGINT", (signal) => void shutdown(signal)); +process.on("SIGTERM", (signal) => void shutdown(signal)); run().catch((error: unknown) => { logger.error("Error running worker", { error }); diff --git a/apps/hash-api/src/ensure-system-graph-is-initialized.ts b/apps/hash-api/src/ensure-system-graph-is-initialized.ts index 47d8416d357..67ee8978b79 100644 --- a/apps/hash-api/src/ensure-system-graph-is-initialized.ts +++ b/apps/hash-api/src/ensure-system-graph-is-initialized.ts @@ -17,7 +17,7 @@ const context: ImpureGraphContext = { host: getRequiredEnv("HASH_GRAPH_HTTP_HOST"), port: Number.parseInt(getRequiredEnv("HASH_GRAPH_HTTP_PORT"), 10), }), - temporalClient: await createTemporalClient(logger), + temporalClient: await createTemporalClient(), }; await ensureSystemGraphIsInitialized({ diff --git a/apps/hash-api/src/graphql/opentelemetry.ts b/apps/hash-api/src/graphql/opentelemetry.ts deleted file mode 100644 index 3aded15c9dc..00000000000 --- a/apps/hash-api/src/graphql/opentelemetry.ts +++ /dev/null @@ -1,91 +0,0 @@ -import { logs } from "@opentelemetry/api-logs"; -import { OTLPLogExporter } from "@opentelemetry/exporter-logs-otlp-grpc"; -import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-grpc"; -import { registerInstrumentations } from "@opentelemetry/instrumentation"; -import { - ExpressInstrumentation, - ExpressLayerType, -} from "@opentelemetry/instrumentation-express"; -import { GraphQLInstrumentation } from "@opentelemetry/instrumentation-graphql"; -import { HttpInstrumentation } from "@opentelemetry/instrumentation-http"; -import { - defaultResource, - resourceFromAttributes, -} from "@opentelemetry/resources"; -import { - LoggerProvider, - SimpleLogRecordProcessor, -} from "@opentelemetry/sdk-logs"; -import { SimpleSpanProcessor } from "@opentelemetry/sdk-trace-base"; -import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node"; - -import { logger } from "../logger"; - -const traceTimeout = 5000; - -const unregisterInstrumentations = registerInstrumentations({ - instrumentations: [ - new HttpInstrumentation({ - ignoreOutgoingRequestHook: (options) => { - return options.port === 4317; - }, - }), - new ExpressInstrumentation({ - ignoreLayersType: [ExpressLayerType.MIDDLEWARE], - }), - new GraphQLInstrumentation({ - allowValues: true, - depth: 5, - mergeItems: true, - ignoreTrivialResolveSpans: true, - }), - ], -}); - -export const registerOpenTelemetry = ( - otlpGrpcEndpoint: string | null, - serviceName: string, -): (() => void) => { - if (!otlpGrpcEndpoint) { - logger.warn( - "No OpenTelemetry Protocol endpoint given. Not sending telemetry anywhere.", - ); - return () => {}; - } - - const collectorOptions = { - timeoutMillis: traceTimeout, - url: otlpGrpcEndpoint, - }; - - // Setup Tracing - const traceExporter = new OTLPTraceExporter(collectorOptions); - const traceProvider = new NodeTracerProvider({ - resource: defaultResource().merge( - resourceFromAttributes({ "service.name": serviceName }), - ), - spanProcessors: [new SimpleSpanProcessor(traceExporter)], - }); - traceProvider.register(); - - // Setup Logs - const logExporter = new OTLPLogExporter(collectorOptions); - const logProvider = new LoggerProvider({ - resource: defaultResource().merge( - resourceFromAttributes({ "service.name": serviceName }), - ), - processors: [new SimpleLogRecordProcessor(logExporter)], - }); - - logs.setGlobalLoggerProvider(logProvider); - - logger.info( - `Registered OpenTelemetry (traces + logs) at endpoint ${otlpGrpcEndpoint} for ${serviceName}`, - ); - - return () => { - traceProvider.shutdown().catch(logger.error); - logProvider.shutdown().catch(logger.error); - unregisterInstrumentations(); - }; -}; diff --git a/apps/hash-api/src/index.ts b/apps/hash-api/src/index.ts index 2df03434c5f..6abfa0e6cc0 100644 --- a/apps/hash-api/src/index.ts +++ b/apps/hash-api/src/index.ts @@ -12,6 +12,7 @@ import { realtimeSyncEnabled, waitOnResource, } from "@local/hash-backend-utils/environment"; +import { getActiveOpenTelemetrySetup } from "@local/hash-backend-utils/opentelemetry"; import { createRedisClient } from "@local/hash-backend-utils/redis"; import { GracefulShutdown } from "@local/hash-backend-utils/shutdown"; import { createTemporalClient } from "@local/hash-backend-utils/temporal"; @@ -100,6 +101,14 @@ const httpServer = http.createServer(app); const shutdown = new GracefulShutdown(logger, "SIGINT", "SIGTERM"); +// Flush OpenTelemetry last so cleanup hooks above this point can still +// emit shutdown spans / logs before the providers disconnect from the +// collector. +const otelSetup = getActiveOpenTelemetrySetup(); +if (otelSetup) { + shutdown.addCleanup("OpenTelemetry", otelSetup.shutdown); +} + const baseRateLimitOptions: Partial = { windowMs: process.env.NODE_ENV === "test" ? 10 : 1000 * 10, // 10 seconds limit: 12, // Limit each IP to 12 requests every 10 seconds @@ -244,6 +253,30 @@ const sanitizeProxyLogArgs = (args: unknown[]): unknown[] => typeof arg === "string" ? redactAuthQueryParams(arg) : arg, ); +/** + * Forward a `http-proxy-middleware` variadic log call to the + * structured logger as `(message, meta?)`. Without this, passing the + * raw `args` array as a single argument to `logger.info` results in + * the OTLP body being a JSON-encoded array (`["[HPM] …"]`) instead of + * the proxy log message itself. + */ +const forwardProxyLog = (level: "info" | "warn" | "error", args: unknown[]) => { + const sanitized = sanitizeProxyLogArgs(args); + if (sanitized.length === 0) { + return; + } + const [first, ...rest] = sanitized; + if (typeof first === "string") { + if (rest.length === 0) { + logger[level](first); + } else { + logger[level](first, { args: rest }); + } + return; + } + logger[level]("[HPM]", { args: sanitized }); +}; + const kratosProxyLogger = { /** * `http-proxy-middleware` logs include request URLs. @@ -251,15 +284,9 @@ const kratosProxyLogger = { * `/auth/*` requests can include Ory self-service query parameters, so we * sanitize all forwarded log levels consistently. */ - info: (...args: unknown[]) => { - logger.info(sanitizeProxyLogArgs(args)); - }, - warn: (...args: unknown[]) => { - logger.warn(sanitizeProxyLogArgs(args)); - }, - error: (...args: unknown[]) => { - logger.error(sanitizeProxyLogArgs(args)); - }, + info: (...args: unknown[]) => forwardProxyLog("info", args), + warn: (...args: unknown[]) => forwardProxyLog("warn", args), + error: (...args: unknown[]) => forwardProxyLog("error", args), }; const kratosProxy = createProxyMiddleware({ @@ -418,7 +445,7 @@ const main = async () => { // Setup upload storage provider and express routes for local file uploads const uploadProvider = setupStorageProviders(app, FILE_UPLOAD_PROVIDER); - const temporalClient = await createTemporalClient(logger); + const temporalClient = await createTemporalClient(); const vaultClient = await createVaultClient({ logger }); diff --git a/apps/hash-api/src/instrument.mjs b/apps/hash-api/src/instrument.mjs index 4e72042f03c..acb62854383 100644 --- a/apps/hash-api/src/instrument.mjs +++ b/apps/hash-api/src/instrument.mjs @@ -5,14 +5,69 @@ import * as Sentry from "@sentry/node"; import { isProdEnv } from "./lib/env-config"; -// Initialize OpenTelemetry BEFORE any app code +// Initialize OpenTelemetry BEFORE any app code. The setup handle is +// retrievable via `getActiveOpenTelemetrySetup()` so `index.ts` can +// wire `shutdown()` into the GracefulShutdown chain — without it, +// in-flight OTLP exports are lost on SIGTERM. const otlpEndpoint = process.env.HASH_OTLP_ENDPOINT; if (otlpEndpoint) { - const { registerOpenTelemetry } = await import("./graphql/opentelemetry.js"); - registerOpenTelemetry( - otlpEndpoint, - process.env.OTEL_SERVICE_NAME || "Node API", + const { createUndiciInstrumentation, registerOpenTelemetry } = await import( + "@local/hash-backend-utils/opentelemetry" ); + const [ + { HttpInstrumentation }, + { ExpressInstrumentation, ExpressLayerType }, + { GraphQLInstrumentation }, + ] = await Promise.all([ + import("@opentelemetry/instrumentation-http"), + import("@opentelemetry/instrumentation-express"), + import("@opentelemetry/instrumentation-graphql"), + ]); + + registerOpenTelemetry({ + endpoint: otlpEndpoint, + serviceName: process.env.OTEL_SERVICE_NAME || "Node API", + instrumentations: [ + new HttpInstrumentation({ + ignoreOutgoingRequestHook: (options) => options.port === 4317, + // The default span name is just `HTTP POST` etc. Replace with + // `METHOD /path`. Incoming requests (IncomingMessage) expose + // `originalUrl` (Express) or `url`; outgoing requests + // (ClientRequest) expose `path`. Cover both so neither side + // ends up with a bare `POST` span name. + requestHook: (span, request) => { + if (!("method" in request) || !request.method) { + return; + } + const rawPath = + ("originalUrl" in request && request.originalUrl) || + ("url" in request && request.url) || + ("path" in request && request.path) || + ""; + // Strip query string to keep cardinality bounded. + const path = String(rawPath).split("?")[0]; + if (path) { + span.updateName(`${request.method} ${path}`); + } + }, + }), + new ExpressInstrumentation({ + ignoreLayersType: [ExpressLayerType.MIDDLEWARE], + }), + new GraphQLInstrumentation({ + allowValues: true, + depth: 5, + mergeItems: true, + ignoreTrivialResolveSpans: true, + }), + // Native `fetch` (used by openai SDK and outbound API calls in + // resolvers) goes through undici, which the http instrumentation + // does not patch. The shared helper sets `peer.service` so + // Tempo's service_graphs processor renders external dependencies + // as edges in the service map. + createUndiciInstrumentation(), + ], + }); } const sentryDsn = process.env.NODE_API_SENTRY_DSN; @@ -26,4 +81,11 @@ Sentry.init({ (isProdEnv ? "production" : "development"), sendDefaultPii: true, tracesSampleRate: isProdEnv ? 1.0 : 0, + // When OTEL is configured we already have a global TracerProvider + // wired up to OTLP. Tell Sentry to share it instead of registering its + // own — otherwise Sentry hijacks the global provider and the + // OpenTelemetryWorkflowClientInterceptor that injects trace context + // into Temporal workflow headers ends up running on a non-functional + // tracer, breaking caller→workflow→activity correlation. + skipOpenTelemetrySetup: !!otlpEndpoint, }); diff --git a/apps/hash-api/src/integrations/linear/webhook.ts b/apps/hash-api/src/integrations/linear/webhook.ts index dbb709c77e1..c5b39974f73 100644 --- a/apps/hash-api/src/integrations/linear/webhook.ts +++ b/apps/hash-api/src/integrations/linear/webhook.ts @@ -59,7 +59,7 @@ export const linearWebhook: RequestHandler< const payload = JSON.parse(req.body) as LinearWebhookPayload; - const temporalClient = await createTemporalClient(logger); + const temporalClient = await createTemporalClient(); const organizationId = payload.organizationId; diff --git a/apps/hash-external-services/docker-compose.yml b/apps/hash-external-services/docker-compose.yml index 916282b1c98..9da1bf8ba10 100644 --- a/apps/hash-external-services/docker-compose.yml +++ b/apps/hash-external-services/docker-compose.yml @@ -237,6 +237,9 @@ services: POSTGRES_USER: "${HASH_TEMPORAL_PG_USER}" POSTGRES_PWD: "${HASH_TEMPORAL_PG_PASSWORD}" POSTGRES_SEEDS: "postgres" # the hostname of the postgres container + # Expose Prometheus-format metrics on port 8000 so the otel-collector + # can scrape them and forward to Mimir. + PROMETHEUS_ENDPOINT: "0.0.0.0:8000" security_opt: - no-new-privileges:true ports: diff --git a/apps/hash-external-services/opentelemetry-collector/otel-collector-config.yaml b/apps/hash-external-services/opentelemetry-collector/otel-collector-config.yaml index 8805bf18998..36fb21a0efb 100644 --- a/apps/hash-external-services/opentelemetry-collector/otel-collector-config.yaml +++ b/apps/hash-external-services/opentelemetry-collector/otel-collector-config.yaml @@ -5,6 +5,34 @@ receivers: endpoint: 0.0.0.0:4317 http: endpoint: 0.0.0.0:4318 + # Scrape Prometheus-format metrics that the Temporal server cluster + # exposes on its `PROMETHEUS_ENDPOINT` (port 8000) so we get + # server-side counters (workflow_started, persistence latency, + # mutable-state cache, queue depth, …) alongside the worker SDK + # metrics that arrive via OTLP. + # + # Temporal emits unprefixed metric names (`cache_miss`, + # `request_latency`, `activity_end_to_end_latency`, …) that collide + # visually with metrics from every other scrape job. Force a + # `temporal_` prefix on everything coming out of this scrape so + # they're disambiguated in Mimir / Grafana auto-complete. Go runtime + # metrics from the Temporal process are also prefixed — that's + # intentional, those are *Temporal's* runtime, not ours. + prometheus: + config: + scrape_configs: + - job_name: temporal-server + scrape_interval: 30s + static_configs: + - targets: [temporal:8000] + labels: + service.name: temporal-server + metric_relabel_configs: + - source_labels: [__name__] + regex: ^(.+)$ + target_label: __name__ + replacement: temporal_${1} + action: replace processors: batch: @@ -40,7 +68,7 @@ service: processors: [resource/deployment_environment, batch] exporters: [otlp/tempo] metrics: - receivers: [otlp] + receivers: [otlp, prometheus] processors: [batch] exporters: [otlphttp/mimir] logs: diff --git a/apps/hash-integration-worker/package.json b/apps/hash-integration-worker/package.json index b26526c56a3..f09f045a2cf 100644 --- a/apps/hash-integration-worker/package.json +++ b/apps/hash-integration-worker/package.json @@ -28,8 +28,14 @@ "@local/hash-graph-sdk": "workspace:*", "@local/hash-isomorphic-utils": "workspace:*", "@local/status": "workspace:*", + "@opentelemetry/api": "1.9.0", + "@opentelemetry/api-logs": "0.207.0", + "@opentelemetry/instrumentation": "0.207.0", + "@opentelemetry/instrumentation-grpc": "0.207.0", + "@opentelemetry/instrumentation-http": "0.207.0", "@sentry/node": "10.42.0", "@temporalio/activity": "1.12.1", + "@temporalio/interceptors-opentelemetry": "1.12.1", "@temporalio/worker": "1.12.1", "@temporalio/workflow": "1.12.1", "agentkeepalive": "4.6.0", diff --git a/apps/hash-integration-worker/src/instrument.ts b/apps/hash-integration-worker/src/instrument.ts new file mode 100644 index 00000000000..ab629fd02f5 --- /dev/null +++ b/apps/hash-integration-worker/src/instrument.ts @@ -0,0 +1,58 @@ +/** + * OpenTelemetry bootstrap for the integration worker. Imported as the + * very first statement of `main.ts` so the auto-instrumentations can + * patch http and gRPC modules before any other code requires them. + */ +import { + createUndiciInstrumentation, + registerOpenTelemetry, +} from "@local/hash-backend-utils/opentelemetry"; +import { GrpcInstrumentation } from "@opentelemetry/instrumentation-grpc"; +import { HttpInstrumentation } from "@opentelemetry/instrumentation-http"; + +const otlpEndpoint = process.env.HASH_OTLP_ENDPOINT; + +/** + * Setup handles. `undefined` when no `HASH_OTLP_ENDPOINT` is configured — + * `main.ts` checks for that and skips workflow-side OTEL wiring in that + * case. + */ +export const otelSetup = otlpEndpoint + ? registerOpenTelemetry({ + endpoint: otlpEndpoint, + serviceName: process.env.OTEL_SERVICE_NAME ?? "Integration Worker", + instrumentations: [ + new HttpInstrumentation({ + // Don't trace the OTLP exporter's own outgoing requests, otherwise + // every export creates a span that needs to be exported, recursively. + ignoreOutgoingRequestHook: (options) => options.port === 4317, + // Default span name is just `HTTP POST` — replace with + // `METHOD /path` so outbound Linear / Graph calls are + // distinguishable in Tempo. + requestHook: (span, request) => { + if (!("method" in request) || !request.method) { + return; + } + const candidates = [ + "originalUrl" in request ? request.originalUrl : undefined, + "url" in request ? request.url : undefined, + "path" in request ? request.path : undefined, + ]; + const rawPath = candidates.find( + (value): value is string => typeof value === "string", + ); + const path = rawPath?.split("?")[0]; + if (path) { + span.updateName(`${request.method} ${path}`); + } + }, + }), + new GrpcInstrumentation(), + // Native `fetch` (used by Linear SDK / outbound API calls) goes + // through undici, which the http instrumentation does not patch. + // The shared helper sets `peer.service` so Tempo's service_graphs + // processor renders Linear etc. as external-service edges. + createUndiciInstrumentation(), + ], + }) + : undefined; diff --git a/apps/hash-integration-worker/src/main.ts b/apps/hash-integration-worker/src/main.ts index 5b3b16eccbf..53158b04f8e 100644 --- a/apps/hash-integration-worker/src/main.ts +++ b/apps/hash-integration-worker/src/main.ts @@ -1,4 +1,8 @@ -/* eslint-disable import/first */ +/* eslint-disable import/first, import/order, simple-import-sort/imports */ + +// Must be the first import so OTEL auto-instrumentations can patch +// http / grpc / Sentry's own monkey-patches before they apply. +import { otelSetup } from "./instrument.js"; import * as Sentry from "@sentry/node"; @@ -12,6 +16,12 @@ Sentry.init({ process.env.ENVIRONMENT || (process.env.NODE_ENV === "production" ? "production" : "development"), tracesSampleRate: process.env.NODE_ENV === "production" ? 1.0 : 0, + // When OTEL is configured we already have a global TracerProvider + // wired up to OTLP. Tell Sentry to share it instead of registering its + // own, so Sentry's spans flow through our pipeline and inherit the + // active OTEL context (including parent spans extracted from Temporal + // headers by OpenTelemetryActivityInboundInterceptor). + skipOpenTelemetrySetup: !!process.env.HASH_OTLP_ENDPOINT, }); import * as http from "node:http"; @@ -23,10 +33,20 @@ import { createGraphClient } from "@local/hash-backend-utils/create-graph-client import { getRequiredEnv } from "@local/hash-backend-utils/environment"; import { createCommonFlowActivities } from "@local/hash-backend-utils/flows"; import { Logger } from "@local/hash-backend-utils/logger"; +import { OpenTelemetryActivityInboundInterceptor } from "@local/hash-backend-utils/temporal/interceptors/activities/opentelemetry"; import { SentryActivityInboundInterceptor } from "@local/hash-backend-utils/temporal/interceptors/activities/sentry"; import { sentrySinks } from "@local/hash-backend-utils/temporal/sinks/sentry"; +import { createTemporalSdkLogger } from "@local/hash-backend-utils/temporal"; +import { wrapWorkflowSpanExporter } from "@local/hash-backend-utils/temporal/workflow-span-adapter"; import type { WorkflowTypeMap } from "@local/hash-backend-utils/temporal-integration-workflow-types"; -import { defaultSinks, NativeConnection, Worker } from "@temporalio/worker"; +import { makeWorkflowExporter } from "@temporalio/interceptors-opentelemetry"; +import type { ActivityInboundCallsInterceptorFactory } from "@temporalio/worker"; +import { + defaultSinks, + NativeConnection, + Runtime, + Worker, +} from "@temporalio/worker"; import { config } from "dotenv-flow"; import { createFlowActivities } from "./activities/flow-activities.js"; @@ -88,8 +108,31 @@ const workflowOption = () => : { workflowsPath: require.resolve("./workflows") }; async function run() { - // eslint-disable-next-line no-console - console.info("Starting integration worker..."); + logger.info("Starting integration worker..."); + + // Temporal SDK runtime telemetry: emits SDK-internal metrics (worker + // slot utilisation, sticky cache hits, polling latency, activity / + // workflow execution latency) and forwards Rust core logs to OTLP. + // Must be installed before any Temporal Connection / Worker is + // created. Distinct from the per-activity user-code spans the + // interceptors below produce. + if (otelSetup) { + Runtime.install({ + // Pipe Rust-core SDK logs through the same structured logger as + // the rest of the worker, so they end up on the OTLP transport + // alongside app logs instead of bypassing it to stderr. + logger: createTemporalSdkLogger(logger), + telemetryOptions: { + metrics: { + otel: { + url: process.env.HASH_OTLP_ENDPOINT!, + metricsExportInterval: "30s", + }, + }, + logging: { forward: { level: "INFO" } }, + }, + }); + } const graphApiClient = createGraphClient(logger, { host: getRequiredEnv("HASH_GRAPH_HTTP_HOST"), @@ -112,39 +155,81 @@ async function run() { }), namespace: "HASH", taskQueue: "integration", - sinks: { ...defaultSinks(), ...sentrySinks() }, + sinks: { + ...defaultSinks(), + ...sentrySinks(), + ...(otelSetup + ? { + exporter: makeWorkflowExporter( + // Wrap our v2 OTLPTraceExporter so it can ingest the v1- + // shaped `ReadableSpan`s that Temporal's workflow sandbox + // produces. Without this, the `instrumentationLibrary` → + // `instrumentationScope` rename in sdk-trace-base v2 + // causes the OTLP transformer to crash on every workflow + // span export. + wrapWorkflowSpanExporter( + otelSetup.traceExporter, + ) as unknown as Parameters[0], + otelSetup.resource as unknown as Parameters< + typeof makeWorkflowExporter + >[1], + ), + } + : {}), + }, interceptors: { workflowModules: [ require.resolve( "@local/hash-backend-utils/temporal/interceptors/workflows/sentry", ), + ...(otelSetup + ? [ + require.resolve( + "@local/hash-backend-utils/temporal/interceptors/workflows/opentelemetry", + ), + ] + : []), + ], + // OTEL interceptor must run as the OUTER wrapper so it can + // extract the trace context the workflow client injected into + // the activity headers before any other span is created. Sentry + // doesn't know the Temporal `_tracer-data` header convention, so + // a Sentry-first ordering produces a parent-less span and breaks + // the caller→workflow→activity trace chain. + activityInbound: [ + ...((otelSetup + ? [(ctx) => new OpenTelemetryActivityInboundInterceptor(ctx)] + : []) satisfies ActivityInboundCallsInterceptorFactory[]), + (ctx) => new SentryActivityInboundInterceptor(ctx), ], - activityInbound: [(ctx) => new SentryActivityInboundInterceptor(ctx)], }, }); const httpServer = createHealthCheckServer(); const port = 4300; httpServer.listen({ host: "0.0.0.0", port }); - // eslint-disable-next-line no-console - console.info(`HTTP server listening on port ${port}`); + logger.info(`HTTP server listening on port ${port}`); await worker.run(); } -process.on("SIGINT", () => { - // eslint-disable-next-line no-console - console.info("Received SIGINT, exiting..."); +const shutdown = async (signal: NodeJS.Signals) => { + logger.info(`Received ${signal}, exiting...`); + // Flush any pending OTLP exports before tearing down the process. + // Without this, the last seconds of telemetry are lost on every + // graceful restart. + try { + await otelSetup?.shutdown(); + } catch (error) { + logger.error("Failed to flush OpenTelemetry", { error }); + } process.exit(1); -}); -process.on("SIGTERM", () => { - // eslint-disable-next-line no-console - console.info("Received SIGTERM, exiting..."); - process.exit(1); -}); +}; + +process.on("SIGINT", (signal) => void shutdown(signal)); +process.on("SIGTERM", (signal) => void shutdown(signal)); -run().catch((err) => { - // eslint-disable-next-line no-console - console.error(err); +run().catch((error: unknown) => { + logger.error("Error running worker", { error }); process.exit(1); }); diff --git a/libs/@local/hash-backend-utils/package.json b/libs/@local/hash-backend-utils/package.json index 54e15527af8..8b3d3309f20 100644 --- a/libs/@local/hash-backend-utils/package.json +++ b/libs/@local/hash-backend-utils/package.json @@ -39,12 +39,24 @@ "@local/status": "workspace:*", "@opentelemetry/api": "1.9.0", "@opentelemetry/api-logs": "0.207.0", + "@opentelemetry/core": "2.2.0", + "@opentelemetry/exporter-logs-otlp-grpc": "0.207.0", + "@opentelemetry/exporter-metrics-otlp-grpc": "0.207.0", + "@opentelemetry/exporter-trace-otlp-grpc": "0.207.0", + "@opentelemetry/instrumentation": "0.207.0", + "@opentelemetry/instrumentation-undici": "0.25.0", + "@opentelemetry/resources": "2.2.0", + "@opentelemetry/sdk-logs": "0.207.0", + "@opentelemetry/sdk-metrics": "2.2.0", + "@opentelemetry/sdk-trace-base": "2.2.0", + "@opentelemetry/sdk-trace-node": "2.2.0", "@sentry/node": "10.42.0", "@smithy/protocol-http": "5.3.3", "@smithy/signature-v4": "5.3.3", "@temporalio/activity": "1.12.1", "@temporalio/client": "1.12.1", "@temporalio/common": "1.12.1", + "@temporalio/interceptors-opentelemetry": "1.12.1", "@temporalio/proto": "1.12.1", "@temporalio/worker": "1.12.1", "@temporalio/workflow": "1.12.1", diff --git a/libs/@local/hash-backend-utils/src/opentelemetry.ts b/libs/@local/hash-backend-utils/src/opentelemetry.ts new file mode 100644 index 00000000000..4f54c726403 --- /dev/null +++ b/libs/@local/hash-backend-utils/src/opentelemetry.ts @@ -0,0 +1,218 @@ +/** + * OpenTelemetry registration for HASH Node services. + * + * Registers a global trace, log, and metric provider against an OTLP/gRPC + * collector when `endpoint` is set, plus any caller-supplied auto + * instrumentations (HTTP, Express, gRPC, …). + * + * Returns a teardown function that must run during graceful shutdown so + * pending spans / log records / metric points are flushed before exit. + */ +import { metrics } from "@opentelemetry/api"; +import { logs } from "@opentelemetry/api-logs"; +import { OTLPLogExporter } from "@opentelemetry/exporter-logs-otlp-grpc"; +import { OTLPMetricExporter } from "@opentelemetry/exporter-metrics-otlp-grpc"; +import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-grpc"; +import type { Instrumentation } from "@opentelemetry/instrumentation"; +import { registerInstrumentations } from "@opentelemetry/instrumentation"; +import { UndiciInstrumentation } from "@opentelemetry/instrumentation-undici"; +import type { Resource } from "@opentelemetry/resources"; +import { + defaultResource, + resourceFromAttributes, +} from "@opentelemetry/resources"; +import { + LoggerProvider, + SimpleLogRecordProcessor, +} from "@opentelemetry/sdk-logs"; +import { + MeterProvider, + PeriodicExportingMetricReader, +} from "@opentelemetry/sdk-metrics"; +import type { SpanExporter } from "@opentelemetry/sdk-trace-base"; +import { SimpleSpanProcessor } from "@opentelemetry/sdk-trace-base"; +import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node"; + +const traceTimeoutMs = 5000; +const metricExportIntervalMs = 30_000; + +export interface RegisterOpenTelemetryOptions { + /** + * OTLP gRPC endpoint, e.g. `http://localhost:4317`. Falsy values disable + * registration entirely (useful for local development without a + * collector running). + */ + endpoint: string | null | undefined; + /** `service.name` resource attribute. */ + serviceName: string; + /** Auto-instrumentations to register (HTTP, gRPC, Express, …). */ + instrumentations?: Instrumentation[]; +} + +export interface OpenTelemetrySetup { + /** Run during graceful shutdown to flush pending spans / logs / metrics. */ + shutdown: () => Promise; + /** + * The trace exporter, exposed so callers can build worker-side sinks + * (e.g. Temporal's `makeWorkflowExporter`) that share this connection. + */ + traceExporter: SpanExporter; + /** The resource used for traces / logs / metrics, shared with sinks. */ + resource: Resource; +} + +/** + * Mapping of outbound `host` → `peer.service` label used by Tempo's + * `service_graphs` processor to render external dependencies as + * separate nodes in the service map. Without this attribute the + * undici-instrumentation spans land under the caller service ("AI + * Worker") with a bare `POST` name and no external-service edge. + * + * Keys are matched as exact host or as suffix (`.openai.com` matches + * `api.openai.com`). + */ +const PEER_SERVICE_BY_HOST: Array<[string, string]> = [ + ["api.openai.com", "OpenAI"], + ["api.anthropic.com", "Anthropic"], + ["api.linear.app", "Linear"], + [".googleapis.com", "Google Cloud"], + ["edge-config.vercel.com", "Vercel Edge Config"], + [".vercel.com", "Vercel"], +]; + +const resolvePeerService = (host: string): string | undefined => { + for (const [match, service] of PEER_SERVICE_BY_HOST) { + if (match.startsWith(".") ? host.endsWith(match) : host === match) { + return service; + } + } + return undefined; +}; + +/** + * `@opentelemetry/instrumentation-undici` instance configured with: + * + * - `peer.service` derived from the outbound host (Tempo's + * `service_graphs` processor turns this into an external-service + * edge in the service map). + * - Span name set to `METHOD host/path` rather than the bare HTTP + * verb the instrumentation defaults to. + */ +export const createUndiciInstrumentation = (): UndiciInstrumentation => + new UndiciInstrumentation({ + startSpanHook: (request) => { + try { + const { host } = new URL(request.origin); + const peerService = resolvePeerService(host); + return peerService ? { "peer.service": peerService } : {}; + } catch { + return {}; + } + }, + requestHook: (span, request) => { + // Default span name is just `POST` — replace with `METHOD path` + // so outbound calls are distinguishable in Tempo. The host is + // covered by `peer.service` (set in `startSpanHook`), so the + // span name only carries the path. Strip query string to keep + // cardinality bounded. + const path = request.path.split("?")[0]; + span.updateName(`${request.method} ${path}`); + }, + }); + +/** + * Last setup returned from `registerOpenTelemetry`. Exposed so callers + * that bootstrap OTEL via a `--import` shim (where the setup handle + * isn't naturally accessible from the main entry point) can still wire + * `shutdown()` into their graceful-shutdown chain. + */ +let activeSetup: OpenTelemetrySetup | undefined; + +/** + * Returns the most recently registered OTEL setup, if any. + */ +export const getActiveOpenTelemetrySetup = (): OpenTelemetrySetup | undefined => + activeSetup; + +/** + * Initialise tracing, logging, and metrics. Returns `undefined` when + * `endpoint` is unset so callers can skip workflow-side sink wiring. + */ +export const registerOpenTelemetry = ({ + endpoint, + serviceName, + instrumentations = [], +}: RegisterOpenTelemetryOptions): OpenTelemetrySetup | undefined => { + if (!endpoint) { + // This runs before any logger is wired up, so direct stderr is the + // right channel — and it's a one-shot bootstrap message, not a hot + // log path. + // eslint-disable-next-line no-console + console.warn( + "No OpenTelemetry Protocol endpoint given. Not sending telemetry anywhere.", + ); + return undefined; + } + + const collectorOptions = { + timeoutMillis: traceTimeoutMs, + url: endpoint, + }; + + const resource = defaultResource().merge( + resourceFromAttributes({ "service.name": serviceName }), + ); + + // Tracing + const traceExporter = new OTLPTraceExporter(collectorOptions); + const traceProvider = new NodeTracerProvider({ + resource, + spanProcessors: [new SimpleSpanProcessor(traceExporter)], + }); + traceProvider.register(); + + // Logs + const logExporter = new OTLPLogExporter(collectorOptions); + const logProvider = new LoggerProvider({ + resource, + processors: [new SimpleLogRecordProcessor(logExporter)], + }); + logs.setGlobalLoggerProvider(logProvider); + + // Metrics + const metricExporter = new OTLPMetricExporter(collectorOptions); + const meterProvider = new MeterProvider({ + resource, + readers: [ + new PeriodicExportingMetricReader({ + exporter: metricExporter, + exportIntervalMillis: metricExportIntervalMs, + }), + ], + }); + metrics.setGlobalMeterProvider(meterProvider); + + const unregisterInstrumentations = registerInstrumentations({ + instrumentations, + }); + + // eslint-disable-next-line no-console + console.info( + `Registered OpenTelemetry (traces + logs + metrics) at endpoint ${endpoint} for ${serviceName}`, + ); + + const setup: OpenTelemetrySetup = { + traceExporter, + resource, + shutdown: async () => { + await Promise.allSettled([ + traceProvider.shutdown(), + logProvider.shutdown(), + meterProvider.shutdown(), + ]); + unregisterInstrumentations(); + }, + }; + activeSetup = setup; + return setup; +}; diff --git a/libs/@local/hash-backend-utils/src/temporal.ts b/libs/@local/hash-backend-utils/src/temporal.ts index c8afa23a291..268aabe3339 100644 --- a/libs/@local/hash-backend-utils/src/temporal.ts +++ b/libs/@local/hash-backend-utils/src/temporal.ts @@ -1,4 +1,7 @@ +import { trace } from "@opentelemetry/api"; import { Client as TemporalClient, Connection } from "@temporalio/client"; +import { OpenTelemetryWorkflowClientInterceptor } from "@temporalio/interceptors-opentelemetry"; +import { DefaultLogger } from "@temporalio/worker"; import { getRequiredEnv } from "./environment.js"; import type { Logger } from "./logger.js"; @@ -7,7 +10,33 @@ export { Client as TemporalClient } from "@temporalio/client"; export const temporalNamespace = "HASH"; -export const createTemporalClient = async (_logger?: Logger) => { +/** + * Create a Temporal SDK `Logger` that pipes Rust-core SDK logs and + * Node-side worker events through the application's structured Winston + * logger. Without this, `Runtime.install({ telemetryOptions: { logging: + * { forward } } })` forwards Rust-core logs to a default sink that + * writes to `stderr` directly — bypassing OTLP and the JSON-formatted + * console output the rest of the worker uses. + */ +export const createTemporalSdkLogger = (logger: Logger): DefaultLogger => + new DefaultLogger("INFO", ({ level, message, meta }) => { + switch (level) { + case "TRACE": + case "DEBUG": + logger.debug(message, meta); + return; + case "INFO": + logger.info(message, meta); + return; + case "WARN": + logger.warn(message, meta); + return; + case "ERROR": + logger.error(message, meta); + } + }); + +export const createTemporalClient = async () => { const temporalServerHost = getRequiredEnv("HASH_TEMPORAL_SERVER_HOST"); const host = new URL(temporalServerHost).hostname; @@ -18,5 +47,25 @@ export const createTemporalClient = async (_logger?: Logger) => { address: `${host}:${port}`, }); - return new TemporalClient({ connection, namespace: temporalNamespace }); + // When the caller has OTEL configured (instrument.mjs ran and a global + // tracer provider is registered), attach the workflow client + // interceptor so the active trace context (e.g. an Express HTTP span) + // gets propagated into the workflow headers. The worker-side + // interceptors then pick it up, and the workflow + activity spans + // chain off the caller's trace. + const interceptors = process.env.HASH_OTLP_ENDPOINT + ? { + workflow: [ + new OpenTelemetryWorkflowClientInterceptor({ + tracer: trace.getTracer("@temporalio/interceptors-opentelemetry"), + }), + ], + } + : undefined; + + return new TemporalClient({ + connection, + namespace: temporalNamespace, + interceptors, + }); }; diff --git a/libs/@local/hash-backend-utils/src/temporal/interceptors/activities/opentelemetry.ts b/libs/@local/hash-backend-utils/src/temporal/interceptors/activities/opentelemetry.ts new file mode 100644 index 00000000000..f1b16d22b55 --- /dev/null +++ b/libs/@local/hash-backend-utils/src/temporal/interceptors/activities/opentelemetry.ts @@ -0,0 +1,16 @@ +/** + * Activity-side OpenTelemetry interceptors. + * + * Re-exports the upstream interceptors so workers reference the + * `@local/hash-backend-utils/...` path consistently with the existing + * Sentry interceptor wiring. + * + * Activity duration / outcome metrics (latency histograms, failed counts) + * are emitted by Temporal's SDK runtime telemetry — see + * `Runtime.install({ telemetryOptions })` in each worker's `main.ts`. + * No custom metrics interceptor is needed. + */ +export { + OpenTelemetryActivityInboundInterceptor, + OpenTelemetryActivityOutboundInterceptor, +} from "@temporalio/interceptors-opentelemetry"; diff --git a/libs/@local/hash-backend-utils/src/temporal/interceptors/workflows/opentelemetry.ts b/libs/@local/hash-backend-utils/src/temporal/interceptors/workflows/opentelemetry.ts new file mode 100644 index 00000000000..02b1c74bafa --- /dev/null +++ b/libs/@local/hash-backend-utils/src/temporal/interceptors/workflows/opentelemetry.ts @@ -0,0 +1,21 @@ +/** + * OpenTelemetry workflow interceptors. + * + * Used as a workflow module via Temporal's `workflowModules` option so the + * upstream `@temporalio/interceptors-opentelemetry` interceptors can run + * inside the workflow sandbox. Pair with the worker-side + * `makeWorkflowExporter` sink so the spans created here are exported to + * the worker's OTEL trace provider. + */ +import { + OpenTelemetryInboundInterceptor, + OpenTelemetryInternalsInterceptor, + OpenTelemetryOutboundInterceptor, +} from "@temporalio/interceptors-opentelemetry"; +import type { WorkflowInterceptors } from "@temporalio/workflow"; + +export const interceptors = (): WorkflowInterceptors => ({ + inbound: [new OpenTelemetryInboundInterceptor()], + outbound: [new OpenTelemetryOutboundInterceptor()], + internals: [new OpenTelemetryInternalsInterceptor()], +}); diff --git a/libs/@local/hash-backend-utils/src/temporal/workflow-span-adapter.ts b/libs/@local/hash-backend-utils/src/temporal/workflow-span-adapter.ts new file mode 100644 index 00000000000..999f2843d30 --- /dev/null +++ b/libs/@local/hash-backend-utils/src/temporal/workflow-span-adapter.ts @@ -0,0 +1,85 @@ +/** + * Bridge between `@temporalio/interceptors-opentelemetry` (which pins + * `@opentelemetry/sdk-trace-base@^1`) and our `@opentelemetry/sdk-trace-base@2` + * stack. Two field renames matter on the v1→v2 boundary: + * + * - `instrumentationLibrary` → `instrumentationScope`: v2's + * `OTLPTraceExporter` crashes inside `createResourceMap` reading + * `span.instrumentationScope.name` on a v1-shaped object. + * - `parentSpanId: string` → `parentSpanContext: SpanContext`: v2's + * OTLP transformer encodes parent linkage from `parentSpanContext.spanId` + * only. v1-shaped spans carry `parentSpanId` but no `parentSpanContext`, + * so without translation the OTLP envelope ships with no parent and + * Tempo renders every workflow/activity span as a root in the trace. + * + * `wrapWorkflowSpanExporter` returns a `SpanExporter` that synthesises + * the v2-shaped fields on each span on its way in. + * + * TODO(BE-520): drop this adapter when + * `@temporalio/interceptors-opentelemetry-v2` (PR + * https://github.com/temporalio/sdk-typescript/pull/1951) is released. + */ +import type { ExportResult } from "@opentelemetry/core"; +import type { ReadableSpan, SpanExporter } from "@opentelemetry/sdk-trace-base"; + +interface LegacyReadableSpan { + instrumentationLibrary?: { + name: string; + version?: string; + schemaUrl?: string; + }; + instrumentationScope?: { name: string; version?: string; schemaUrl?: string }; + parentSpanId?: string; +} + +const normaliseSpan = (span: ReadableSpan): ReadableSpan => { + // The cast loses runtime correctness intentionally — Temporal's + // `extractReadableSpan` produces v1-shaped objects whose + // `instrumentationScope` and `parentSpanContext` are genuinely + // undefined at runtime, even though v2's `ReadableSpan` types them + // as required / present. + const legacy = span as ReadableSpan & LegacyReadableSpan; + + const needsScope = !legacy.instrumentationScope; + const needsParent = legacy.parentSpanId && !legacy.parentSpanContext; + if (!needsScope && !needsParent) { + return span; + } + + // Without an instrumentation identifier the OTLP resource-map logic + // still throws, so synthesise an "unknown" scope rather than letting + // the export crash. + const instrumentationScope = legacy.instrumentationScope ?? + legacy.instrumentationLibrary ?? { name: "unknown" }; + + let parentSpanContext = legacy.parentSpanContext; + if (legacy.parentSpanId && !parentSpanContext) { + const ctx = span.spanContext(); + parentSpanContext = { + traceId: ctx.traceId, + spanId: legacy.parentSpanId, + traceFlags: ctx.traceFlags, + // The parent lives outside this span exporter's sandbox — either + // in the worker's host process (RunWorkflow / RunActivity) or in + // the workflow client (e.g. an Express HTTP span). + isRemote: true, + }; + } + + return Object.assign( + Object.create(Object.getPrototypeOf(span) as object), + span, + { instrumentationScope, parentSpanContext }, + ) as ReadableSpan; +}; + +export const wrapWorkflowSpanExporter = ( + inner: SpanExporter, +): SpanExporter => ({ + export(spans, resultCallback): void { + const adapted = spans.map(normaliseSpan); + inner.export(adapted, (result: ExportResult) => resultCallback(result)); + }, + shutdown: () => inner.shutdown(), + forceFlush: () => inner.forceFlush?.() ?? Promise.resolve(), +}); diff --git a/libs/@local/temporal-client/Cargo.toml b/libs/@local/temporal-client/Cargo.toml index 7002f2f5e55..370898d10e7 100644 --- a/libs/@local/temporal-client/Cargo.toml +++ b/libs/@local/temporal-client/Cargo.toml @@ -16,14 +16,17 @@ type-system = { workspace = true, public = true } error-stack = { workspace = true } # Private third-party dependencies -serde = { workspace = true } -serde_json = { workspace = true } -simple-mermaid = { workspace = true } -temporalio-client = { workspace = true } -temporalio-common = { workspace = true } -thiserror = { workspace = true } -url = { workspace = true } -uuid = { workspace = true, features = ["v4"] } +opentelemetry = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +simple-mermaid = { workspace = true } +temporalio-client = { workspace = true } +temporalio-common = { workspace = true } +thiserror = { workspace = true } +tracing = { workspace = true } +tracing-opentelemetry = { workspace = true } +url = { workspace = true } +uuid = { workspace = true, features = ["v4"] } [lints] workspace = true diff --git a/libs/@local/temporal-client/src/ai.rs b/libs/@local/temporal-client/src/ai.rs index 877e4daa296..3e4fbac29d3 100644 --- a/libs/@local/temporal-client/src/ai.rs +++ b/libs/@local/temporal-client/src/ai.rs @@ -1,11 +1,21 @@ use std::collections::HashMap; use error_stack::{Report, ResultExt as _}; +use opentelemetry::{global, propagation::Injector}; use serde::Serialize; -use temporalio_client::{WorkflowClientTrait as _, WorkflowOptions}; +use temporalio_client::{NamespacedClient, WorkflowService, tonic::IntoRequest as _}; use temporalio_common::protos::{ - ENCODING_PAYLOAD_KEY, JSON_ENCODING_VAL, temporal::api::common::v1::Payload, + ENCODING_PAYLOAD_KEY, JSON_ENCODING_VAL, + coresdk::IntoPayloadsExt as _, + temporal::api::{ + common::v1::{Header, Payload, WorkflowType}, + enums::v1::TaskQueueKind, + taskqueue::v1::TaskQueue, + workflowservice::v1::StartWorkflowExecutionRequest, + }, }; +use tracing::{Span, instrument}; +use tracing_opentelemetry::OpenTelemetrySpanExt as _; use type_system::{ knowledge::entity::EntityId, ontology::{ @@ -17,6 +27,55 @@ use uuid::Uuid; use crate::{TemporalClient, WorkflowError}; +/// Header key used by `@temporalio/interceptors-opentelemetry` to carry the +/// trace-context payload across workflow boundaries. Must stay in sync with +/// the constant declared in that package's `instrumentation.ts`. +const TRACE_HEADER: &str = "_tracer-data"; + +/// Adapter so `opentelemetry`'s text-map propagator can write into a plain +/// `HashMap` carrier. +struct CarrierWriter<'a>(&'a mut HashMap); + +impl Injector for CarrierWriter<'_> { + fn set(&mut self, key: &str, value: String) { + self.0.insert(key.to_owned(), value); + } +} + +/// Build a Temporal `Header` containing the active OTEL trace context as +/// a JSON-encoded text-map under the `_tracer-data` field. +/// +/// Returns `None` if no propagator wrote anything into the carrier (e.g. +/// no active span, or no propagator registered) — the caller should leave +/// the request `header` field empty in that case rather than send an +/// empty payload. +fn build_otel_header() -> Option
{ + let context = Span::current().context(); + let mut carrier = HashMap::::new(); + global::get_text_map_propagator(|propagator| { + propagator.inject_context(&context, &mut CarrierWriter(&mut carrier)); + }); + if carrier.is_empty() { + return None; + } + + let payload = Payload { + metadata: HashMap::from([( + ENCODING_PAYLOAD_KEY.to_owned(), + JSON_ENCODING_VAL.as_bytes().to_vec(), + )]), + // `HashMap` cannot fail to serialise — fail loud + // rather than silently dropping the trace context (which would + // produce a parent-less workflow span on every start). + data: serde_json::to_vec(&carrier).expect("HashMap serialises"), + ..Default::default() + }; + + Some(Header { + fields: HashMap::from([(TRACE_HEADER.to_owned(), payload)]), + }) +} + #[derive(Serialize)] #[serde(rename_all = "camelCase")] struct AuthenticationContext { @@ -24,31 +83,57 @@ struct AuthenticationContext { } impl TemporalClient { + /// Start a workflow on the `ai` task queue with the active OTEL trace + /// context propagated through the workflow headers. The TypeScript + /// worker's `OpenTelemetryInboundInterceptor` extracts this context so + /// the resulting `RunWorkflow` and `RunActivity` spans chain off the + /// caller's trace. + /// + /// Bypasses the `WorkflowClientTrait::start_workflow` helper because + /// that helper does not expose the proto `header` field. Calls + /// `WorkflowService::start_workflow_execution` directly instead. + #[instrument( + skip(self, payload), + fields(workflow_type = workflow, otel.kind = "producer"), + )] async fn start_ai_workflow( &self, workflow: &'static str, payload: &(impl Serialize + Sync), ) -> Result> { - Ok(self - .client - .start_workflow( - vec![Payload { - metadata: HashMap::from([( - ENCODING_PAYLOAD_KEY.to_owned(), - JSON_ENCODING_VAL.as_bytes().to_vec(), - )]), - data: serde_json::to_vec(payload).change_context(WorkflowError(workflow))?, - external_payloads: Vec::new(), - }], - "ai".to_owned(), - Uuid::new_v4().to_string(), - workflow.to_owned(), - None, - WorkflowOptions::default(), - ) - .await - .change_context(WorkflowError(workflow))? - .run_id) + let mut client = self.client.clone(); + let request = StartWorkflowExecutionRequest { + namespace: <_ as NamespacedClient>::namespace(&client), + input: vec![Payload { + metadata: HashMap::from([( + ENCODING_PAYLOAD_KEY.to_owned(), + JSON_ENCODING_VAL.as_bytes().to_vec(), + )]), + data: serde_json::to_vec(payload).change_context(WorkflowError(workflow))?, + ..Default::default() + }] + .into_payloads(), + workflow_id: Uuid::new_v4().to_string(), + workflow_type: Some(WorkflowType { + name: workflow.to_owned(), + }), + task_queue: Some(TaskQueue { + name: "ai".to_owned(), + kind: TaskQueueKind::Unspecified as i32, + normal_name: String::new(), + }), + request_id: Uuid::new_v4().to_string(), + header: build_otel_header(), + ..Default::default() + }; + + let response = + WorkflowService::start_workflow_execution(&mut client, request.into_request()) + .await + .change_context(WorkflowError(workflow))? + .into_inner(); + + Ok(response.run_id) } /// Starts a workflow to update the embeddings for the provided data type. @@ -138,10 +223,7 @@ impl TemporalClient { .await } - /// Starts a workflow to update the embeddings for the provided entities. - /// - /// The `embedding_exclusions` parameter specifies which properties should be excluded - /// from embedding generation for specific entity types (e.g., email for User entities). + /// Starts workflows to update the embeddings for the provided entities. /// /// Returns the run IDs of the workflows. /// diff --git a/yarn.lock b/yarn.lock index 65886557dba..5ec7eab46f5 100644 --- a/yarn.lock +++ b/yarn.lock @@ -352,9 +352,15 @@ __metadata: "@local/hash-isomorphic-utils": "workspace:*" "@local/status": "workspace:*" "@local/tsconfig": "workspace:*" + "@opentelemetry/api": "npm:1.9.0" + "@opentelemetry/api-logs": "npm:0.207.0" + "@opentelemetry/instrumentation": "npm:0.207.0" + "@opentelemetry/instrumentation-grpc": "npm:0.207.0" + "@opentelemetry/instrumentation-http": "npm:0.207.0" "@sentry/node": "npm:10.42.0" "@temporalio/activity": "npm:1.12.1" "@temporalio/common": "npm:1.12.1" + "@temporalio/interceptors-opentelemetry": "npm:1.12.1" "@temporalio/proto": "npm:1.12.1" "@temporalio/worker": "npm:1.12.1" "@temporalio/workflow": "npm:1.12.1" @@ -700,8 +706,14 @@ __metadata: "@local/hash-isomorphic-utils": "workspace:*" "@local/status": "workspace:*" "@local/tsconfig": "workspace:*" + "@opentelemetry/api": "npm:1.9.0" + "@opentelemetry/api-logs": "npm:0.207.0" + "@opentelemetry/instrumentation": "npm:0.207.0" + "@opentelemetry/instrumentation-grpc": "npm:0.207.0" + "@opentelemetry/instrumentation-http": "npm:0.207.0" "@sentry/node": "npm:10.42.0" "@temporalio/activity": "npm:1.12.1" + "@temporalio/interceptors-opentelemetry": "npm:1.12.1" "@temporalio/worker": "npm:1.12.1" "@temporalio/workflow": "npm:1.12.1" "@types/dotenv-flow": "npm:3.3.3" @@ -9202,12 +9214,24 @@ __metadata: "@local/tsconfig": "workspace:*" "@opentelemetry/api": "npm:1.9.0" "@opentelemetry/api-logs": "npm:0.207.0" + "@opentelemetry/core": "npm:2.2.0" + "@opentelemetry/exporter-logs-otlp-grpc": "npm:0.207.0" + "@opentelemetry/exporter-metrics-otlp-grpc": "npm:0.207.0" + "@opentelemetry/exporter-trace-otlp-grpc": "npm:0.207.0" + "@opentelemetry/instrumentation": "npm:0.207.0" + "@opentelemetry/instrumentation-undici": "npm:0.25.0" + "@opentelemetry/resources": "npm:2.2.0" + "@opentelemetry/sdk-logs": "npm:0.207.0" + "@opentelemetry/sdk-metrics": "npm:2.2.0" + "@opentelemetry/sdk-trace-base": "npm:2.2.0" + "@opentelemetry/sdk-trace-node": "npm:2.2.0" "@sentry/node": "npm:10.42.0" "@smithy/protocol-http": "npm:5.3.3" "@smithy/signature-v4": "npm:5.3.3" "@temporalio/activity": "npm:1.12.1" "@temporalio/client": "npm:1.12.1" "@temporalio/common": "npm:1.12.1" + "@temporalio/interceptors-opentelemetry": "npm:1.12.1" "@temporalio/proto": "npm:1.12.1" "@temporalio/worker": "npm:1.12.1" "@temporalio/workflow": "npm:1.12.1" @@ -10437,6 +10461,15 @@ __metadata: languageName: node linkType: hard +"@opentelemetry/api-logs@npm:0.215.0": + version: 0.215.0 + resolution: "@opentelemetry/api-logs@npm:0.215.0" + dependencies: + "@opentelemetry/api": "npm:^1.3.0" + checksum: 10c0/9ec6a46c064f71d01c21b45d1e90183efb2476f41ef9ac450bc95077c618cc7fe95bdb65fcd6f134d634e1950a1fe97678e6efe1a8382a11ebae6d8e960330e0 + languageName: node + linkType: hard + "@opentelemetry/api-logs@npm:0.41.2": version: 0.41.2 resolution: "@opentelemetry/api-logs@npm:0.41.2" @@ -10455,13 +10488,20 @@ __metadata: languageName: node linkType: hard -"@opentelemetry/api@npm:1.9.0, @opentelemetry/api@npm:^1.0.0, @opentelemetry/api@npm:^1.3.0, @opentelemetry/api@npm:^1.4.1, @opentelemetry/api@npm:^1.9.0": +"@opentelemetry/api@npm:1.9.0": version: 1.9.0 resolution: "@opentelemetry/api@npm:1.9.0" checksum: 10c0/9aae2fe6e8a3a3eeb6c1fdef78e1939cf05a0f37f8a4fae4d6bf2e09eb1e06f966ece85805626e01ba5fab48072b94f19b835449e58b6d26720ee19a58298add languageName: node linkType: hard +"@opentelemetry/api@npm:^1.0.0, @opentelemetry/api@npm:^1.3.0, @opentelemetry/api@npm:^1.4.1, @opentelemetry/api@npm:^1.7.0, @opentelemetry/api@npm:^1.9.0": + version: 1.9.1 + resolution: "@opentelemetry/api@npm:1.9.1" + checksum: 10c0/c608485fc8b5a91e1f7e05e843b45b509307456b31cd2ad365933d90813e40ebfedf179f1451c762037e82d7c76aa8500e95d2da3609f640a1206cde5322cd14 + languageName: node + linkType: hard + "@opentelemetry/context-async-hooks@npm:2.2.0": version: 2.2.0 resolution: "@opentelemetry/context-async-hooks@npm:2.2.0" @@ -10511,7 +10551,7 @@ __metadata: languageName: node linkType: hard -"@opentelemetry/core@npm:1.30.1": +"@opentelemetry/core@npm:1.30.1, @opentelemetry/core@npm:^1.19.0": version: 1.30.1 resolution: "@opentelemetry/core@npm:1.30.1" dependencies: @@ -11053,6 +11093,18 @@ __metadata: languageName: node linkType: hard +"@opentelemetry/instrumentation-grpc@npm:0.207.0": + version: 0.207.0 + resolution: "@opentelemetry/instrumentation-grpc@npm:0.207.0" + dependencies: + "@opentelemetry/instrumentation": "npm:0.207.0" + "@opentelemetry/semantic-conventions": "npm:^1.29.0" + peerDependencies: + "@opentelemetry/api": ^1.3.0 + checksum: 10c0/d3dc63193ea941836fbd60dc6d637e06b9ba9a5a2806e1d28117cf6ee6a4d2bf942903ad572c32869f1d4cd06a669f76c7848ce3af32c697c940dbe324f0d3da + languageName: node + linkType: hard + "@opentelemetry/instrumentation-hapi@npm:0.55.0": version: 0.55.0 resolution: "@opentelemetry/instrumentation-hapi@npm:0.55.0" @@ -11450,6 +11502,19 @@ __metadata: languageName: node linkType: hard +"@opentelemetry/instrumentation-undici@npm:0.25.0": + version: 0.25.0 + resolution: "@opentelemetry/instrumentation-undici@npm:0.25.0" + dependencies: + "@opentelemetry/core": "npm:^2.0.0" + "@opentelemetry/instrumentation": "npm:^0.215.0" + "@opentelemetry/semantic-conventions": "npm:^1.24.0" + peerDependencies: + "@opentelemetry/api": ^1.7.0 + checksum: 10c0/03e6133149a33801655dcc9fc3b0459b50581f6379e811bbfd43ff98bcca1e956c5acfb1fdd5ade9b2184c108e06fd1ccc2509b8703d16c290d2e203dacc9de8 + languageName: node + linkType: hard + "@opentelemetry/instrumentation@npm:0.207.0, @opentelemetry/instrumentation@npm:^0.207.0": version: 0.207.0 resolution: "@opentelemetry/instrumentation@npm:0.207.0" @@ -11476,7 +11541,7 @@ __metadata: languageName: node linkType: hard -"@opentelemetry/instrumentation@npm:0.211.0, @opentelemetry/instrumentation@npm:>=0.52.0 <1, @opentelemetry/instrumentation@npm:^0.211.0": +"@opentelemetry/instrumentation@npm:0.211.0, @opentelemetry/instrumentation@npm:^0.211.0": version: 0.211.0 resolution: "@opentelemetry/instrumentation@npm:0.211.0" dependencies: @@ -11489,6 +11554,19 @@ __metadata: languageName: node linkType: hard +"@opentelemetry/instrumentation@npm:>=0.52.0 <1, @opentelemetry/instrumentation@npm:^0.215.0": + version: 0.215.0 + resolution: "@opentelemetry/instrumentation@npm:0.215.0" + dependencies: + "@opentelemetry/api-logs": "npm:0.215.0" + import-in-the-middle: "npm:^3.0.0" + require-in-the-middle: "npm:^8.0.0" + peerDependencies: + "@opentelemetry/api": ^1.3.0 + checksum: 10c0/d9b3f2f927192a866d4c44b80bdebfea54c958660242bba7c61889357c6e51ab8fda54a711d41b7e93bf5fe41e8c7a5384a90a5262467667de997387c3e54924 + languageName: node + linkType: hard + "@opentelemetry/otlp-exporter-base@npm:0.205.0": version: 0.205.0 resolution: "@opentelemetry/otlp-exporter-base@npm:0.205.0" @@ -11696,7 +11774,7 @@ __metadata: languageName: node linkType: hard -"@opentelemetry/resources@npm:1.30.1, @opentelemetry/resources@npm:^1.15.2": +"@opentelemetry/resources@npm:1.30.1, @opentelemetry/resources@npm:^1.15.2, @opentelemetry/resources@npm:^1.19.0": version: 1.30.1 resolution: "@opentelemetry/resources@npm:1.30.1" dependencies: @@ -11916,7 +11994,7 @@ __metadata: languageName: node linkType: hard -"@opentelemetry/sdk-trace-base@npm:1.30.1, @opentelemetry/sdk-trace-base@npm:^1.15.2": +"@opentelemetry/sdk-trace-base@npm:1.30.1, @opentelemetry/sdk-trace-base@npm:^1.15.2, @opentelemetry/sdk-trace-base@npm:^1.19.0": version: 1.30.1 resolution: "@opentelemetry/sdk-trace-base@npm:1.30.1" dependencies: @@ -17740,6 +17818,24 @@ __metadata: languageName: node linkType: hard +"@temporalio/interceptors-opentelemetry@npm:1.12.1": + version: 1.12.1 + resolution: "@temporalio/interceptors-opentelemetry@npm:1.12.1" + dependencies: + "@opentelemetry/api": "npm:^1.7.0" + "@opentelemetry/core": "npm:^1.19.0" + "@opentelemetry/resources": "npm:^1.19.0" + "@opentelemetry/sdk-trace-base": "npm:^1.19.0" + peerDependencies: + "@temporalio/activity": 1.12.1 + "@temporalio/client": 1.12.1 + "@temporalio/common": 1.12.1 + "@temporalio/worker": 1.12.1 + "@temporalio/workflow": 1.12.1 + checksum: 10c0/e933f0580f974b1f44203a4df23d9eed6bbf165b1a9a957b981bbe911ed092bfb183c447873468bf3b858e3f31ac84baaebf944afbd1332cf0c95b35a6c008c4 + languageName: node + linkType: hard + "@temporalio/proto@npm:1.12.1": version: 1.12.1 resolution: "@temporalio/proto@npm:1.12.1" @@ -31482,6 +31578,18 @@ __metadata: languageName: node linkType: hard +"import-in-the-middle@npm:^3.0.0": + version: 3.0.1 + resolution: "import-in-the-middle@npm:3.0.1" + dependencies: + acorn: "npm:^8.15.0" + acorn-import-attributes: "npm:^1.9.5" + cjs-module-lexer: "npm:^2.2.0" + module-details-from-path: "npm:^1.0.4" + checksum: 10c0/afd314edb76764ff53d624e2868f5489a9fb81d22745da3db88121a962f422390b59be86fc5cb1158ed672025ece3b3f8a4943e5dde116bae3dac9f0ff5aff86 + languageName: node + linkType: hard + "import-lazy@npm:~4.0.0": version: 4.0.0 resolution: "import-lazy@npm:4.0.0" From d86b329a37d6bbcff9a4c228cae5c5a3e73d1e74 Mon Sep 17 00:00:00 2001 From: Tim Diekmann Date: Wed, 29 Apr 2026 12:06:10 +0200 Subject: [PATCH 02/17] BE-519: Address review findings, add cleanup helpers and unit tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two rounds of multi-agent review surfaced 5 Critical and 8 Important items, plus a Critical regression introduced by the first cleanup pass. This commit addresses all of them. Production telemetry fixes: - Bundle scripts now register the OTEL workflow interceptor as a workflow-bundle module; without this, prod workers shipped no workflow spans because `Worker.create.interceptors.workflowModules` is ignored when a prebuilt bundle is loaded. - `BatchSpanProcessor` / `BatchLogRecordProcessor` replace the Simple variants so OTLP exports stay off the request path. - `registerOpenTelemetry` shutdown surfaces `Promise.allSettled` rejections to stderr and applies a 2-second per-provider timeout. - `instrument.mjs` (and the worker shims) wrap the bootstrap in try/catch so a misconfigured collector falls back to no telemetry rather than crashing the process before the logger is wired. Worker shutdown: - Workers now call `worker.shutdown()` and await `worker.run()` from the SIGTERM handler so in-flight activities drain cleanly. The previous draft called `process.exit` before the drain completed. - Exit code is 0 on graceful signals, 1 only on actual failure. - `OpenTelemetryActivityOutboundInterceptor` is wired so log lines carry `trace_id` / `span_id` for Loki ↔ Tempo correlation. Refactor: - New `runWorker(opts)` helper in `@local/hash-backend-utils/temporal/worker-bootstrap` collapses the previously-duplicated bootstrap logic in both worker `main.ts` files. Sentry init stays per-worker (ESM ordering). - `WorkflowSource` discriminated union (`{ kind: "bundle" | "path" }`) replaces `Partial` for workflow-source config, and `ExtraWorkerOptions = Omit` excludes helper-owned keys from the escape hatch. - `makeV2WorkflowSink(setup)` quarantines the `as unknown as Parameters<...>` casts to a single helper, ready to drop with BE-520. - `httpRequestSpanNameHook` is shared across the three Node bootstraps. - The `getActiveOpenTelemetrySetup` singleton is gone; `instrument.mjs` exports `otelSetup` directly. - Rust `build_otel_header` emits a once-per-process warning when the trace-context carrier is empty, surfacing missing-propagator configuration instead of silently producing parent-less workflows. Tests (23 unit tests covering the v1↔v2 normalisation, peer.service resolution, and HTTP hook behaviour): - `wrapWorkflowSpanExporter` / `normaliseSpan` against v1-shaped spans, mixed batches, the existing-`parentSpanContext` precedence case, the `parentSpanId === ""` edge case, and result-callback propagation. - `resolvePeerService` exact / suffix matching and lookalike-domain rejection. - `httpRequestSpanNameHook` for incoming, outgoing, Express-wrapped, query-stripped, and missing-method paths. The drain semantics are verified manually — `TestWorkflowEnvironment` integration tests would be a follow-up. --- .../scripts/bundle-workflow-code.ts | 8 + apps/hash-ai-worker-ts/src/instrument.ts | 61 ++-- apps/hash-ai-worker-ts/src/main.ts | 244 +++------------ apps/hash-api/src/index.ts | 6 +- apps/hash-api/src/instrument.mjs | 141 +++++---- .../scripts/bundle-workflow-code.ts | 8 + .../hash-integration-worker/src/instrument.ts | 57 ++-- apps/hash-integration-worker/src/main.ts | 189 ++--------- libs/@local/hash-backend-utils/package.json | 1 + .../src/opentelemetry.test.ts | 133 ++++++++ .../hash-backend-utils/src/opentelemetry.ts | 175 +++++++---- .../@local/hash-backend-utils/src/temporal.ts | 28 +- .../src/temporal/worker-bootstrap.ts | 293 ++++++++++++++++++ .../temporal/workflow-span-adapter.test.ts | 265 ++++++++++++++++ .../src/temporal/workflow-span-adapter.ts | 72 ++++- libs/@local/temporal-client/src/ai.rs | 37 ++- yarn.lock | 1 + 17 files changed, 1133 insertions(+), 586 deletions(-) create mode 100644 libs/@local/hash-backend-utils/src/opentelemetry.test.ts create mode 100644 libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts create mode 100644 libs/@local/hash-backend-utils/src/temporal/workflow-span-adapter.test.ts diff --git a/apps/hash-ai-worker-ts/scripts/bundle-workflow-code.ts b/apps/hash-ai-worker-ts/scripts/bundle-workflow-code.ts index 9c37039248c..9cf2a65542c 100644 --- a/apps/hash-ai-worker-ts/scripts/bundle-workflow-code.ts +++ b/apps/hash-ai-worker-ts/scripts/bundle-workflow-code.ts @@ -17,6 +17,14 @@ async function bundle() { require.resolve( "@local/hash-backend-utils/temporal/interceptors/workflows/sentry", ), + // OTEL workflow interceptor must be in the bundle: when the + // worker boots with `workflowBundle`, the `interceptors.workflowModules` + // option on `Worker.create` is ignored. The interceptor is a no-op + // when no global TracerProvider is registered, so it's safe to + // include unconditionally. + require.resolve( + "@local/hash-backend-utils/temporal/interceptors/workflows/opentelemetry", + ), ], }); const codePath = path.join(__dirname, "../dist/workflow-bundle.js"); diff --git a/apps/hash-ai-worker-ts/src/instrument.ts b/apps/hash-ai-worker-ts/src/instrument.ts index 3988d8ea9a0..72e866aa325 100644 --- a/apps/hash-ai-worker-ts/src/instrument.ts +++ b/apps/hash-ai-worker-ts/src/instrument.ts @@ -5,55 +5,50 @@ */ import { createUndiciInstrumentation, + httpRequestSpanNameHook, registerOpenTelemetry, } from "@local/hash-backend-utils/opentelemetry"; import { GrpcInstrumentation } from "@opentelemetry/instrumentation-grpc"; import { HttpInstrumentation } from "@opentelemetry/instrumentation-http"; -const otlpEndpoint = process.env.HASH_OTLP_ENDPOINT; - /** - * Setup handles. `undefined` when no `HASH_OTLP_ENDPOINT` is configured — - * `main.ts` checks for that and skips workflow-side OTEL wiring in that - * case. + * Setup handles. `undefined` when no `HASH_OTLP_ENDPOINT` is configured + * (no collector) or when bootstrap throws. */ -export const otelSetup = otlpEndpoint - ? registerOpenTelemetry({ +export const otelSetup: ReturnType = (() => { + const otlpEndpoint = process.env.HASH_OTLP_ENDPOINT; + if (!otlpEndpoint) { + return undefined; + } + try { + return registerOpenTelemetry({ endpoint: otlpEndpoint, serviceName: process.env.OTEL_SERVICE_NAME ?? "AI Worker", instrumentations: [ new HttpInstrumentation({ - // Don't trace the OTLP exporter's own outgoing requests, otherwise - // every export creates a span that needs to be exported, recursively. + // Don't trace the OTLP exporter's own outgoing requests — every + // export would create a span that needs to be exported, recursively. ignoreOutgoingRequestHook: (options) => options.port === 4317, - // Default span name is just `HTTP POST` — replace with - // `METHOD /path` so outbound LLM / Graph calls are - // distinguishable in Tempo. - requestHook: (span, request) => { - if (!("method" in request) || !request.method) { - return; - } - const candidates = [ - "originalUrl" in request ? request.originalUrl : undefined, - "url" in request ? request.url : undefined, - "path" in request ? request.path : undefined, - ]; - const rawPath = candidates.find( - (value): value is string => typeof value === "string", - ); - const path = rawPath?.split("?")[0]; - if (path) { - span.updateName(`${request.method} ${path}`); - } - }, + requestHook: httpRequestSpanNameHook, }), new GrpcInstrumentation(), // Native `fetch` (used by openai / @anthropic-ai/sdk / Vertex AI // SDKs) goes through undici, which the http instrumentation does // not patch. The shared helper sets `peer.service` so Tempo's - // service_graphs processor renders OpenAI / Anthropic / Vertex - // as external-service edges in the service map. + // service_graphs processor renders external dependencies as + // edges in the service map. createUndiciInstrumentation(), ], - }) - : undefined; + }); + } catch (error) { + // Bootstrap runs before any logger is wired up, so direct stderr is + // the right channel. Don't rethrow — the worker should still start + // without telemetry. + // eslint-disable-next-line no-console + console.error( + "OpenTelemetry bootstrap failed; AI worker will start without telemetry.", + error, + ); + return undefined; + } +})(); diff --git a/apps/hash-ai-worker-ts/src/main.ts b/apps/hash-ai-worker-ts/src/main.ts index c010ae289af..1f93a495494 100644 --- a/apps/hash-ai-worker-ts/src/main.ts +++ b/apps/hash-ai-worker-ts/src/main.ts @@ -16,15 +16,16 @@ Sentry.init({ process.env.ENVIRONMENT || (process.env.NODE_ENV === "production" ? "production" : "development"), tracesSampleRate: process.env.NODE_ENV === "production" ? 1.0 : 0, - // When OTEL is configured we already have a global TracerProvider - // wired up to OTLP. Tell Sentry to share it instead of registering its - // own, so Sentry's spans flow through our pipeline and inherit the - // active OTEL context (including parent spans extracted from Temporal - // headers by OpenTelemetryActivityInboundInterceptor). - skipOpenTelemetrySetup: !!process.env.HASH_OTLP_ENDPOINT, + // Sentry registers its own global `NodeTracerProvider` by default. + // Letting it run after `registerOpenTelemetry` would replace the + // provider that the OTEL workflow client interceptor (set up in + // `createTemporalClient`) holds via `trace.getTracer(...)`, breaking + // caller → workflow → activity context propagation. With this flag + // Sentry shares our provider so its spans flow through the same + // OTLP pipeline. + skipOpenTelemetrySetup: !!otelSetup, }); -import * as http from "node:http"; import { createRequire } from "node:module"; import path from "node:path"; import { fileURLToPath } from "node:url"; @@ -32,23 +33,9 @@ import { fileURLToPath } from "node:url"; import { createGraphClient } from "@local/hash-backend-utils/create-graph-client"; import { getRequiredEnv } from "@local/hash-backend-utils/environment"; import { createCommonFlowActivities } from "@local/hash-backend-utils/flows"; -import { OpenTelemetryActivityInboundInterceptor } from "@local/hash-backend-utils/temporal/interceptors/activities/opentelemetry"; -import { SentryActivityInboundInterceptor } from "@local/hash-backend-utils/temporal/interceptors/activities/sentry"; -import { sentrySinks } from "@local/hash-backend-utils/temporal/sinks/sentry"; -import { createTemporalSdkLogger } from "@local/hash-backend-utils/temporal"; -import { wrapWorkflowSpanExporter } from "@local/hash-backend-utils/temporal/workflow-span-adapter"; +import type { WorkflowSource } from "@local/hash-backend-utils/temporal/worker-bootstrap"; +import { runWorker } from "@local/hash-backend-utils/temporal/worker-bootstrap"; import { createVaultClient } from "@local/hash-backend-utils/vault"; -import { makeWorkflowExporter } from "@temporalio/interceptors-opentelemetry"; -import type { - ActivityInboundCallsInterceptorFactory, - WorkerOptions, -} from "@temporalio/worker"; -import { - defaultSinks, - NativeConnection, - Runtime, - Worker, -} from "@temporalio/worker"; import { config } from "dotenv-flow"; import { TsconfigPathsPlugin } from "tsconfig-paths-webpack-plugin"; @@ -65,205 +52,76 @@ export const monorepoRootDir = path.resolve(__dirname, "../../.."); config({ silent: true, path: monorepoRootDir }); -const TEMPORAL_HOST = new URL( - process.env.HASH_TEMPORAL_SERVER_HOST ?? "http://localhost", -).hostname; -const TEMPORAL_PORT = process.env.HASH_TEMPORAL_SERVER_PORT - ? parseInt(process.env.HASH_TEMPORAL_SERVER_PORT, 10) - : 7233; - -const createHealthCheckServer = () => { - const server = http.createServer((req, res) => { - if (req.method === "GET" && req.url === "/health") { - res.setHeader("Content-Type", "application/json"); - res.writeHead(200); - res.end( - JSON.stringify({ - msg: "worker healthy", - }), - ); - return; - } - res.writeHead(404); - res.end(""); - }); - - return server; -}; - -const workflowOptions: Partial = +const workflowSource: WorkflowSource = process.env.NODE_ENV === "production" ? { - workflowBundle: { - codePath: require.resolve("../dist/workflow-bundle.js"), - }, + kind: "bundle", + bundle: { codePath: require.resolve("../dist/workflow-bundle.js") }, } : { + kind: "path", + workflowsPath: require.resolve("./workflows"), bundlerOptions: { - webpackConfigHook: (webpackConfig) => { - return { - ...webpackConfig, - resolve: { - ...webpackConfig.resolve, - plugins: [ - ...((webpackConfig.plugins as [] | undefined) ?? []), - /** - * Because we run TypeScript directly in development, we need to use the 'paths' in the base tsconfig.json - * This tells TypeScript where to resolve the imports from, overwriting the 'exports' in local dependencies' package.jsons, - * which refer to the transpiled JavaScript code. This plugin converts the 'paths' to webpack 'alias'. - */ - new TsconfigPathsPlugin({ - configFile: - "../../libs/@local/tsconfig/legacy-base-tsconfig-to-refactor.json", - }), - ], - }, - }; - }, + webpackConfigHook: (webpackConfig) => ({ + ...webpackConfig, + resolve: { + ...webpackConfig.resolve, + plugins: [ + ...((webpackConfig.plugins as [] | undefined) ?? []), + /** + * We run TypeScript directly in development, so the 'paths' in + * the base tsconfig.json need to be honoured to override the + * 'exports' in local dependencies' package.jsons (which point + * at transpiled JavaScript). This plugin converts the 'paths' + * to webpack 'alias'. + */ + new TsconfigPathsPlugin({ + configFile: + "../../libs/@local/tsconfig/legacy-base-tsconfig-to-refactor.json", + }), + ], + }, + }), }, - workflowsPath: require.resolve("./workflows"), }; async function run() { - logger.info("Starting AI worker..."); - - // Temporal SDK runtime telemetry: emits SDK-internal metrics (worker - // slot utilisation, sticky cache hits, polling latency, activity / - // workflow execution latency) and forwards Rust core logs to OTLP. - // Must be installed before any Temporal Connection / Worker is - // created. Distinct from the per-activity user-code spans the - // interceptors below produce. - if (otelSetup) { - Runtime.install({ - logger: createTemporalSdkLogger(logger), - telemetryOptions: { - metrics: { - otel: { - url: process.env.HASH_OTLP_ENDPOINT!, - metricsExportInterval: "30s", - }, - }, - logging: { forward: { level: "INFO" } }, - }, - }); - } - const graphApiClient = createGraphClient(logger, { host: getRequiredEnv("HASH_GRAPH_HTTP_HOST"), port: parseInt(getRequiredEnv("HASH_GRAPH_HTTP_PORT"), 10), }); - logger.info("Created Graph client"); const vaultClient = await createVaultClient({ logger }); - if (!vaultClient) { throw new Error("Failed to create Vault client, check preceding logs."); } - logger.info("Created Vault client"); - const connection = await NativeConnection.connect({ - address: `${TEMPORAL_HOST}:${TEMPORAL_PORT}`, - }); - logger.info("Created Temporal connection"); - - const worker = await Worker.create({ - ...workflowOptions, + await runWorker({ + serviceName: "AI worker", + taskQueue: "ai", + healthCheckPort: 4100, activities: { - ...createAiActivities({ - graphApiClient, - }), - ...createGraphActivities({ - graphApiClient, - }), + ...createAiActivities({ graphApiClient }), + ...createGraphActivities({ graphApiClient }), ...createFlowActivities({ vaultClient }), ...createCommonFlowActivities({ graphApiClient }), }, - connection, - /** - * The maximum time that may elapse between heartbeats being processed by the server. - * The default maxHeartbeatThrottleInterval is 60s. - * Throttling is also capped at 80% of the heartbeatTimeout set when proxying an activity. - */ - maxHeartbeatThrottleInterval: "10 seconds", - namespace: "HASH", - taskQueue: "ai", - sinks: { - ...defaultSinks(), - ...sentrySinks(), - ...(otelSetup - ? { - exporter: makeWorkflowExporter( - // Wrap our v2 OTLPTraceExporter so it can ingest the v1- - // shaped `ReadableSpan`s that Temporal's workflow sandbox - // produces. Without this, the `instrumentationLibrary` → - // `instrumentationScope` rename in sdk-trace-base v2 - // causes the OTLP transformer to crash on every workflow - // span export. - wrapWorkflowSpanExporter( - otelSetup.traceExporter, - ) as unknown as Parameters[0], - otelSetup.resource as unknown as Parameters< - typeof makeWorkflowExporter - >[1], - ), - } - : {}), - }, - interceptors: { - workflowModules: [ - require.resolve( - "@local/hash-backend-utils/temporal/interceptors/workflows/sentry", - ), - ...(otelSetup - ? [ - require.resolve( - "@local/hash-backend-utils/temporal/interceptors/workflows/opentelemetry", - ), - ] - : []), - ], - // OTEL interceptor must run as the OUTER wrapper so it can - // extract the trace context the workflow client injected into - // the activity headers before any other span is created. Sentry - // doesn't know the Temporal `_tracer-data` header convention, so - // a Sentry-first ordering produces a parent-less span and breaks - // the caller→workflow→activity trace chain. - activityInbound: [ - ...((otelSetup - ? [(ctx) => new OpenTelemetryActivityInboundInterceptor(ctx)] - : []) satisfies ActivityInboundCallsInterceptorFactory[]), - (ctx) => new SentryActivityInboundInterceptor(ctx), - ], + workflowSource, + workerOptions: { + /** + * Maximum interval between heartbeats being processed by the server. + * Default `maxHeartbeatThrottleInterval` is 60s; throttling is also + * capped at 80% of `heartbeatTimeout` set when proxying an activity. + */ + maxHeartbeatThrottleInterval: "10 seconds", }, + otelSetup, + logger, }); - - const httpServer = createHealthCheckServer(); - const port = 4100; - httpServer.listen({ host: "0.0.0.0", port }); - - logger.info(`HTTP server listening on port ${port}`); - - await worker.run(); } -const shutdown = async (signal: NodeJS.Signals) => { - logger.info(`Received ${signal}, exiting...`); - // Flush any pending OTLP exports before tearing down the process. - // Without this, the last seconds of telemetry are lost on every - // graceful restart. - try { - await otelSetup?.shutdown(); - } catch (error) { - logger.error("Failed to flush OpenTelemetry", { error }); - } - process.exit(1); -}; - -process.on("SIGINT", (signal) => void shutdown(signal)); -process.on("SIGTERM", (signal) => void shutdown(signal)); - run().catch((error: unknown) => { logger.error("Error running worker", { error }); process.exit(1); diff --git a/apps/hash-api/src/index.ts b/apps/hash-api/src/index.ts index 6abfa0e6cc0..6646a15118f 100644 --- a/apps/hash-api/src/index.ts +++ b/apps/hash-api/src/index.ts @@ -12,7 +12,6 @@ import { realtimeSyncEnabled, waitOnResource, } from "@local/hash-backend-utils/environment"; -import { getActiveOpenTelemetrySetup } from "@local/hash-backend-utils/opentelemetry"; import { createRedisClient } from "@local/hash-backend-utils/redis"; import { GracefulShutdown } from "@local/hash-backend-utils/shutdown"; import { createTemporalClient } from "@local/hash-backend-utils/temporal"; @@ -67,6 +66,7 @@ import { createEmailTransporter } from "./email/create-email-transporter"; import { ensureSystemGraphIsInitialized } from "./graph/ensure-system-graph-is-initialized"; import { ensureHashSystemAccountExists } from "./graph/system-account"; import { createApolloServer } from "./graphql/create-apollo-server"; +import { otelSetup } from "./instrument.mjs"; import { enabledIntegrations } from "./integrations/enabled-integrations"; import { checkGoogleAccessToken } from "./integrations/google/check-access-token"; import { getGoogleAccessToken } from "./integrations/google/get-access-token"; @@ -103,8 +103,8 @@ const shutdown = new GracefulShutdown(logger, "SIGINT", "SIGTERM"); // Flush OpenTelemetry last so cleanup hooks above this point can still // emit shutdown spans / logs before the providers disconnect from the -// collector. -const otelSetup = getActiveOpenTelemetrySetup(); +// collector. `otelSetup` is `undefined` when no `HASH_OTLP_ENDPOINT` is +// configured (no collector) or when bootstrap throws. if (otelSetup) { shutdown.addCleanup("OpenTelemetry", otelSetup.shutdown); } diff --git a/apps/hash-api/src/instrument.mjs b/apps/hash-api/src/instrument.mjs index acb62854383..2d6711a790c 100644 --- a/apps/hash-api/src/instrument.mjs +++ b/apps/hash-api/src/instrument.mjs @@ -1,74 +1,75 @@ /** Required to load environment variables */ import "@local/hash-backend-utils/environment"; +import { + createUndiciInstrumentation, + httpRequestSpanNameHook, + registerOpenTelemetry, +} from "@local/hash-backend-utils/opentelemetry"; +import { + ExpressInstrumentation, + ExpressLayerType, +} from "@opentelemetry/instrumentation-express"; +import { GraphQLInstrumentation } from "@opentelemetry/instrumentation-graphql"; +import { HttpInstrumentation } from "@opentelemetry/instrumentation-http"; import * as Sentry from "@sentry/node"; import { isProdEnv } from "./lib/env-config"; -// Initialize OpenTelemetry BEFORE any app code. The setup handle is -// retrievable via `getActiveOpenTelemetrySetup()` so `index.ts` can -// wire `shutdown()` into the GracefulShutdown chain — without it, -// in-flight OTLP exports are lost on SIGTERM. -const otlpEndpoint = process.env.HASH_OTLP_ENDPOINT; -if (otlpEndpoint) { - const { createUndiciInstrumentation, registerOpenTelemetry } = await import( - "@local/hash-backend-utils/opentelemetry" - ); - const [ - { HttpInstrumentation }, - { ExpressInstrumentation, ExpressLayerType }, - { GraphQLInstrumentation }, - ] = await Promise.all([ - import("@opentelemetry/instrumentation-http"), - import("@opentelemetry/instrumentation-express"), - import("@opentelemetry/instrumentation-graphql"), - ]); - - registerOpenTelemetry({ - endpoint: otlpEndpoint, - serviceName: process.env.OTEL_SERVICE_NAME || "Node API", - instrumentations: [ - new HttpInstrumentation({ - ignoreOutgoingRequestHook: (options) => options.port === 4317, - // The default span name is just `HTTP POST` etc. Replace with - // `METHOD /path`. Incoming requests (IncomingMessage) expose - // `originalUrl` (Express) or `url`; outgoing requests - // (ClientRequest) expose `path`. Cover both so neither side - // ends up with a bare `POST` span name. - requestHook: (span, request) => { - if (!("method" in request) || !request.method) { - return; - } - const rawPath = - ("originalUrl" in request && request.originalUrl) || - ("url" in request && request.url) || - ("path" in request && request.path) || - ""; - // Strip query string to keep cardinality bounded. - const path = String(rawPath).split("?")[0]; - if (path) { - span.updateName(`${request.method} ${path}`); - } - }, - }), - new ExpressInstrumentation({ - ignoreLayersType: [ExpressLayerType.MIDDLEWARE], - }), - new GraphQLInstrumentation({ - allowValues: true, - depth: 5, - mergeItems: true, - ignoreTrivialResolveSpans: true, - }), - // Native `fetch` (used by openai SDK and outbound API calls in - // resolvers) goes through undici, which the http instrumentation - // does not patch. The shared helper sets `peer.service` so - // Tempo's service_graphs processor renders external dependencies - // as edges in the service map. - createUndiciInstrumentation(), - ], - }); -} +/** + * OpenTelemetry setup handle, exported so `index.ts` can wire + * `shutdown()` into the GracefulShutdown chain. `undefined` when + * `HASH_OTLP_ENDPOINT` is unset (no collector configured) or when + * bootstrap throws. + * + * @type {import("@local/hash-backend-utils/opentelemetry").OpenTelemetrySetup | undefined} + */ +export const otelSetup = (() => { + const otlpEndpoint = process.env.HASH_OTLP_ENDPOINT; + if (!otlpEndpoint) { + return undefined; + } + try { + return registerOpenTelemetry({ + endpoint: otlpEndpoint, + serviceName: process.env.OTEL_SERVICE_NAME || "Node API", + instrumentations: [ + new HttpInstrumentation({ + // Don't trace the OTLP exporter's own outgoing requests — every + // export would create a span that needs to be exported, recursively. + ignoreOutgoingRequestHook: (options) => options.port === 4317, + requestHook: httpRequestSpanNameHook, + }), + new ExpressInstrumentation({ + ignoreLayersType: [ExpressLayerType.MIDDLEWARE], + }), + new GraphQLInstrumentation({ + allowValues: true, + depth: 5, + mergeItems: true, + ignoreTrivialResolveSpans: true, + }), + // Native `fetch` (used by openai SDK and outbound API calls in + // resolvers) goes through undici, which the http instrumentation + // does not patch. The shared helper sets `peer.service` so + // Tempo's service_graphs processor renders external dependencies + // as edges in the service map. + createUndiciInstrumentation(), + ], + }); + } catch (error) { + // Bootstrap runs before the application logger is wired up, so direct + // stderr is the right channel. Do not rethrow: the service should + // still start without telemetry rather than crash on a misconfigured + // collector. + // eslint-disable-next-line no-console + console.error( + "OpenTelemetry bootstrap failed; service will start without telemetry.", + error, + ); + return undefined; + } +})(); const sentryDsn = process.env.NODE_API_SENTRY_DSN; @@ -81,11 +82,9 @@ Sentry.init({ (isProdEnv ? "production" : "development"), sendDefaultPii: true, tracesSampleRate: isProdEnv ? 1.0 : 0, - // When OTEL is configured we already have a global TracerProvider - // wired up to OTLP. Tell Sentry to share it instead of registering its - // own — otherwise Sentry hijacks the global provider and the - // OpenTelemetryWorkflowClientInterceptor that injects trace context - // into Temporal workflow headers ends up running on a non-functional - // tracer, breaking caller→workflow→activity correlation. - skipOpenTelemetrySetup: !!otlpEndpoint, + // Skip Sentry's tracer setup only when our v2 provider actually + // registered. Gating on `otlpEndpoint` instead would skip Sentry's + // setup whenever the env var is set even if our bootstrap threw, + // leaving the process with neither tracer. + skipOpenTelemetrySetup: !!otelSetup, }); diff --git a/apps/hash-integration-worker/scripts/bundle-workflow-code.ts b/apps/hash-integration-worker/scripts/bundle-workflow-code.ts index 9c37039248c..9cf2a65542c 100644 --- a/apps/hash-integration-worker/scripts/bundle-workflow-code.ts +++ b/apps/hash-integration-worker/scripts/bundle-workflow-code.ts @@ -17,6 +17,14 @@ async function bundle() { require.resolve( "@local/hash-backend-utils/temporal/interceptors/workflows/sentry", ), + // OTEL workflow interceptor must be in the bundle: when the + // worker boots with `workflowBundle`, the `interceptors.workflowModules` + // option on `Worker.create` is ignored. The interceptor is a no-op + // when no global TracerProvider is registered, so it's safe to + // include unconditionally. + require.resolve( + "@local/hash-backend-utils/temporal/interceptors/workflows/opentelemetry", + ), ], }); const codePath = path.join(__dirname, "../dist/workflow-bundle.js"); diff --git a/apps/hash-integration-worker/src/instrument.ts b/apps/hash-integration-worker/src/instrument.ts index ab629fd02f5..45a4a73c90a 100644 --- a/apps/hash-integration-worker/src/instrument.ts +++ b/apps/hash-integration-worker/src/instrument.ts @@ -5,47 +5,31 @@ */ import { createUndiciInstrumentation, + httpRequestSpanNameHook, registerOpenTelemetry, } from "@local/hash-backend-utils/opentelemetry"; import { GrpcInstrumentation } from "@opentelemetry/instrumentation-grpc"; import { HttpInstrumentation } from "@opentelemetry/instrumentation-http"; -const otlpEndpoint = process.env.HASH_OTLP_ENDPOINT; - /** - * Setup handles. `undefined` when no `HASH_OTLP_ENDPOINT` is configured — - * `main.ts` checks for that and skips workflow-side OTEL wiring in that - * case. + * Setup handles. `undefined` when no `HASH_OTLP_ENDPOINT` is configured + * (no collector) or when bootstrap throws. */ -export const otelSetup = otlpEndpoint - ? registerOpenTelemetry({ +export const otelSetup: ReturnType = (() => { + const otlpEndpoint = process.env.HASH_OTLP_ENDPOINT; + if (!otlpEndpoint) { + return undefined; + } + try { + return registerOpenTelemetry({ endpoint: otlpEndpoint, serviceName: process.env.OTEL_SERVICE_NAME ?? "Integration Worker", instrumentations: [ new HttpInstrumentation({ - // Don't trace the OTLP exporter's own outgoing requests, otherwise - // every export creates a span that needs to be exported, recursively. + // Don't trace the OTLP exporter's own outgoing requests — every + // export would create a span that needs to be exported, recursively. ignoreOutgoingRequestHook: (options) => options.port === 4317, - // Default span name is just `HTTP POST` — replace with - // `METHOD /path` so outbound Linear / Graph calls are - // distinguishable in Tempo. - requestHook: (span, request) => { - if (!("method" in request) || !request.method) { - return; - } - const candidates = [ - "originalUrl" in request ? request.originalUrl : undefined, - "url" in request ? request.url : undefined, - "path" in request ? request.path : undefined, - ]; - const rawPath = candidates.find( - (value): value is string => typeof value === "string", - ); - const path = rawPath?.split("?")[0]; - if (path) { - span.updateName(`${request.method} ${path}`); - } - }, + requestHook: httpRequestSpanNameHook, }), new GrpcInstrumentation(), // Native `fetch` (used by Linear SDK / outbound API calls) goes @@ -54,5 +38,16 @@ export const otelSetup = otlpEndpoint // processor renders Linear etc. as external-service edges. createUndiciInstrumentation(), ], - }) - : undefined; + }); + } catch (error) { + // Bootstrap runs before any logger is wired up, so direct stderr is + // the right channel. Don't rethrow — the worker should still start + // without telemetry. + // eslint-disable-next-line no-console + console.error( + "OpenTelemetry bootstrap failed; integration worker will start without telemetry.", + error, + ); + return undefined; + } +})(); diff --git a/apps/hash-integration-worker/src/main.ts b/apps/hash-integration-worker/src/main.ts index 53158b04f8e..2e85db0bb75 100644 --- a/apps/hash-integration-worker/src/main.ts +++ b/apps/hash-integration-worker/src/main.ts @@ -16,15 +16,16 @@ Sentry.init({ process.env.ENVIRONMENT || (process.env.NODE_ENV === "production" ? "production" : "development"), tracesSampleRate: process.env.NODE_ENV === "production" ? 1.0 : 0, - // When OTEL is configured we already have a global TracerProvider - // wired up to OTLP. Tell Sentry to share it instead of registering its - // own, so Sentry's spans flow through our pipeline and inherit the - // active OTEL context (including parent spans extracted from Temporal - // headers by OpenTelemetryActivityInboundInterceptor). - skipOpenTelemetrySetup: !!process.env.HASH_OTLP_ENDPOINT, + // Sentry registers its own global `NodeTracerProvider` by default. + // Letting it run after `registerOpenTelemetry` would replace the + // provider that the OTEL workflow client interceptor (set up in + // `createTemporalClient`) holds via `trace.getTracer(...)`, breaking + // caller → workflow → activity context propagation. With this flag + // Sentry shares our provider so its spans flow through the same + // OTLP pipeline. + skipOpenTelemetrySetup: !!otelSetup, }); -import * as http from "node:http"; import { createRequire } from "node:module"; import path from "node:path"; import { fileURLToPath } from "node:url"; @@ -33,20 +34,9 @@ import { createGraphClient } from "@local/hash-backend-utils/create-graph-client import { getRequiredEnv } from "@local/hash-backend-utils/environment"; import { createCommonFlowActivities } from "@local/hash-backend-utils/flows"; import { Logger } from "@local/hash-backend-utils/logger"; -import { OpenTelemetryActivityInboundInterceptor } from "@local/hash-backend-utils/temporal/interceptors/activities/opentelemetry"; -import { SentryActivityInboundInterceptor } from "@local/hash-backend-utils/temporal/interceptors/activities/sentry"; -import { sentrySinks } from "@local/hash-backend-utils/temporal/sinks/sentry"; -import { createTemporalSdkLogger } from "@local/hash-backend-utils/temporal"; -import { wrapWorkflowSpanExporter } from "@local/hash-backend-utils/temporal/workflow-span-adapter"; +import type { WorkflowSource } from "@local/hash-backend-utils/temporal/worker-bootstrap"; +import { runWorker } from "@local/hash-backend-utils/temporal/worker-bootstrap"; import type { WorkflowTypeMap } from "@local/hash-backend-utils/temporal-integration-workflow-types"; -import { makeWorkflowExporter } from "@temporalio/interceptors-opentelemetry"; -import type { ActivityInboundCallsInterceptorFactory } from "@temporalio/worker"; -import { - defaultSinks, - NativeConnection, - Runtime, - Worker, -} from "@temporalio/worker"; import { config } from "dotenv-flow"; import { createFlowActivities } from "./activities/flow-activities.js"; @@ -58,8 +48,9 @@ const __dirname = path.dirname(__filename); const require = createRequire(import.meta.url); -// This is a workaround to ensure that all functions defined in WorkflowTypeMap are exported from the workflows file -// They must be individually exported from the file, and it's impossible to check completeness of exports in the file itself +// Ensures that all functions defined in WorkflowTypeMap are exported from the +// workflows file. They must be individually exported, and it's impossible to +// check completeness of exports in the file itself. // eslint-disable-next-line @typescript-eslint/no-unused-vars const exportMap: WorkflowTypeMap = workflows; @@ -72,163 +63,35 @@ export const logger = new Logger({ serviceName: "integration-worker", }); -const TEMPORAL_HOST = new URL( - process.env.HASH_TEMPORAL_SERVER_HOST ?? "http://localhost", -).hostname; -const TEMPORAL_PORT = process.env.HASH_TEMPORAL_SERVER_PORT - ? parseInt(process.env.HASH_TEMPORAL_SERVER_PORT, 10) - : 7233; - -const createHealthCheckServer = () => { - const server = http.createServer((req, res) => { - if (req.method === "GET" && req.url === "/health") { - res.setHeader("Content-Type", "application/json"); - res.writeHead(200); - res.end( - JSON.stringify({ - msg: "worker healthy", - }), - ); - return; - } - res.writeHead(404); - res.end(""); - }); - - return server; -}; - -const workflowOption = () => +const workflowSource: WorkflowSource = process.env.NODE_ENV === "production" ? { - workflowBundle: { - codePath: require.resolve("../dist/workflow-bundle.js"), - }, + kind: "bundle", + bundle: { codePath: require.resolve("../dist/workflow-bundle.js") }, } - : { workflowsPath: require.resolve("./workflows") }; + : { kind: "path", workflowsPath: require.resolve("./workflows") }; async function run() { - logger.info("Starting integration worker..."); - - // Temporal SDK runtime telemetry: emits SDK-internal metrics (worker - // slot utilisation, sticky cache hits, polling latency, activity / - // workflow execution latency) and forwards Rust core logs to OTLP. - // Must be installed before any Temporal Connection / Worker is - // created. Distinct from the per-activity user-code spans the - // interceptors below produce. - if (otelSetup) { - Runtime.install({ - // Pipe Rust-core SDK logs through the same structured logger as - // the rest of the worker, so they end up on the OTLP transport - // alongside app logs instead of bypassing it to stderr. - logger: createTemporalSdkLogger(logger), - telemetryOptions: { - metrics: { - otel: { - url: process.env.HASH_OTLP_ENDPOINT!, - metricsExportInterval: "30s", - }, - }, - logging: { forward: { level: "INFO" } }, - }, - }); - } - const graphApiClient = createGraphClient(logger, { host: getRequiredEnv("HASH_GRAPH_HTTP_HOST"), port: parseInt(getRequiredEnv("HASH_GRAPH_HTTP_PORT"), 10), }); - const worker = await Worker.create({ - ...workflowOption(), + await runWorker({ + serviceName: "integration worker", + taskQueue: "integration", + healthCheckPort: 4300, activities: { - ...linearActivities.createLinearIntegrationActivities({ - graphApiClient, - }), - ...createFlowActivities({ - graphApiClient, - }), + ...linearActivities.createLinearIntegrationActivities({ graphApiClient }), + ...createFlowActivities({ graphApiClient }), ...createCommonFlowActivities({ graphApiClient }), }, - connection: await NativeConnection.connect({ - address: `${TEMPORAL_HOST}:${TEMPORAL_PORT}`, - }), - namespace: "HASH", - taskQueue: "integration", - sinks: { - ...defaultSinks(), - ...sentrySinks(), - ...(otelSetup - ? { - exporter: makeWorkflowExporter( - // Wrap our v2 OTLPTraceExporter so it can ingest the v1- - // shaped `ReadableSpan`s that Temporal's workflow sandbox - // produces. Without this, the `instrumentationLibrary` → - // `instrumentationScope` rename in sdk-trace-base v2 - // causes the OTLP transformer to crash on every workflow - // span export. - wrapWorkflowSpanExporter( - otelSetup.traceExporter, - ) as unknown as Parameters[0], - otelSetup.resource as unknown as Parameters< - typeof makeWorkflowExporter - >[1], - ), - } - : {}), - }, - interceptors: { - workflowModules: [ - require.resolve( - "@local/hash-backend-utils/temporal/interceptors/workflows/sentry", - ), - ...(otelSetup - ? [ - require.resolve( - "@local/hash-backend-utils/temporal/interceptors/workflows/opentelemetry", - ), - ] - : []), - ], - // OTEL interceptor must run as the OUTER wrapper so it can - // extract the trace context the workflow client injected into - // the activity headers before any other span is created. Sentry - // doesn't know the Temporal `_tracer-data` header convention, so - // a Sentry-first ordering produces a parent-less span and breaks - // the caller→workflow→activity trace chain. - activityInbound: [ - ...((otelSetup - ? [(ctx) => new OpenTelemetryActivityInboundInterceptor(ctx)] - : []) satisfies ActivityInboundCallsInterceptorFactory[]), - (ctx) => new SentryActivityInboundInterceptor(ctx), - ], - }, + workflowSource, + otelSetup, + logger, }); - - const httpServer = createHealthCheckServer(); - const port = 4300; - httpServer.listen({ host: "0.0.0.0", port }); - logger.info(`HTTP server listening on port ${port}`); - - await worker.run(); } -const shutdown = async (signal: NodeJS.Signals) => { - logger.info(`Received ${signal}, exiting...`); - // Flush any pending OTLP exports before tearing down the process. - // Without this, the last seconds of telemetry are lost on every - // graceful restart. - try { - await otelSetup?.shutdown(); - } catch (error) { - logger.error("Failed to flush OpenTelemetry", { error }); - } - process.exit(1); -}; - -process.on("SIGINT", (signal) => void shutdown(signal)); -process.on("SIGTERM", (signal) => void shutdown(signal)); - run().catch((error: unknown) => { logger.error("Error running worker", { error }); process.exit(1); diff --git a/libs/@local/hash-backend-utils/package.json b/libs/@local/hash-backend-utils/package.json index 8b3d3309f20..fb811fcff9a 100644 --- a/libs/@local/hash-backend-utils/package.json +++ b/libs/@local/hash-backend-utils/package.json @@ -78,6 +78,7 @@ "devDependencies": { "@local/eslint": "workspace:*", "@local/tsconfig": "workspace:*", + "@opentelemetry/instrumentation-http": "0.207.0", "@types/dotenv-flow": "3.3.3", "@types/mime-types": "2.1.4", "@types/node": "22.18.13", diff --git a/libs/@local/hash-backend-utils/src/opentelemetry.test.ts b/libs/@local/hash-backend-utils/src/opentelemetry.test.ts new file mode 100644 index 00000000000..c4de74f7b24 --- /dev/null +++ b/libs/@local/hash-backend-utils/src/opentelemetry.test.ts @@ -0,0 +1,133 @@ +import type { ClientRequest, IncomingMessage } from "node:http"; + +import { type Span, trace } from "@opentelemetry/api"; +import { describe, expect, it } from "vitest"; + +import { + httpRequestSpanNameHook, + resolvePeerService, +} from "./opentelemetry.js"; + +describe("resolvePeerService", () => { + it("matches exact hosts to their service label", () => { + expect(resolvePeerService("api.openai.com")).toBe("OpenAI"); + expect(resolvePeerService("api.anthropic.com")).toBe("Anthropic"); + expect(resolvePeerService("api.linear.app")).toBe("Linear"); + }); + + it("matches suffix rules for subdomains", () => { + expect(resolvePeerService("bigquery.googleapis.com")).toBe("Google Cloud"); + expect(resolvePeerService("aiplatform.googleapis.com")).toBe( + "Google Cloud", + ); + }); + + it("does not match a suffix rule against the bare domain", () => { + // `.googleapis.com` (with leading dot) only matches if the host + // ends with that — `googleapis.com` itself does not. + expect(resolvePeerService("googleapis.com")).toBeUndefined(); + }); + + it("does not match unrelated hosts", () => { + expect(resolvePeerService("example.com")).toBeUndefined(); + expect(resolvePeerService("openai.com")).toBeUndefined(); + expect(resolvePeerService("anthropic.com")).toBeUndefined(); + }); + + it("does not match a substring inside a host segment", () => { + expect(resolvePeerService("not-api.openai.com.evil.test")).toBeUndefined(); + }); + + // Suffix rules use `.endsWith(rule.suffix)` with the leading dot, so a + // host that happens to share the suffix without the dot boundary + // (e.g. `evilgoogleapis.com`) must not match. This is the property + // that prevents lookalike-domain attribution. + it("requires the suffix dot boundary", () => { + expect(resolvePeerService("evilgoogleapis.com")).toBeUndefined(); + expect(resolvePeerService("googleapis.com.evil.test")).toBeUndefined(); + }); +}); + +describe("httpRequestSpanNameHook", () => { + /** + * Build a recording span with a mutable `updateName` capture so the + * hook's effect can be asserted without a full TracerProvider. + */ + const makeSpan = (): { span: Span; updates: string[] } => { + const updates: string[] = []; + const noopSpan = trace.getTracer("test").startSpan("noop"); + const span: Span = Object.assign(noopSpan, { + updateName: (name: string) => { + updates.push(name); + return span; + }, + }); + return { span, updates }; + }; + + it("renames incoming requests to METHOD /path", () => { + const { span, updates } = makeSpan(); + const incoming = { + method: "GET", + url: "/api/v1/widgets", + } as Partial; + + httpRequestSpanNameHook(span, incoming as IncomingMessage); + + expect(updates).toEqual(["GET /api/v1/widgets"]); + }); + + it("prefers Express's `originalUrl` over `url` when both are present", () => { + const { span, updates } = makeSpan(); + const incoming = { + method: "POST", + originalUrl: "/graphql", + url: "/", // Express rewrites url after route matching + } as Partial & { originalUrl: string }; + + httpRequestSpanNameHook( + span, + incoming as IncomingMessage & { originalUrl: string }, + ); + + expect(updates).toEqual(["POST /graphql"]); + }); + + it("renames outgoing requests using `path`", () => { + const { span, updates } = makeSpan(); + const outgoing = { + method: "POST", + path: "/v1/embeddings", + } as unknown as ClientRequest; + + httpRequestSpanNameHook(span, outgoing); + + expect(updates).toEqual(["POST /v1/embeddings"]); + }); + + it("strips query string to keep cardinality bounded", () => { + const { span, updates } = makeSpan(); + httpRequestSpanNameHook(span, { + method: "GET", + url: "/search?q=secret&page=2", + } as IncomingMessage); + + expect(updates).toEqual(["GET /search"]); + }); + + it("does nothing when method is missing", () => { + const { span, updates } = makeSpan(); + httpRequestSpanNameHook(span, { + url: "/api/widgets", + } as IncomingMessage); + + expect(updates).toEqual([]); + }); + + it("does nothing when no path source is available", () => { + const { span, updates } = makeSpan(); + httpRequestSpanNameHook(span, { method: "GET" } as IncomingMessage); + + expect(updates).toEqual([]); + }); +}); diff --git a/libs/@local/hash-backend-utils/src/opentelemetry.ts b/libs/@local/hash-backend-utils/src/opentelemetry.ts index 4f54c726403..e4bd886aa16 100644 --- a/libs/@local/hash-backend-utils/src/opentelemetry.ts +++ b/libs/@local/hash-backend-utils/src/opentelemetry.ts @@ -15,6 +15,7 @@ import { OTLPMetricExporter } from "@opentelemetry/exporter-metrics-otlp-grpc"; import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-grpc"; import type { Instrumentation } from "@opentelemetry/instrumentation"; import { registerInstrumentations } from "@opentelemetry/instrumentation"; +import type { HttpInstrumentationConfig } from "@opentelemetry/instrumentation-http"; import { UndiciInstrumentation } from "@opentelemetry/instrumentation-undici"; import type { Resource } from "@opentelemetry/resources"; import { @@ -22,19 +23,20 @@ import { resourceFromAttributes, } from "@opentelemetry/resources"; import { + BatchLogRecordProcessor, LoggerProvider, - SimpleLogRecordProcessor, } from "@opentelemetry/sdk-logs"; import { MeterProvider, PeriodicExportingMetricReader, } from "@opentelemetry/sdk-metrics"; import type { SpanExporter } from "@opentelemetry/sdk-trace-base"; -import { SimpleSpanProcessor } from "@opentelemetry/sdk-trace-base"; +import { BatchSpanProcessor } from "@opentelemetry/sdk-trace-base"; import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node"; const traceTimeoutMs = 5000; const metricExportIntervalMs = 30_000; +const shutdownTimeoutMs = 2000; export interface RegisterOpenTelemetryOptions { /** @@ -50,6 +52,8 @@ export interface RegisterOpenTelemetryOptions { } export interface OpenTelemetrySetup { + /** OTLP gRPC endpoint this setup is attached to. */ + endpoint: string; /** Run during graceful shutdown to flush pending spans / logs / metrics. */ shutdown: () => Promise; /** @@ -64,39 +68,44 @@ export interface OpenTelemetrySetup { /** * Mapping of outbound `host` → `peer.service` label used by Tempo's * `service_graphs` processor to render external dependencies as - * separate nodes in the service map. Without this attribute the - * undici-instrumentation spans land under the caller service ("AI - * Worker") with a bare `POST` name and no external-service edge. + * separate nodes in the service map. * - * Keys are matched as exact host or as suffix (`.openai.com` matches - * `api.openai.com`). + * Order matters: the first match wins, so place narrower exact matches + * before broader suffix matches. `kind: "suffix"` matches against the + * tail of the host (e.g. `.googleapis.com` matches `bigquery.googleapis.com` + * but not the bare `googleapis.com`). */ -const PEER_SERVICE_BY_HOST: Array<[string, string]> = [ - ["api.openai.com", "OpenAI"], - ["api.anthropic.com", "Anthropic"], - ["api.linear.app", "Linear"], - [".googleapis.com", "Google Cloud"], - ["edge-config.vercel.com", "Vercel Edge Config"], - [".vercel.com", "Vercel"], +type PeerServiceRule = + | { kind: "exact"; host: string; service: string } + | { kind: "suffix"; suffix: string; service: string }; + +const PEER_SERVICE_RULES: readonly PeerServiceRule[] = [ + { kind: "exact", host: "api.openai.com", service: "OpenAI" }, + { kind: "exact", host: "api.anthropic.com", service: "Anthropic" }, + { kind: "exact", host: "api.linear.app", service: "Linear" }, + { kind: "suffix", suffix: ".googleapis.com", service: "Google Cloud" }, ]; -const resolvePeerService = (host: string): string | undefined => { - for (const [match, service] of PEER_SERVICE_BY_HOST) { - if (match.startsWith(".") ? host.endsWith(match) : host === match) { - return service; +export const resolvePeerService = (host: string): string | undefined => { + for (const rule of PEER_SERVICE_RULES) { + if (rule.kind === "exact" && rule.host === host) { + return rule.service; + } + if (rule.kind === "suffix" && host.endsWith(rule.suffix)) { + return rule.service; } } return undefined; }; /** - * `@opentelemetry/instrumentation-undici` instance configured with: + * Undici instrumentation configured to: * - * - `peer.service` derived from the outbound host (Tempo's - * `service_graphs` processor turns this into an external-service - * edge in the service map). - * - Span name set to `METHOD host/path` rather than the bare HTTP - * verb the instrumentation defaults to. + * - Tag spans with `peer.service` derived from the outbound host. Tempo's + * `service_graphs` processor turns this into an external-service edge + * in the service map. + * - Name spans `METHOD path`. The host already lives in `peer.service`, + * so the span name only carries the path. */ export const createUndiciInstrumentation = (): UndiciInstrumentation => new UndiciInstrumentation({ @@ -110,29 +119,68 @@ export const createUndiciInstrumentation = (): UndiciInstrumentation => } }, requestHook: (span, request) => { - // Default span name is just `POST` — replace with `METHOD path` - // so outbound calls are distinguishable in Tempo. The host is - // covered by `peer.service` (set in `startSpanHook`), so the - // span name only carries the path. Strip query string to keep - // cardinality bounded. + if (typeof request.path !== "string") { + return; + } + // Strip query string to keep cardinality bounded. const path = request.path.split("?")[0]; - span.updateName(`${request.method} ${path}`); + if (path) { + span.updateName(`${request.method} ${path}`); + } }, }); /** - * Last setup returned from `registerOpenTelemetry`. Exposed so callers - * that bootstrap OTEL via a `--import` shim (where the setup handle - * isn't naturally accessible from the main entry point) can still wire - * `shutdown()` into their graceful-shutdown chain. + * `requestHook` for `@opentelemetry/instrumentation-http` that names + * spans `METHOD /path`. Path source depends on the request shape: + * outgoing `ClientRequest` exposes `path`, incoming `IncomingMessage` + * exposes `url`. `originalUrl` is checked first as a no-cost fallback + * for the case where Express has already wrapped the request before + * the hook reads it. */ -let activeSetup: OpenTelemetrySetup | undefined; +export const httpRequestSpanNameHook: NonNullable< + HttpInstrumentationConfig["requestHook"] +> = (span, request) => { + if (!("method" in request) || !request.method) { + return; + } + const candidates = [ + "originalUrl" in request ? request.originalUrl : undefined, + "url" in request ? request.url : undefined, + "path" in request ? request.path : undefined, + ]; + const rawPath = candidates.find( + (value): value is string => typeof value === "string", + ); + // Strip query string to keep cardinality bounded. + const path = rawPath?.split("?")[0]; + if (path) { + span.updateName(`${request.method} ${path}`); + } +}; -/** - * Returns the most recently registered OTEL setup, if any. - */ -export const getActiveOpenTelemetrySetup = (): OpenTelemetrySetup | undefined => - activeSetup; +const shutdownWithTimeout = async ( + label: string, + shutdown: () => Promise, +): Promise => { + let timer: NodeJS.Timeout | undefined; + const timeoutPromise = new Promise((_, reject) => { + timer = setTimeout(() => { + reject( + new Error( + `${label} shutdown exceeded ${shutdownTimeoutMs}ms — pending exports may be dropped.`, + ), + ); + }, shutdownTimeoutMs); + }); + try { + await Promise.race([shutdown(), timeoutPromise]); + } finally { + if (timer) { + clearTimeout(timer); + } + } +}; /** * Initialise tracing, logging, and metrics. Returns `undefined` when @@ -144,9 +192,8 @@ export const registerOpenTelemetry = ({ instrumentations = [], }: RegisterOpenTelemetryOptions): OpenTelemetrySetup | undefined => { if (!endpoint) { - // This runs before any logger is wired up, so direct stderr is the - // right channel — and it's a one-shot bootstrap message, not a hot - // log path. + // Runs before any logger is wired up, so direct stderr is the + // right channel. // eslint-disable-next-line no-console console.warn( "No OpenTelemetry Protocol endpoint given. Not sending telemetry anywhere.", @@ -163,23 +210,24 @@ export const registerOpenTelemetry = ({ resourceFromAttributes({ "service.name": serviceName }), ); - // Tracing + // Batch processors keep span / log export off the request path. The + // Simple variants export each record synchronously, which under load + // saturates the gRPC connection and adds tail latency to every + // request. const traceExporter = new OTLPTraceExporter(collectorOptions); const traceProvider = new NodeTracerProvider({ resource, - spanProcessors: [new SimpleSpanProcessor(traceExporter)], + spanProcessors: [new BatchSpanProcessor(traceExporter)], }); traceProvider.register(); - // Logs const logExporter = new OTLPLogExporter(collectorOptions); const logProvider = new LoggerProvider({ resource, - processors: [new SimpleLogRecordProcessor(logExporter)], + processors: [new BatchLogRecordProcessor(logExporter)], }); logs.setGlobalLoggerProvider(logProvider); - // Metrics const metricExporter = new OTLPMetricExporter(collectorOptions); const meterProvider = new MeterProvider({ resource, @@ -201,18 +249,33 @@ export const registerOpenTelemetry = ({ `Registered OpenTelemetry (traces + logs + metrics) at endpoint ${endpoint} for ${serviceName}`, ); - const setup: OpenTelemetrySetup = { + return { + endpoint, traceExporter, resource, shutdown: async () => { - await Promise.allSettled([ - traceProvider.shutdown(), - logProvider.shutdown(), - meterProvider.shutdown(), - ]); + // Flush each provider with a per-provider timeout so a stuck + // exporter (collector unreachable, gRPC channel hung) cannot + // block the SIGTERM handler indefinitely. Failures are surfaced + // to stderr because the logger may already be shutting down. + const targets: Array Promise]> = [ + ["trace provider", () => traceProvider.shutdown()], + ["log provider", () => logProvider.shutdown()], + ["meter provider", () => meterProvider.shutdown()], + ]; + const results = await Promise.allSettled( + targets.map(([label, run]) => shutdownWithTimeout(label, run)), + ); + for (const [index, result] of results.entries()) { + if (result.status === "rejected") { + // eslint-disable-next-line no-console + console.error( + `OpenTelemetry ${targets[index]![0]} shutdown failed:`, + result.reason, + ); + } + } unregisterInstrumentations(); }, }; - activeSetup = setup; - return setup; }; diff --git a/libs/@local/hash-backend-utils/src/temporal.ts b/libs/@local/hash-backend-utils/src/temporal.ts index 268aabe3339..18cc1dca253 100644 --- a/libs/@local/hash-backend-utils/src/temporal.ts +++ b/libs/@local/hash-backend-utils/src/temporal.ts @@ -11,14 +11,14 @@ export { Client as TemporalClient } from "@temporalio/client"; export const temporalNamespace = "HASH"; /** - * Create a Temporal SDK `Logger` that pipes Rust-core SDK logs and - * Node-side worker events through the application's structured Winston - * logger. Without this, `Runtime.install({ telemetryOptions: { logging: - * { forward } } })` forwards Rust-core logs to a default sink that - * writes to `stderr` directly — bypassing OTLP and the JSON-formatted - * console output the rest of the worker uses. + * Adapter that pipes Temporal SDK logs (both Rust core and Node-side + * worker events) through the application logger, keeping them in the + * same JSON format and log-level scheme as the rest of the worker + * output. Pass to `Runtime.install({ logger })`. */ export const createTemporalSdkLogger = (logger: Logger): DefaultLogger => + // DefaultLogger filters at INFO, so TRACE / DEBUG paths only fire + // when the level is bumped at the call site. new DefaultLogger("INFO", ({ level, message, meta }) => { switch (level) { case "TRACE": @@ -33,6 +33,12 @@ export const createTemporalSdkLogger = (logger: Logger): DefaultLogger => return; case "ERROR": logger.error(message, meta); + return; + default: + logger.warn(`Unknown Temporal SDK log level: ${level as string}`, { + message, + meta, + }); } }); @@ -47,12 +53,10 @@ export const createTemporalClient = async () => { address: `${host}:${port}`, }); - // When the caller has OTEL configured (instrument.mjs ran and a global - // tracer provider is registered), attach the workflow client - // interceptor so the active trace context (e.g. an Express HTTP span) - // gets propagated into the workflow headers. The worker-side - // interceptors then pick it up, and the workflow + activity spans - // chain off the caller's trace. + // When OTEL is configured the active trace context (e.g. an Express + // HTTP span) is injected into workflow start headers. The worker-side + // interceptors extract it and parent the workflow + activity spans + // off the caller's trace. const interceptors = process.env.HASH_OTLP_ENDPOINT ? { workflow: [ diff --git a/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts b/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts new file mode 100644 index 00000000000..f699e1c5893 --- /dev/null +++ b/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts @@ -0,0 +1,293 @@ +/** + * Shared bootstrap for HASH Temporal workers. Both + * `hash-ai-worker-ts` and `hash-integration-worker` enter through + * `runWorker`, supplying per-service deltas (service name, task queue, + * port, activities, workflow bundle path) as options. + * + * `Sentry.init` stays in each worker's `main.ts` because ESM import + * ordering requires it before the rest of the imports — that cannot + * be reproduced from a helper module. Everything from `Runtime.install` + * onwards is centralised here. + */ +import * as http from "node:http"; +import { createRequire } from "node:module"; + +import type { + ActivityInterceptorsFactory, + WorkerOptions, + WorkflowBundleOption, +} from "@temporalio/worker"; +import { + defaultSinks, + NativeConnection, + Runtime, + Worker, +} from "@temporalio/worker"; + +import type { Logger } from "../logger.js"; +import type { OpenTelemetrySetup } from "../opentelemetry.js"; +import { createTemporalSdkLogger } from "../temporal.js"; +import { + OpenTelemetryActivityInboundInterceptor, + OpenTelemetryActivityOutboundInterceptor, +} from "./interceptors/activities/opentelemetry.js"; +import { SentryActivityInboundInterceptor } from "./interceptors/activities/sentry.js"; +import { sentrySinks } from "./sinks/sentry.js"; +import { makeV2WorkflowSink } from "./workflow-span-adapter.js"; + +const require = createRequire(import.meta.url); + +const TEMPORAL_DEFAULT_PORT = 7233; + +const getTemporalAddress = (): string => { + const host = new URL( + process.env.HASH_TEMPORAL_SERVER_HOST ?? "http://localhost", + ).hostname; + const port = process.env.HASH_TEMPORAL_SERVER_PORT + ? parseInt(process.env.HASH_TEMPORAL_SERVER_PORT, 10) + : TEMPORAL_DEFAULT_PORT; + return `${host}:${port}`; +}; + +const createHealthCheckServer = (): http.Server => + http.createServer((req, res) => { + if (req.method === "GET" && req.url === "/health") { + res.setHeader("Content-Type", "application/json"); + res.writeHead(200); + res.end(JSON.stringify({ msg: "worker healthy" })); + return; + } + res.writeHead(404); + res.end(""); + }); + +/** + * Source of workflow code passed to `Worker.create`. + * + * - `bundle` — a prebuilt webpack bundle (`workflowBundle.codePath`), + * produced by the per-worker `bundle-workflow-code.ts` script. Used + * in production builds. + * - `path` — a TypeScript entry-point that the worker bundles in-process + * (`workflowsPath`), with optional `bundlerOptions` for things like + * `tsconfig-paths-webpack-plugin`. Used in development. + */ +export type WorkflowSource = + | { kind: "bundle"; bundle: WorkflowBundleOption } + | { + kind: "path"; + workflowsPath: string; + bundlerOptions?: WorkerOptions["bundlerOptions"]; + }; + +/** + * Per-worker tuning passed straight through to `Worker.create`. Keys + * owned by the helper (`activities`, `connection`, `namespace`, + * `taskQueue`, `sinks`, `interceptors`) and the workflow-source keys + * (`workflowBundle`, `workflowsPath`, `bundlerOptions`) are excluded — + * `workflowSource` is the only way to set them. + */ +export type ExtraWorkerOptions = Omit< + WorkerOptions, + | "activities" + | "connection" + | "namespace" + | "taskQueue" + | "sinks" + | "interceptors" + | "workflowBundle" + | "workflowsPath" + | "bundlerOptions" +>; + +export interface RunWorkerOptions { + /** Logged once at startup; also used as Temporal worker identity. */ + serviceName: string; + /** Temporal task queue this worker pulls work from. */ + taskQueue: string; + /** Port the health-check server listens on. */ + healthCheckPort: number; + /** + * Activities object passed to `Worker.create({ activities })`. The + * caller assembles this from per-worker activity factories. + */ + activities: WorkerOptions["activities"]; + /** Where the workflow code lives. See {@link WorkflowSource}. */ + workflowSource: WorkflowSource; + /** + * Additional `Worker.create` options for per-worker tuning (e.g. + * `maxHeartbeatThrottleInterval`). + */ + workerOptions?: ExtraWorkerOptions; + /** OTEL setup handle from `instrument.ts`; `undefined` disables OTEL wiring. */ + otelSetup: OpenTelemetrySetup | undefined; + /** Application logger shared with the rest of the worker. */ + logger: Logger; +} + +const expandWorkflowSource = ( + source: WorkflowSource, +): Pick< + WorkerOptions, + "workflowBundle" | "workflowsPath" | "bundlerOptions" +> => { + switch (source.kind) { + case "bundle": + return { workflowBundle: source.bundle }; + case "path": + return { + workflowsPath: source.workflowsPath, + bundlerOptions: source.bundlerOptions, + }; + } +}; + +/** + * Boot a HASH Temporal worker. Installs the Temporal SDK runtime + * telemetry (when OTEL is configured), connects to the Temporal server, + * builds the activity interceptor chain, registers the workflow + sink + * wiring, starts a health-check HTTP server, and finally enters + * `worker.run()`. Returns once the worker has drained on SIGTERM/SIGINT. + */ +export async function runWorker(opts: RunWorkerOptions): Promise { + const { logger, otelSetup } = opts; + + logger.info(`Starting ${opts.serviceName}...`); + + // Temporal SDK runtime telemetry: emits SDK-internal metrics (worker + // slot utilisation, sticky cache hits, polling latency, activity / + // workflow execution latency) directly to OTLP, and forwards Rust + // core logs through the Node-side logger so they share the + // application's log pipeline. Must run before any Connection / + // Worker is created. Separate channel from the per-activity user-code + // spans the interceptors below produce. + if (otelSetup) { + Runtime.install({ + logger: createTemporalSdkLogger(logger), + telemetryOptions: { + metrics: { + otel: { + url: otelSetup.endpoint, + metricsExportInterval: "30s", + }, + }, + logging: { forward: { level: "INFO" } }, + }, + }); + } + + const connection = await NativeConnection.connect({ + address: getTemporalAddress(), + }); + logger.info("Created Temporal connection"); + + // OTEL interceptor must precede Sentry: `composeInterceptors` builds + // the chain right-to-left so index 0 is outermost. The OTEL inbound + // half extracts the trace context that the workflow's outbound OTEL + // interceptor injected via `scheduleActivity`, re-establishing the + // parent before any other interceptor opens a span. The outbound half + // stamps `trace_id` / `span_id` / `trace_flags` onto activity log + // lines so Loki ↔ Tempo correlation works. + const activityInterceptors: ActivityInterceptorsFactory[] = []; + if (otelSetup) { + activityInterceptors.push((ctx) => ({ + inbound: new OpenTelemetryActivityInboundInterceptor(ctx), + outbound: new OpenTelemetryActivityOutboundInterceptor(ctx), + })); + } + activityInterceptors.push((ctx) => ({ + inbound: new SentryActivityInboundInterceptor(ctx), + })); + + const worker = await Worker.create({ + ...opts.workerOptions, + ...expandWorkflowSource(opts.workflowSource), + activities: opts.activities, + connection, + namespace: "HASH", + taskQueue: opts.taskQueue, + sinks: { + ...defaultSinks(), + ...sentrySinks(), + ...(otelSetup ? { exporter: makeV2WorkflowSink(otelSetup) } : {}), + }, + interceptors: { + workflowModules: [ + require.resolve( + "@local/hash-backend-utils/temporal/interceptors/workflows/sentry", + ), + require.resolve( + "@local/hash-backend-utils/temporal/interceptors/workflows/opentelemetry", + ), + ], + activity: activityInterceptors, + }, + }); + + const httpServer = createHealthCheckServer(); + httpServer.on("error", (error) => + logger.error("Health-check server error", { error }), + ); + await new Promise((resolve, reject) => { + const onError = (error: Error) => { + httpServer.removeListener("error", onError); + reject(error); + }; + httpServer.once("error", onError); + httpServer.listen({ host: "0.0.0.0", port: opts.healthCheckPort }, () => { + httpServer.removeListener("error", onError); + logger.info(`HTTP server listening on port ${opts.healthCheckPort}`); + resolve(); + }); + }); + + // Start the worker; `worker.run()` resolves once the SDK has fully + // drained in-flight activities after `worker.shutdown()` is called. + // The shutdown handler awaits this promise so SIGTERM doesn't kill + // activities mid-execution. + const workerRunPromise = worker.run(); + + let shuttingDown = false; + const shutdown = async (signal: NodeJS.Signals) => { + if (shuttingDown) { + return; + } + shuttingDown = true; + logger.info(`Received ${signal}, exiting...`); + let exitCode = 0; + try { + worker.shutdown(); + } catch (error) { + logger.error("Worker shutdown trigger failed", { error }); + exitCode = 1; + } + try { + await workerRunPromise; + } catch (error) { + logger.error("Worker drain failed", { error }); + exitCode = 1; + } + try { + httpServer.close(); + } catch (error) { + logger.error("Health-check server close failed", { error }); + } + try { + await otelSetup?.shutdown(); + } catch (error) { + logger.error("Failed to flush OpenTelemetry", { error }); + exitCode = 1; + } + process.exit(exitCode); + }; + + const onSignal = (signal: NodeJS.Signals) => { + shutdown(signal).catch((error: unknown) => { + logger.error("Shutdown handler threw", { error }); + process.exit(1); + }); + }; + process.on("SIGINT", onSignal); + process.on("SIGTERM", onSignal); + + await workerRunPromise; +} diff --git a/libs/@local/hash-backend-utils/src/temporal/workflow-span-adapter.test.ts b/libs/@local/hash-backend-utils/src/temporal/workflow-span-adapter.test.ts new file mode 100644 index 00000000000..3ec2ac79431 --- /dev/null +++ b/libs/@local/hash-backend-utils/src/temporal/workflow-span-adapter.test.ts @@ -0,0 +1,265 @@ +import type { ReadableSpan, SpanExporter } from "@opentelemetry/sdk-trace-base"; +import { describe, expect, it } from "vitest"; + +import { wrapWorkflowSpanExporter } from "./workflow-span-adapter.js"; + +/** + * Recording exporter that captures the spans handed to it without + * actually exporting anywhere. Used to assert that the adapter + * produces v2-shaped spans before the OTLP transformer sees them. + */ +const recordingExporter = (): { + exporter: SpanExporter; + exported: ReadableSpan[]; +} => { + const exported: ReadableSpan[] = []; + return { + exported, + exporter: { + export: (spans, callback) => { + exported.push(...spans); + callback({ code: 0 }); + }, + shutdown: () => Promise.resolve(), + forceFlush: () => Promise.resolve(), + }, + }; +}; + +/** + * Build a v1-shaped Temporal span. Mirrors the shape that + * `@temporalio/interceptors-opentelemetry`'s `extractReadableSpan` + * produces: `instrumentationLibrary` (not `instrumentationScope`) and + * `parentSpanId` (not `parentSpanContext`). + */ +const v1Span = (overrides: Partial> = {}) => + ({ + name: "RunWorkflow:exampleWorkflow", + spanContext: () => ({ + traceId: "0af7651916cd43dd8448eb211c80319c", + spanId: "b7ad6b7169203331", + traceFlags: 1, + }), + instrumentationLibrary: { name: "@temporalio/interceptor-workflow" }, + parentSpanId: "00f067aa0ba902b7", + ...overrides, + }) as unknown as ReadableSpan; + +describe("wrapWorkflowSpanExporter / normaliseSpan", () => { + it("synthesises instrumentationScope from instrumentationLibrary", () => { + const { exporter, exported } = recordingExporter(); + const wrapped = wrapWorkflowSpanExporter(exporter); + + wrapped.export([v1Span()], () => {}); + + expect(exported).toHaveLength(1); + expect(exported[0]!.instrumentationScope).toEqual({ + name: "@temporalio/interceptor-workflow", + }); + }); + + it("synthesises parentSpanContext from parentSpanId, marking parent remote", () => { + const { exporter, exported } = recordingExporter(); + const wrapped = wrapWorkflowSpanExporter(exporter); + + wrapped.export([v1Span()], () => {}); + + const ctx = exported[0]!.parentSpanContext; + expect(ctx).toEqual({ + traceId: "0af7651916cd43dd8448eb211c80319c", + spanId: "00f067aa0ba902b7", + traceFlags: 1, + isRemote: true, + }); + }); + + it("falls back to an 'unknown' scope when both legacy and v2 fields are missing", () => { + const { exporter, exported } = recordingExporter(); + const wrapped = wrapWorkflowSpanExporter(exporter); + + wrapped.export( + [v1Span({ instrumentationLibrary: undefined, parentSpanId: undefined })], + () => {}, + ); + + expect(exported[0]!.instrumentationScope).toEqual({ name: "unknown" }); + expect(exported[0]!.parentSpanContext).toBeUndefined(); + }); + + it("leaves a span with no parent untouched on the parent field", () => { + const { exporter, exported } = recordingExporter(); + const wrapped = wrapWorkflowSpanExporter(exporter); + + wrapped.export([v1Span({ parentSpanId: undefined })], () => {}); + + expect(exported[0]!.parentSpanContext).toBeUndefined(); + }); + + it("passes through spans that are already v2-shaped", () => { + const { exporter, exported } = recordingExporter(); + const wrapped = wrapWorkflowSpanExporter(exporter); + + const scope = { name: "@opentelemetry/sdk-trace-node" }; + const parentSpanContext = { + traceId: "0af7651916cd43dd8448eb211c80319c", + spanId: "00f067aa0ba902b7", + traceFlags: 1, + isRemote: false, + }; + const v2 = { + ...v1Span(), + instrumentationLibrary: undefined, + parentSpanId: undefined, + instrumentationScope: scope, + parentSpanContext, + } as unknown as ReadableSpan; + + wrapped.export([v2], () => {}); + + expect(exported[0]).toBe(v2); + }); + + it("preserves attributes, events, kind, and `spanContext()` callability", () => { + const { exporter, exported } = recordingExporter(); + const wrapped = wrapWorkflowSpanExporter(exporter); + + const ctx = { + traceId: "0af7651916cd43dd8448eb211c80319c", + spanId: "b7ad6b7169203331", + traceFlags: 1, + }; + const events = [{ name: "evt", time: [0, 0], attributes: { x: 1 } }]; + const links = [{ context: ctx }]; + const span = v1Span({ + kind: 1, + attributes: { "service.name": "x" }, + events, + links, + startTime: [123, 0], + endTime: [124, 0], + duration: [1, 0], + ended: true, + droppedAttributesCount: 0, + droppedEventsCount: 0, + droppedLinksCount: 0, + spanContext: () => ctx, + }); + + wrapped.export([span], () => {}); + + const out = exported[0]!; + expect(out.kind).toBe(1); + expect(out.attributes).toEqual({ "service.name": "x" }); + expect(out.events).toBe(events); + expect(out.links).toBe(links); + expect(out.duration).toEqual([1, 0]); + expect(out.ended).toBe(true); + // The arrow-function `spanContext` is an own property in + // `extractReadableSpan`'s output. Spreading must keep it callable + // — the OTLP transformer reads `span.spanContext().spanId`. + expect(out.spanContext()).toEqual(ctx); + }); + + it("treats parentSpanId === '' as 'no parent' rather than synthesising an empty span ID", () => { + const { exporter, exported } = recordingExporter(); + const wrapped = wrapWorkflowSpanExporter(exporter); + + wrapped.export([v1Span({ parentSpanId: "" })], () => {}); + + // Empty string is falsy, so `needsParent` short-circuits; an + // explicit "" must not become `parentSpanContext.spanId === ""` on + // the wire (which Tempo would interpret as a malformed parent). + expect(exported[0]!.parentSpanContext).toBeUndefined(); + }); + + it("does not overwrite an existing parentSpanContext when legacy parentSpanId is also set", () => { + const { exporter, exported } = recordingExporter(); + const wrapped = wrapWorkflowSpanExporter(exporter); + + const existing = { + traceId: "0af7651916cd43dd8448eb211c80319c", + spanId: "ffffffffffffffff", + traceFlags: 1, + isRemote: false, + }; + wrapped.export( + [ + v1Span({ + parentSpanId: "00f067aa0ba902b7", + parentSpanContext: existing, + }), + ], + () => {}, + ); + + // v2 field wins — the adapter is meant to *fill in* gaps, not + // overwrite a context that's already present. + expect(exported[0]!.parentSpanContext).toBe(existing); + }); + + it("normalises a mixed v1 / v2 batch per-span", () => { + const { exporter, exported } = recordingExporter(); + const wrapped = wrapWorkflowSpanExporter(exporter); + + const v1 = v1Span(); + const v2 = { + ...v1Span(), + instrumentationLibrary: undefined, + parentSpanId: undefined, + instrumentationScope: { name: "v2-scope" }, + } as unknown as ReadableSpan; + + wrapped.export([v1, v2, v1Span({ parentSpanId: undefined })], () => {}); + + expect(exported).toHaveLength(3); + expect(exported[0]!.instrumentationScope).toEqual({ + name: "@temporalio/interceptor-workflow", + }); + expect(exported[1]).toBe(v2); + expect(exported[2]!.parentSpanContext).toBeUndefined(); + }); + + it("propagates the inner exporter's result code to the outer callback", async () => { + const failing = { + export: ( + _spans: ReadableSpan[], + cb: (result: { code: number; error?: Error }) => void, + ) => cb({ code: 1, error: new Error("downstream failed") }), + shutdown: () => Promise.resolve(), + forceFlush: () => Promise.resolve(), + }; + const wrapped = wrapWorkflowSpanExporter(failing); + + const result = await new Promise<{ code: number; error?: Error }>( + (resolve) => { + wrapped.export([v1Span()], resolve); + }, + ); + + expect(result.code).toBe(1); + expect(result.error?.message).toBe("downstream failed"); + }); + + it("delegates shutdown and forceFlush to the inner exporter", async () => { + let shutdownCalls = 0; + let flushCalls = 0; + const inner: SpanExporter = { + export: (_, cb) => cb({ code: 0 }), + shutdown: () => { + shutdownCalls += 1; + return Promise.resolve(); + }, + forceFlush: () => { + flushCalls += 1; + return Promise.resolve(); + }, + }; + const wrapped = wrapWorkflowSpanExporter(inner); + + await wrapped.shutdown(); + await wrapped.forceFlush!(); + + expect(shutdownCalls).toBe(1); + expect(flushCalls).toBe(1); + }); +}); diff --git a/libs/@local/hash-backend-utils/src/temporal/workflow-span-adapter.ts b/libs/@local/hash-backend-utils/src/temporal/workflow-span-adapter.ts index 999f2843d30..e76977216b5 100644 --- a/libs/@local/hash-backend-utils/src/temporal/workflow-span-adapter.ts +++ b/libs/@local/hash-backend-utils/src/temporal/workflow-span-adapter.ts @@ -13,15 +13,30 @@ * Tempo renders every workflow/activity span as a root in the trace. * * `wrapWorkflowSpanExporter` returns a `SpanExporter` that synthesises - * the v2-shaped fields on each span on its way in. + * the v2-shaped fields on each span on its way in. `makeV2WorkflowSink` + * is the standard entry point for worker bootstraps — it produces a + * `WorkflowSinks` entry from an `OpenTelemetrySetup`, hiding the v1↔v2 + * type-cast in one place. * * TODO(BE-520): drop this adapter when * `@temporalio/interceptors-opentelemetry-v2` (PR * https://github.com/temporalio/sdk-typescript/pull/1951) is released. */ +import type { SpanContext } from "@opentelemetry/api"; import type { ExportResult } from "@opentelemetry/core"; import type { ReadableSpan, SpanExporter } from "@opentelemetry/sdk-trace-base"; +import { makeWorkflowExporter } from "@temporalio/interceptors-opentelemetry"; +import type { OpenTelemetrySetup } from "../opentelemetry.js"; + +/** + * v1-shaped fields that may appear on spans produced by Temporal's + * `extractReadableSpan`. The v2 `instrumentationScope` and + * `parentSpanContext` fields are also re-declared as optional because + * the runtime shape is narrower than v2's `ReadableSpan` types pretend + * — `extractReadableSpan` genuinely produces objects where they are + * `undefined`. + */ interface LegacyReadableSpan { instrumentationLibrary?: { name: string; @@ -30,15 +45,23 @@ interface LegacyReadableSpan { }; instrumentationScope?: { name: string; version?: string; schemaUrl?: string }; parentSpanId?: string; + parentSpanContext?: SpanContext; } +/** + * `Omit` the v2 fields the legacy spans don't reliably populate, then + * re-add them via `LegacyReadableSpan` as optional. Without this the + * intersection inherits v2's required `instrumentationScope` typing + * and TypeScript flags every defensive `?? fallback` as "always truthy". + */ +type FlexibleSpan = Omit< + ReadableSpan, + "instrumentationScope" | "parentSpanContext" +> & + LegacyReadableSpan; + const normaliseSpan = (span: ReadableSpan): ReadableSpan => { - // The cast loses runtime correctness intentionally — Temporal's - // `extractReadableSpan` produces v1-shaped objects whose - // `instrumentationScope` and `parentSpanContext` are genuinely - // undefined at runtime, even though v2's `ReadableSpan` types them - // as required / present. - const legacy = span as ReadableSpan & LegacyReadableSpan; + const legacy = span as unknown as FlexibleSpan; const needsScope = !legacy.instrumentationScope; const needsParent = legacy.parentSpanId && !legacy.parentSpanContext; @@ -46,9 +69,8 @@ const normaliseSpan = (span: ReadableSpan): ReadableSpan => { return span; } - // Without an instrumentation identifier the OTLP resource-map logic - // still throws, so synthesise an "unknown" scope rather than letting - // the export crash. + // Synthesise an "unknown" scope as the last fallback — the OTLP + // resource-map logic crashes on a missing identifier. const instrumentationScope = legacy.instrumentationScope ?? legacy.instrumentationLibrary ?? { name: "unknown" }; @@ -66,11 +88,10 @@ const normaliseSpan = (span: ReadableSpan): ReadableSpan => { }; } - return Object.assign( - Object.create(Object.getPrototypeOf(span) as object), - span, - { instrumentationScope, parentSpanContext }, - ) as ReadableSpan; + // Spread is safe: `extractReadableSpan` produces a plain object with + // `spanContext` as an own arrow-function property, not as a prototype + // method, so we don't lose any callable surface. + return { ...span, instrumentationScope, parentSpanContext } as ReadableSpan; }; export const wrapWorkflowSpanExporter = ( @@ -83,3 +104,24 @@ export const wrapWorkflowSpanExporter = ( shutdown: () => inner.shutdown(), forceFlush: () => inner.forceFlush?.() ?? Promise.resolve(), }); + +/** + * Build the Temporal workflow sink that exports workflow-sandbox spans + * through the application's OTLP trace exporter, normalising the v1↔v2 + * `ReadableSpan` shape on the way in. + * + * The `as unknown as` casts on the exporter and resource arguments are + * required because `@temporalio/interceptors-opentelemetry@1.x` declares + * them against `@opentelemetry/sdk-trace-base@1` types, while we run + * `@2`. They are the only place in our codebase that pins the v1 shape; + * removing them is the BE-520 deliverable. + */ +export const makeV2WorkflowSink = ( + setup: OpenTelemetrySetup, +): ReturnType => + makeWorkflowExporter( + wrapWorkflowSpanExporter(setup.traceExporter) as unknown as Parameters< + typeof makeWorkflowExporter + >[0], + setup.resource as unknown as Parameters[1], + ); diff --git a/libs/@local/temporal-client/src/ai.rs b/libs/@local/temporal-client/src/ai.rs index 3e4fbac29d3..95e083e0649 100644 --- a/libs/@local/temporal-client/src/ai.rs +++ b/libs/@local/temporal-client/src/ai.rs @@ -29,7 +29,10 @@ use crate::{TemporalClient, WorkflowError}; /// Header key used by `@temporalio/interceptors-opentelemetry` to carry the /// trace-context payload across workflow boundaries. Must stay in sync with -/// the constant declared in that package's `instrumentation.ts`. +/// `TRACE_HEADER` in that package's `instrumentation.ts`; if it drifts, +/// workflows started from Rust will ship correct headers that the TypeScript +/// inbound interceptor silently ignores, and every resulting span renders +/// parent-less in Tempo. const TRACE_HEADER: &str = "_tracer-data"; /// Adapter so `opentelemetry`'s text-map propagator can write into a plain @@ -56,6 +59,20 @@ fn build_otel_header() -> Option
{ propagator.inject_context(&context, &mut CarrierWriter(&mut carrier)); }); if carrier.is_empty() { + // Surface this once per process: an empty carrier means either no + // active tracing span (caller missing `#[instrument]`) or no + // global propagator registered (telemetry bootstrap missing + // `set_text_map_propagator`). Either way the workflow will start + // with no parent context and the worker-side span renders detached + // from the caller's trace. + static WARNED: std::sync::OnceLock<()> = std::sync::OnceLock::new(); + WARNED.get_or_init(|| { + tracing::warn!( + "OpenTelemetry text-map propagator wrote no headers when starting workflow; \ + workflow spans will be parent-less. Verify the global propagator is installed \ + and the calling fn carries an active tracing span." + ); + }); return None; } @@ -83,15 +100,17 @@ struct AuthenticationContext { } impl TemporalClient { - /// Start a workflow on the `ai` task queue with the active OTEL trace - /// context propagated through the workflow headers. The TypeScript - /// worker's `OpenTelemetryInboundInterceptor` extracts this context so - /// the resulting `RunWorkflow` and `RunActivity` spans chain off the - /// caller's trace. + /// Start a workflow on the `ai` task queue, injecting the active + /// OTEL trace context into the workflow start headers so the + /// worker-side interceptors can parent the workflow + activity + /// spans off the caller's trace. /// - /// Bypasses the `WorkflowClientTrait::start_workflow` helper because - /// that helper does not expose the proto `header` field. Calls - /// `WorkflowService::start_workflow_execution` directly instead. + /// Goes via the low-level `WorkflowService::start_workflow_execution` + /// because `WorkflowClientTrait::start_workflow` does not expose the + /// proto `header` field. The span is annotated with `otel.kind = + /// "producer"` for the asynchronous fire-and-forget shape (the value + /// is case-sensitive; `tracing-opentelemetry` falls back to + /// `Internal` on typos). #[instrument( skip(self, payload), fields(workflow_type = workflow, otel.kind = "producer"), diff --git a/yarn.lock b/yarn.lock index 5ec7eab46f5..33a7017bf6c 100644 --- a/yarn.lock +++ b/yarn.lock @@ -9219,6 +9219,7 @@ __metadata: "@opentelemetry/exporter-metrics-otlp-grpc": "npm:0.207.0" "@opentelemetry/exporter-trace-otlp-grpc": "npm:0.207.0" "@opentelemetry/instrumentation": "npm:0.207.0" + "@opentelemetry/instrumentation-http": "npm:0.207.0" "@opentelemetry/instrumentation-undici": "npm:0.25.0" "@opentelemetry/resources": "npm:2.2.0" "@opentelemetry/sdk-logs": "npm:0.207.0" From b46d8666a3e6d1e496224216f4afdec12bf830a6 Mon Sep 17 00:00:00 2001 From: Tim Diekmann Date: Thu, 30 Apr 2026 13:28:22 +0200 Subject: [PATCH 03/17] BE-519: Address PR review findings - Fix tsc failure in @tests/hash-backend-integration: setup-opentelemetry.ts pointed at the removed @apps/hash-api/src/graphql/opentelemetry module; migrate it to @local/hash-backend-utils/opentelemetry with the new options-object signature. - Restore the workflow-start identity field that the high-level WorkflowClientTrait::start_workflow auto-populated; the low-level StartWorkflowExecutionRequest defaulted it to "" so Temporal Server could not attribute starts to a client. - Use URL.hostname (not URL.host) when resolving peer.service so outbound origins like https://api.openai.com:443/ still match the exact-host rules. - Fix the worker-bootstrap serviceName doc-comment: it claimed the value is also used as Temporal worker identity, but Worker.create keeps the SDK default (pid@hostname). Document the actual usage (service.name in the OTEL resource). - Move the OTEL shutdown error logging into the per-target try/catch so the label is captured at execution time instead of indexed back out of the targets array. --- .../hash-backend-utils/src/opentelemetry.ts | 28 ++++++++++--------- .../src/temporal/worker-bootstrap.ts | 7 ++++- libs/@local/temporal-client/src/ai.rs | 7 +++++ .../src/tests/setup-opentelemetry.ts | 12 ++++++-- 4 files changed, 37 insertions(+), 17 deletions(-) diff --git a/libs/@local/hash-backend-utils/src/opentelemetry.ts b/libs/@local/hash-backend-utils/src/opentelemetry.ts index e4bd886aa16..d4f9b21c99c 100644 --- a/libs/@local/hash-backend-utils/src/opentelemetry.ts +++ b/libs/@local/hash-backend-utils/src/opentelemetry.ts @@ -111,8 +111,11 @@ export const createUndiciInstrumentation = (): UndiciInstrumentation => new UndiciInstrumentation({ startSpanHook: (request) => { try { - const { host } = new URL(request.origin); - const peerService = resolvePeerService(host); + // `hostname` (not `host`): the latter includes the port for non-default + // schemes (e.g. `api.openai.com:443`), which would miss exact matches + // in `resolvePeerService`. + const { hostname } = new URL(request.origin); + const peerService = resolvePeerService(hostname); return peerService ? { "peer.service": peerService } : {}; } catch { return {}; @@ -263,18 +266,17 @@ export const registerOpenTelemetry = ({ ["log provider", () => logProvider.shutdown()], ["meter provider", () => meterProvider.shutdown()], ]; - const results = await Promise.allSettled( - targets.map(([label, run]) => shutdownWithTimeout(label, run)), + await Promise.allSettled( + targets.map(async ([label, run]) => { + try { + await shutdownWithTimeout(label, run); + } catch (error) { + // eslint-disable-next-line no-console + console.error("OpenTelemetry %s shutdown failed:", label, error); + throw error; + } + }), ); - for (const [index, result] of results.entries()) { - if (result.status === "rejected") { - // eslint-disable-next-line no-console - console.error( - `OpenTelemetry ${targets[index]![0]} shutdown failed:`, - result.reason, - ); - } - } unregisterInstrumentations(); }, }; diff --git a/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts b/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts index f699e1c5893..1852888842a 100644 --- a/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts +++ b/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts @@ -100,7 +100,12 @@ export type ExtraWorkerOptions = Omit< >; export interface RunWorkerOptions { - /** Logged once at startup; also used as Temporal worker identity. */ + /** + * Logged once at startup and used as `service.name` for OTEL traces / + * logs / metrics. The Temporal worker identity stays at the SDK default + * (`pid@hostname`) so multiple replicas remain distinguishable in the + * Temporal UI. + */ serviceName: string; /** Temporal task queue this worker pulls work from. */ taskQueue: string; diff --git a/libs/@local/temporal-client/src/ai.rs b/libs/@local/temporal-client/src/ai.rs index 95e083e0649..9a45849a5e8 100644 --- a/libs/@local/temporal-client/src/ai.rs +++ b/libs/@local/temporal-client/src/ai.rs @@ -121,6 +121,12 @@ impl TemporalClient { payload: &(impl Serialize + Sync), ) -> Result> { let mut client = self.client.clone(); + // `WorkflowClientTrait::start_workflow` auto-populates `identity` from + // `ClientOptions` (typically `pid@hostname`). The low-level + // `StartWorkflowExecutionRequest` defaults it to an empty string, + // which makes Temporal Server / UI unable to attribute starts to a + // client. Read it back from the configured client. + let identity = client.get_client().identity().to_owned(); let request = StartWorkflowExecutionRequest { namespace: <_ as NamespacedClient>::namespace(&client), input: vec![Payload { @@ -141,6 +147,7 @@ impl TemporalClient { kind: TaskQueueKind::Unspecified as i32, normal_name: String::new(), }), + identity, request_id: Uuid::new_v4().to_string(), header: build_otel_header(), ..Default::default() diff --git a/tests/hash-backend-integration/src/tests/setup-opentelemetry.ts b/tests/hash-backend-integration/src/tests/setup-opentelemetry.ts index c689f5fc760..a6aed910015 100644 --- a/tests/hash-backend-integration/src/tests/setup-opentelemetry.ts +++ b/tests/hash-backend-integration/src/tests/setup-opentelemetry.ts @@ -1,7 +1,13 @@ -import { registerOpenTelemetry } from "@apps/hash-api/src/graphql/opentelemetry"; +import { + createUndiciInstrumentation, + registerOpenTelemetry, +} from "@local/hash-backend-utils/opentelemetry"; -// Initialize OpenTelemetry for backend integration tests const otlpEndpoint = process.env.HASH_OTLP_ENDPOINT; if (otlpEndpoint) { - registerOpenTelemetry(otlpEndpoint, "BE Integration Tests"); + registerOpenTelemetry({ + endpoint: otlpEndpoint, + serviceName: "BE Integration Tests", + instrumentations: [createUndiciInstrumentation()], + }); } From 2b2962d8d9047a02cb4d3b54e91ce8de146133db Mon Sep 17 00:00:00 2001 From: Tim Diekmann Date: Thu, 30 Apr 2026 13:36:47 +0200 Subject: [PATCH 04/17] Avoid cloning client identity in Temporal client --- libs/@local/temporal-client/src/ai.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/@local/temporal-client/src/ai.rs b/libs/@local/temporal-client/src/ai.rs index 9a45849a5e8..8eae08258bd 100644 --- a/libs/@local/temporal-client/src/ai.rs +++ b/libs/@local/temporal-client/src/ai.rs @@ -126,7 +126,7 @@ impl TemporalClient { // `StartWorkflowExecutionRequest` defaults it to an empty string, // which makes Temporal Server / UI unable to attribute starts to a // client. Read it back from the configured client. - let identity = client.get_client().identity().to_owned(); + let identity = client.get_client().identity(); let request = StartWorkflowExecutionRequest { namespace: <_ as NamespacedClient>::namespace(&client), input: vec![Payload { From ec05476d322de6e0b02176bdf50c82d71085b0ea Mon Sep 17 00:00:00 2001 From: Tim Diekmann Date: Thu, 30 Apr 2026 14:04:13 +0200 Subject: [PATCH 05/17] BE-519: Extract createHttpInstrumentation helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The OTLP-port filter on HttpInstrumentation was hardcoded to 4317 in three instrument files. A non-default HASH_OTLP_ENDPOINT port (e.g. :4318 for OTLP/HTTP) bypassed the filter and let the http instrumentation trace the exporter's own outbound traffic — each export creating a span that gets queued for the next export, amplifying batch volume. - Add createHttpInstrumentation(otlpEndpoint, extra?) to the shared OTEL module. Parses the port out of the endpoint URL once at bootstrap and falls back to 4317 when the URL has no explicit port or is malformed. - Replace the three inline new HttpInstrumentation({...}) blocks with the helper. - Move @opentelemetry/instrumentation-http from devDependencies to dependencies in @local/hash-backend-utils (the helper now constructs it at runtime); drop the now-unused direct dep from each app. --- apps/hash-ai-worker-ts/package.json | 1 - apps/hash-ai-worker-ts/src/instrument.ts | 10 +--- apps/hash-api/package.json | 1 - apps/hash-api/src/instrument.mjs | 10 +--- apps/hash-integration-worker/package.json | 1 - .../hash-integration-worker/src/instrument.ts | 10 +--- libs/@local/hash-backend-utils/package.json | 2 +- .../hash-backend-utils/src/opentelemetry.ts | 53 ++++++++++++++++++- yarn.lock | 3 -- 9 files changed, 59 insertions(+), 32 deletions(-) diff --git a/apps/hash-ai-worker-ts/package.json b/apps/hash-ai-worker-ts/package.json index bf12def9436..04069ff3804 100644 --- a/apps/hash-ai-worker-ts/package.json +++ b/apps/hash-ai-worker-ts/package.json @@ -60,7 +60,6 @@ "@opentelemetry/api-logs": "0.207.0", "@opentelemetry/instrumentation": "0.207.0", "@opentelemetry/instrumentation-grpc": "0.207.0", - "@opentelemetry/instrumentation-http": "0.207.0", "@sentry/node": "10.42.0", "@temporalio/activity": "1.12.1", "@temporalio/common": "1.12.1", diff --git a/apps/hash-ai-worker-ts/src/instrument.ts b/apps/hash-ai-worker-ts/src/instrument.ts index 72e866aa325..c50f46e5ec6 100644 --- a/apps/hash-ai-worker-ts/src/instrument.ts +++ b/apps/hash-ai-worker-ts/src/instrument.ts @@ -4,12 +4,11 @@ * and gRPC modules before any other code requires them. */ import { + createHttpInstrumentation, createUndiciInstrumentation, - httpRequestSpanNameHook, registerOpenTelemetry, } from "@local/hash-backend-utils/opentelemetry"; import { GrpcInstrumentation } from "@opentelemetry/instrumentation-grpc"; -import { HttpInstrumentation } from "@opentelemetry/instrumentation-http"; /** * Setup handles. `undefined` when no `HASH_OTLP_ENDPOINT` is configured @@ -25,12 +24,7 @@ export const otelSetup: ReturnType = (() => { endpoint: otlpEndpoint, serviceName: process.env.OTEL_SERVICE_NAME ?? "AI Worker", instrumentations: [ - new HttpInstrumentation({ - // Don't trace the OTLP exporter's own outgoing requests — every - // export would create a span that needs to be exported, recursively. - ignoreOutgoingRequestHook: (options) => options.port === 4317, - requestHook: httpRequestSpanNameHook, - }), + createHttpInstrumentation(otlpEndpoint), new GrpcInstrumentation(), // Native `fetch` (used by openai / @anthropic-ai/sdk / Vertex AI // SDKs) goes through undici, which the http instrumentation does diff --git a/apps/hash-api/package.json b/apps/hash-api/package.json index 796ce43a439..58972e49f89 100644 --- a/apps/hash-api/package.json +++ b/apps/hash-api/package.json @@ -54,7 +54,6 @@ "@opentelemetry/instrumentation": "0.207.0", "@opentelemetry/instrumentation-express": "0.56.0", "@opentelemetry/instrumentation-graphql": "0.55.0", - "@opentelemetry/instrumentation-http": "0.207.0", "@opentelemetry/resources": "2.2.0", "@opentelemetry/sdk-logs": "0.207.0", "@opentelemetry/sdk-trace-base": "2.2.0", diff --git a/apps/hash-api/src/instrument.mjs b/apps/hash-api/src/instrument.mjs index 2d6711a790c..2a0654086ef 100644 --- a/apps/hash-api/src/instrument.mjs +++ b/apps/hash-api/src/instrument.mjs @@ -2,8 +2,8 @@ import "@local/hash-backend-utils/environment"; import { + createHttpInstrumentation, createUndiciInstrumentation, - httpRequestSpanNameHook, registerOpenTelemetry, } from "@local/hash-backend-utils/opentelemetry"; import { @@ -11,7 +11,6 @@ import { ExpressLayerType, } from "@opentelemetry/instrumentation-express"; import { GraphQLInstrumentation } from "@opentelemetry/instrumentation-graphql"; -import { HttpInstrumentation } from "@opentelemetry/instrumentation-http"; import * as Sentry from "@sentry/node"; import { isProdEnv } from "./lib/env-config"; @@ -34,12 +33,7 @@ export const otelSetup = (() => { endpoint: otlpEndpoint, serviceName: process.env.OTEL_SERVICE_NAME || "Node API", instrumentations: [ - new HttpInstrumentation({ - // Don't trace the OTLP exporter's own outgoing requests — every - // export would create a span that needs to be exported, recursively. - ignoreOutgoingRequestHook: (options) => options.port === 4317, - requestHook: httpRequestSpanNameHook, - }), + createHttpInstrumentation(otlpEndpoint), new ExpressInstrumentation({ ignoreLayersType: [ExpressLayerType.MIDDLEWARE], }), diff --git a/apps/hash-integration-worker/package.json b/apps/hash-integration-worker/package.json index f09f045a2cf..9a12d26600a 100644 --- a/apps/hash-integration-worker/package.json +++ b/apps/hash-integration-worker/package.json @@ -32,7 +32,6 @@ "@opentelemetry/api-logs": "0.207.0", "@opentelemetry/instrumentation": "0.207.0", "@opentelemetry/instrumentation-grpc": "0.207.0", - "@opentelemetry/instrumentation-http": "0.207.0", "@sentry/node": "10.42.0", "@temporalio/activity": "1.12.1", "@temporalio/interceptors-opentelemetry": "1.12.1", diff --git a/apps/hash-integration-worker/src/instrument.ts b/apps/hash-integration-worker/src/instrument.ts index 45a4a73c90a..a7e0e049e6d 100644 --- a/apps/hash-integration-worker/src/instrument.ts +++ b/apps/hash-integration-worker/src/instrument.ts @@ -4,12 +4,11 @@ * patch http and gRPC modules before any other code requires them. */ import { + createHttpInstrumentation, createUndiciInstrumentation, - httpRequestSpanNameHook, registerOpenTelemetry, } from "@local/hash-backend-utils/opentelemetry"; import { GrpcInstrumentation } from "@opentelemetry/instrumentation-grpc"; -import { HttpInstrumentation } from "@opentelemetry/instrumentation-http"; /** * Setup handles. `undefined` when no `HASH_OTLP_ENDPOINT` is configured @@ -25,12 +24,7 @@ export const otelSetup: ReturnType = (() => { endpoint: otlpEndpoint, serviceName: process.env.OTEL_SERVICE_NAME ?? "Integration Worker", instrumentations: [ - new HttpInstrumentation({ - // Don't trace the OTLP exporter's own outgoing requests — every - // export would create a span that needs to be exported, recursively. - ignoreOutgoingRequestHook: (options) => options.port === 4317, - requestHook: httpRequestSpanNameHook, - }), + createHttpInstrumentation(otlpEndpoint), new GrpcInstrumentation(), // Native `fetch` (used by Linear SDK / outbound API calls) goes // through undici, which the http instrumentation does not patch. diff --git a/libs/@local/hash-backend-utils/package.json b/libs/@local/hash-backend-utils/package.json index fb811fcff9a..53a7c7ade1f 100644 --- a/libs/@local/hash-backend-utils/package.json +++ b/libs/@local/hash-backend-utils/package.json @@ -44,6 +44,7 @@ "@opentelemetry/exporter-metrics-otlp-grpc": "0.207.0", "@opentelemetry/exporter-trace-otlp-grpc": "0.207.0", "@opentelemetry/instrumentation": "0.207.0", + "@opentelemetry/instrumentation-http": "0.207.0", "@opentelemetry/instrumentation-undici": "0.25.0", "@opentelemetry/resources": "2.2.0", "@opentelemetry/sdk-logs": "0.207.0", @@ -78,7 +79,6 @@ "devDependencies": { "@local/eslint": "workspace:*", "@local/tsconfig": "workspace:*", - "@opentelemetry/instrumentation-http": "0.207.0", "@types/dotenv-flow": "3.3.3", "@types/mime-types": "2.1.4", "@types/node": "22.18.13", diff --git a/libs/@local/hash-backend-utils/src/opentelemetry.ts b/libs/@local/hash-backend-utils/src/opentelemetry.ts index d4f9b21c99c..5b3f8a33f0e 100644 --- a/libs/@local/hash-backend-utils/src/opentelemetry.ts +++ b/libs/@local/hash-backend-utils/src/opentelemetry.ts @@ -15,7 +15,10 @@ import { OTLPMetricExporter } from "@opentelemetry/exporter-metrics-otlp-grpc"; import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-grpc"; import type { Instrumentation } from "@opentelemetry/instrumentation"; import { registerInstrumentations } from "@opentelemetry/instrumentation"; -import type { HttpInstrumentationConfig } from "@opentelemetry/instrumentation-http"; +import { + HttpInstrumentation, + type HttpInstrumentationConfig, +} from "@opentelemetry/instrumentation-http"; import { UndiciInstrumentation } from "@opentelemetry/instrumentation-undici"; import type { Resource } from "@opentelemetry/resources"; import { @@ -162,6 +165,54 @@ export const httpRequestSpanNameHook: NonNullable< } }; +/** + * Default OTLP/gRPC port. Used when the configured endpoint URL does not + * carry an explicit port (e.g. `http://collector` resolves via gRPC default). + */ +const DEFAULT_OTLP_PORT = 4317; + +const otlpPortFromEndpoint = (otlpEndpoint: string): number => { + try { + const { port } = new URL(otlpEndpoint); + return port ? Number.parseInt(port, 10) : DEFAULT_OTLP_PORT; + } catch { + // `registerOpenTelemetry` will surface the malformed-URL error when + // it builds the exporter; here we just fall back so the filter does + // not throw on every outgoing request. + return DEFAULT_OTLP_PORT; + } +}; + +/** + * `@opentelemetry/instrumentation-http` configured for HASH services: + * + * - Skips outgoing requests to the OTLP collector port. Without this filter + * each export would itself produce a span, which would be batched for + * export, which would produce another span — amplifying export volume on + * every batch. The port is derived from `otlpEndpoint` so a non-default + * collector port (e.g. `:4318` for OTLP/HTTP) still gets ignored. + * - Names spans `METHOD /path` via {@link httpRequestSpanNameHook}. + * + * Pass `extra` to merge per-service options (e.g. `ignoreIncomingPaths`). + * `ignoreOutgoingRequestHook` and `requestHook` are intentionally not + * mergeable here — callers needing different shapes should construct + * `HttpInstrumentation` directly. + */ +export const createHttpInstrumentation = ( + otlpEndpoint: string, + extra: Omit< + HttpInstrumentationConfig, + "ignoreOutgoingRequestHook" | "requestHook" + > = {}, +): HttpInstrumentation => { + const otlpPort = otlpPortFromEndpoint(otlpEndpoint); + return new HttpInstrumentation({ + ...extra, + ignoreOutgoingRequestHook: (options) => options.port === otlpPort, + requestHook: httpRequestSpanNameHook, + }); +}; + const shutdownWithTimeout = async ( label: string, shutdown: () => Promise, diff --git a/yarn.lock b/yarn.lock index 33a7017bf6c..25150abacb4 100644 --- a/yarn.lock +++ b/yarn.lock @@ -356,7 +356,6 @@ __metadata: "@opentelemetry/api-logs": "npm:0.207.0" "@opentelemetry/instrumentation": "npm:0.207.0" "@opentelemetry/instrumentation-grpc": "npm:0.207.0" - "@opentelemetry/instrumentation-http": "npm:0.207.0" "@sentry/node": "npm:10.42.0" "@temporalio/activity": "npm:1.12.1" "@temporalio/common": "npm:1.12.1" @@ -448,7 +447,6 @@ __metadata: "@opentelemetry/instrumentation": "npm:0.207.0" "@opentelemetry/instrumentation-express": "npm:0.56.0" "@opentelemetry/instrumentation-graphql": "npm:0.55.0" - "@opentelemetry/instrumentation-http": "npm:0.207.0" "@opentelemetry/resources": "npm:2.2.0" "@opentelemetry/sdk-logs": "npm:0.207.0" "@opentelemetry/sdk-trace-base": "npm:2.2.0" @@ -710,7 +708,6 @@ __metadata: "@opentelemetry/api-logs": "npm:0.207.0" "@opentelemetry/instrumentation": "npm:0.207.0" "@opentelemetry/instrumentation-grpc": "npm:0.207.0" - "@opentelemetry/instrumentation-http": "npm:0.207.0" "@sentry/node": "npm:10.42.0" "@temporalio/activity": "npm:1.12.1" "@temporalio/interceptors-opentelemetry": "npm:1.12.1" From 3c35253023ece73c49818179249059e0a86e1654 Mon Sep 17 00:00:00 2001 From: Tim Diekmann Date: Thu, 30 Apr 2026 14:05:12 +0200 Subject: [PATCH 06/17] BE-519: Flush OTEL on fatal worker errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If worker.run() rejects with a fatal error (no signal), the signal-driven shutdown handler never runs, so BatchSpanProcessor / BatchLogRecordProcessor buffers were dropped — losing telemetry from the very session whose crash the operator most wants to investigate. Wrap await workerRunPromise in try/catch; on rejection, close the health server and flush the OTEL providers before rethrowing into main.ts. Guard with shuttingDown to avoid double-flushing if a signal raced in. --- .../src/temporal/worker-bootstrap.ts | 25 ++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts b/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts index 1852888842a..7d9b51a881a 100644 --- a/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts +++ b/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts @@ -294,5 +294,28 @@ export async function runWorker(opts: RunWorkerOptions): Promise { process.on("SIGINT", onSignal); process.on("SIGTERM", onSignal); - await workerRunPromise; + try { + await workerRunPromise; + } catch (error) { + // Fatal worker error (no signal): the signal-driven `shutdown()` never + // ran, so the OTEL batch processors still hold the spans / logs from + // the crashing session. Flush them before letting `main.ts` exit. + // The `shuttingDown` guard avoids double-flushing if a signal raced in + // and `shutdown()` is already running concurrently — TS can't see the + // closure mutation, hence the disable. + // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition + if (!shuttingDown) { + try { + httpServer.close(); + } catch (closeError) { + logger.error("Health-check server close failed", { error: closeError }); + } + try { + await otelSetup?.shutdown(); + } catch (flushError) { + logger.error("Failed to flush OpenTelemetry", { error: flushError }); + } + } + throw error; + } } From 575689c3bab85cf4c83f0e47f223cf58ab17e1ef Mon Sep 17 00:00:00 2001 From: Tim Diekmann Date: Thu, 30 Apr 2026 14:15:48 +0200 Subject: [PATCH 07/17] BE-519: Surface OTEL shutdown failures as AggregateError Promise.allSettled never rejects, so the caller's `await otelSetup.shutdown()` always resolved successfully even when individual provider shutdowns failed. That made the surrounding `catch (error) { exitCode = 1 }` blocks in worker-bootstrap.ts dead code: a flush failure left the worker exiting 0, hiding partial telemetry loss from K8s / the operator. Inspect the settled results and throw an AggregateError when any provider rejected. Per-provider stderr logging stays in place; the AggregateError just lets the caller's existing escalation paths (exitCode = 1) actually fire. --- .../hash-backend-utils/src/opentelemetry.ts | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/libs/@local/hash-backend-utils/src/opentelemetry.ts b/libs/@local/hash-backend-utils/src/opentelemetry.ts index 5b3f8a33f0e..ef6fb58d48d 100644 --- a/libs/@local/hash-backend-utils/src/opentelemetry.ts +++ b/libs/@local/hash-backend-utils/src/opentelemetry.ts @@ -317,7 +317,7 @@ export const registerOpenTelemetry = ({ ["log provider", () => logProvider.shutdown()], ["meter provider", () => meterProvider.shutdown()], ]; - await Promise.allSettled( + const results = await Promise.allSettled( targets.map(async ([label, run]) => { try { await shutdownWithTimeout(label, run); @@ -329,6 +329,20 @@ export const registerOpenTelemetry = ({ }), ); unregisterInstrumentations(); + // `Promise.allSettled` itself never rejects, so without inspecting the + // results the caller's `catch` block would never fire and downstream + // exit-code / error-reporting logic would treat partial flush failures + // as success. Surface them as an `AggregateError` so the caller can + // react. + const failures = results.flatMap((result) => + result.status === "rejected" ? [result.reason as unknown] : [], + ); + if (failures.length > 0) { + throw new AggregateError( + failures, + "One or more OpenTelemetry providers failed to shut down", + ); + } }, }; }; From 55a7b53e367ab523be7bd3e7a0892f1dd4f2a747 Mon Sep 17 00:00:00 2001 From: Tim Diekmann Date: Thu, 30 Apr 2026 14:28:50 +0200 Subject: [PATCH 08/17] BE-519: Move worker cleanup onto the linear path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bugbot caught a race in the previous shutdown structure. When SIGTERM arrived, the signal handler's shutdown() called worker.shutdown() which resolved workerRunPromise. Both awaits (shutdown's and the main flow's) woke up, runWorker returned to main.ts, run() resolved — but shutdown() was still mid-flush as fire-and-forget. With otelSetup undefined the event loop could empty before process.exit() fired, racing the chosen exit code against Node's natural termination. Restructure: signal handler only triggers worker.shutdown() and flips a flag. All cleanup (HTTP close + OTEL flush + exit) lives on a single linear path that runs on every termination mode (signal / fatal worker error / clean worker exit). This also covers a previously-missing case: clean worker exit was returning without flushing OTEL. --- .../src/temporal/worker-bootstrap.ts | 89 ++++++++----------- 1 file changed, 37 insertions(+), 52 deletions(-) diff --git a/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts b/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts index 7d9b51a881a..641de30c662 100644 --- a/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts +++ b/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts @@ -245,77 +245,62 @@ export async function runWorker(opts: RunWorkerOptions): Promise { }); }); - // Start the worker; `worker.run()` resolves once the SDK has fully - // drained in-flight activities after `worker.shutdown()` is called. - // The shutdown handler awaits this promise so SIGTERM doesn't kill - // activities mid-execution. + // `worker.run()` resolves once the SDK has fully drained in-flight + // activities after `worker.shutdown()` is called. const workerRunPromise = worker.run(); + // Signal handler ONLY triggers drain. All cleanup (HTTP close + OTEL + // flush + process.exit) lives on the linear path below, so it runs on + // every termination mode: signal-driven shutdown, fatal worker error, + // and clean worker exit. A fire-and-forget signal handler that owns + // cleanup races against the main flow returning to `main.ts`. let shuttingDown = false; - const shutdown = async (signal: NodeJS.Signals) => { + const onSignal = (signal: NodeJS.Signals) => { if (shuttingDown) { return; } shuttingDown = true; logger.info(`Received ${signal}, exiting...`); - let exitCode = 0; try { worker.shutdown(); } catch (error) { logger.error("Worker shutdown trigger failed", { error }); - exitCode = 1; - } - try { - await workerRunPromise; - } catch (error) { - logger.error("Worker drain failed", { error }); - exitCode = 1; - } - try { - httpServer.close(); - } catch (error) { - logger.error("Health-check server close failed", { error }); } - try { - await otelSetup?.shutdown(); - } catch (error) { - logger.error("Failed to flush OpenTelemetry", { error }); - exitCode = 1; - } - process.exit(exitCode); - }; - - const onSignal = (signal: NodeJS.Signals) => { - shutdown(signal).catch((error: unknown) => { - logger.error("Shutdown handler threw", { error }); - process.exit(1); - }); }; process.on("SIGINT", onSignal); process.on("SIGTERM", onSignal); + let exitCode = 0; + let workerError: unknown; try { await workerRunPromise; } catch (error) { - // Fatal worker error (no signal): the signal-driven `shutdown()` never - // ran, so the OTEL batch processors still hold the spans / logs from - // the crashing session. Flush them before letting `main.ts` exit. - // The `shuttingDown` guard avoids double-flushing if a signal raced in - // and `shutdown()` is already running concurrently — TS can't see the - // closure mutation, hence the disable. - // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition - if (!shuttingDown) { - try { - httpServer.close(); - } catch (closeError) { - logger.error("Health-check server close failed", { error: closeError }); - } - try { - await otelSetup?.shutdown(); - } catch (flushError) { - logger.error("Failed to flush OpenTelemetry", { error: flushError }); - } - } - throw error; + workerError = error; + exitCode = 1; + logger.error(shuttingDown ? "Worker drain failed" : "Worker run failed", { + error, + }); + } + + try { + httpServer.close(); + } catch (error) { + logger.error("Health-check server close failed", { error }); + } + try { + await otelSetup?.shutdown(); + } catch (error) { + logger.error("Failed to flush OpenTelemetry", { error }); + exitCode = 1; + } + + if (shuttingDown) { + // Signal-driven exit: `main.ts` has nothing to do after `run()` + // returns, so terminate explicitly with the chosen exit code. + process.exit(exitCode); + } + if (workerError) { + throw workerError; } + // Clean worker exit: return; `main.ts` has no further work. } From 669ed316bdc96b4672c4dfece3c2ea3a0dfc1b76 Mon Sep 17 00:00:00 2001 From: Tim Diekmann Date: Thu, 30 Apr 2026 15:03:39 +0200 Subject: [PATCH 09/17] BE-519: Silence lint on closure-mutated shuttingDown ESLint's no-unnecessary-condition narrows shuttingDown to its initial false here because the only mutation lives inside the SIGINT/SIGTERM handler closure, which TS-flow does not see from the surrounding scope. Same shape for the rethrow: workerError holds whatever the SDK threw (typed unknown), so only-throw-error fires on the rethrow even though preserving the original value is the correct behaviour. --- .../hash-backend-utils/src/temporal/worker-bootstrap.ts | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts b/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts index 641de30c662..a9943c03bf1 100644 --- a/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts +++ b/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts @@ -277,6 +277,10 @@ export async function runWorker(opts: RunWorkerOptions): Promise { } catch (error) { workerError = error; exitCode = 1; + // `shuttingDown` is mutated in the signal-handler closure; TS narrows + // it to the initial `false` here so the conditional looks dead to + // eslint. + // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition logger.error(shuttingDown ? "Worker drain failed" : "Worker run failed", { error, }); @@ -294,12 +298,16 @@ export async function runWorker(opts: RunWorkerOptions): Promise { exitCode = 1; } + // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition if (shuttingDown) { // Signal-driven exit: `main.ts` has nothing to do after `run()` // returns, so terminate explicitly with the chosen exit code. process.exit(exitCode); } if (workerError) { + // The catch block stored whatever the SDK threw; rethrow the same + // value so the caller sees the original. + // eslint-disable-next-line @typescript-eslint/only-throw-error throw workerError; } // Clean worker exit: return; `main.ts` has no further work. From 6fca0381648b94c8c316716087d300b2536bd6e7 Mon Sep 17 00:00:00 2001 From: Tim Diekmann Date: Thu, 30 Apr 2026 16:11:56 +0200 Subject: [PATCH 10/17] BE-519: Move createTemporalSdkLogger to worker-bootstrap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The helper imports DefaultLogger from @temporalio/worker, which bundles the native Rust core bindings. Keeping it in temporal.ts pulled the worker package into every consumer of createTemporalClient — including hash-api, where it has no business being. worker-bootstrap.ts is the only caller, so the helper moves there and becomes file-private. temporal.ts is back to a pure @temporalio/client surface. --- .../@local/hash-backend-utils/src/temporal.ts | 34 ------------------ .../src/temporal/worker-bootstrap.ts | 36 ++++++++++++++++++- 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/libs/@local/hash-backend-utils/src/temporal.ts b/libs/@local/hash-backend-utils/src/temporal.ts index 18cc1dca253..da82626d559 100644 --- a/libs/@local/hash-backend-utils/src/temporal.ts +++ b/libs/@local/hash-backend-utils/src/temporal.ts @@ -1,47 +1,13 @@ import { trace } from "@opentelemetry/api"; import { Client as TemporalClient, Connection } from "@temporalio/client"; import { OpenTelemetryWorkflowClientInterceptor } from "@temporalio/interceptors-opentelemetry"; -import { DefaultLogger } from "@temporalio/worker"; import { getRequiredEnv } from "./environment.js"; -import type { Logger } from "./logger.js"; export { Client as TemporalClient } from "@temporalio/client"; export const temporalNamespace = "HASH"; -/** - * Adapter that pipes Temporal SDK logs (both Rust core and Node-side - * worker events) through the application logger, keeping them in the - * same JSON format and log-level scheme as the rest of the worker - * output. Pass to `Runtime.install({ logger })`. - */ -export const createTemporalSdkLogger = (logger: Logger): DefaultLogger => - // DefaultLogger filters at INFO, so TRACE / DEBUG paths only fire - // when the level is bumped at the call site. - new DefaultLogger("INFO", ({ level, message, meta }) => { - switch (level) { - case "TRACE": - case "DEBUG": - logger.debug(message, meta); - return; - case "INFO": - logger.info(message, meta); - return; - case "WARN": - logger.warn(message, meta); - return; - case "ERROR": - logger.error(message, meta); - return; - default: - logger.warn(`Unknown Temporal SDK log level: ${level as string}`, { - message, - meta, - }); - } - }); - export const createTemporalClient = async () => { const temporalServerHost = getRequiredEnv("HASH_TEMPORAL_SERVER_HOST"); diff --git a/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts b/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts index a9943c03bf1..abb47168965 100644 --- a/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts +++ b/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts @@ -18,6 +18,7 @@ import type { WorkflowBundleOption, } from "@temporalio/worker"; import { + DefaultLogger, defaultSinks, NativeConnection, Runtime, @@ -26,7 +27,6 @@ import { import type { Logger } from "../logger.js"; import type { OpenTelemetrySetup } from "../opentelemetry.js"; -import { createTemporalSdkLogger } from "../temporal.js"; import { OpenTelemetryActivityInboundInterceptor, OpenTelemetryActivityOutboundInterceptor, @@ -37,6 +37,40 @@ import { makeV2WorkflowSink } from "./workflow-span-adapter.js"; const require = createRequire(import.meta.url); +/** + * Adapter that pipes Temporal SDK logs (both Rust core and Node-side + * worker events) through the application logger, keeping them in the + * same JSON format and log-level scheme as the rest of the worker + * output. Lives here rather than in `temporal.ts` so the API server's + * import of `createTemporalClient` does not pull in `@temporalio/worker` + * (which bundles native Rust core bindings). + */ +const createTemporalSdkLogger = (logger: Logger): DefaultLogger => + // DefaultLogger filters at INFO, so TRACE / DEBUG paths only fire + // when the level is bumped at the call site. + new DefaultLogger("INFO", ({ level, message, meta }) => { + switch (level) { + case "TRACE": + case "DEBUG": + logger.debug(message, meta); + return; + case "INFO": + logger.info(message, meta); + return; + case "WARN": + logger.warn(message, meta); + return; + case "ERROR": + logger.error(message, meta); + return; + default: + logger.warn(`Unknown Temporal SDK log level: ${level as string}`, { + message, + meta, + }); + } + }); + const TEMPORAL_DEFAULT_PORT = 7233; const getTemporalAddress = (): string => { From 71569460d3ae4d390e972645bfed769bb3d3f38b Mon Sep 17 00:00:00 2001 From: Tim Diekmann Date: Thu, 30 Apr 2026 16:53:12 +0200 Subject: [PATCH 11/17] BE-519: Address PR review-team findings (round 1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - worker-bootstrap.ts: drop the dead try/catch around httpServer.close(); http.Server.close() reports failures via the optional callback, not synchronously. Pass an error-logging callback instead so close failures actually surface. - opentelemetry.ts: tighten the host vs hostname comment. The example api.openai.com:443 was wrong — URL strips the default :443 for https. Use collector:4318 instead so the rationale matches what the URL spec does. - hash-api/index.ts: call out the LIFO cleanup order explicitly. Without the GracefulShutdown.reverse() reference, a future maintainer reading "Flush OpenTelemetry last" alongside addCleanup() calls below would reasonably conclude the comment is stale and move the registration. --- apps/hash-api/src/index.ts | 10 ++++++---- .../@local/hash-backend-utils/src/opentelemetry.ts | 6 +++--- .../src/temporal/worker-bootstrap.ts | 14 +++++++++----- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/apps/hash-api/src/index.ts b/apps/hash-api/src/index.ts index 6646a15118f..0f9f29e84b2 100644 --- a/apps/hash-api/src/index.ts +++ b/apps/hash-api/src/index.ts @@ -101,10 +101,12 @@ const httpServer = http.createServer(app); const shutdown = new GracefulShutdown(logger, "SIGINT", "SIGTERM"); -// Flush OpenTelemetry last so cleanup hooks above this point can still -// emit shutdown spans / logs before the providers disconnect from the -// collector. `otelSetup` is `undefined` when no `HASH_OTLP_ENDPOINT` is -// configured (no collector) or when bootstrap throws. +// Register OpenTelemetry first so it flushes last — `GracefulShutdown` +// runs cleanups in reverse registration order. Cleanup hooks added below +// can still emit shutdown spans / logs before the providers disconnect +// from the collector. `otelSetup` is `undefined` when no +// `HASH_OTLP_ENDPOINT` is configured (no collector) or when bootstrap +// throws. if (otelSetup) { shutdown.addCleanup("OpenTelemetry", otelSetup.shutdown); } diff --git a/libs/@local/hash-backend-utils/src/opentelemetry.ts b/libs/@local/hash-backend-utils/src/opentelemetry.ts index ef6fb58d48d..6444453e10f 100644 --- a/libs/@local/hash-backend-utils/src/opentelemetry.ts +++ b/libs/@local/hash-backend-utils/src/opentelemetry.ts @@ -114,9 +114,9 @@ export const createUndiciInstrumentation = (): UndiciInstrumentation => new UndiciInstrumentation({ startSpanHook: (request) => { try { - // `hostname` (not `host`): the latter includes the port for non-default - // schemes (e.g. `api.openai.com:443`), which would miss exact matches - // in `resolvePeerService`. + // `hostname` strips the port unconditionally; `host` keeps it for + // non-default ports (e.g. `collector:4318`), which would miss + // exact-host matches in `resolvePeerService`. const { hostname } = new URL(request.origin); const peerService = resolvePeerService(hostname); return peerService ? { "peer.service": peerService } : {}; diff --git a/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts b/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts index abb47168965..979508100ec 100644 --- a/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts +++ b/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts @@ -320,11 +320,15 @@ export async function runWorker(opts: RunWorkerOptions): Promise { }); } - try { - httpServer.close(); - } catch (error) { - logger.error("Health-check server close failed", { error }); - } + // `http.Server.close()` reports failures via the optional callback, + // not synchronously, so a `try`/`catch` around the bare call would be + // dead. Pass a callback to log close failures and let the OTEL flush + // below give the listening socket time to release before exit. + httpServer.close((error) => { + if (error) { + logger.error("Health-check server close failed", { error }); + } + }); try { await otelSetup?.shutdown(); } catch (error) { From d7d1ed83cd98774e169a2e3ceff95f1f013d54e2 Mon Sep 17 00:00:00 2001 From: Tim Diekmann Date: Thu, 30 Apr 2026 16:55:05 +0200 Subject: [PATCH 12/17] BE-519: Register worker signal handlers up-front MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A SIGTERM during the worker startup window — NativeConnection.connect, workflow bundle compile, Worker.create — would hit the Node default handler and terminate the process without flushing OTEL. K8s pod evictions during startup are a real edge case in rolling deploys. Move SIGINT/SIGTERM registration to the top of runWorker, before any async startup work. The handler captures `worker` once it's set; if a signal arrives earlier, it just flips `shuttingDown` and the linear cleanup path below handles flush+exit. The worker.run() call gates on `shuttingDown` so we skip running entirely if startup got interrupted. --- .../src/temporal/worker-bootstrap.ts | 60 +++++++++++-------- 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts b/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts index 979508100ec..fbdbb5014ad 100644 --- a/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts +++ b/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts @@ -192,6 +192,37 @@ export async function runWorker(opts: RunWorkerOptions): Promise { logger.info(`Starting ${opts.serviceName}...`); + // Install signal handlers up-front so a SIGTERM during the + // potentially-slow startup phase (NativeConnection.connect, workflow + // bundle compile, Worker.create) doesn't terminate the process via + // the Node default — we'd lose the OTEL flush. The handler captures a + // reference to `worker` once it's set; until then the linear cleanup + // path below handles flush+exit when `workerRunPromise` (also unset + // pre-Worker.create) is replaced by a sentinel that resolves + // immediately on signal during startup. + let worker: Worker | undefined; + let shuttingDown = false; + const onSignal = (signal: NodeJS.Signals) => { + if (shuttingDown) { + return; + } + shuttingDown = true; + logger.info(`Received ${signal}, exiting...`); + if (worker) { + try { + worker.shutdown(); + } catch (error) { + logger.error("Worker shutdown trigger failed", { error }); + } + } + // If the worker hasn't been created yet, do nothing more here: + // `runWorker` is still in startup and the early-exit branch below + // will fall through to the linear cleanup path when control + // returns from whichever `await` is pending. + }; + process.on("SIGINT", onSignal); + process.on("SIGTERM", onSignal); + // Temporal SDK runtime telemetry: emits SDK-internal metrics (worker // slot utilisation, sticky cache hits, polling latency, activity / // workflow execution latency) directly to OTLP, and forwards Rust @@ -237,7 +268,7 @@ export async function runWorker(opts: RunWorkerOptions): Promise { inbound: new SentryActivityInboundInterceptor(ctx), })); - const worker = await Worker.create({ + worker = await Worker.create({ ...opts.workerOptions, ...expandWorkflowSource(opts.workflowSource), activities: opts.activities, @@ -280,29 +311,10 @@ export async function runWorker(opts: RunWorkerOptions): Promise { }); // `worker.run()` resolves once the SDK has fully drained in-flight - // activities after `worker.shutdown()` is called. - const workerRunPromise = worker.run(); - - // Signal handler ONLY triggers drain. All cleanup (HTTP close + OTEL - // flush + process.exit) lives on the linear path below, so it runs on - // every termination mode: signal-driven shutdown, fatal worker error, - // and clean worker exit. A fire-and-forget signal handler that owns - // cleanup races against the main flow returning to `main.ts`. - let shuttingDown = false; - const onSignal = (signal: NodeJS.Signals) => { - if (shuttingDown) { - return; - } - shuttingDown = true; - logger.info(`Received ${signal}, exiting...`); - try { - worker.shutdown(); - } catch (error) { - logger.error("Worker shutdown trigger failed", { error }); - } - }; - process.on("SIGINT", onSignal); - process.on("SIGTERM", onSignal); + // activities after `worker.shutdown()` is called. If a signal already + // arrived during startup, skip running entirely and fall through to + // the cleanup path below. + const workerRunPromise = shuttingDown ? Promise.resolve() : worker.run(); let exitCode = 0; let workerError: unknown; From 320d223b8862e00cd7e1490e91865c8684739a88 Mon Sep 17 00:00:00 2001 From: Tim Diekmann Date: Thu, 30 Apr 2026 16:56:59 +0200 Subject: [PATCH 13/17] BE-519: Test createHttpInstrumentation OTLP-port filter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Locks down the port-derivation contract that prevents the exporter from tracing its own outbound traffic. The failure mode is exponential span amplification per export batch, hard to spot from logs — easier to catch with a unit test. Covers: configured gRPC port (4317), non-default port from URL (4318), fallback when the URL has no explicit port, and fallback on a malformed endpoint (the helper must not throw on every outgoing request). Reads the hook back via HttpInstrumentation.getConfig() so the test exercises the same wiring runtime callers see. --- .../src/opentelemetry.test.ts | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/libs/@local/hash-backend-utils/src/opentelemetry.test.ts b/libs/@local/hash-backend-utils/src/opentelemetry.test.ts index c4de74f7b24..443e3e45c70 100644 --- a/libs/@local/hash-backend-utils/src/opentelemetry.test.ts +++ b/libs/@local/hash-backend-utils/src/opentelemetry.test.ts @@ -1,9 +1,11 @@ import type { ClientRequest, IncomingMessage } from "node:http"; import { type Span, trace } from "@opentelemetry/api"; +import type { HttpInstrumentationConfig } from "@opentelemetry/instrumentation-http"; import { describe, expect, it } from "vitest"; import { + createHttpInstrumentation, httpRequestSpanNameHook, resolvePeerService, } from "./opentelemetry.js"; @@ -131,3 +133,53 @@ describe("httpRequestSpanNameHook", () => { expect(updates).toEqual([]); }); }); + +describe("createHttpInstrumentation OTLP-port filter", () => { + /** + * Read the configured `ignoreOutgoingRequestHook` back off the + * instrumentation. Without this, a regression that drops the filter + * would only show up at runtime as exporter traffic feeding back into + * itself, amplifying span volume per export batch. + */ + const ignoreOutgoingFor = (otlpEndpoint: string) => { + const config = createHttpInstrumentation( + otlpEndpoint, + ).getConfig() as HttpInstrumentationConfig; + const hook = config.ignoreOutgoingRequestHook; + if (!hook) { + throw new Error("ignoreOutgoingRequestHook should be set"); + } + return (port: number | undefined) => + hook({ port } as Parameters>[0]); + }; + + it("ignores outgoing requests to the configured OTLP gRPC port", () => { + const ignored = ignoreOutgoingFor("http://collector:4317"); + expect(ignored(4317)).toBe(true); + expect(ignored(443)).toBe(false); + }); + + it("derives a non-default OTLP port from the endpoint URL", () => { + // `:4318` is the OTLP/HTTP convention; if the helper hardcoded 4317 + // a self-instrumented exporter on 4318 would feed back into itself. + const ignored = ignoreOutgoingFor("http://collector:4318"); + expect(ignored(4318)).toBe(true); + expect(ignored(4317)).toBe(false); + }); + + it("falls back to 4317 when the endpoint has no explicit port", () => { + const ignored = ignoreOutgoingFor("http://collector"); + expect(ignored(4317)).toBe(true); + expect(ignored(8080)).toBe(false); + }); + + it("falls back to 4317 when the endpoint is malformed", () => { + // Bad URL must not throw on every outgoing request — that would + // disable HTTP tracing entirely. Falling back to 4317 keeps the + // tracer running; `registerOpenTelemetry` surfaces the URL error + // separately when it builds the exporter. + const ignored = ignoreOutgoingFor("not a url"); + expect(ignored(4317)).toBe(true); + expect(ignored(443)).toBe(false); + }); +}); From c75c1e15904861b86aab37a11032f1ba7c9f4061 Mon Sep 17 00:00:00 2001 From: Tim Diekmann Date: Thu, 30 Apr 2026 16:58:51 +0200 Subject: [PATCH 14/17] BE-519: Tighten ExtraWorkerOptions to an allowlist Switching from Omit to Pick means future Temporal SDK additions don't silently leak through and let callers override helper-owned wiring (activities, connection, taskQueue, sinks, interceptors, workflowBundle). Only key currently in use is maxHeartbeatThrottleInterval; add more on demand. --- .../src/temporal/worker-bootstrap.ts | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts b/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts index fbdbb5014ad..b07f016f742 100644 --- a/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts +++ b/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts @@ -114,23 +114,12 @@ export type WorkflowSource = }; /** - * Per-worker tuning passed straight through to `Worker.create`. Keys - * owned by the helper (`activities`, `connection`, `namespace`, - * `taskQueue`, `sinks`, `interceptors`) and the workflow-source keys - * (`workflowBundle`, `workflowsPath`, `bundlerOptions`) are excluded — - * `workflowSource` is the only way to set them. + * Per-worker tuning passed straight through to `Worker.create`. Add + * keys here as needed; helper-owned wiring stays inaccessible. */ -export type ExtraWorkerOptions = Omit< +export type ExtraWorkerOptions = Pick< WorkerOptions, - | "activities" - | "connection" - | "namespace" - | "taskQueue" - | "sinks" - | "interceptors" - | "workflowBundle" - | "workflowsPath" - | "bundlerOptions" + "maxHeartbeatThrottleInterval" >; export interface RunWorkerOptions { From e2f41c10ae1b67005408ae7c1318570fc266add6 Mon Sep 17 00:00:00 2001 From: Tim Diekmann Date: Thu, 30 Apr 2026 17:01:04 +0200 Subject: [PATCH 15/17] BE-519: Fail-loud on bootstrap errors outside production MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous catch swallowed every throw — including the kind that matters most in dev / CI: typos in instrumentation construction, bad endpoint URLs, missing peer deps. A regression that disabled telemetry on every deploy would slip through. Keep the fall-through-to-undefined behaviour for production (don't crash a prod service over the collector layer) but rethrow elsewhere so the bootstrap error surfaces during development. --- apps/hash-ai-worker-ts/src/instrument.ts | 9 ++++++--- apps/hash-api/src/instrument.mjs | 10 ++++++---- apps/hash-integration-worker/src/instrument.ts | 9 ++++++--- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/apps/hash-ai-worker-ts/src/instrument.ts b/apps/hash-ai-worker-ts/src/instrument.ts index c50f46e5ec6..05650f8aae1 100644 --- a/apps/hash-ai-worker-ts/src/instrument.ts +++ b/apps/hash-ai-worker-ts/src/instrument.ts @@ -35,9 +35,12 @@ export const otelSetup: ReturnType = (() => { ], }); } catch (error) { - // Bootstrap runs before any logger is wired up, so direct stderr is - // the right channel. Don't rethrow — the worker should still start - // without telemetry. + // Outside production, fail loud: realistic causes here are coding + // errors (bad URL, malformed instrumentation config) and hiding + // them in dev/CI loses regressions. + if (process.env.NODE_ENV !== "production") { + throw error; + } // eslint-disable-next-line no-console console.error( "OpenTelemetry bootstrap failed; AI worker will start without telemetry.", diff --git a/apps/hash-api/src/instrument.mjs b/apps/hash-api/src/instrument.mjs index 2a0654086ef..9aff48c593b 100644 --- a/apps/hash-api/src/instrument.mjs +++ b/apps/hash-api/src/instrument.mjs @@ -52,10 +52,12 @@ export const otelSetup = (() => { ], }); } catch (error) { - // Bootstrap runs before the application logger is wired up, so direct - // stderr is the right channel. Do not rethrow: the service should - // still start without telemetry rather than crash on a misconfigured - // collector. + // Outside production, fail loud: the only realistic causes here are + // coding errors (bad URL, malformed instrumentation config) and + // hiding them in dev/CI loses regressions. + if (!isProdEnv) { + throw error; + } // eslint-disable-next-line no-console console.error( "OpenTelemetry bootstrap failed; service will start without telemetry.", diff --git a/apps/hash-integration-worker/src/instrument.ts b/apps/hash-integration-worker/src/instrument.ts index a7e0e049e6d..004412282c2 100644 --- a/apps/hash-integration-worker/src/instrument.ts +++ b/apps/hash-integration-worker/src/instrument.ts @@ -34,9 +34,12 @@ export const otelSetup: ReturnType = (() => { ], }); } catch (error) { - // Bootstrap runs before any logger is wired up, so direct stderr is - // the right channel. Don't rethrow — the worker should still start - // without telemetry. + // Outside production, fail loud: realistic causes here are coding + // errors (bad URL, malformed instrumentation config) and hiding + // them in dev/CI loses regressions. + if (process.env.NODE_ENV !== "production") { + throw error; + } // eslint-disable-next-line no-console console.error( "OpenTelemetry bootstrap failed; integration worker will start without telemetry.", From ed943056ff4d3a3ad863fc916b68aa13b185630a Mon Sep 17 00:00:00 2001 From: Tim Diekmann Date: Thu, 30 Apr 2026 17:46:27 +0200 Subject: [PATCH 16/17] BE-519: Drop unused import + closure-narrowing eslint-disable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit biome auto-removed the now-redundant as HttpInstrumentationConfig cast (getConfig() already types it), leaving the import unused. worker-bootstrap.ts:306: same TS-narrowing-vs-closure-mutation pattern as the other shuttingDown checks — TS sees the initial false but the SIGTERM handler can flip it before this line runs. --- libs/@local/hash-backend-utils/src/opentelemetry.test.ts | 5 +---- .../hash-backend-utils/src/temporal/worker-bootstrap.ts | 1 + 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/libs/@local/hash-backend-utils/src/opentelemetry.test.ts b/libs/@local/hash-backend-utils/src/opentelemetry.test.ts index 443e3e45c70..1af461f1331 100644 --- a/libs/@local/hash-backend-utils/src/opentelemetry.test.ts +++ b/libs/@local/hash-backend-utils/src/opentelemetry.test.ts @@ -1,7 +1,6 @@ import type { ClientRequest, IncomingMessage } from "node:http"; import { type Span, trace } from "@opentelemetry/api"; -import type { HttpInstrumentationConfig } from "@opentelemetry/instrumentation-http"; import { describe, expect, it } from "vitest"; import { @@ -142,9 +141,7 @@ describe("createHttpInstrumentation OTLP-port filter", () => { * itself, amplifying span volume per export batch. */ const ignoreOutgoingFor = (otlpEndpoint: string) => { - const config = createHttpInstrumentation( - otlpEndpoint, - ).getConfig() as HttpInstrumentationConfig; + const config = createHttpInstrumentation(otlpEndpoint).getConfig(); const hook = config.ignoreOutgoingRequestHook; if (!hook) { throw new Error("ignoreOutgoingRequestHook should be set"); diff --git a/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts b/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts index b07f016f742..5a2059b243f 100644 --- a/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts +++ b/libs/@local/hash-backend-utils/src/temporal/worker-bootstrap.ts @@ -303,6 +303,7 @@ export async function runWorker(opts: RunWorkerOptions): Promise { // activities after `worker.shutdown()` is called. If a signal already // arrived during startup, skip running entirely and fall through to // the cleanup path below. + // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition const workerRunPromise = shuttingDown ? Promise.resolve() : worker.run(); let exitCode = 0; From 98041ffcd09c08206e71fb728c74eb56cccfd1dd Mon Sep 17 00:00:00 2001 From: Tim Diekmann Date: Thu, 30 Apr 2026 17:49:49 +0200 Subject: [PATCH 17/17] BE-519: Coerce port for OTLP filter equality MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit http.RequestOptions.port is string | number | null | undefined; some callers pass a string and `"4317" === 4317` is false. The filter would miss the exporter's own outbound traffic and the very feedback loop this filter exists to prevent would slip through. Number(options.port) handles both shapes, undefined → NaN → not equal, which preserves the original "let unrelated traffic through" path. Test covers the string and undefined cases the original suite missed. --- .../hash-backend-utils/src/opentelemetry.test.ts | 11 ++++++++++- libs/@local/hash-backend-utils/src/opentelemetry.ts | 7 ++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/libs/@local/hash-backend-utils/src/opentelemetry.test.ts b/libs/@local/hash-backend-utils/src/opentelemetry.test.ts index 1af461f1331..c610bc06797 100644 --- a/libs/@local/hash-backend-utils/src/opentelemetry.test.ts +++ b/libs/@local/hash-backend-utils/src/opentelemetry.test.ts @@ -146,7 +146,7 @@ describe("createHttpInstrumentation OTLP-port filter", () => { if (!hook) { throw new Error("ignoreOutgoingRequestHook should be set"); } - return (port: number | undefined) => + return (port: number | string | undefined) => hook({ port } as Parameters>[0]); }; @@ -156,6 +156,15 @@ describe("createHttpInstrumentation OTLP-port filter", () => { expect(ignored(443)).toBe(false); }); + // RequestOptions.port is `string | number | null | undefined`; a strict + // `===` would let "4317" through and the exporter would self-trace. + it("matches the OTLP port whether callers pass a number or a string", () => { + const ignored = ignoreOutgoingFor("http://collector:4317"); + expect(ignored("4317")).toBe(true); + expect(ignored("443")).toBe(false); + expect(ignored(undefined)).toBe(false); + }); + it("derives a non-default OTLP port from the endpoint URL", () => { // `:4318` is the OTLP/HTTP convention; if the helper hardcoded 4317 // a self-instrumented exporter on 4318 would feed back into itself. diff --git a/libs/@local/hash-backend-utils/src/opentelemetry.ts b/libs/@local/hash-backend-utils/src/opentelemetry.ts index 6444453e10f..638746a7c9d 100644 --- a/libs/@local/hash-backend-utils/src/opentelemetry.ts +++ b/libs/@local/hash-backend-utils/src/opentelemetry.ts @@ -208,7 +208,12 @@ export const createHttpInstrumentation = ( const otlpPort = otlpPortFromEndpoint(otlpEndpoint); return new HttpInstrumentation({ ...extra, - ignoreOutgoingRequestHook: (options) => options.port === otlpPort, + ignoreOutgoingRequestHook: (options) => + // `RequestOptions.port` is `string | number | null | undefined`; + // some callers (raw `http.request`, axios) pass a string. Coerce + // to compare consistently — otherwise `"4317" === 4317` is false + // and the exporter's own outbound traffic slips through. + Number(options.port) === otlpPort, requestHook: httpRequestSpanNameHook, }); };