From b36722a59d1560b1f4ce00008ce8940d4406a7a8 Mon Sep 17 00:00:00 2001 From: Nicacio Oliveira Date: Tue, 2 Jun 2026 15:48:53 -0300 Subject: [PATCH] feat(chart): opt-in Argo Rollouts (blue-green) + pre-deploy migration Job MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds two opt-in flags to the studio Helm chart so consumers can switch from the default Deployment to an Argo Rollouts Rollout with blue-green strategy, and move DB migrations out of pod startup into a dedicated pre-sync Job. Both default to off — existing installs keep the exact same Deployment with the on-startup migration path. No selector / label changes, no PVC churn. ## What's new - `argoRollouts.enabled` — when true, render a `Rollout` (argoproj.io/v1alpha1) instead of the `Deployment`. Pod template is shared via the new `chart-deco-studio.podTemplate` helper so the two workload kinds describe an identical pod surface. Supports `blueGreen` (default) and `canary` strategies. - `migrationJob.enabled` — when true, render a Job that runs `bun run --cwd=apps/mesh migrate` ONCE before pods start. Carries BOTH `helm.sh/hook: pre-install,pre-upgrade` and `argocd.argoproj.io/hook: PreSync` annotations so it sequences correctly whether installed via `helm upgrade` directly or synced by ArgoCD. The runtime pod command gets `--skip-migrations` appended (the studio CLI already exposes this flag — see `apps/mesh/src/cli.ts`), eliminating the race between N replicas migrating concurrently and giving a clear pre-deploy gate: migration Job fails → release aborted. ## New / modified files - `templates/_pod-template.tpl` (new) — shared pod template helper + the `podCommand` helper that appends `--skip-migrations` when migrationJob is on. - `templates/deployment.yaml` — now wraps in `{{- if not argoRollouts.enabled }}` and references the helper; lifts the entire `spec.template` body out. - `templates/rollout.yaml` (new) — gated on `argoRollouts.enabled`, mirrors the Deployment via the same helper, picks blueGreen or canary based on values. Mutual-exclusion `fail` for both-on configs. - `templates/migration-job.yaml` (new) — gated on `migrationJob.enabled`. Sync-wave -1, dual hooks, bounded backoffLimit/activeDeadlineSeconds/TTL. - `templates/service-preview.yaml` (new) — rendered only for blue-green; Argo Rollouts manages its selector to point at the preview ReplicaSet. - `values.yaml` — adds `argoRollouts` and `migrationJob` blocks; both off by default, defaults preserve current behavior. ## Why opt-in The chart is open-source and not everyone has the argo-rollouts controller installed. Defaulting to Deployment keeps zero requirements on the consumer's cluster. Internal CD (deco-apps-cd) flips both flags on for the deco-studio / deco-studio-stg releases — that's a separate change. ## Migration discipline note Blue-green amplifies the schema/code overlap window. The Job moves migrations to a single execution point BEFORE the new ReplicaSet probes, but it does NOT make destructive DDL safe — the old (blue) ReplicaSet still serves traffic during the overlap window with the migrated schema. Destructive changes (DROP/RENAME/type changes) still require expand-contract discipline at the migration code level. This is independent of the chart and is being handled team-side as a code/review practice. ## Verification - `helm lint deploy/helm/studio` passes - `helm template deco-studio deploy/helm/studio` (default) renders identical workload surface to before — Deployment with `bun run deco --no-local-mode`, no Rollout, no preview Service, no migration Job - `helm template ... --set argoRollouts.enabled=true --set migrationJob.enabled=true` renders Rollout with blueGreen, preview Service, migration Job with the PreSync hooks, and `--skip-migrations` appended to the pod command Co-Authored-By: Claude Opus 4.7 --- .../helm/studio/templates/_pod-template.tpl | 220 ++++++++++++++++++ deploy/helm/studio/templates/deployment.yaml | 196 +--------------- .../helm/studio/templates/migration-job.yaml | 105 +++++++++ deploy/helm/studio/templates/rollout.yaml | 92 ++++++++ .../studio/templates/service-preview.yaml | 42 ++++ deploy/helm/studio/values.yaml | 67 ++++++ 6 files changed, 529 insertions(+), 193 deletions(-) create mode 100644 deploy/helm/studio/templates/_pod-template.tpl create mode 100644 deploy/helm/studio/templates/migration-job.yaml create mode 100644 deploy/helm/studio/templates/rollout.yaml create mode 100644 deploy/helm/studio/templates/service-preview.yaml diff --git a/deploy/helm/studio/templates/_pod-template.tpl b/deploy/helm/studio/templates/_pod-template.tpl new file mode 100644 index 0000000000..5479eb185f --- /dev/null +++ b/deploy/helm/studio/templates/_pod-template.tpl @@ -0,0 +1,220 @@ +{{/* +Shared pod template body used by both Deployment and Rollout workload kinds. + +This contains everything from `template:` down to (but not including) the +workload-level `strategy:` block — i.e. metadata + spec for the pod. + +Keep all pod-level concerns here (container env, volumes, securityContext, etc.) +so the Deployment ↔ Rollout opt-in stays a one-line toggle for consumers and the +two workload templates stay byte-identical in pod surface. +*/}} +{{- define "chart-deco-studio.podTemplate" -}} +metadata: + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} + labels: + {{- include "chart-deco-studio.labels" . | nindent 4 }} + {{- with .Values.podLabels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.terminationGracePeriodSeconds }} + terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds }} + {{- end }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 4 }} + {{- end }} + serviceAccountName: {{ include "chart-deco-studio.serviceAccountName" . }} + securityContext: + {{- include "chart-deco-studio.podSecurityContext" . | nindent 4 }} + containers: + - name: {{ .Chart.Name }} + {{- with .Values.securityContext }} + securityContext: + {{- toYaml . | nindent 8 }} + {{- end }} + image: "{{ .Values.image.repository }}{{- if and .Values.image.tag (hasPrefix "sha256:" .Values.image.tag) }}@{{ .Values.image.tag }}{{- else }}:{{ .Values.image.tag | default .Chart.AppVersion }}{{- end }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: + {{- include "chart-deco-studio.podCommand" . | nindent 8 }} + ports: + - name: http + containerPort: {{ .Values.service.targetPort | default 3000 }} + protocol: TCP + envFrom: + - configMapRef: + name: {{ include "chart-deco-studio.fullname" . }}-config + - secretRef: + name: {{ include "chart-deco-studio.secretName" . }} + env: + {{- if .Values.otel.enabled }} + {{- if .Values.otel.protocol }} + - name: OTEL_EXPORTER_OTLP_PROTOCOL + value: {{ .Values.otel.protocol | quote }} + {{- end }} + {{- if .Values.otel.service }} + - name: OTEL_SERVICE_NAME + value: {{ .Values.otel.service | quote }} + {{- end }} + {{- if .Values.otel.endpoint }} + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: {{ .Values.otel.endpoint | quote }} + {{- else if .Values.otel.collector.enabled }} + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: {{ printf "http://%s-opentelemetry-collector:4318" .Release.Name | quote }} + {{- end }} + {{- if and .Values.otel.headers (or (not .Values.otel.collector.enabled) .Values.otel.endpoint) }} + - name: OTEL_EXPORTER_OTLP_HEADERS + value: {{ include "chart-deco-studio.otelHeaders" . | quote }} + {{- end }} + {{- if .Values.otel.attributes }} + - name: OTEL_RESOURCE_ATTRIBUTES + value: {{ include "chart-deco-studio.otelAttributes" . | quote }} + {{- end }} + {{- end }} + {{- if and .Values.dbosConductor .Values.dbosConductor.enabled }} + - name: DBOS_CONDUCTOR_KEY + valueFrom: + secretKeyRef: + name: {{ .Values.dbosConductor.existingSecret | default (printf "%s-dbos-conductor" (include "chart-deco-studio.fullname" .)) }} + key: {{ .Values.dbosConductor.existingSecretKey | default "DBOS_CONDUCTOR_KEY" }} + {{- with .Values.dbosConductor.url }} + - name: DBOS_CONDUCTOR_URL + value: {{ . | quote }} + {{- end }} + {{- end }} + {{- with .Values.env }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.livenessProbe }} + livenessProbe: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.readinessProbe }} + readinessProbe: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.lifecycle }} + lifecycle: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.resources }} + resources: + {{- toYaml . | nindent 8 }} + {{- end }} + volumeMounts: + - name: data + mountPath: {{ .Values.configMap.meshConfig.DATA_DIR | default "/app/data" }} + {{- if and (eq (lower (default "sqlite" .Values.database.engine)) "postgresql") .Values.database.caCert }} + - name: ca-cert + mountPath: /etc/ssl/certs + readOnly: true + {{- end }} + {{- with .Values.volumeMounts }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- if .Values.s3Sync.enabled }} + - name: s3-sync + image: "{{ .Values.s3Sync.image.repository }}:{{ .Values.s3Sync.image.tag }}" + imagePullPolicy: {{ .Values.s3Sync.image.pullPolicy }} + command: ["/bin/sh", "/scripts/sync.sh"] + {{- with .Values.securityContext }} + securityContext: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.s3Sync.resources }} + resources: + {{- toYaml . | nindent 8 }} + {{- end }} + volumeMounts: + - name: data + mountPath: {{ .Values.configMap.meshConfig.DATA_DIR | default "/app/data" }} + - name: s3-sync-script + mountPath: /scripts + readOnly: true + {{- end }} + {{- with .Values.extraContainers }} + {{- toYaml . | nindent 4 }} + {{- end }} + volumes: + {{- if and (eq (lower (default "sqlite" .Values.database.engine)) "postgresql") .Values.database.caCert }} + - name: ca-cert + configMap: + name: {{ include "chart-deco-studio.fullname" . }}-ca-cert + items: + - key: ca-cert.pem + path: ca-cert.pem + {{- end }} + {{- if .Values.persistence.enabled }} + - name: data + {{- if .Values.persistence.claimName }} + persistentVolumeClaim: + claimName: {{ .Values.persistence.claimName }} + {{- else }} + persistentVolumeClaim: + claimName: {{ include "chart-deco-studio.fullname" . }}-data + {{- end }} + {{- else }} + - name: data + emptyDir: + sizeLimit: {{ .Values.persistence.emptyDirSizeLimit | default "10Gi" }} + {{- end }} + {{- if .Values.s3Sync.enabled }} + - name: s3-sync-script + configMap: + name: {{ include "chart-deco-studio.fullname" . }}-s3-sync + defaultMode: 0755 + items: + - key: sync.sh + path: sync.sh + {{- end }} + {{- with .Values.volumes }} + {{- toYaml . | nindent 4 }} + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 4 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 4 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 4 }} + {{- end }} + {{- if and .Values.topologySpreadConstraints (gt (len .Values.topologySpreadConstraints) 0) }} + topologySpreadConstraints: + {{- range .Values.topologySpreadConstraints }} + - {{- if not .labelSelector }} + {{- fail "labelSelector é obrigatório em topologySpreadConstraints. Especifique explicitamente os labels." }} + {{- else }} + labelSelector: + {{- toYaml .labelSelector | nindent 8 }} + {{- end }} + maxSkew: {{ .maxSkew }} + topologyKey: {{ .topologyKey }} + whenUnsatisfiable: {{ .whenUnsatisfiable }} + {{- end }} + {{- end }} +{{- end }} + +{{/* +Resolves the pod command, appending `--skip-migrations` when migrationJob.enabled. + +When the chart runs migrations in a dedicated pre-sync Job, the runtime must +NOT also run them on boot — otherwise N pods race against the lock and the +Job's whole point (single execution point + pre-deploy gate) is undermined. +The studio CLI already exposes `--skip-migrations` (see apps/mesh/src/cli.ts), +so we just append it to the configured command. +*/}} +{{- define "chart-deco-studio.podCommand" -}} +{{- $cmd := default (list "bun" "run" "deco" "--no-local-mode") .Values.image.command -}} +{{- if and .Values.migrationJob .Values.migrationJob.enabled -}} +{{- $cmd = append $cmd "--skip-migrations" -}} +{{- end -}} +{{- toYaml $cmd -}} +{{- end -}} diff --git a/deploy/helm/studio/templates/deployment.yaml b/deploy/helm/studio/templates/deployment.yaml index 790c9d7a14..e712573d3a 100644 --- a/deploy/helm/studio/templates/deployment.yaml +++ b/deploy/helm/studio/templates/deployment.yaml @@ -1,3 +1,4 @@ +{{- if not (and .Values.argoRollouts .Values.argoRollouts.enabled) }} apiVersion: apps/v1 kind: Deployment metadata: @@ -24,196 +25,5 @@ spec: matchLabels: {{- include "chart-deco-studio.selectorLabels" . | nindent 6 }} template: - metadata: - {{- with .Values.podAnnotations }} - annotations: - {{- toYaml . | nindent 8 }} - {{- end }} - labels: - {{- include "chart-deco-studio.labels" . | nindent 8 }} - {{- with .Values.podLabels }} - {{- toYaml . | nindent 8 }} - {{- end }} - spec: - {{- if .Values.terminationGracePeriodSeconds }} - terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds }} - {{- end }} - {{- with .Values.imagePullSecrets }} - imagePullSecrets: - {{- toYaml . | nindent 8 }} - {{- end }} - serviceAccountName: {{ include "chart-deco-studio.serviceAccountName" . }} - securityContext: - {{- include "chart-deco-studio.podSecurityContext" . | nindent 8 }} - containers: - - name: {{ .Chart.Name }} - {{- with .Values.securityContext }} - securityContext: - {{- toYaml . | nindent 12 }} - {{- end }} - image: "{{ .Values.image.repository }}{{- if and .Values.image.tag (hasPrefix "sha256:" .Values.image.tag) }}@{{ .Values.image.tag }}{{- else }}:{{ .Values.image.tag | default .Chart.AppVersion }}{{- end }}" - imagePullPolicy: {{ .Values.image.pullPolicy }} - {{- with .Values.image.command }} - command: - {{- toYaml . | nindent 12 }} - {{- end }} - ports: - - name: http - containerPort: {{ .Values.service.targetPort | default 3000 }} - protocol: TCP - envFrom: - - configMapRef: - name: {{ include "chart-deco-studio.fullname" . }}-config - - secretRef: - name: {{ include "chart-deco-studio.secretName" . }} - env: - {{- if .Values.otel.enabled }} - {{- if .Values.otel.protocol }} - - name: OTEL_EXPORTER_OTLP_PROTOCOL - value: {{ .Values.otel.protocol | quote }} - {{- end }} - {{- if .Values.otel.service }} - - name: OTEL_SERVICE_NAME - value: {{ .Values.otel.service | quote }} - {{- end }} - {{- if .Values.otel.endpoint }} - - name: OTEL_EXPORTER_OTLP_ENDPOINT - value: {{ .Values.otel.endpoint | quote }} - {{- else if .Values.otel.collector.enabled }} - - name: OTEL_EXPORTER_OTLP_ENDPOINT - value: {{ printf "http://%s-opentelemetry-collector:4318" .Release.Name | quote }} - {{- end }} - {{- if and .Values.otel.headers (or (not .Values.otel.collector.enabled) .Values.otel.endpoint) }} - - name: OTEL_EXPORTER_OTLP_HEADERS - value: {{ include "chart-deco-studio.otelHeaders" . | quote }} - {{- end }} - {{- if .Values.otel.attributes }} - - name: OTEL_RESOURCE_ATTRIBUTES - value: {{ include "chart-deco-studio.otelAttributes" . | quote }} - {{- end }} - {{- end }} - {{- if and .Values.dbosConductor .Values.dbosConductor.enabled }} - - name: DBOS_CONDUCTOR_KEY - valueFrom: - secretKeyRef: - name: {{ .Values.dbosConductor.existingSecret | default (printf "%s-dbos-conductor" (include "chart-deco-studio.fullname" .)) }} - key: {{ .Values.dbosConductor.existingSecretKey | default "DBOS_CONDUCTOR_KEY" }} - {{- with .Values.dbosConductor.url }} - - name: DBOS_CONDUCTOR_URL - value: {{ . | quote }} - {{- end }} - {{- end }} - {{- with .Values.env }} - {{- toYaml . | nindent 12 }} - {{- end }} - {{- with .Values.livenessProbe }} - livenessProbe: - {{- toYaml . | nindent 12 }} - {{- end }} - {{- with .Values.readinessProbe }} - readinessProbe: - {{- toYaml . | nindent 12 }} - {{- end }} - {{- with .Values.lifecycle }} - lifecycle: - {{- toYaml . | nindent 12 }} - {{- end }} - {{- with .Values.resources }} - resources: - {{- toYaml . | nindent 12 }} - {{- end }} - volumeMounts: - - name: data - mountPath: {{ .Values.configMap.meshConfig.DATA_DIR | default "/app/data" }} - {{- if and (eq (lower (default "sqlite" .Values.database.engine)) "postgresql") .Values.database.caCert }} - - name: ca-cert - mountPath: /etc/ssl/certs - readOnly: true - {{- end }} - {{- with .Values.volumeMounts }} - {{- toYaml . | nindent 12 }} - {{- end }} - {{- if .Values.s3Sync.enabled }} - - name: s3-sync - image: "{{ .Values.s3Sync.image.repository }}:{{ .Values.s3Sync.image.tag }}" - imagePullPolicy: {{ .Values.s3Sync.image.pullPolicy }} - command: ["/bin/sh", "/scripts/sync.sh"] - {{- with .Values.securityContext }} - securityContext: - {{- toYaml . | nindent 12 }} - {{- end }} - {{- with .Values.s3Sync.resources }} - resources: - {{- toYaml . | nindent 12 }} - {{- end }} - volumeMounts: - - name: data - mountPath: {{ .Values.configMap.meshConfig.DATA_DIR | default "/app/data" }} - - name: s3-sync-script - mountPath: /scripts - readOnly: true - {{- end }} - {{- with .Values.extraContainers }} - {{- toYaml . | nindent 8 }} - {{- end }} - volumes: - {{- if and (eq (lower (default "sqlite" .Values.database.engine)) "postgresql") .Values.database.caCert }} - - name: ca-cert - configMap: - name: {{ include "chart-deco-studio.fullname" . }}-ca-cert - items: - - key: ca-cert.pem - path: ca-cert.pem - {{- end }} - {{- if .Values.persistence.enabled }} - - name: data - {{- if .Values.persistence.claimName }} - persistentVolumeClaim: - claimName: {{ .Values.persistence.claimName }} - {{- else }} - persistentVolumeClaim: - claimName: {{ include "chart-deco-studio.fullname" . }}-data - {{- end }} - {{- else }} - - name: data - emptyDir: - sizeLimit: {{ .Values.persistence.emptyDirSizeLimit | default "10Gi" }} - {{- end }} - {{- if .Values.s3Sync.enabled }} - - name: s3-sync-script - configMap: - name: {{ include "chart-deco-studio.fullname" . }}-s3-sync - defaultMode: 0755 - items: - - key: sync.sh - path: sync.sh - {{- end }} - {{- with .Values.volumes }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- if and .Values.topologySpreadConstraints (gt (len .Values.topologySpreadConstraints) 0) }} - topologySpreadConstraints: - {{- range .Values.topologySpreadConstraints }} - - {{- if not .labelSelector }} - {{- fail "labelSelector é obrigatório em topologySpreadConstraints. Especifique explicitamente os labels." }} - {{- else }} - labelSelector: - {{- toYaml .labelSelector | nindent 12 }} - {{- end }} - maxSkew: {{ .maxSkew }} - topologyKey: {{ .topologyKey }} - whenUnsatisfiable: {{ .whenUnsatisfiable }} - {{- end }} - {{- end }} + {{- include "chart-deco-studio.podTemplate" . | nindent 4 }} +{{- end }} diff --git a/deploy/helm/studio/templates/migration-job.yaml b/deploy/helm/studio/templates/migration-job.yaml new file mode 100644 index 0000000000..1424086451 --- /dev/null +++ b/deploy/helm/studio/templates/migration-job.yaml @@ -0,0 +1,105 @@ +{{- if and .Values.migrationJob .Values.migrationJob.enabled }} +{{- /* +Pre-deploy migration Job — opt-in via `migrationJob.enabled: true`. + +Runs `bun run --cwd=apps/mesh migrate` ONCE before the workload (Deployment or +Rollout) sees the new image. Both Helm and ArgoCD hook semantics are declared +so the Job ordering is preserved whether the chart is installed via `helm +upgrade` directly or synced by ArgoCD: + + - helm.sh/hook: pre-install,pre-upgrade (helm) + - argocd.argoproj.io/hook: PreSync (argocd) + - sync-wave: -1 (extra ordering hint) + +When this Job is enabled, the workload pod command gets `--skip-migrations` +appended (see _pod-template.tpl) so the runtime DOES NOT also run migrations on +boot. This eliminates the race between N replicas trying to migrate simultaneously +and gives an explicit pre-deploy gate: if the migration Job fails, the release +is aborted before the new ReplicaSet is created. + +IMPORTANT: this Job moves migration ordering but does NOT eliminate the +schema/code compatibility problem during the overlap window between old and +new pods. Destructive migrations (DROP/RENAME/type changes) still require +expand-contract discipline at the migration code level. See chart README for +the migration playbook. +*/}} +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ include "chart-deco-studio.fullname" . }}-migrate-{{ .Release.Revision | default 0 }} + labels: + {{- include "chart-deco-studio.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-weight": "-1" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded + "argocd.argoproj.io/hook": PreSync + "argocd.argoproj.io/hook-delete-policy": BeforeHookCreation + "argocd.argoproj.io/sync-wave": "-1" +spec: + backoffLimit: {{ default 0 .Values.migrationJob.backoffLimit }} + activeDeadlineSeconds: {{ default 600 .Values.migrationJob.activeDeadlineSeconds }} + ttlSecondsAfterFinished: {{ default 300 .Values.migrationJob.ttlSecondsAfterFinished }} + template: + metadata: + labels: + {{- include "chart-deco-studio.labels" . | nindent 8 }} + {{- with .Values.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + restartPolicy: Never + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "chart-deco-studio.serviceAccountName" . }} + securityContext: + {{- include "chart-deco-studio.podSecurityContext" . | nindent 8 }} + containers: + - name: migrate + {{- with .Values.securityContext }} + securityContext: + {{- toYaml . | nindent 12 }} + {{- end }} + image: "{{ .Values.image.repository }}{{- if and .Values.image.tag (hasPrefix "sha256:" .Values.image.tag) }}@{{ .Values.image.tag }}{{- else }}:{{ .Values.image.tag | default .Chart.AppVersion }}{{- end }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: + {{- toYaml (default (list "bun" "run" "--cwd=apps/mesh" "migrate") .Values.migrationJob.command) | nindent 12 }} + envFrom: + - configMapRef: + name: {{ include "chart-deco-studio.fullname" . }}-config + - secretRef: + name: {{ include "chart-deco-studio.secretName" . }} + {{- with .Values.migrationJob.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- if and (eq (lower (default "sqlite" .Values.database.engine)) "postgresql") .Values.database.caCert }} + volumeMounts: + - name: ca-cert + mountPath: /etc/ssl/certs + readOnly: true + {{- end }} + {{- if and (eq (lower (default "sqlite" .Values.database.engine)) "postgresql") .Values.database.caCert }} + volumes: + - name: ca-cert + configMap: + name: {{ include "chart-deco-studio.fullname" . }}-ca-cert + items: + - key: ca-cert.pem + path: ca-cert.pem + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/deploy/helm/studio/templates/rollout.yaml b/deploy/helm/studio/templates/rollout.yaml new file mode 100644 index 0000000000..b4badac67e --- /dev/null +++ b/deploy/helm/studio/templates/rollout.yaml @@ -0,0 +1,92 @@ +{{- if and .Values.argoRollouts .Values.argoRollouts.enabled }} +{{- /* +Argo Rollouts Rollout — opt-in via `argoRollouts.enabled: true`. + +Replaces the standard Deployment with a Rollout managed by argo-rollouts. The +pod template is shared with the Deployment path via the `podTemplate` helper, +so both kinds describe the same workload; only the orchestration around traffic +switching differs. + +BlueGreen strategy: + - activeService: routes prod traffic. Argo updates its selector to point to + the active ReplicaSet (the post-promotion one). + - previewService: routes preview/probe traffic. Argo points it at the new + ReplicaSet before promotion so it can be smoke-tested / inspected. + - autoPromotionEnabled: false by default — promote manually via the Argo UI + after verifying the preview, or wire prePromotionAnalysis to gate it. + - scaleDownDelaySeconds: keeps the old ReplicaSet alive for N seconds after + promotion so in-flight requests drain cleanly before its pods terminate. + +Canary strategy: also supported — set `argoRollouts.strategy.canary` instead. +Pick one; setting both throws at install time. + +IMPORTANT for stateful workloads (long-lived sessions, MCP / chat): destructive +schema migrations during a blue/green window will break the old version. Pair +this Rollout with `migrationJob.enabled: true` so migrations run before the new +ReplicaSet is brought up, AND adopt expand-contract migration discipline so the +overlap window between old and new code is safe. +*/}} +apiVersion: argoproj.io/v1alpha1 +kind: Rollout +metadata: + name: {{ include "chart-deco-studio.fullname" . }} + labels: + {{- include "chart-deco-studio.labels" . | nindent 4 }} +spec: + {{- if not .Values.autoscaling.enabled }} + replicas: {{ .Values.replicaCount }} + {{- end }} + selector: + matchLabels: + {{- include "chart-deco-studio.selectorLabels" . | nindent 6 }} + {{- with .Values.argoRollouts.revisionHistoryLimit }} + revisionHistoryLimit: {{ . }} + {{- end }} + template: + {{- include "chart-deco-studio.podTemplate" . | nindent 4 }} + strategy: + {{- $bg := default dict (default dict .Values.argoRollouts.strategy).blueGreen }} + {{- $canary := default dict (default dict .Values.argoRollouts.strategy).canary }} + {{- if and $bg.enabled $canary.enabled }} + {{- fail "chart-deco-studio: argoRollouts.strategy.blueGreen.enabled and argoRollouts.strategy.canary.enabled are mutually exclusive — pick one." }} + {{- end }} + {{- if $canary.enabled }} + canary: + {{- with $canary.canaryService }} + canaryService: {{ . }} + {{- end }} + {{- with $canary.stableService }} + stableService: {{ . }} + {{- end }} + {{- with $canary.steps }} + steps: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with $canary.trafficRouting }} + trafficRouting: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- else }} + blueGreen: + activeService: {{ default (include "chart-deco-studio.fullname" .) $bg.activeServiceName }} + previewService: {{ default (printf "%s-preview" (include "chart-deco-studio.fullname" .)) $bg.previewServiceName }} + autoPromotionEnabled: {{ default false $bg.autoPromotionEnabled }} + {{- with $bg.autoPromotionSeconds }} + autoPromotionSeconds: {{ . }} + {{- end }} + {{- with $bg.scaleDownDelaySeconds }} + scaleDownDelaySeconds: {{ . }} + {{- end }} + {{- with $bg.previewReplicaCount }} + previewReplicaCount: {{ . }} + {{- end }} + {{- with $bg.prePromotionAnalysis }} + prePromotionAnalysis: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with $bg.postPromotionAnalysis }} + postPromotionAnalysis: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} +{{- end }} diff --git a/deploy/helm/studio/templates/service-preview.yaml b/deploy/helm/studio/templates/service-preview.yaml new file mode 100644 index 0000000000..adc21191f6 --- /dev/null +++ b/deploy/helm/studio/templates/service-preview.yaml @@ -0,0 +1,42 @@ +{{- /* +Preview Service for blue-green Rollouts. + +Argo Rollouts manages the selectors of `activeService` (this chart's main +Service) and `previewService` (this one) — it injects the rollout's pod- +template-hash so each Service routes to the correct ReplicaSet at each phase: + + - previewService → new ReplicaSet, available BEFORE promotion (use it to + smoke-test the green version, or to wire prePromotionAnalysis probes). + - activeService → current ReplicaSet that serves prod traffic. + +Only rendered when blueGreen is the chosen strategy. Canary uses traffic +splitting (Istio/NGINX) instead of a second Service. +*/}} +{{- $rollouts := default dict .Values.argoRollouts }} +{{- $strategy := default dict $rollouts.strategy }} +{{- $bg := default dict $strategy.blueGreen }} +{{- if and $rollouts.enabled $bg.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ default (printf "%s-preview" (include "chart-deco-studio.fullname" .)) $bg.previewServiceName }} + labels: + {{- include "chart-deco-studio.labels" . | nindent 4 }} + app.kubernetes.io/component: preview +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: http + protocol: TCP + name: http + selector: + {{- include "chart-deco-studio.selectorLabels" . | nindent 4 }} + {{- if .Values.service.sessionAffinity }} + sessionAffinity: {{ .Values.service.sessionAffinity }} + {{- end }} + {{- if .Values.service.sessionAffinityConfig }} + sessionAffinityConfig: + {{- toYaml .Values.service.sessionAffinityConfig | nindent 4 }} + {{- end }} +{{- end }} diff --git a/deploy/helm/studio/values.yaml b/deploy/helm/studio/values.yaml index b068428a0c..2b3df4d22e 100644 --- a/deploy/helm/studio/values.yaml +++ b/deploy/helm/studio/values.yaml @@ -52,6 +52,73 @@ autoscaling: targetCPUUtilizationPercentage: 80 targetMemoryUtilizationPercentage: 80 +# Argo Rollouts opt-in. +# +# When `argoRollouts.enabled: true`, the chart renders a Rollout (argoproj.io/v1alpha1) +# in place of the standard Deployment. The pod template is shared between both +# kinds via the `podTemplate` helper, so toggling this flag changes ONLY the +# orchestration around traffic switching — never the workload itself. +# +# Requires: argo-rollouts controller installed in the cluster +# (https://argo-rollouts.readthedocs.io/). +# +# IMPORTANT: blue-green amplifies the schema/code overlap window. Destructive +# DB migrations (DROP/RENAME/type changes) on a single release WILL break the +# old (blue) ReplicaSet while it still serves traffic. Pair with +# `migrationJob.enabled: true` so migrations run before the new ReplicaSet is +# scaled up, and adopt expand-contract migration discipline at the code level. +argoRollouts: + enabled: false + # revisionHistoryLimit: 5 # how many old ReplicaSets to keep + strategy: + # Pick exactly ONE of `blueGreen.enabled` or `canary.enabled`. + blueGreen: + enabled: true + # autoPromotionEnabled: false → manual promote in the Argo UI after + # validating preview. Set true (or wire prePromotionAnalysis) to + # auto-promote once the new ReplicaSet is healthy. + autoPromotionEnabled: false + # autoPromotionSeconds: 0 # auto-promote N seconds after preview becomes available + # Keep the old ReplicaSet alive for N seconds after promotion so in-flight + # requests drain cleanly. Reasonable default for long-lived sessions + # (MCP / chat connections). + scaleDownDelaySeconds: 30 + # previewReplicaCount: 1 # scale preview to until promote + # activeServiceName: "" # default: + # previewServiceName: "" # default: -preview + # prePromotionAnalysis: {} + # postPromotionAnalysis: {} + canary: + enabled: false + # canaryService: "" + # stableService: "" + # steps: [] + # trafficRouting: {} + +# Pre-deploy migration Job (Helm pre-upgrade + ArgoCD PreSync hook). +# +# When enabled, the chart renders a Job that runs `bun run --cwd=apps/mesh +# migrate` ONCE before the workload (Deployment or Rollout) sees the new +# image. The runtime pod command gets `--skip-migrations` appended (see +# _pod-template.tpl) so pods don't also try to migrate on boot — eliminating +# the race between N replicas migrating concurrently and giving a clear +# pre-deploy gate (Job fails → release aborted). +# +# Critical companion to `argoRollouts.enabled: true` for blue-green safety: +# migrations have already settled before the new ReplicaSet starts probing. +migrationJob: + enabled: false + # command: ["bun", "run", "--cwd=apps/mesh", "migrate"] # override if needed + backoffLimit: 0 + activeDeadlineSeconds: 600 + ttlSecondsAfterFinished: 300 + resources: + requests: + memory: "1Gi" + cpu: "500m" + limits: + memory: "2Gi" + configMap: meshConfig: NODE_ENV: "production"