diff --git a/operator/Makefile b/operator/Makefile
index c341ee150..45a4ecbe8 100644
--- a/operator/Makefile
+++ b/operator/Makefile
@@ -222,6 +222,24 @@ run-scale-test:
 	@echo "> Running scale tests..."
 	@cd e2e && go test -count=1 -tags=e2e ./tests/scale/... -v -timeout 40m
 
+# Run the soak / churn benchmark against an existing cluster.
+# Gated behind the `soak` build tag so it does NOT run as part of
+# run-e2e or run-scale-test. Long-running (~30 min default).
+#
+# Usage: make run-soak-test [DIAG_DIR=<path>] [DIAG_MODE=<mode>] \
+#                           [SOAK_BASE=<n>] [SOAK_PEAK=<n>] [SOAK_CYCLES=<n>]
+# Examples:
+#   make run-soak-test                                       # Defaults: 25 → 50 PCS, 10 cycles
+#   make run-soak-test SOAK_CYCLES=3                         # Shorter run for quick validation
+#   make run-soak-test DIAG_DIR=/tmp/soak-baseline           # Capture artifacts for later comparison
+.PHONY: run-soak-test
+run-soak-test: export GROVE_E2E_DIAG_DIR = $(DIAG_DIR)
+run-soak-test: export GROVE_E2E_DIAG_MODE = $(DIAG_MODE)
+run-soak-test:
+	@echo "> Running soak / churn benchmark (long-running, on-demand)..."
+	@cd e2e && SOAK_BASE=$(SOAK_BASE) SOAK_PEAK=$(SOAK_PEAK) SOAK_CYCLES=$(SOAK_CYCLES) \
+		go test -count=1 -tags='e2e soak' ./tests/scale/... -v -timeout 60m -run Test_SoakChurn
+
 # Make targets for local development and testing
 # -------------------------------------------------------------
 # Starts a local k8s cluster using kind.
diff --git a/operator/e2e/measurement/condition/pod.go b/operator/e2e/measurement/condition/pod.go
index f3f22669c..271845264 100644
--- a/operator/e2e/measurement/condition/pod.go
+++ b/operator/e2e/measurement/condition/pod.go
@@ -135,3 +135,38 @@ func (c *PodsReadyCondition) Met(ctx context.Context) (bool, error) {
 func (c *PodsReadyCondition) Progress(_ context.Context) string {
 	return fmt.Sprintf("%d/%d pods ready", c.lastReady, c.ExpectedCount)
 }
+
+// PodsAtCountCondition fires when the live pod count drops to ExpectedCount or
+// below. Intended for scale-down milestones where PodsCreatedCondition (≥-only)
+// would fire immediately because the starting count already exceeds the target.
+// Using ≤ rather than == makes the condition robust to a transient overshoot
+// during cascade-delete where two consecutive polls might skip the exact target.
+type PodsAtCountCondition struct {
+	Client        client.Client
+	Namespace     string
+	LabelSelector string
+	ExpectedCount int
+	lastCount     int
+	sel           parsedSelector
+}
+
+// Met returns true once the live pod count is ≤ ExpectedCount.
+func (c *PodsAtCountCondition) Met(ctx context.Context) (bool, error) {
+	if c.ExpectedCount < 0 {
+		return false, errors.New("expected count cannot be negative")
+	}
+
+	c.sel.init(c.LabelSelector)
+	pods, err := listPods(ctx, c.Client, c.Namespace, &c.sel)
+	if err != nil {
+		return false, err
+	}
+
+	c.lastCount = len(pods)
+	return c.lastCount <= c.ExpectedCount, nil
+}
+
+// Progress returns a human-readable progress string.
+func (c *PodsAtCountCondition) Progress(_ context.Context) string {
+	return fmt.Sprintf("%d pods (target ≤%d)", c.lastCount, c.ExpectedCount)
+}
diff --git a/operator/e2e/tests/scale/soak_churn.md b/operator/e2e/tests/scale/soak_churn.md
new file mode 100644
index 000000000..429fa563e
--- /dev/null
+++ b/operator/e2e/tests/scale/soak_churn.md
@@ -0,0 +1,175 @@
+# Soak / Churn Benchmark
+
+## GitHub Issue
+
+Use the **Enhancement** template (`.github/ISSUE_TEMPLATE/ENHANCEMENT.yaml`).
+The fields below are ready to paste into the form.
+
+**Title**
+
+> Add long-running soak / churn benchmark to e2e scale tests
+
+**Labels**
+
+`enhancement`, `area/testing`, `area/operator`
+
+---
+
+### What you would like to be added?
+
+Add a long-running benchmark under `operator/e2e/tests/scale/` that
+boots a small `PodCliqueSet` (~100 pods) and then drives repeated
+scale-up / scale-down cycles against it over an extended duration.
+The goal is to surface bugs that only appear after many incremental
+reconciles — leaks, monotonically growing slices, gradually drifting
+counters, finalizer pile-ups — that single-shot benchmarks cannot see.
+
+**Scope (single test, parameterized):**
+
+| Parameter | Default | Env override | Rationale |
+|---|---|---|---|
+| Base PCS replicas | 25 (50 pods) | `SOAK_BASE` | Small enough to fit dev clusters; large enough to exercise multi-child reconciles. |
+| Peak PCS replicas | 50 (100 pods) | `SOAK_PEAK` | 2x amplitude per cycle. |
+| Cycles | 10 | `SOAK_CYCLES` | Enough per-cycle iterations to surface slow drift; bounded total runtime. |
+| Per-cycle hold | ~30 s | — | Lets watch events flush and pprof capture overlap consecutive no-op reconciles. |
+| Worker nodes | 30 kwok nodes | — | Same fixture as scale-up/down tests. |
+| Timeout | 45 min | — | Soft cap; expected runtime ~25–30 min. |
+
+**Deliverables:**
+
+- `operator/e2e/tests/scale/soak_test.go`.
+- `operator/e2e/yaml/soak-churn.yaml` (PCS sized at base; ~50 pods).
+- Reuse `runScaleTest`, `ScalePCS`, `PodsCreatedCondition`,
+  `TimerCondition`. The same scale-down milestone gap noted in the
+  scale-up-down issue applies here (`PodsAtCountCondition` is
+  prerequisite).
+- Final-check phase asserting no leaks/orphans/unbounded counters at
+  the end of the run.
+
+**Cycle shape (executed N times):**
+
+1. **scale-up** — patch `spec.replicas` base → peak. Wait until live
+   pod count reaches `peak * podsPerReplica`.
+2. **hold-peak** — 30 s window.
+3. **scale-down** — patch `spec.replicas` peak → base. Wait until live
+   pod count reaches `base * podsPerReplica`.
+4. **hold-base** — 30 s window.
+
+**Final-check assertions:**
+
+- Live pod count equals base target (no leaked pods).
+- No `PodCliqueScalingGroup` or `PodClique` exists outside the expected
+  child set.
+- `status.updateProgress` (if present) has bounded counters:
+  `UpdatedPodCliquesCount ≤ TotalPodCliquesCount`.
+- `status.lastErrors` is empty.
+
+---
+
+### Why is this needed?
+
+Today's scale benchmarks run for minutes and measure a single event
+(deploy, resize, delete). They are good at catching gross regressions
+in per-event throughput, but they pass cleanly on a controller that is
+slowly leaking heap, accumulating stale entries in an informer index,
+or letting one status field drift by an item per reconcile.
+
+Issues we have personally been bitten by, and that a soak/churn test
+**would catch**:
+
+- **#567 — unbounded slices in status.** The original
+  `UpdatedPodCliques` / `UpdatedPodCliqueScalingGroups` slices grew
+  with every reconcile. A single-shot benchmark looked fine; a soak
+  test would have flagged it within an hour.
+- **Finalizer / cascade-delete drift.** Repeated scale-down cycles
+  exercise the partial-delete path many times. Any reconcile that
+  forgets to clear an expectation, or that re-emits a finalizer, will
+  accumulate across cycles.
+- **Watch-event amplification.** Status mutators that fire a Patch on
+  byte-identical status (no real change) waste a write per reconcile
+  and trigger a cascade of watch events. The equality-short-circuit
+  guards added in #567 prevent this, but only a churn test exercises
+  the steady-state-Patch-suppression path enough to verify it.
+- **Heap growth without a leak.** Even without an obvious bug, churn
+  reveals whether the controller's working-set memory plateaus or
+  keeps climbing. A pprof window over the second half of the run is
+  the cheapest way to spot a slow trend.
+
+---
+
+## Design Detail
+
+The sections below are for the implementation PR — they are not part of
+the GitHub issue body.
+
+### Goals
+
+- Drive ~10 scale-up / scale-down cycles against a single PCS over a
+  duration long enough to expose slow leaks (target: 30–60 min).
+- Run on the smaller dev fixture (≤ 30 worker nodes, ~100 pods peak).
+  The point is reconcile churn, not absolute capacity.
+- Reuse the existing `runScaleTest` scaffolding and pprof hook so the
+  output format and capture mechanism match the rest of the suite.
+- Assert at the end that the PCS reaches its final steady state
+  cleanly — no orphaned children, counters bounded, no unexpected
+  `lastErrors`.
+
+### Non-Goals
+
+- Multi-PCS / multi-tenant soak. Single-PCS churn is enough to surface
+  controller-internal leaks; multi-tenant belongs in a separate test.
+- Failure-injection (operator restarts, API server flakes). The soak
+  test is for steady-state churn; chaos is out of scope.
+- Comparison against historical runs. The output is captured in the
+  same JSON format as other scale tests, but no regression gate is
+  defined here.
+
+### Test Shape
+
+Three top-level phases:
+
+1. **deploy** — apply the YAML at base; wait for `pods-ready`.
+2. **churn-cycles** — loop N times executing the cycle shape above.
+   Each cycle adds milestones to the timeline so the per-cycle cost is
+   visible in the output.
+3. **soak-final-check** — assertions listed in the issue body; if any
+   fails the test fails.
+
+There is no explicit `delete` phase — the test fixture cleanup tears
+the PCS down at the end. (If we want a delete-time measurement, add
+it; otherwise keep the scope tight.)
+
+### Implementation Notes
+
+- New file: `operator/e2e/tests/scale/soak_test.go`.
+- New YAML fixture: `operator/e2e/yaml/soak-churn.yaml`. PCS sized at
+  the base (25 replicas, 50 pods) so the deploy phase is short and the
+  churn phases dominate the timeline.
+- Per-cycle phases can be added programmatically in a loop inside the
+  test's `addPhases` callback — each iteration calls
+  `tracker.AddPhase` four times with unique names suffixed by the
+  cycle index (e.g. `scale-up-c3`).
+- Reuses existing `ScalePCS`, `PodsCreatedCondition`, and
+  `TimerCondition`. **Same caveat as the scale-down doc:** the
+  scale-down legs need a "live pod count drops to N" condition that
+  `PodsCreatedCondition` (which is monotonic-up) does not provide.
+- The pprof hook in `setupPprofHook` is already per-run; one capture
+  spans the whole soak. If we want per-cycle pprof captures, that's a
+  follow-up.
+- Make cycle count and amplitude env-tunable (`SOAK_CYCLES`,
+  `SOAK_BASE`, `SOAK_PEAK`) so the same test can run at small scale in
+  CI and at larger scale on dev clusters without a new test function.
+
+### Open Questions
+
+- Should the soak test be tagged with a build tag stricter than `e2e`
+  (e.g. `e2e_soak`) so it isn't pulled into the default suite by
+  accident? The 30-min default runtime is too long for routine CI.
+- Where should the final-check assertions live? Adding them as
+  measurement conditions keeps the timeline coherent; embedding them
+  as `t.Fatalf` checks after `tracker.Wait()` is simpler but breaks
+  the "everything is a phase" pattern.
+- Do we want a heap-comparison assertion (e.g. compare pprof heap
+  samples between cycle 1 and cycle N, fail if delta exceeds a
+  threshold)? Useful but stateful — defer until the first real
+  regression motivates it.
diff --git a/operator/e2e/tests/scale/soak_test.go b/operator/e2e/tests/scale/soak_test.go
new file mode 100644
index 000000000..5aba8842d
--- /dev/null
+++ b/operator/e2e/tests/scale/soak_test.go
@@ -0,0 +1,257 @@
+//go:build e2e && soak
+
+// /*
+// Copyright 2026 The Grove Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// */
+
+package scale
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"strconv"
+	"testing"
+	"time"
+
+	grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
+	"github.com/ai-dynamo/grove/operator/e2e/grove/workload"
+	"github.com/ai-dynamo/grove/operator/e2e/k8s/resources"
+	"github.com/ai-dynamo/grove/operator/e2e/measurement"
+	"github.com/ai-dynamo/grove/operator/e2e/measurement/condition"
+	"github.com/ai-dynamo/grove/operator/e2e/testctx"
+
+	"sigs.k8s.io/controller-runtime/pkg/client"
+)
+
+const (
+	soakTimeout      = 60 * time.Minute
+	soakWorkerNodes  = 30
+	soakPerCycleHold = 30 * time.Second
+	soakPodsPerCLQ   = 2
+
+	soakDefaultBase   = 25
+	soakDefaultPeak   = 50
+	soakDefaultCycles = 10
+
+	soakWorkloadName = "soak-churn"
+	soakYAMLPath     = "../../yaml/soak-churn.yaml"
+)
+
+// soakConfig is the resolved cycle configuration. Read once from env at the
+// top of the test so a failing parse fails the test immediately rather than
+// midway through a 30-minute run.
+type soakConfig struct {
+	base   int
+	peak   int
+	cycles int
+}
+
+func loadSoakConfig() soakConfig {
+	return soakConfig{
+		base:   envInt("SOAK_BASE", soakDefaultBase),
+		peak:   envInt("SOAK_PEAK", soakDefaultPeak),
+		cycles: envInt("SOAK_CYCLES", soakDefaultCycles),
+	}
+}
+
+func envInt(key string, def int) int {
+	raw := os.Getenv(key)
+	if raw == "" {
+		return def
+	}
+	n, err := strconv.Atoi(raw)
+	if err != nil || n <= 0 {
+		return def
+	}
+	return n
+}
+
+func (c soakConfig) basePods() int { return c.base * soakPodsPerCLQ }
+func (c soakConfig) peakPods() int { return c.peak * soakPodsPerCLQ }
+
+// Test_SoakChurn drives repeated scale-up / scale-down cycles against a single
+// small PCS to surface bugs that only appear after many incremental reconciles
+// — leaks, monotonically growing fields, gradually drifting counters,
+// finalizer pile-ups. Gated behind the `soak` build tag so it does not run as
+// part of the default e2e suite. See operator/e2e/tests/scale/soak_churn.md.
+func Test_SoakChurn(t *testing.T) {
+	cfg := loadSoakConfig()
+	if cfg.peak <= cfg.base {
+		t.Fatalf("SOAK_PEAK (%d) must be > SOAK_BASE (%d)", cfg.peak, cfg.base)
+	}
+
+	runScaleTest(t, scaleTestConfig{
+		name:         "SoakChurn",
+		workload:     soakWorkloadName,
+		yamlPath:     soakYAMLPath,
+		expectedPods: cfg.peakPods(),
+		pcsCount:     defaultScalePCSCount,
+		workerNodes:  soakWorkerNodes,
+		timeout:      soakTimeout,
+		pollInterval: defaultScalePollInterval,
+	}, func(tracker *measurement.TimelineTracker, tc *testctx.TestContext, _ string) {
+		addSoakPhases(tracker, tc, cfg)
+		addFinalCheckPhase(tracker, tc, cfg)
+	})
+}
+
+// addSoakPhases adds the deploy phase plus N churn cycles to the tracker. Each
+// cycle contributes four phases: scale-up, hold-peak, scale-down, hold-base.
+func addSoakPhases(tracker *measurement.TimelineTracker, tc *testctx.TestContext, cfg soakConfig) {
+	tracker.AddPhase(measurement.PhaseDefinition{
+		Name: "deploy",
+		ActionFn: func(ctx context.Context) error {
+			_, err := resources.NewResourceManager(tc.Client, Logger).ApplyYAMLFile(ctx, tc.Workload.YAMLPath, tc.Namespace)
+			return err
+		},
+		Milestones: []measurement.MilestoneDefinition{
+			{
+				Name: "base-pods-ready",
+				Condition: &condition.PodsReadyCondition{
+					Client:        tc.Client.Client,
+					Namespace:     tc.Namespace,
+					LabelSelector: tc.GetLabelSelector(),
+					ExpectedCount: cfg.basePods(),
+				},
+			},
+		},
+	})
+
+	for cycle := 1; cycle <= cfg.cycles; cycle++ {
+		addCyclePhases(tracker, tc, cfg, cycle)
+	}
+}
+
+// addCyclePhases registers the four phases of a single churn cycle. Phase
+// names are suffixed with the cycle index so per-cycle costs are visible in
+// the exported timeline.
+func addCyclePhases(tracker *measurement.TimelineTracker, tc *testctx.TestContext, cfg soakConfig, cycle int) {
+	wm := workload.NewWorkloadManager(tc.Client, Logger)
+
+	tracker.AddPhase(measurement.PhaseDefinition{
+		Name: fmt.Sprintf("scale-up-c%d", cycle),
+		ActionFn: func(ctx context.Context) error {
+			Logger.Infof("cycle %d: scaling %s %d → %d PCS replicas", cycle, tc.Workload.Name, cfg.base, cfg.peak)
+			return wm.ScalePCS(ctx, tc.Namespace, tc.Workload.Name, cfg.peak)
+		},
+		Milestones: []measurement.MilestoneDefinition{
+			{
+				Name: "peak-pods-ready",
+				Condition: &condition.PodsReadyCondition{
+					Client:        tc.Client.Client,
+					Namespace:     tc.Namespace,
+					LabelSelector: tc.GetLabelSelector(),
+					ExpectedCount: cfg.peakPods(),
+				},
+			},
+		},
+	})
+
+	tracker.AddPhase(measurement.PhaseDefinition{
+		Name:     fmt.Sprintf("hold-peak-c%d", cycle),
+		ActionFn: func(_ context.Context) error { return nil },
+		Milestones: []measurement.MilestoneDefinition{
+			{
+				Name:      "peak-hold-elapsed",
+				Condition: &condition.TimerCondition{Duration: soakPerCycleHold},
+			},
+		},
+	})
+
+	tracker.AddPhase(measurement.PhaseDefinition{
+		Name: fmt.Sprintf("scale-down-c%d", cycle),
+		ActionFn: func(ctx context.Context) error {
+			Logger.Infof("cycle %d: scaling %s %d → %d PCS replicas", cycle, tc.Workload.Name, cfg.peak, cfg.base)
+			return wm.ScalePCS(ctx, tc.Namespace, tc.Workload.Name, cfg.base)
+		},
+		Milestones: []measurement.MilestoneDefinition{
+			{
+				Name: "base-pods-restored",
+				Condition: &condition.PodsAtCountCondition{
+					Client:        tc.Client.Client,
+					Namespace:     tc.Namespace,
+					LabelSelector: tc.GetLabelSelector(),
+					ExpectedCount: cfg.basePods(),
+				},
+			},
+		},
+	})
+
+	tracker.AddPhase(measurement.PhaseDefinition{
+		Name:     fmt.Sprintf("hold-base-c%d", cycle),
+		ActionFn: func(_ context.Context) error { return nil },
+		Milestones: []measurement.MilestoneDefinition{
+			{
+				Name:      "base-hold-elapsed",
+				Condition: &condition.TimerCondition{Duration: soakPerCycleHold},
+			},
+		},
+	})
+}
+
+// addFinalCheckPhase appends a synchronous final-check phase to the timeline.
+// The phase must run before runScaleTest's deferred cleanup deletes the PCS,
+// which is why it lives inside the timeline rather than after tracker.Wait().
+// The action runs the end-state assertions and returns an error to fail the
+// phase (and the test) if any invariant is violated.
+func addFinalCheckPhase(tracker *measurement.TimelineTracker, tc *testctx.TestContext, cfg soakConfig) {
+	tracker.AddPhase(measurement.PhaseDefinition{
+		Name: "final-check",
+		ActionFn: func(ctx context.Context) error {
+			return runSoakFinalChecks(ctx, tc, cfg)
+		},
+	})
+}
+
+// runSoakFinalChecks asserts end-state invariants. Returns an error rather
+// than calling t.Fatalf so it can be invoked from a phase ActionFn; the
+// tracker propagates the error into a test failure.
+func runSoakFinalChecks(ctx context.Context, tc *testctx.TestContext, cfg soakConfig) error {
+	// 1. Live pod count is back at base.
+	pods, err := tc.ListPods()
+	if err != nil {
+		return fmt.Errorf("list pods: %w", err)
+	}
+	if got, want := len(pods.Items), cfg.basePods(); got != want {
+		return fmt.Errorf("live pod count = %d, want %d (potential leak)", got, want)
+	}
+
+	// 2. Fetch the PCS once for the next two checks.
+	pcs := &grovecorev1alpha1.PodCliqueSet{}
+	if err := tc.Client.Client.Get(ctx, client.ObjectKey{Namespace: tc.Namespace, Name: soakWorkloadName}, pcs); err != nil {
+		return fmt.Errorf("get PCS: %w", err)
+	}
+
+	// 3. UpdateProgress counters stay bounded (the regression #567 protects against).
+	if up := pcs.Status.UpdateProgress; up != nil {
+		if up.UpdatedPodCliquesCount > up.TotalPodCliquesCount {
+			return fmt.Errorf("UpdatedPodCliquesCount (%d) > TotalPodCliquesCount (%d)",
+				up.UpdatedPodCliquesCount, up.TotalPodCliquesCount)
+		}
+		if up.UpdatedPodCliqueScalingGroupsCount > up.TotalPodCliqueScalingGroupsCount {
+			return fmt.Errorf("UpdatedPodCliqueScalingGroupsCount (%d) > TotalPodCliqueScalingGroupsCount (%d)",
+				up.UpdatedPodCliqueScalingGroupsCount, up.TotalPodCliqueScalingGroupsCount)
+		}
+	}
+
+	// 4. lastErrors is empty — any error accumulated across N cycles is a real bug.
+	if len(pcs.Status.LastErrors) > 0 {
+		return fmt.Errorf("%d LastErrors on PCS after %d cycles: %+v",
+			len(pcs.Status.LastErrors), cfg.cycles, pcs.Status.LastErrors)
+	}
+
+	return nil
+}
diff --git a/operator/e2e/yaml/soak-churn.yaml b/operator/e2e/yaml/soak-churn.yaml
new file mode 100644
index 000000000..83d472d30
--- /dev/null
+++ b/operator/e2e/yaml/soak-churn.yaml
@@ -0,0 +1,43 @@
+# Soak / Churn Benchmark workload
+# Boots a small PCS at the soak base (25 replicas × 2 pods/clique = 50 pods).
+# The test then drives repeated scale-up → hold → scale-down → hold cycles.
+# Sized small on purpose: the soak test is about reconcile churn under load,
+# not absolute capacity.
+---
+apiVersion: grove.io/v1alpha1
+kind: PodCliqueSet
+metadata:
+  name: soak-churn
+  labels:
+    app: soak-churn
+spec:
+  replicas: 25
+  template:
+    cliques:
+      - name: expert-worker
+        spec:
+          roleName: expert
+          replicas: 2
+          minAvailable: 2
+          podSpec:
+            schedulerName: default-scheduler
+            affinity:
+              nodeAffinity:
+                requiredDuringSchedulingIgnoredDuringExecution:
+                  nodeSelectorTerms:
+                    - matchExpressions:
+                        - key: type
+                          operator: In
+                          values:
+                            - kwok
+            tolerations:
+              - key: node_role.e2e.grove.nvidia.com
+                operator: Equal
+                value: agent
+                effect: NoSchedule
+            containers:
+              - name: expert-worker
+                image: registry:5001/nginx:alpine-slim
+                resources:
+                  requests:
+                    memory: 10Mi