diff --git a/operator/Makefile b/operator/Makefile index c341ee150..45a4ecbe8 100644 --- a/operator/Makefile +++ b/operator/Makefile @@ -222,6 +222,24 @@ run-scale-test: @echo "> Running scale tests..." @cd e2e && go test -count=1 -tags=e2e ./tests/scale/... -v -timeout 40m +# Run the soak / churn benchmark against an existing cluster. +# Gated behind the `soak` build tag so it does NOT run as part of +# run-e2e or run-scale-test. Long-running (~30 min default). +# +# Usage: make run-soak-test [DIAG_DIR=] [DIAG_MODE=] \ +# [SOAK_BASE=] [SOAK_PEAK=] [SOAK_CYCLES=] +# Examples: +# make run-soak-test # Defaults: 25 → 50 PCS, 10 cycles +# make run-soak-test SOAK_CYCLES=3 # Shorter run for quick validation +# make run-soak-test DIAG_DIR=/tmp/soak-baseline # Capture artifacts for later comparison +.PHONY: run-soak-test +run-soak-test: export GROVE_E2E_DIAG_DIR = $(DIAG_DIR) +run-soak-test: export GROVE_E2E_DIAG_MODE = $(DIAG_MODE) +run-soak-test: + @echo "> Running soak / churn benchmark (long-running, on-demand)..." + @cd e2e && SOAK_BASE=$(SOAK_BASE) SOAK_PEAK=$(SOAK_PEAK) SOAK_CYCLES=$(SOAK_CYCLES) \ + go test -count=1 -tags='e2e soak' ./tests/scale/... -v -timeout 60m -run Test_SoakChurn + # Make targets for local development and testing # ------------------------------------------------------------- # Starts a local k8s cluster using kind. diff --git a/operator/e2e/measurement/condition/pod.go b/operator/e2e/measurement/condition/pod.go index f3f22669c..271845264 100644 --- a/operator/e2e/measurement/condition/pod.go +++ b/operator/e2e/measurement/condition/pod.go @@ -135,3 +135,38 @@ func (c *PodsReadyCondition) Met(ctx context.Context) (bool, error) { func (c *PodsReadyCondition) Progress(_ context.Context) string { return fmt.Sprintf("%d/%d pods ready", c.lastReady, c.ExpectedCount) } + +// PodsAtCountCondition fires when the live pod count drops to ExpectedCount or +// below. Intended for scale-down milestones where PodsCreatedCondition (≥-only) +// would fire immediately because the starting count already exceeds the target. +// Using ≤ rather than == makes the condition robust to a transient overshoot +// during cascade-delete where two consecutive polls might skip the exact target. +type PodsAtCountCondition struct { + Client client.Client + Namespace string + LabelSelector string + ExpectedCount int + lastCount int + sel parsedSelector +} + +// Met returns true once the live pod count is ≤ ExpectedCount. +func (c *PodsAtCountCondition) Met(ctx context.Context) (bool, error) { + if c.ExpectedCount < 0 { + return false, errors.New("expected count cannot be negative") + } + + c.sel.init(c.LabelSelector) + pods, err := listPods(ctx, c.Client, c.Namespace, &c.sel) + if err != nil { + return false, err + } + + c.lastCount = len(pods) + return c.lastCount <= c.ExpectedCount, nil +} + +// Progress returns a human-readable progress string. +func (c *PodsAtCountCondition) Progress(_ context.Context) string { + return fmt.Sprintf("%d pods (target ≤%d)", c.lastCount, c.ExpectedCount) +} diff --git a/operator/e2e/tests/scale/soak_churn.md b/operator/e2e/tests/scale/soak_churn.md new file mode 100644 index 000000000..429fa563e --- /dev/null +++ b/operator/e2e/tests/scale/soak_churn.md @@ -0,0 +1,175 @@ +# Soak / Churn Benchmark + +## GitHub Issue + +Use the **Enhancement** template (`.github/ISSUE_TEMPLATE/ENHANCEMENT.yaml`). +The fields below are ready to paste into the form. + +**Title** + +> Add long-running soak / churn benchmark to e2e scale tests + +**Labels** + +`enhancement`, `area/testing`, `area/operator` + +--- + +### What you would like to be added? + +Add a long-running benchmark under `operator/e2e/tests/scale/` that +boots a small `PodCliqueSet` (~100 pods) and then drives repeated +scale-up / scale-down cycles against it over an extended duration. +The goal is to surface bugs that only appear after many incremental +reconciles — leaks, monotonically growing slices, gradually drifting +counters, finalizer pile-ups — that single-shot benchmarks cannot see. + +**Scope (single test, parameterized):** + +| Parameter | Default | Env override | Rationale | +|---|---|---|---| +| Base PCS replicas | 25 (50 pods) | `SOAK_BASE` | Small enough to fit dev clusters; large enough to exercise multi-child reconciles. | +| Peak PCS replicas | 50 (100 pods) | `SOAK_PEAK` | 2x amplitude per cycle. | +| Cycles | 10 | `SOAK_CYCLES` | Enough per-cycle iterations to surface slow drift; bounded total runtime. | +| Per-cycle hold | ~30 s | — | Lets watch events flush and pprof capture overlap consecutive no-op reconciles. | +| Worker nodes | 30 kwok nodes | — | Same fixture as scale-up/down tests. | +| Timeout | 45 min | — | Soft cap; expected runtime ~25–30 min. | + +**Deliverables:** + +- `operator/e2e/tests/scale/soak_test.go`. +- `operator/e2e/yaml/soak-churn.yaml` (PCS sized at base; ~50 pods). +- Reuse `runScaleTest`, `ScalePCS`, `PodsCreatedCondition`, + `TimerCondition`. The same scale-down milestone gap noted in the + scale-up-down issue applies here (`PodsAtCountCondition` is + prerequisite). +- Final-check phase asserting no leaks/orphans/unbounded counters at + the end of the run. + +**Cycle shape (executed N times):** + +1. **scale-up** — patch `spec.replicas` base → peak. Wait until live + pod count reaches `peak * podsPerReplica`. +2. **hold-peak** — 30 s window. +3. **scale-down** — patch `spec.replicas` peak → base. Wait until live + pod count reaches `base * podsPerReplica`. +4. **hold-base** — 30 s window. + +**Final-check assertions:** + +- Live pod count equals base target (no leaked pods). +- No `PodCliqueScalingGroup` or `PodClique` exists outside the expected + child set. +- `status.updateProgress` (if present) has bounded counters: + `UpdatedPodCliquesCount ≤ TotalPodCliquesCount`. +- `status.lastErrors` is empty. + +--- + +### Why is this needed? + +Today's scale benchmarks run for minutes and measure a single event +(deploy, resize, delete). They are good at catching gross regressions +in per-event throughput, but they pass cleanly on a controller that is +slowly leaking heap, accumulating stale entries in an informer index, +or letting one status field drift by an item per reconcile. + +Issues we have personally been bitten by, and that a soak/churn test +**would catch**: + +- **#567 — unbounded slices in status.** The original + `UpdatedPodCliques` / `UpdatedPodCliqueScalingGroups` slices grew + with every reconcile. A single-shot benchmark looked fine; a soak + test would have flagged it within an hour. +- **Finalizer / cascade-delete drift.** Repeated scale-down cycles + exercise the partial-delete path many times. Any reconcile that + forgets to clear an expectation, or that re-emits a finalizer, will + accumulate across cycles. +- **Watch-event amplification.** Status mutators that fire a Patch on + byte-identical status (no real change) waste a write per reconcile + and trigger a cascade of watch events. The equality-short-circuit + guards added in #567 prevent this, but only a churn test exercises + the steady-state-Patch-suppression path enough to verify it. +- **Heap growth without a leak.** Even without an obvious bug, churn + reveals whether the controller's working-set memory plateaus or + keeps climbing. A pprof window over the second half of the run is + the cheapest way to spot a slow trend. + +--- + +## Design Detail + +The sections below are for the implementation PR — they are not part of +the GitHub issue body. + +### Goals + +- Drive ~10 scale-up / scale-down cycles against a single PCS over a + duration long enough to expose slow leaks (target: 30–60 min). +- Run on the smaller dev fixture (≤ 30 worker nodes, ~100 pods peak). + The point is reconcile churn, not absolute capacity. +- Reuse the existing `runScaleTest` scaffolding and pprof hook so the + output format and capture mechanism match the rest of the suite. +- Assert at the end that the PCS reaches its final steady state + cleanly — no orphaned children, counters bounded, no unexpected + `lastErrors`. + +### Non-Goals + +- Multi-PCS / multi-tenant soak. Single-PCS churn is enough to surface + controller-internal leaks; multi-tenant belongs in a separate test. +- Failure-injection (operator restarts, API server flakes). The soak + test is for steady-state churn; chaos is out of scope. +- Comparison against historical runs. The output is captured in the + same JSON format as other scale tests, but no regression gate is + defined here. + +### Test Shape + +Three top-level phases: + +1. **deploy** — apply the YAML at base; wait for `pods-ready`. +2. **churn-cycles** — loop N times executing the cycle shape above. + Each cycle adds milestones to the timeline so the per-cycle cost is + visible in the output. +3. **soak-final-check** — assertions listed in the issue body; if any + fails the test fails. + +There is no explicit `delete` phase — the test fixture cleanup tears +the PCS down at the end. (If we want a delete-time measurement, add +it; otherwise keep the scope tight.) + +### Implementation Notes + +- New file: `operator/e2e/tests/scale/soak_test.go`. +- New YAML fixture: `operator/e2e/yaml/soak-churn.yaml`. PCS sized at + the base (25 replicas, 50 pods) so the deploy phase is short and the + churn phases dominate the timeline. +- Per-cycle phases can be added programmatically in a loop inside the + test's `addPhases` callback — each iteration calls + `tracker.AddPhase` four times with unique names suffixed by the + cycle index (e.g. `scale-up-c3`). +- Reuses existing `ScalePCS`, `PodsCreatedCondition`, and + `TimerCondition`. **Same caveat as the scale-down doc:** the + scale-down legs need a "live pod count drops to N" condition that + `PodsCreatedCondition` (which is monotonic-up) does not provide. +- The pprof hook in `setupPprofHook` is already per-run; one capture + spans the whole soak. If we want per-cycle pprof captures, that's a + follow-up. +- Make cycle count and amplitude env-tunable (`SOAK_CYCLES`, + `SOAK_BASE`, `SOAK_PEAK`) so the same test can run at small scale in + CI and at larger scale on dev clusters without a new test function. + +### Open Questions + +- Should the soak test be tagged with a build tag stricter than `e2e` + (e.g. `e2e_soak`) so it isn't pulled into the default suite by + accident? The 30-min default runtime is too long for routine CI. +- Where should the final-check assertions live? Adding them as + measurement conditions keeps the timeline coherent; embedding them + as `t.Fatalf` checks after `tracker.Wait()` is simpler but breaks + the "everything is a phase" pattern. +- Do we want a heap-comparison assertion (e.g. compare pprof heap + samples between cycle 1 and cycle N, fail if delta exceeds a + threshold)? Useful but stateful — defer until the first real + regression motivates it. diff --git a/operator/e2e/tests/scale/soak_test.go b/operator/e2e/tests/scale/soak_test.go new file mode 100644 index 000000000..5aba8842d --- /dev/null +++ b/operator/e2e/tests/scale/soak_test.go @@ -0,0 +1,257 @@ +//go:build e2e && soak + +// /* +// Copyright 2026 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package scale + +import ( + "context" + "fmt" + "os" + "strconv" + "testing" + "time" + + grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1" + "github.com/ai-dynamo/grove/operator/e2e/grove/workload" + "github.com/ai-dynamo/grove/operator/e2e/k8s/resources" + "github.com/ai-dynamo/grove/operator/e2e/measurement" + "github.com/ai-dynamo/grove/operator/e2e/measurement/condition" + "github.com/ai-dynamo/grove/operator/e2e/testctx" + + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + soakTimeout = 60 * time.Minute + soakWorkerNodes = 30 + soakPerCycleHold = 30 * time.Second + soakPodsPerCLQ = 2 + + soakDefaultBase = 25 + soakDefaultPeak = 50 + soakDefaultCycles = 10 + + soakWorkloadName = "soak-churn" + soakYAMLPath = "../../yaml/soak-churn.yaml" +) + +// soakConfig is the resolved cycle configuration. Read once from env at the +// top of the test so a failing parse fails the test immediately rather than +// midway through a 30-minute run. +type soakConfig struct { + base int + peak int + cycles int +} + +func loadSoakConfig() soakConfig { + return soakConfig{ + base: envInt("SOAK_BASE", soakDefaultBase), + peak: envInt("SOAK_PEAK", soakDefaultPeak), + cycles: envInt("SOAK_CYCLES", soakDefaultCycles), + } +} + +func envInt(key string, def int) int { + raw := os.Getenv(key) + if raw == "" { + return def + } + n, err := strconv.Atoi(raw) + if err != nil || n <= 0 { + return def + } + return n +} + +func (c soakConfig) basePods() int { return c.base * soakPodsPerCLQ } +func (c soakConfig) peakPods() int { return c.peak * soakPodsPerCLQ } + +// Test_SoakChurn drives repeated scale-up / scale-down cycles against a single +// small PCS to surface bugs that only appear after many incremental reconciles +// — leaks, monotonically growing fields, gradually drifting counters, +// finalizer pile-ups. Gated behind the `soak` build tag so it does not run as +// part of the default e2e suite. See operator/e2e/tests/scale/soak_churn.md. +func Test_SoakChurn(t *testing.T) { + cfg := loadSoakConfig() + if cfg.peak <= cfg.base { + t.Fatalf("SOAK_PEAK (%d) must be > SOAK_BASE (%d)", cfg.peak, cfg.base) + } + + runScaleTest(t, scaleTestConfig{ + name: "SoakChurn", + workload: soakWorkloadName, + yamlPath: soakYAMLPath, + expectedPods: cfg.peakPods(), + pcsCount: defaultScalePCSCount, + workerNodes: soakWorkerNodes, + timeout: soakTimeout, + pollInterval: defaultScalePollInterval, + }, func(tracker *measurement.TimelineTracker, tc *testctx.TestContext, _ string) { + addSoakPhases(tracker, tc, cfg) + addFinalCheckPhase(tracker, tc, cfg) + }) +} + +// addSoakPhases adds the deploy phase plus N churn cycles to the tracker. Each +// cycle contributes four phases: scale-up, hold-peak, scale-down, hold-base. +func addSoakPhases(tracker *measurement.TimelineTracker, tc *testctx.TestContext, cfg soakConfig) { + tracker.AddPhase(measurement.PhaseDefinition{ + Name: "deploy", + ActionFn: func(ctx context.Context) error { + _, err := resources.NewResourceManager(tc.Client, Logger).ApplyYAMLFile(ctx, tc.Workload.YAMLPath, tc.Namespace) + return err + }, + Milestones: []measurement.MilestoneDefinition{ + { + Name: "base-pods-ready", + Condition: &condition.PodsReadyCondition{ + Client: tc.Client.Client, + Namespace: tc.Namespace, + LabelSelector: tc.GetLabelSelector(), + ExpectedCount: cfg.basePods(), + }, + }, + }, + }) + + for cycle := 1; cycle <= cfg.cycles; cycle++ { + addCyclePhases(tracker, tc, cfg, cycle) + } +} + +// addCyclePhases registers the four phases of a single churn cycle. Phase +// names are suffixed with the cycle index so per-cycle costs are visible in +// the exported timeline. +func addCyclePhases(tracker *measurement.TimelineTracker, tc *testctx.TestContext, cfg soakConfig, cycle int) { + wm := workload.NewWorkloadManager(tc.Client, Logger) + + tracker.AddPhase(measurement.PhaseDefinition{ + Name: fmt.Sprintf("scale-up-c%d", cycle), + ActionFn: func(ctx context.Context) error { + Logger.Infof("cycle %d: scaling %s %d → %d PCS replicas", cycle, tc.Workload.Name, cfg.base, cfg.peak) + return wm.ScalePCS(ctx, tc.Namespace, tc.Workload.Name, cfg.peak) + }, + Milestones: []measurement.MilestoneDefinition{ + { + Name: "peak-pods-ready", + Condition: &condition.PodsReadyCondition{ + Client: tc.Client.Client, + Namespace: tc.Namespace, + LabelSelector: tc.GetLabelSelector(), + ExpectedCount: cfg.peakPods(), + }, + }, + }, + }) + + tracker.AddPhase(measurement.PhaseDefinition{ + Name: fmt.Sprintf("hold-peak-c%d", cycle), + ActionFn: func(_ context.Context) error { return nil }, + Milestones: []measurement.MilestoneDefinition{ + { + Name: "peak-hold-elapsed", + Condition: &condition.TimerCondition{Duration: soakPerCycleHold}, + }, + }, + }) + + tracker.AddPhase(measurement.PhaseDefinition{ + Name: fmt.Sprintf("scale-down-c%d", cycle), + ActionFn: func(ctx context.Context) error { + Logger.Infof("cycle %d: scaling %s %d → %d PCS replicas", cycle, tc.Workload.Name, cfg.peak, cfg.base) + return wm.ScalePCS(ctx, tc.Namespace, tc.Workload.Name, cfg.base) + }, + Milestones: []measurement.MilestoneDefinition{ + { + Name: "base-pods-restored", + Condition: &condition.PodsAtCountCondition{ + Client: tc.Client.Client, + Namespace: tc.Namespace, + LabelSelector: tc.GetLabelSelector(), + ExpectedCount: cfg.basePods(), + }, + }, + }, + }) + + tracker.AddPhase(measurement.PhaseDefinition{ + Name: fmt.Sprintf("hold-base-c%d", cycle), + ActionFn: func(_ context.Context) error { return nil }, + Milestones: []measurement.MilestoneDefinition{ + { + Name: "base-hold-elapsed", + Condition: &condition.TimerCondition{Duration: soakPerCycleHold}, + }, + }, + }) +} + +// addFinalCheckPhase appends a synchronous final-check phase to the timeline. +// The phase must run before runScaleTest's deferred cleanup deletes the PCS, +// which is why it lives inside the timeline rather than after tracker.Wait(). +// The action runs the end-state assertions and returns an error to fail the +// phase (and the test) if any invariant is violated. +func addFinalCheckPhase(tracker *measurement.TimelineTracker, tc *testctx.TestContext, cfg soakConfig) { + tracker.AddPhase(measurement.PhaseDefinition{ + Name: "final-check", + ActionFn: func(ctx context.Context) error { + return runSoakFinalChecks(ctx, tc, cfg) + }, + }) +} + +// runSoakFinalChecks asserts end-state invariants. Returns an error rather +// than calling t.Fatalf so it can be invoked from a phase ActionFn; the +// tracker propagates the error into a test failure. +func runSoakFinalChecks(ctx context.Context, tc *testctx.TestContext, cfg soakConfig) error { + // 1. Live pod count is back at base. + pods, err := tc.ListPods() + if err != nil { + return fmt.Errorf("list pods: %w", err) + } + if got, want := len(pods.Items), cfg.basePods(); got != want { + return fmt.Errorf("live pod count = %d, want %d (potential leak)", got, want) + } + + // 2. Fetch the PCS once for the next two checks. + pcs := &grovecorev1alpha1.PodCliqueSet{} + if err := tc.Client.Client.Get(ctx, client.ObjectKey{Namespace: tc.Namespace, Name: soakWorkloadName}, pcs); err != nil { + return fmt.Errorf("get PCS: %w", err) + } + + // 3. UpdateProgress counters stay bounded (the regression #567 protects against). + if up := pcs.Status.UpdateProgress; up != nil { + if up.UpdatedPodCliquesCount > up.TotalPodCliquesCount { + return fmt.Errorf("UpdatedPodCliquesCount (%d) > TotalPodCliquesCount (%d)", + up.UpdatedPodCliquesCount, up.TotalPodCliquesCount) + } + if up.UpdatedPodCliqueScalingGroupsCount > up.TotalPodCliqueScalingGroupsCount { + return fmt.Errorf("UpdatedPodCliqueScalingGroupsCount (%d) > TotalPodCliqueScalingGroupsCount (%d)", + up.UpdatedPodCliqueScalingGroupsCount, up.TotalPodCliqueScalingGroupsCount) + } + } + + // 4. lastErrors is empty — any error accumulated across N cycles is a real bug. + if len(pcs.Status.LastErrors) > 0 { + return fmt.Errorf("%d LastErrors on PCS after %d cycles: %+v", + len(pcs.Status.LastErrors), cfg.cycles, pcs.Status.LastErrors) + } + + return nil +} diff --git a/operator/e2e/yaml/soak-churn.yaml b/operator/e2e/yaml/soak-churn.yaml new file mode 100644 index 000000000..83d472d30 --- /dev/null +++ b/operator/e2e/yaml/soak-churn.yaml @@ -0,0 +1,43 @@ +# Soak / Churn Benchmark workload +# Boots a small PCS at the soak base (25 replicas × 2 pods/clique = 50 pods). +# The test then drives repeated scale-up → hold → scale-down → hold cycles. +# Sized small on purpose: the soak test is about reconcile churn under load, +# not absolute capacity. +--- +apiVersion: grove.io/v1alpha1 +kind: PodCliqueSet +metadata: + name: soak-churn + labels: + app: soak-churn +spec: + replicas: 25 + template: + cliques: + - name: expert-worker + spec: + roleName: expert + replicas: 2 + minAvailable: 2 + podSpec: + schedulerName: default-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: type + operator: In + values: + - kwok + tolerations: + - key: node_role.e2e.grove.nvidia.com + operator: Equal + value: agent + effect: NoSchedule + containers: + - name: expert-worker + image: registry:5001/nginx:alpine-slim + resources: + requests: + memory: 10Mi