ai-dynamo · oleg-kushniriov · May 12, 2026
@@ -214,13 +214,19 @@ scale-cluster-down:
 	@$(MODULE_HACK_DIR)/infra-manager.py delete k3d-cluster
 
 # Run scale tests against an existing cluster
-# Usage: make run-scale-test [DIAG_DIR=<path>] [DIAG_MODE=<mode>]
+# Usage: make run-scale-test [TEST_PATTERN=<pattern>] [DIAG_DIR=<path>] [DIAG_MODE=<mode>]
+# Examples:
+#   make run-scale-test                                       # Run all scale tests
+#   make run-scale-test TEST_PATTERN=Test_ScaleUp             # Run all scale-up variants
+#   make run-scale-test TEST_PATTERN=Test_ScaleDown           # Run all scale-down variants
+#   make run-scale-test TEST_PATTERN='Test_Scale(Up|Down)'    # Run both directions
+#   make run-scale-test TEST_PATTERN=Test_ScaleUp_Tiny        # Run the sanity test only
 .PHONY: run-scale-test
 run-scale-test: export GROVE_E2E_DIAG_DIR = $(DIAG_DIR)
 run-scale-test: export GROVE_E2E_DIAG_MODE = $(DIAG_MODE)
 run-scale-test:
 	@echo "> Running scale tests..."
-	@cd e2e && go test -count=1 -tags=e2e ./tests/scale/... -v -timeout 40m
+	@cd e2e && go test -count=1 -tags=e2e ./tests/scale/... -v -timeout 40m $(if $(TEST_PATTERN),-run $(TEST_PATTERN))
 
 # Make targets for local development and testing
 # -------------------------------------------------------------

@@ -135,3 +135,38 @@ func (c *PodsReadyCondition) Met(ctx context.Context) (bool, error) {
 func (c *PodsReadyCondition) Progress(_ context.Context) string {
 	return fmt.Sprintf("%d/%d pods ready", c.lastReady, c.ExpectedCount)
 }
+
+// PodsAtCountCondition fires when the live pod count drops to ExpectedCount or
+// below. Intended for scale-down milestones where PodsCreatedCondition (≥-only)
+// would fire immediately because the starting count already exceeds the target.
+// Using ≤ rather than == makes the condition robust to a transient overshoot
+// during cascade-delete where two consecutive polls might skip the exact target.
+type PodsAtCountCondition struct {
+	Client        client.Client
+	Namespace     string
+	LabelSelector string
+	ExpectedCount int
+	lastCount     int
+	sel           parsedSelector
+}
+
+// Met returns true once the live pod count is ≤ ExpectedCount.
+func (c *PodsAtCountCondition) Met(ctx context.Context) (bool, error) {
+	if c.ExpectedCount < 0 {
+		return false, errors.New("expected count cannot be negative")
+	}
+
+	c.sel.init(c.LabelSelector)
+	pods, err := listPods(ctx, c.Client, c.Namespace, &c.sel)
+	if err != nil {
+		return false, err
+	}
+
+	c.lastCount = len(pods)
+	return c.lastCount <= c.ExpectedCount, nil
+}
+
+// Progress returns a human-readable progress string.
+func (c *PodsAtCountCondition) Progress(_ context.Context) string {
+	return fmt.Sprintf("%d pods (target ≤%d)", c.lastCount, c.ExpectedCount)
+}
@@ -0,0 +1,199 @@
+//go:build e2e
+
+// /*
+// Copyright 2026 The Grove Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// */
+
+package scale
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/ai-dynamo/grove/operator/e2e/grove/workload"
+	"github.com/ai-dynamo/grove/operator/e2e/k8s/resources"
+	"github.com/ai-dynamo/grove/operator/e2e/measurement"
+	"github.com/ai-dynamo/grove/operator/e2e/measurement/condition"
+	"github.com/ai-dynamo/grove/operator/e2e/testctx"
+)
+
+const (
+	scaleDownTimeout = 15 * time.Minute
+	// scaleDownWorkerNodes matches scaleUpWorkerNodes: ~1100 kwok pods on 30 nodes
+	// (~37 pods/node) is under the default 110-pod kubelet limit, so these tests
+	// run on smaller dev clusters.
+	scaleDownWorkerNodes = 30
+)
+
+// scaleDownVariant configures a single scale-down scenario. Each variant boots a
+// PCS at the YAML-encoded initialReplicas and then patches spec.replicas to
+// targetReplicas so the timeline isolates the marginal scale-down cost.
+type scaleDownVariant struct {
+	name           string
+	workloadName   string
+	yamlPath       string
+	initialPods    int
+	targetReplicas int
+	targetPods     int
+	// workerNodes overrides scaleDownWorkerNodes for variants that need fewer
+	// nodes (e.g. the tiny sanity test). Zero means "use the default".
+	workerNodes int
+}
+
+// Test_ScaleDown_Tiny is a sanity-check variant that scales from 5 replicas
+// (10 pods) down to 0. It runs the same code paths as the real benchmarks but
+// completes in seconds — use it to validate cluster setup and the new
+// PodsAtCountCondition before running the 500/1000-pod scenarios.
+func Test_ScaleDown_Tiny(t *testing.T) {
+	runScaleDownTest(t, scaleDownVariant{
+		name:           "ScaleDown_Tiny",
+		workloadName:   "scale-down-tiny",
+		yamlPath:       "../../yaml/scale-down-tiny.yaml",
+		initialPods:    10,
+		targetReplicas: 0,
+		targetPods:     0,
+		workerNodes:    5,
+	})
+}
+
+// Test_ScaleDown_ToZero scales an existing 500-replica PCS (1000 pods) down to 0.
+// Captures the cascade-delete-everything case where every child must be torn down.
+func Test_ScaleDown_ToZero(t *testing.T) {
+	runScaleDownTest(t, scaleDownVariant{
+		name:           "ScaleDown_ToZero",
+		workloadName:   "scale-down-to-zero",
+		yamlPath:       "../../yaml/scale-down-to-zero.yaml",
+		initialPods:    1000,
+		targetReplicas: 0,
+		targetPods:     0,
+	})
+}
+
+// Test_ScaleDown_SmallDelta scales an existing 1100-pod PCS by -10% (to 1000 pods).
+// Captures the steady-state-with-modest-shrink case.
+func Test_ScaleDown_SmallDelta(t *testing.T) {
+	runScaleDownTest(t, scaleDownVariant{
+		name:           "ScaleDown_SmallDelta",
+		workloadName:   "scale-down-small-delta",
+		yamlPath:       "../../yaml/scale-down-small-delta.yaml",
+		initialPods:    1100,
+		targetReplicas: 500,
+		targetPods:     1000,
+	})
+}
+
+// Test_ScaleDown_LargeDelta scales an existing 1000-pod PCS by 0.5x (to 500 pods).
+// Captures the burst-shrink case where the controller has to tear down as many
+// replicas as it keeps.
+func Test_ScaleDown_LargeDelta(t *testing.T) {
+	runScaleDownTest(t, scaleDownVariant{
+		name:           "ScaleDown_LargeDelta",
+		workloadName:   "scale-down-large-delta",
+		yamlPath:       "../../yaml/scale-down-large-delta.yaml",
+		initialPods:    1000,
+		targetReplicas: 250,
+		targetPods:     500,
+	})
+}
+
+// runScaleDownTest builds the deploy → scale-down → delete timeline for a variant.
+// The deploy phase brings the PCS up at the YAML's initial replica count; the
+// scale-down phase is the measurement of interest and milestones-out at
+// pod-count-at-target.
+func runScaleDownTest(t *testing.T, v scaleDownVariant) {
+	workerNodes := scaleDownWorkerNodes
+	if v.workerNodes > 0 {
+		workerNodes = v.workerNodes
+	}
+	// expectedPods is the upper bound used to size cluster fixtures; the scale-down
+	// phase shrinks below it.
+	runScaleTest(t, scaleTestConfig{
+		name:         v.name,
+		workload:     v.workloadName,
+		yamlPath:     v.yamlPath,
+		expectedPods: v.initialPods,
+		pcsCount:     defaultScalePCSCount,
+		workerNodes:  workerNodes,
+		timeout:      scaleDownTimeout,
+		pollInterval: defaultScalePollInterval,
+	}, func(tracker *measurement.TimelineTracker, tc *testctx.TestContext, _ string) {
+		tracker.AddPhase(measurement.PhaseDefinition{
+			Name: "deploy",
+			ActionFn: func(ctx context.Context) error {
+				_, err := resources.NewResourceManager(tc.Client, Logger).ApplyYAMLFile(ctx, tc.Workload.YAMLPath, tc.Namespace)
+				return err
+			},
+			Milestones: []measurement.MilestoneDefinition{
+				{
+					Name: "initial-pods-created",
+					Condition: &condition.PodsCreatedCondition{
+						Client:        tc.Client.Client,
+						Namespace:     tc.Namespace,
+						LabelSelector: tc.GetLabelSelector(),
+						ExpectedCount: v.initialPods,
+					},
+				},
+				{
+					Name: "initial-pods-ready",
+					Condition: &condition.PodsReadyCondition{
+						Client:        tc.Client.Client,
+						Namespace:     tc.Namespace,
+						LabelSelector: tc.GetLabelSelector(),
+						ExpectedCount: v.initialPods,
+					},
+				},
+			},
+		})
+
+		tracker.AddPhase(measurement.PhaseDefinition{
+			Name: "scale-down",
+			ActionFn: func(ctx context.Context) error {
+				Logger.Infof("scaling %s from %d to %d PCS replicas (target %d pods)",
+					tc.Workload.Name, v.initialPods/2, v.targetReplicas, v.targetPods)
+				return workload.NewWorkloadManager(tc.Client, Logger).ScalePCS(ctx, tc.Namespace, tc.Workload.Name, v.targetReplicas)
+			},
+			Milestones: []measurement.MilestoneDefinition{
+				{
+					Name: "pods-at-target",
+					Condition: &condition.PodsAtCountCondition{
+						Client:        tc.Client.Client,
+						Namespace:     tc.Namespace,
+						LabelSelector: tc.GetLabelSelector(),
+						ExpectedCount: v.targetPods,
+					},
+				},
+			},
+		})
+
+		tracker.AddPhase(measurement.PhaseDefinition{
+			Name: "delete",
+			ActionFn: func(ctx context.Context) error {
+				return workload.NewWorkloadManager(tc.Client, Logger).DeletePCS(ctx, tc.Namespace, tc.Workload.Name)
+			},
+			Milestones: []measurement.MilestoneDefinition{
+				{
+					Name: "pcs-deleted",
+					Condition: &condition.PCSAndSubresourcesDeletedCondition{
+						Client:        tc.Client.Client,
+						Name:          tc.Workload.Name,
+						Namespace:     tc.Namespace,
+						LabelSelector: tc.GetLabelSelector(),
+					},
+				},
+			},
+		})
+	})
+}