diff --git a/operator/Makefile b/operator/Makefile
index c341ee150..5850f12c7 100644
--- a/operator/Makefile
+++ b/operator/Makefile
@@ -214,13 +214,19 @@ scale-cluster-down:
 	@$(MODULE_HACK_DIR)/infra-manager.py delete k3d-cluster
 
 # Run scale tests against an existing cluster
-# Usage: make run-scale-test [DIAG_DIR=<path>] [DIAG_MODE=<mode>]
+# Usage: make run-scale-test [TEST_PATTERN=<pattern>] [DIAG_DIR=<path>] [DIAG_MODE=<mode>]
+# Examples:
+#   make run-scale-test                                       # Run all scale tests
+#   make run-scale-test TEST_PATTERN=Test_ScaleUp             # Run all scale-up variants
+#   make run-scale-test TEST_PATTERN=Test_ScaleDown           # Run all scale-down variants
+#   make run-scale-test TEST_PATTERN='Test_Scale(Up|Down)'    # Run both directions
+#   make run-scale-test TEST_PATTERN=Test_ScaleUp_Tiny        # Run the sanity test only
 .PHONY: run-scale-test
 run-scale-test: export GROVE_E2E_DIAG_DIR = $(DIAG_DIR)
 run-scale-test: export GROVE_E2E_DIAG_MODE = $(DIAG_MODE)
 run-scale-test:
 	@echo "> Running scale tests..."
-	@cd e2e && go test -count=1 -tags=e2e ./tests/scale/... -v -timeout 40m
+	@cd e2e && go test -count=1 -tags=e2e ./tests/scale/... -v -timeout 40m $(if $(TEST_PATTERN),-run $(TEST_PATTERN))
 
 # Make targets for local development and testing
 # -------------------------------------------------------------
diff --git a/operator/e2e/measurement/condition/pod.go b/operator/e2e/measurement/condition/pod.go
index f3f22669c..271845264 100644
--- a/operator/e2e/measurement/condition/pod.go
+++ b/operator/e2e/measurement/condition/pod.go
@@ -135,3 +135,38 @@ func (c *PodsReadyCondition) Met(ctx context.Context) (bool, error) {
 func (c *PodsReadyCondition) Progress(_ context.Context) string {
 	return fmt.Sprintf("%d/%d pods ready", c.lastReady, c.ExpectedCount)
 }
+
+// PodsAtCountCondition fires when the live pod count drops to ExpectedCount or
+// below. Intended for scale-down milestones where PodsCreatedCondition (≥-only)
+// would fire immediately because the starting count already exceeds the target.
+// Using ≤ rather than == makes the condition robust to a transient overshoot
+// during cascade-delete where two consecutive polls might skip the exact target.
+type PodsAtCountCondition struct {
+	Client        client.Client
+	Namespace     string
+	LabelSelector string
+	ExpectedCount int
+	lastCount     int
+	sel           parsedSelector
+}
+
+// Met returns true once the live pod count is ≤ ExpectedCount.
+func (c *PodsAtCountCondition) Met(ctx context.Context) (bool, error) {
+	if c.ExpectedCount < 0 {
+		return false, errors.New("expected count cannot be negative")
+	}
+
+	c.sel.init(c.LabelSelector)
+	pods, err := listPods(ctx, c.Client, c.Namespace, &c.sel)
+	if err != nil {
+		return false, err
+	}
+
+	c.lastCount = len(pods)
+	return c.lastCount <= c.ExpectedCount, nil
+}
+
+// Progress returns a human-readable progress string.
+func (c *PodsAtCountCondition) Progress(_ context.Context) string {
+	return fmt.Sprintf("%d pods (target ≤%d)", c.lastCount, c.ExpectedCount)
+}
diff --git a/operator/e2e/tests/scale/scale_down_test.go b/operator/e2e/tests/scale/scale_down_test.go
new file mode 100644
index 000000000..44fc82af8
--- /dev/null
+++ b/operator/e2e/tests/scale/scale_down_test.go
@@ -0,0 +1,199 @@
+//go:build e2e
+
+// /*
+// Copyright 2026 The Grove Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// */
+
+package scale
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/ai-dynamo/grove/operator/e2e/grove/workload"
+	"github.com/ai-dynamo/grove/operator/e2e/k8s/resources"
+	"github.com/ai-dynamo/grove/operator/e2e/measurement"
+	"github.com/ai-dynamo/grove/operator/e2e/measurement/condition"
+	"github.com/ai-dynamo/grove/operator/e2e/testctx"
+)
+
+const (
+	scaleDownTimeout = 15 * time.Minute
+	// scaleDownWorkerNodes matches scaleUpWorkerNodes: ~1100 kwok pods on 30 nodes
+	// (~37 pods/node) is under the default 110-pod kubelet limit, so these tests
+	// run on smaller dev clusters.
+	scaleDownWorkerNodes = 30
+)
+
+// scaleDownVariant configures a single scale-down scenario. Each variant boots a
+// PCS at the YAML-encoded initialReplicas and then patches spec.replicas to
+// targetReplicas so the timeline isolates the marginal scale-down cost.
+type scaleDownVariant struct {
+	name           string
+	workloadName   string
+	yamlPath       string
+	initialPods    int
+	targetReplicas int
+	targetPods     int
+	// workerNodes overrides scaleDownWorkerNodes for variants that need fewer
+	// nodes (e.g. the tiny sanity test). Zero means "use the default".
+	workerNodes int
+}
+
+// Test_ScaleDown_Tiny is a sanity-check variant that scales from 5 replicas
+// (10 pods) down to 0. It runs the same code paths as the real benchmarks but
+// completes in seconds — use it to validate cluster setup and the new
+// PodsAtCountCondition before running the 500/1000-pod scenarios.
+func Test_ScaleDown_Tiny(t *testing.T) {
+	runScaleDownTest(t, scaleDownVariant{
+		name:           "ScaleDown_Tiny",
+		workloadName:   "scale-down-tiny",
+		yamlPath:       "../../yaml/scale-down-tiny.yaml",
+		initialPods:    10,
+		targetReplicas: 0,
+		targetPods:     0,
+		workerNodes:    5,
+	})
+}
+
+// Test_ScaleDown_ToZero scales an existing 500-replica PCS (1000 pods) down to 0.
+// Captures the cascade-delete-everything case where every child must be torn down.
+func Test_ScaleDown_ToZero(t *testing.T) {
+	runScaleDownTest(t, scaleDownVariant{
+		name:           "ScaleDown_ToZero",
+		workloadName:   "scale-down-to-zero",
+		yamlPath:       "../../yaml/scale-down-to-zero.yaml",
+		initialPods:    1000,
+		targetReplicas: 0,
+		targetPods:     0,
+	})
+}
+
+// Test_ScaleDown_SmallDelta scales an existing 1100-pod PCS by -10% (to 1000 pods).
+// Captures the steady-state-with-modest-shrink case.
+func Test_ScaleDown_SmallDelta(t *testing.T) {
+	runScaleDownTest(t, scaleDownVariant{
+		name:           "ScaleDown_SmallDelta",
+		workloadName:   "scale-down-small-delta",
+		yamlPath:       "../../yaml/scale-down-small-delta.yaml",
+		initialPods:    1100,
+		targetReplicas: 500,
+		targetPods:     1000,
+	})
+}
+
+// Test_ScaleDown_LargeDelta scales an existing 1000-pod PCS by 0.5x (to 500 pods).
+// Captures the burst-shrink case where the controller has to tear down as many
+// replicas as it keeps.
+func Test_ScaleDown_LargeDelta(t *testing.T) {
+	runScaleDownTest(t, scaleDownVariant{
+		name:           "ScaleDown_LargeDelta",
+		workloadName:   "scale-down-large-delta",
+		yamlPath:       "../../yaml/scale-down-large-delta.yaml",
+		initialPods:    1000,
+		targetReplicas: 250,
+		targetPods:     500,
+	})
+}
+
+// runScaleDownTest builds the deploy → scale-down → delete timeline for a variant.
+// The deploy phase brings the PCS up at the YAML's initial replica count; the
+// scale-down phase is the measurement of interest and milestones-out at
+// pod-count-at-target.
+func runScaleDownTest(t *testing.T, v scaleDownVariant) {
+	workerNodes := scaleDownWorkerNodes
+	if v.workerNodes > 0 {
+		workerNodes = v.workerNodes
+	}
+	// expectedPods is the upper bound used to size cluster fixtures; the scale-down
+	// phase shrinks below it.
+	runScaleTest(t, scaleTestConfig{
+		name:         v.name,
+		workload:     v.workloadName,
+		yamlPath:     v.yamlPath,
+		expectedPods: v.initialPods,
+		pcsCount:     defaultScalePCSCount,
+		workerNodes:  workerNodes,
+		timeout:      scaleDownTimeout,
+		pollInterval: defaultScalePollInterval,
+	}, func(tracker *measurement.TimelineTracker, tc *testctx.TestContext, _ string) {
+		tracker.AddPhase(measurement.PhaseDefinition{
+			Name: "deploy",
+			ActionFn: func(ctx context.Context) error {
+				_, err := resources.NewResourceManager(tc.Client, Logger).ApplyYAMLFile(ctx, tc.Workload.YAMLPath, tc.Namespace)
+				return err
+			},
+			Milestones: []measurement.MilestoneDefinition{
+				{
+					Name: "initial-pods-created",
+					Condition: &condition.PodsCreatedCondition{
+						Client:        tc.Client.Client,
+						Namespace:     tc.Namespace,
+						LabelSelector: tc.GetLabelSelector(),
+						ExpectedCount: v.initialPods,
+					},
+				},
+				{
+					Name: "initial-pods-ready",
+					Condition: &condition.PodsReadyCondition{
+						Client:        tc.Client.Client,
+						Namespace:     tc.Namespace,
+						LabelSelector: tc.GetLabelSelector(),
+						ExpectedCount: v.initialPods,
+					},
+				},
+			},
+		})
+
+		tracker.AddPhase(measurement.PhaseDefinition{
+			Name: "scale-down",
+			ActionFn: func(ctx context.Context) error {
+				Logger.Infof("scaling %s from %d to %d PCS replicas (target %d pods)",
+					tc.Workload.Name, v.initialPods/2, v.targetReplicas, v.targetPods)
+				return workload.NewWorkloadManager(tc.Client, Logger).ScalePCS(ctx, tc.Namespace, tc.Workload.Name, v.targetReplicas)
+			},
+			Milestones: []measurement.MilestoneDefinition{
+				{
+					Name: "pods-at-target",
+					Condition: &condition.PodsAtCountCondition{
+						Client:        tc.Client.Client,
+						Namespace:     tc.Namespace,
+						LabelSelector: tc.GetLabelSelector(),
+						ExpectedCount: v.targetPods,
+					},
+				},
+			},
+		})
+
+		tracker.AddPhase(measurement.PhaseDefinition{
+			Name: "delete",
+			ActionFn: func(ctx context.Context) error {
+				return workload.NewWorkloadManager(tc.Client, Logger).DeletePCS(ctx, tc.Namespace, tc.Workload.Name)
+			},
+			Milestones: []measurement.MilestoneDefinition{
+				{
+					Name: "pcs-deleted",
+					Condition: &condition.PCSAndSubresourcesDeletedCondition{
+						Client:        tc.Client.Client,
+						Name:          tc.Workload.Name,
+						Namespace:     tc.Namespace,
+						LabelSelector: tc.GetLabelSelector(),
+					},
+				},
+			},
+		})
+	})
+}
diff --git a/operator/e2e/tests/scale/scale_up_test.go b/operator/e2e/tests/scale/scale_up_test.go
new file mode 100644
index 000000000..fb6da4d0e
--- /dev/null
+++ b/operator/e2e/tests/scale/scale_up_test.go
@@ -0,0 +1,210 @@
+//go:build e2e
+
+// /*
+// Copyright 2026 The Grove Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// */
+
+package scale
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/ai-dynamo/grove/operator/e2e/grove/workload"
+	"github.com/ai-dynamo/grove/operator/e2e/k8s/resources"
+	"github.com/ai-dynamo/grove/operator/e2e/measurement"
+	"github.com/ai-dynamo/grove/operator/e2e/measurement/condition"
+	"github.com/ai-dynamo/grove/operator/e2e/testctx"
+)
+
+const (
+	scaleUpTimeout = 15 * time.Minute
+	// scaleUpWorkerNodes is intentionally lower than defaultScaleWorkerNodes (100)
+	// so these tests run on smaller dev clusters. ~1000 kwok pods on 30 nodes
+	// (~33 pods/node) is well under the default 110-pod kubelet limit.
+	scaleUpWorkerNodes = 30
+)
+
+// scaleUpVariant configures a single scale-up scenario. Each variant boots a PCS
+// at initialReplicas (encoded in the YAML) and then patches spec.replicas to
+// targetReplicas so the timeline isolates the marginal scale-up cost.
+type scaleUpVariant struct {
+	name           string
+	workloadName   string
+	yamlPath       string
+	initialPods    int
+	targetReplicas int
+	targetPods     int
+	// workerNodes overrides scaleUpWorkerNodes for variants that need fewer
+	// nodes (e.g. the tiny sanity test). Zero means "use the default".
+	workerNodes int
+}
+
+// Test_ScaleUp_Tiny is a sanity-check variant that scales from 0 to 5 replicas
+// (10 pods). It runs the same code paths as the real benchmarks but completes
+// in seconds — use it to validate cluster setup and test plumbing before
+// running the 500/1000-pod scenarios.
+func Test_ScaleUp_Tiny(t *testing.T) {
+	runScaleUpTest(t, scaleUpVariant{
+		name:           "ScaleUp_Tiny",
+		workloadName:   "scale-up-tiny",
+		yamlPath:       "../../yaml/scale-up-tiny.yaml",
+		initialPods:    0,
+		targetReplicas: 5,
+		targetPods:     10,
+		workerNodes:    5,
+	})
+}
+
+// Test_ScaleUp_FromZero scales an existing PCS from 0 to 500 replicas (1000 pods).
+// Captures the cold-start case where no PCSGs/PodCliques exist yet.
+func Test_ScaleUp_FromZero(t *testing.T) {
+	runScaleUpTest(t, scaleUpVariant{
+		name:           "ScaleUp_FromZero",
+		workloadName:   "scale-up-from-zero",
+		yamlPath:       "../../yaml/scale-up-from-zero.yaml",
+		initialPods:    0,
+		targetReplicas: 500,
+		targetPods:     1000,
+	})
+}
+
+// Test_ScaleUp_SmallDelta scales an existing 1000-pod PCS by +10% (to 1100 pods).
+// Captures the steady-state-with-modest-growth case.
+func Test_ScaleUp_SmallDelta(t *testing.T) {
+	runScaleUpTest(t, scaleUpVariant{
+		name:           "ScaleUp_SmallDelta",
+		workloadName:   "scale-up-small-delta",
+		yamlPath:       "../../yaml/scale-up-small-delta.yaml",
+		initialPods:    1000,
+		targetReplicas: 550,
+		targetPods:     1100,
+	})
+}
+
+// Test_ScaleUp_LargeDelta scales an existing 500-pod PCS by 2x (to 1000 pods).
+// Captures the burst-growth case where the controller has to create as many
+// new replicas as already exist.
+func Test_ScaleUp_LargeDelta(t *testing.T) {
+	runScaleUpTest(t, scaleUpVariant{
+		name:           "ScaleUp_LargeDelta",
+		workloadName:   "scale-up-large-delta",
+		yamlPath:       "../../yaml/scale-up-large-delta.yaml",
+		initialPods:    500,
+		targetReplicas: 500,
+		targetPods:     1000,
+	})
+}
+
+// runScaleUpTest builds the deploy → scale-up → delete timeline for a variant.
+// The deploy phase brings the PCS up at the YAML's initial replica count; the
+// scale-up phase is the measurement of interest and milestones-out at all-pods-ready.
+func runScaleUpTest(t *testing.T, v scaleUpVariant) {
+	workerNodes := scaleUpWorkerNodes
+	if v.workerNodes > 0 {
+		workerNodes = v.workerNodes
+	}
+	runScaleTest(t, scaleTestConfig{
+		name:         v.name,
+		workload:     v.workloadName,
+		yamlPath:     v.yamlPath,
+		expectedPods: v.targetPods,
+		pcsCount:     defaultScalePCSCount,
+		workerNodes:  workerNodes,
+		timeout:      scaleUpTimeout,
+		pollInterval: defaultScalePollInterval,
+	}, func(tracker *measurement.TimelineTracker, tc *testctx.TestContext, _ string) {
+		var deployMilestones []measurement.MilestoneDefinition
+		if v.initialPods > 0 {
+			deployMilestones = append(deployMilestones,
+				measurement.MilestoneDefinition{
+					Name: "initial-pods-created",
+					Condition: &condition.PodsCreatedCondition{
+						Client:        tc.Client.Client,
+						Namespace:     tc.Namespace,
+						LabelSelector: tc.GetLabelSelector(),
+						ExpectedCount: v.initialPods,
+					},
+				},
+				measurement.MilestoneDefinition{
+					Name: "initial-pods-ready",
+					Condition: &condition.PodsReadyCondition{
+						Client:        tc.Client.Client,
+						Namespace:     tc.Namespace,
+						LabelSelector: tc.GetLabelSelector(),
+						ExpectedCount: v.initialPods,
+					},
+				},
+			)
+		}
+
+		tracker.AddPhase(measurement.PhaseDefinition{
+			Name: "deploy",
+			ActionFn: func(ctx context.Context) error {
+				_, err := resources.NewResourceManager(tc.Client, Logger).ApplyYAMLFile(ctx, tc.Workload.YAMLPath, tc.Namespace)
+				return err
+			},
+			Milestones: deployMilestones,
+		})
+
+		tracker.AddPhase(measurement.PhaseDefinition{
+			Name: "scale-up",
+			ActionFn: func(ctx context.Context) error {
+				Logger.Infof("scaling %s from %d to %d PCS replicas (target %d pods)",
+					tc.Workload.Name, v.initialPods/2, v.targetReplicas, v.targetPods)
+				return workload.NewWorkloadManager(tc.Client, Logger).ScalePCS(ctx, tc.Namespace, tc.Workload.Name, v.targetReplicas)
+			},
+			Milestones: []measurement.MilestoneDefinition{
+				{
+					Name: "all-pods-created",
+					Condition: &condition.PodsCreatedCondition{
+						Client:        tc.Client.Client,
+						Namespace:     tc.Namespace,
+						LabelSelector: tc.GetLabelSelector(),
+						ExpectedCount: v.targetPods,
+					},
+				},
+				{
+					Name: "all-pods-ready",
+					Condition: &condition.PodsReadyCondition{
+						Client:        tc.Client.Client,
+						Namespace:     tc.Namespace,
+						LabelSelector: tc.GetLabelSelector(),
+						ExpectedCount: v.targetPods,
+					},
+				},
+			},
+		})
+
+		tracker.AddPhase(measurement.PhaseDefinition{
+			Name: "delete",
+			ActionFn: func(ctx context.Context) error {
+				return workload.NewWorkloadManager(tc.Client, Logger).DeletePCS(ctx, tc.Namespace, tc.Workload.Name)
+			},
+			Milestones: []measurement.MilestoneDefinition{
+				{
+					Name: "pcs-deleted",
+					Condition: &condition.PCSAndSubresourcesDeletedCondition{
+						Client:        tc.Client.Client,
+						Name:          tc.Workload.Name,
+						Namespace:     tc.Namespace,
+						LabelSelector: tc.GetLabelSelector(),
+					},
+				},
+			},
+		})
+	})
+}
diff --git a/operator/e2e/yaml/scale-down-large-delta.yaml b/operator/e2e/yaml/scale-down-large-delta.yaml
new file mode 100644
index 000000000..2fdb6573c
--- /dev/null
+++ b/operator/e2e/yaml/scale-down-large-delta.yaml
@@ -0,0 +1,40 @@
+# Scale Down Test: large-delta variant (-50%)
+# Starts at 500 PCS replicas (1000 pods); the test scales to 250 (500 pods).
+---
+apiVersion: grove.io/v1alpha1
+kind: PodCliqueSet
+metadata:
+  name: scale-down-large-delta
+  labels:
+    app: scale-down-large-delta
+spec:
+  replicas: 500
+  template:
+    cliques:
+      - name: expert-worker
+        spec:
+          roleName: expert
+          replicas: 2
+          minAvailable: 2
+          podSpec:
+            schedulerName: default-scheduler
+            affinity:
+              nodeAffinity:
+                requiredDuringSchedulingIgnoredDuringExecution:
+                  nodeSelectorTerms:
+                    - matchExpressions:
+                        - key: type
+                          operator: In
+                          values:
+                            - kwok
+            tolerations:
+              - key: node_role.e2e.grove.nvidia.com
+                operator: Equal
+                value: agent
+                effect: NoSchedule
+            containers:
+              - name: expert-worker
+                image: registry:5001/nginx:alpine-slim
+                resources:
+                  requests:
+                    memory: 10Mi
diff --git a/operator/e2e/yaml/scale-down-small-delta.yaml b/operator/e2e/yaml/scale-down-small-delta.yaml
new file mode 100644
index 000000000..dd7d240d5
--- /dev/null
+++ b/operator/e2e/yaml/scale-down-small-delta.yaml
@@ -0,0 +1,40 @@
+# Scale Down Test: small-delta variant (-10%)
+# Starts at 550 PCS replicas (1100 pods); the test scales to 500 (1000 pods).
+---
+apiVersion: grove.io/v1alpha1
+kind: PodCliqueSet
+metadata:
+  name: scale-down-small-delta
+  labels:
+    app: scale-down-small-delta
+spec:
+  replicas: 550
+  template:
+    cliques:
+      - name: expert-worker
+        spec:
+          roleName: expert
+          replicas: 2
+          minAvailable: 2
+          podSpec:
+            schedulerName: default-scheduler
+            affinity:
+              nodeAffinity:
+                requiredDuringSchedulingIgnoredDuringExecution:
+                  nodeSelectorTerms:
+                    - matchExpressions:
+                        - key: type
+                          operator: In
+                          values:
+                            - kwok
+            tolerations:
+              - key: node_role.e2e.grove.nvidia.com
+                operator: Equal
+                value: agent
+                effect: NoSchedule
+            containers:
+              - name: expert-worker
+                image: registry:5001/nginx:alpine-slim
+                resources:
+                  requests:
+                    memory: 10Mi
diff --git a/operator/e2e/yaml/scale-down-tiny.yaml b/operator/e2e/yaml/scale-down-tiny.yaml
new file mode 100644
index 000000000..523e61bc0
--- /dev/null
+++ b/operator/e2e/yaml/scale-down-tiny.yaml
@@ -0,0 +1,43 @@
+# Scale Down Test: tiny sanity variant
+# Starts at 5 PCS replicas (10 pods); the test scales to 0.
+# Used to validate cluster + test plumbing end-to-end (including the new
+# PodsAtCountCondition milestone) without paying the cost of the real
+# 500/1000-pod scenarios.
+---
+apiVersion: grove.io/v1alpha1
+kind: PodCliqueSet
+metadata:
+  name: scale-down-tiny
+  labels:
+    app: scale-down-tiny
+spec:
+  replicas: 5
+  template:
+    cliques:
+      - name: expert-worker
+        spec:
+          roleName: expert
+          replicas: 2
+          minAvailable: 2
+          podSpec:
+            schedulerName: default-scheduler
+            affinity:
+              nodeAffinity:
+                requiredDuringSchedulingIgnoredDuringExecution:
+                  nodeSelectorTerms:
+                    - matchExpressions:
+                        - key: type
+                          operator: In
+                          values:
+                            - kwok
+            tolerations:
+              - key: node_role.e2e.grove.nvidia.com
+                operator: Equal
+                value: agent
+                effect: NoSchedule
+            containers:
+              - name: expert-worker
+                image: registry:5001/nginx:alpine-slim
+                resources:
+                  requests:
+                    memory: 10Mi
diff --git a/operator/e2e/yaml/scale-down-to-zero.yaml b/operator/e2e/yaml/scale-down-to-zero.yaml
new file mode 100644
index 000000000..dbc72e961
--- /dev/null
+++ b/operator/e2e/yaml/scale-down-to-zero.yaml
@@ -0,0 +1,40 @@
+# Scale Down Test: to-zero variant
+# Starts at 500 PCS replicas (1000 pods); the test scales to 0.
+---
+apiVersion: grove.io/v1alpha1
+kind: PodCliqueSet
+metadata:
+  name: scale-down-to-zero
+  labels:
+    app: scale-down-to-zero
+spec:
+  replicas: 500
+  template:
+    cliques:
+      - name: expert-worker
+        spec:
+          roleName: expert
+          replicas: 2
+          minAvailable: 2
+          podSpec:
+            schedulerName: default-scheduler
+            affinity:
+              nodeAffinity:
+                requiredDuringSchedulingIgnoredDuringExecution:
+                  nodeSelectorTerms:
+                    - matchExpressions:
+                        - key: type
+                          operator: In
+                          values:
+                            - kwok
+            tolerations:
+              - key: node_role.e2e.grove.nvidia.com
+                operator: Equal
+                value: agent
+                effect: NoSchedule
+            containers:
+              - name: expert-worker
+                image: registry:5001/nginx:alpine-slim
+                resources:
+                  requests:
+                    memory: 10Mi
diff --git a/operator/e2e/yaml/scale-up-from-zero.yaml b/operator/e2e/yaml/scale-up-from-zero.yaml
new file mode 100644
index 000000000..07f7d3e8a
--- /dev/null
+++ b/operator/e2e/yaml/scale-up-from-zero.yaml
@@ -0,0 +1,40 @@
+# Scale Up Test: from-zero variant
+# Starts at replicas: 0; the test scales to 500 PCS replicas (1000 pods).
+---
+apiVersion: grove.io/v1alpha1
+kind: PodCliqueSet
+metadata:
+  name: scale-up-from-zero
+  labels:
+    app: scale-up-from-zero
+spec:
+  replicas: 0
+  template:
+    cliques:
+      - name: expert-worker
+        spec:
+          roleName: expert
+          replicas: 2
+          minAvailable: 2
+          podSpec:
+            schedulerName: default-scheduler
+            affinity:
+              nodeAffinity:
+                requiredDuringSchedulingIgnoredDuringExecution:
+                  nodeSelectorTerms:
+                    - matchExpressions:
+                        - key: type
+                          operator: In
+                          values:
+                            - kwok
+            tolerations:
+              - key: node_role.e2e.grove.nvidia.com
+                operator: Equal
+                value: agent
+                effect: NoSchedule
+            containers:
+              - name: expert-worker
+                image: registry:5001/nginx:alpine-slim
+                resources:
+                  requests:
+                    memory: 10Mi
diff --git a/operator/e2e/yaml/scale-up-large-delta.yaml b/operator/e2e/yaml/scale-up-large-delta.yaml
new file mode 100644
index 000000000..3e901ede0
--- /dev/null
+++ b/operator/e2e/yaml/scale-up-large-delta.yaml
@@ -0,0 +1,40 @@
+# Scale Up Test: large-delta variant (2x)
+# Starts at 250 PCS replicas (500 pods); the test scales to 500 (1000 pods).
+---
+apiVersion: grove.io/v1alpha1
+kind: PodCliqueSet
+metadata:
+  name: scale-up-large-delta
+  labels:
+    app: scale-up-large-delta
+spec:
+  replicas: 250
+  template:
+    cliques:
+      - name: expert-worker
+        spec:
+          roleName: expert
+          replicas: 2
+          minAvailable: 2
+          podSpec:
+            schedulerName: default-scheduler
+            affinity:
+              nodeAffinity:
+                requiredDuringSchedulingIgnoredDuringExecution:
+                  nodeSelectorTerms:
+                    - matchExpressions:
+                        - key: type
+                          operator: In
+                          values:
+                            - kwok
+            tolerations:
+              - key: node_role.e2e.grove.nvidia.com
+                operator: Equal
+                value: agent
+                effect: NoSchedule
+            containers:
+              - name: expert-worker
+                image: registry:5001/nginx:alpine-slim
+                resources:
+                  requests:
+                    memory: 10Mi
diff --git a/operator/e2e/yaml/scale-up-small-delta.yaml b/operator/e2e/yaml/scale-up-small-delta.yaml
new file mode 100644
index 000000000..ede8ccb1d
--- /dev/null
+++ b/operator/e2e/yaml/scale-up-small-delta.yaml
@@ -0,0 +1,40 @@
+# Scale Up Test: small-delta variant (+10%)
+# Starts at 500 PCS replicas (1000 pods); the test scales to 550 (1100 pods).
+---
+apiVersion: grove.io/v1alpha1
+kind: PodCliqueSet
+metadata:
+  name: scale-up-small-delta
+  labels:
+    app: scale-up-small-delta
+spec:
+  replicas: 500
+  template:
+    cliques:
+      - name: expert-worker
+        spec:
+          roleName: expert
+          replicas: 2
+          minAvailable: 2
+          podSpec:
+            schedulerName: default-scheduler
+            affinity:
+              nodeAffinity:
+                requiredDuringSchedulingIgnoredDuringExecution:
+                  nodeSelectorTerms:
+                    - matchExpressions:
+                        - key: type
+                          operator: In
+                          values:
+                            - kwok
+            tolerations:
+              - key: node_role.e2e.grove.nvidia.com
+                operator: Equal
+                value: agent
+                effect: NoSchedule
+            containers:
+              - name: expert-worker
+                image: registry:5001/nginx:alpine-slim
+                resources:
+                  requests:
+                    memory: 10Mi
diff --git a/operator/e2e/yaml/scale-up-tiny.yaml b/operator/e2e/yaml/scale-up-tiny.yaml
new file mode 100644
index 000000000..daaa76313
--- /dev/null
+++ b/operator/e2e/yaml/scale-up-tiny.yaml
@@ -0,0 +1,42 @@
+# Scale Up Test: tiny sanity variant
+# Starts at replicas: 0; the test scales to 5 PCS replicas (10 pods).
+# Used to validate cluster + test plumbing end-to-end without paying the cost
+# of the real 500/1000-pod scenarios.
+---
+apiVersion: grove.io/v1alpha1
+kind: PodCliqueSet
+metadata:
+  name: scale-up-tiny
+  labels:
+    app: scale-up-tiny
+spec:
+  replicas: 0
+  template:
+    cliques:
+      - name: expert-worker
+        spec:
+          roleName: expert
+          replicas: 2
+          minAvailable: 2
+          podSpec:
+            schedulerName: default-scheduler
+            affinity:
+              nodeAffinity:
+                requiredDuringSchedulingIgnoredDuringExecution:
+                  nodeSelectorTerms:
+                    - matchExpressions:
+                        - key: type
+                          operator: In
+                          values:
+                            - kwok
+            tolerations:
+              - key: node_role.e2e.grove.nvidia.com
+                operator: Equal
+                value: agent
+                effect: NoSchedule
+            containers:
+              - name: expert-worker
+                image: registry:5001/nginx:alpine-slim
+                resources:
+                  requests:
+                    memory: 10Mi