diff --git a/operator/Makefile b/operator/Makefile index c341ee150..5850f12c7 100644 --- a/operator/Makefile +++ b/operator/Makefile @@ -214,13 +214,19 @@ scale-cluster-down: @$(MODULE_HACK_DIR)/infra-manager.py delete k3d-cluster # Run scale tests against an existing cluster -# Usage: make run-scale-test [DIAG_DIR=] [DIAG_MODE=] +# Usage: make run-scale-test [TEST_PATTERN=] [DIAG_DIR=] [DIAG_MODE=] +# Examples: +# make run-scale-test # Run all scale tests +# make run-scale-test TEST_PATTERN=Test_ScaleUp # Run all scale-up variants +# make run-scale-test TEST_PATTERN=Test_ScaleDown # Run all scale-down variants +# make run-scale-test TEST_PATTERN='Test_Scale(Up|Down)' # Run both directions +# make run-scale-test TEST_PATTERN=Test_ScaleUp_Tiny # Run the sanity test only .PHONY: run-scale-test run-scale-test: export GROVE_E2E_DIAG_DIR = $(DIAG_DIR) run-scale-test: export GROVE_E2E_DIAG_MODE = $(DIAG_MODE) run-scale-test: @echo "> Running scale tests..." - @cd e2e && go test -count=1 -tags=e2e ./tests/scale/... -v -timeout 40m + @cd e2e && go test -count=1 -tags=e2e ./tests/scale/... -v -timeout 40m $(if $(TEST_PATTERN),-run $(TEST_PATTERN)) # Make targets for local development and testing # ------------------------------------------------------------- diff --git a/operator/e2e/measurement/condition/pod.go b/operator/e2e/measurement/condition/pod.go index f3f22669c..271845264 100644 --- a/operator/e2e/measurement/condition/pod.go +++ b/operator/e2e/measurement/condition/pod.go @@ -135,3 +135,38 @@ func (c *PodsReadyCondition) Met(ctx context.Context) (bool, error) { func (c *PodsReadyCondition) Progress(_ context.Context) string { return fmt.Sprintf("%d/%d pods ready", c.lastReady, c.ExpectedCount) } + +// PodsAtCountCondition fires when the live pod count drops to ExpectedCount or +// below. Intended for scale-down milestones where PodsCreatedCondition (≥-only) +// would fire immediately because the starting count already exceeds the target. +// Using ≤ rather than == makes the condition robust to a transient overshoot +// during cascade-delete where two consecutive polls might skip the exact target. +type PodsAtCountCondition struct { + Client client.Client + Namespace string + LabelSelector string + ExpectedCount int + lastCount int + sel parsedSelector +} + +// Met returns true once the live pod count is ≤ ExpectedCount. +func (c *PodsAtCountCondition) Met(ctx context.Context) (bool, error) { + if c.ExpectedCount < 0 { + return false, errors.New("expected count cannot be negative") + } + + c.sel.init(c.LabelSelector) + pods, err := listPods(ctx, c.Client, c.Namespace, &c.sel) + if err != nil { + return false, err + } + + c.lastCount = len(pods) + return c.lastCount <= c.ExpectedCount, nil +} + +// Progress returns a human-readable progress string. +func (c *PodsAtCountCondition) Progress(_ context.Context) string { + return fmt.Sprintf("%d pods (target ≤%d)", c.lastCount, c.ExpectedCount) +} diff --git a/operator/e2e/tests/scale/scale_down_test.go b/operator/e2e/tests/scale/scale_down_test.go new file mode 100644 index 000000000..44fc82af8 --- /dev/null +++ b/operator/e2e/tests/scale/scale_down_test.go @@ -0,0 +1,199 @@ +//go:build e2e + +// /* +// Copyright 2026 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package scale + +import ( + "context" + "testing" + "time" + + "github.com/ai-dynamo/grove/operator/e2e/grove/workload" + "github.com/ai-dynamo/grove/operator/e2e/k8s/resources" + "github.com/ai-dynamo/grove/operator/e2e/measurement" + "github.com/ai-dynamo/grove/operator/e2e/measurement/condition" + "github.com/ai-dynamo/grove/operator/e2e/testctx" +) + +const ( + scaleDownTimeout = 15 * time.Minute + // scaleDownWorkerNodes matches scaleUpWorkerNodes: ~1100 kwok pods on 30 nodes + // (~37 pods/node) is under the default 110-pod kubelet limit, so these tests + // run on smaller dev clusters. + scaleDownWorkerNodes = 30 +) + +// scaleDownVariant configures a single scale-down scenario. Each variant boots a +// PCS at the YAML-encoded initialReplicas and then patches spec.replicas to +// targetReplicas so the timeline isolates the marginal scale-down cost. +type scaleDownVariant struct { + name string + workloadName string + yamlPath string + initialPods int + targetReplicas int + targetPods int + // workerNodes overrides scaleDownWorkerNodes for variants that need fewer + // nodes (e.g. the tiny sanity test). Zero means "use the default". + workerNodes int +} + +// Test_ScaleDown_Tiny is a sanity-check variant that scales from 5 replicas +// (10 pods) down to 0. It runs the same code paths as the real benchmarks but +// completes in seconds — use it to validate cluster setup and the new +// PodsAtCountCondition before running the 500/1000-pod scenarios. +func Test_ScaleDown_Tiny(t *testing.T) { + runScaleDownTest(t, scaleDownVariant{ + name: "ScaleDown_Tiny", + workloadName: "scale-down-tiny", + yamlPath: "../../yaml/scale-down-tiny.yaml", + initialPods: 10, + targetReplicas: 0, + targetPods: 0, + workerNodes: 5, + }) +} + +// Test_ScaleDown_ToZero scales an existing 500-replica PCS (1000 pods) down to 0. +// Captures the cascade-delete-everything case where every child must be torn down. +func Test_ScaleDown_ToZero(t *testing.T) { + runScaleDownTest(t, scaleDownVariant{ + name: "ScaleDown_ToZero", + workloadName: "scale-down-to-zero", + yamlPath: "../../yaml/scale-down-to-zero.yaml", + initialPods: 1000, + targetReplicas: 0, + targetPods: 0, + }) +} + +// Test_ScaleDown_SmallDelta scales an existing 1100-pod PCS by -10% (to 1000 pods). +// Captures the steady-state-with-modest-shrink case. +func Test_ScaleDown_SmallDelta(t *testing.T) { + runScaleDownTest(t, scaleDownVariant{ + name: "ScaleDown_SmallDelta", + workloadName: "scale-down-small-delta", + yamlPath: "../../yaml/scale-down-small-delta.yaml", + initialPods: 1100, + targetReplicas: 500, + targetPods: 1000, + }) +} + +// Test_ScaleDown_LargeDelta scales an existing 1000-pod PCS by 0.5x (to 500 pods). +// Captures the burst-shrink case where the controller has to tear down as many +// replicas as it keeps. +func Test_ScaleDown_LargeDelta(t *testing.T) { + runScaleDownTest(t, scaleDownVariant{ + name: "ScaleDown_LargeDelta", + workloadName: "scale-down-large-delta", + yamlPath: "../../yaml/scale-down-large-delta.yaml", + initialPods: 1000, + targetReplicas: 250, + targetPods: 500, + }) +} + +// runScaleDownTest builds the deploy → scale-down → delete timeline for a variant. +// The deploy phase brings the PCS up at the YAML's initial replica count; the +// scale-down phase is the measurement of interest and milestones-out at +// pod-count-at-target. +func runScaleDownTest(t *testing.T, v scaleDownVariant) { + workerNodes := scaleDownWorkerNodes + if v.workerNodes > 0 { + workerNodes = v.workerNodes + } + // expectedPods is the upper bound used to size cluster fixtures; the scale-down + // phase shrinks below it. + runScaleTest(t, scaleTestConfig{ + name: v.name, + workload: v.workloadName, + yamlPath: v.yamlPath, + expectedPods: v.initialPods, + pcsCount: defaultScalePCSCount, + workerNodes: workerNodes, + timeout: scaleDownTimeout, + pollInterval: defaultScalePollInterval, + }, func(tracker *measurement.TimelineTracker, tc *testctx.TestContext, _ string) { + tracker.AddPhase(measurement.PhaseDefinition{ + Name: "deploy", + ActionFn: func(ctx context.Context) error { + _, err := resources.NewResourceManager(tc.Client, Logger).ApplyYAMLFile(ctx, tc.Workload.YAMLPath, tc.Namespace) + return err + }, + Milestones: []measurement.MilestoneDefinition{ + { + Name: "initial-pods-created", + Condition: &condition.PodsCreatedCondition{ + Client: tc.Client.Client, + Namespace: tc.Namespace, + LabelSelector: tc.GetLabelSelector(), + ExpectedCount: v.initialPods, + }, + }, + { + Name: "initial-pods-ready", + Condition: &condition.PodsReadyCondition{ + Client: tc.Client.Client, + Namespace: tc.Namespace, + LabelSelector: tc.GetLabelSelector(), + ExpectedCount: v.initialPods, + }, + }, + }, + }) + + tracker.AddPhase(measurement.PhaseDefinition{ + Name: "scale-down", + ActionFn: func(ctx context.Context) error { + Logger.Infof("scaling %s from %d to %d PCS replicas (target %d pods)", + tc.Workload.Name, v.initialPods/2, v.targetReplicas, v.targetPods) + return workload.NewWorkloadManager(tc.Client, Logger).ScalePCS(ctx, tc.Namespace, tc.Workload.Name, v.targetReplicas) + }, + Milestones: []measurement.MilestoneDefinition{ + { + Name: "pods-at-target", + Condition: &condition.PodsAtCountCondition{ + Client: tc.Client.Client, + Namespace: tc.Namespace, + LabelSelector: tc.GetLabelSelector(), + ExpectedCount: v.targetPods, + }, + }, + }, + }) + + tracker.AddPhase(measurement.PhaseDefinition{ + Name: "delete", + ActionFn: func(ctx context.Context) error { + return workload.NewWorkloadManager(tc.Client, Logger).DeletePCS(ctx, tc.Namespace, tc.Workload.Name) + }, + Milestones: []measurement.MilestoneDefinition{ + { + Name: "pcs-deleted", + Condition: &condition.PCSAndSubresourcesDeletedCondition{ + Client: tc.Client.Client, + Name: tc.Workload.Name, + Namespace: tc.Namespace, + LabelSelector: tc.GetLabelSelector(), + }, + }, + }, + }) + }) +} diff --git a/operator/e2e/tests/scale/scale_up_test.go b/operator/e2e/tests/scale/scale_up_test.go new file mode 100644 index 000000000..fb6da4d0e --- /dev/null +++ b/operator/e2e/tests/scale/scale_up_test.go @@ -0,0 +1,210 @@ +//go:build e2e + +// /* +// Copyright 2026 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package scale + +import ( + "context" + "testing" + "time" + + "github.com/ai-dynamo/grove/operator/e2e/grove/workload" + "github.com/ai-dynamo/grove/operator/e2e/k8s/resources" + "github.com/ai-dynamo/grove/operator/e2e/measurement" + "github.com/ai-dynamo/grove/operator/e2e/measurement/condition" + "github.com/ai-dynamo/grove/operator/e2e/testctx" +) + +const ( + scaleUpTimeout = 15 * time.Minute + // scaleUpWorkerNodes is intentionally lower than defaultScaleWorkerNodes (100) + // so these tests run on smaller dev clusters. ~1000 kwok pods on 30 nodes + // (~33 pods/node) is well under the default 110-pod kubelet limit. + scaleUpWorkerNodes = 30 +) + +// scaleUpVariant configures a single scale-up scenario. Each variant boots a PCS +// at initialReplicas (encoded in the YAML) and then patches spec.replicas to +// targetReplicas so the timeline isolates the marginal scale-up cost. +type scaleUpVariant struct { + name string + workloadName string + yamlPath string + initialPods int + targetReplicas int + targetPods int + // workerNodes overrides scaleUpWorkerNodes for variants that need fewer + // nodes (e.g. the tiny sanity test). Zero means "use the default". + workerNodes int +} + +// Test_ScaleUp_Tiny is a sanity-check variant that scales from 0 to 5 replicas +// (10 pods). It runs the same code paths as the real benchmarks but completes +// in seconds — use it to validate cluster setup and test plumbing before +// running the 500/1000-pod scenarios. +func Test_ScaleUp_Tiny(t *testing.T) { + runScaleUpTest(t, scaleUpVariant{ + name: "ScaleUp_Tiny", + workloadName: "scale-up-tiny", + yamlPath: "../../yaml/scale-up-tiny.yaml", + initialPods: 0, + targetReplicas: 5, + targetPods: 10, + workerNodes: 5, + }) +} + +// Test_ScaleUp_FromZero scales an existing PCS from 0 to 500 replicas (1000 pods). +// Captures the cold-start case where no PCSGs/PodCliques exist yet. +func Test_ScaleUp_FromZero(t *testing.T) { + runScaleUpTest(t, scaleUpVariant{ + name: "ScaleUp_FromZero", + workloadName: "scale-up-from-zero", + yamlPath: "../../yaml/scale-up-from-zero.yaml", + initialPods: 0, + targetReplicas: 500, + targetPods: 1000, + }) +} + +// Test_ScaleUp_SmallDelta scales an existing 1000-pod PCS by +10% (to 1100 pods). +// Captures the steady-state-with-modest-growth case. +func Test_ScaleUp_SmallDelta(t *testing.T) { + runScaleUpTest(t, scaleUpVariant{ + name: "ScaleUp_SmallDelta", + workloadName: "scale-up-small-delta", + yamlPath: "../../yaml/scale-up-small-delta.yaml", + initialPods: 1000, + targetReplicas: 550, + targetPods: 1100, + }) +} + +// Test_ScaleUp_LargeDelta scales an existing 500-pod PCS by 2x (to 1000 pods). +// Captures the burst-growth case where the controller has to create as many +// new replicas as already exist. +func Test_ScaleUp_LargeDelta(t *testing.T) { + runScaleUpTest(t, scaleUpVariant{ + name: "ScaleUp_LargeDelta", + workloadName: "scale-up-large-delta", + yamlPath: "../../yaml/scale-up-large-delta.yaml", + initialPods: 500, + targetReplicas: 500, + targetPods: 1000, + }) +} + +// runScaleUpTest builds the deploy → scale-up → delete timeline for a variant. +// The deploy phase brings the PCS up at the YAML's initial replica count; the +// scale-up phase is the measurement of interest and milestones-out at all-pods-ready. +func runScaleUpTest(t *testing.T, v scaleUpVariant) { + workerNodes := scaleUpWorkerNodes + if v.workerNodes > 0 { + workerNodes = v.workerNodes + } + runScaleTest(t, scaleTestConfig{ + name: v.name, + workload: v.workloadName, + yamlPath: v.yamlPath, + expectedPods: v.targetPods, + pcsCount: defaultScalePCSCount, + workerNodes: workerNodes, + timeout: scaleUpTimeout, + pollInterval: defaultScalePollInterval, + }, func(tracker *measurement.TimelineTracker, tc *testctx.TestContext, _ string) { + var deployMilestones []measurement.MilestoneDefinition + if v.initialPods > 0 { + deployMilestones = append(deployMilestones, + measurement.MilestoneDefinition{ + Name: "initial-pods-created", + Condition: &condition.PodsCreatedCondition{ + Client: tc.Client.Client, + Namespace: tc.Namespace, + LabelSelector: tc.GetLabelSelector(), + ExpectedCount: v.initialPods, + }, + }, + measurement.MilestoneDefinition{ + Name: "initial-pods-ready", + Condition: &condition.PodsReadyCondition{ + Client: tc.Client.Client, + Namespace: tc.Namespace, + LabelSelector: tc.GetLabelSelector(), + ExpectedCount: v.initialPods, + }, + }, + ) + } + + tracker.AddPhase(measurement.PhaseDefinition{ + Name: "deploy", + ActionFn: func(ctx context.Context) error { + _, err := resources.NewResourceManager(tc.Client, Logger).ApplyYAMLFile(ctx, tc.Workload.YAMLPath, tc.Namespace) + return err + }, + Milestones: deployMilestones, + }) + + tracker.AddPhase(measurement.PhaseDefinition{ + Name: "scale-up", + ActionFn: func(ctx context.Context) error { + Logger.Infof("scaling %s from %d to %d PCS replicas (target %d pods)", + tc.Workload.Name, v.initialPods/2, v.targetReplicas, v.targetPods) + return workload.NewWorkloadManager(tc.Client, Logger).ScalePCS(ctx, tc.Namespace, tc.Workload.Name, v.targetReplicas) + }, + Milestones: []measurement.MilestoneDefinition{ + { + Name: "all-pods-created", + Condition: &condition.PodsCreatedCondition{ + Client: tc.Client.Client, + Namespace: tc.Namespace, + LabelSelector: tc.GetLabelSelector(), + ExpectedCount: v.targetPods, + }, + }, + { + Name: "all-pods-ready", + Condition: &condition.PodsReadyCondition{ + Client: tc.Client.Client, + Namespace: tc.Namespace, + LabelSelector: tc.GetLabelSelector(), + ExpectedCount: v.targetPods, + }, + }, + }, + }) + + tracker.AddPhase(measurement.PhaseDefinition{ + Name: "delete", + ActionFn: func(ctx context.Context) error { + return workload.NewWorkloadManager(tc.Client, Logger).DeletePCS(ctx, tc.Namespace, tc.Workload.Name) + }, + Milestones: []measurement.MilestoneDefinition{ + { + Name: "pcs-deleted", + Condition: &condition.PCSAndSubresourcesDeletedCondition{ + Client: tc.Client.Client, + Name: tc.Workload.Name, + Namespace: tc.Namespace, + LabelSelector: tc.GetLabelSelector(), + }, + }, + }, + }) + }) +} diff --git a/operator/e2e/yaml/scale-down-large-delta.yaml b/operator/e2e/yaml/scale-down-large-delta.yaml new file mode 100644 index 000000000..2fdb6573c --- /dev/null +++ b/operator/e2e/yaml/scale-down-large-delta.yaml @@ -0,0 +1,40 @@ +# Scale Down Test: large-delta variant (-50%) +# Starts at 500 PCS replicas (1000 pods); the test scales to 250 (500 pods). +--- +apiVersion: grove.io/v1alpha1 +kind: PodCliqueSet +metadata: + name: scale-down-large-delta + labels: + app: scale-down-large-delta +spec: + replicas: 500 + template: + cliques: + - name: expert-worker + spec: + roleName: expert + replicas: 2 + minAvailable: 2 + podSpec: + schedulerName: default-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: type + operator: In + values: + - kwok + tolerations: + - key: node_role.e2e.grove.nvidia.com + operator: Equal + value: agent + effect: NoSchedule + containers: + - name: expert-worker + image: registry:5001/nginx:alpine-slim + resources: + requests: + memory: 10Mi diff --git a/operator/e2e/yaml/scale-down-small-delta.yaml b/operator/e2e/yaml/scale-down-small-delta.yaml new file mode 100644 index 000000000..dd7d240d5 --- /dev/null +++ b/operator/e2e/yaml/scale-down-small-delta.yaml @@ -0,0 +1,40 @@ +# Scale Down Test: small-delta variant (-10%) +# Starts at 550 PCS replicas (1100 pods); the test scales to 500 (1000 pods). +--- +apiVersion: grove.io/v1alpha1 +kind: PodCliqueSet +metadata: + name: scale-down-small-delta + labels: + app: scale-down-small-delta +spec: + replicas: 550 + template: + cliques: + - name: expert-worker + spec: + roleName: expert + replicas: 2 + minAvailable: 2 + podSpec: + schedulerName: default-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: type + operator: In + values: + - kwok + tolerations: + - key: node_role.e2e.grove.nvidia.com + operator: Equal + value: agent + effect: NoSchedule + containers: + - name: expert-worker + image: registry:5001/nginx:alpine-slim + resources: + requests: + memory: 10Mi diff --git a/operator/e2e/yaml/scale-down-tiny.yaml b/operator/e2e/yaml/scale-down-tiny.yaml new file mode 100644 index 000000000..523e61bc0 --- /dev/null +++ b/operator/e2e/yaml/scale-down-tiny.yaml @@ -0,0 +1,43 @@ +# Scale Down Test: tiny sanity variant +# Starts at 5 PCS replicas (10 pods); the test scales to 0. +# Used to validate cluster + test plumbing end-to-end (including the new +# PodsAtCountCondition milestone) without paying the cost of the real +# 500/1000-pod scenarios. +--- +apiVersion: grove.io/v1alpha1 +kind: PodCliqueSet +metadata: + name: scale-down-tiny + labels: + app: scale-down-tiny +spec: + replicas: 5 + template: + cliques: + - name: expert-worker + spec: + roleName: expert + replicas: 2 + minAvailable: 2 + podSpec: + schedulerName: default-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: type + operator: In + values: + - kwok + tolerations: + - key: node_role.e2e.grove.nvidia.com + operator: Equal + value: agent + effect: NoSchedule + containers: + - name: expert-worker + image: registry:5001/nginx:alpine-slim + resources: + requests: + memory: 10Mi diff --git a/operator/e2e/yaml/scale-down-to-zero.yaml b/operator/e2e/yaml/scale-down-to-zero.yaml new file mode 100644 index 000000000..dbc72e961 --- /dev/null +++ b/operator/e2e/yaml/scale-down-to-zero.yaml @@ -0,0 +1,40 @@ +# Scale Down Test: to-zero variant +# Starts at 500 PCS replicas (1000 pods); the test scales to 0. +--- +apiVersion: grove.io/v1alpha1 +kind: PodCliqueSet +metadata: + name: scale-down-to-zero + labels: + app: scale-down-to-zero +spec: + replicas: 500 + template: + cliques: + - name: expert-worker + spec: + roleName: expert + replicas: 2 + minAvailable: 2 + podSpec: + schedulerName: default-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: type + operator: In + values: + - kwok + tolerations: + - key: node_role.e2e.grove.nvidia.com + operator: Equal + value: agent + effect: NoSchedule + containers: + - name: expert-worker + image: registry:5001/nginx:alpine-slim + resources: + requests: + memory: 10Mi diff --git a/operator/e2e/yaml/scale-up-from-zero.yaml b/operator/e2e/yaml/scale-up-from-zero.yaml new file mode 100644 index 000000000..07f7d3e8a --- /dev/null +++ b/operator/e2e/yaml/scale-up-from-zero.yaml @@ -0,0 +1,40 @@ +# Scale Up Test: from-zero variant +# Starts at replicas: 0; the test scales to 500 PCS replicas (1000 pods). +--- +apiVersion: grove.io/v1alpha1 +kind: PodCliqueSet +metadata: + name: scale-up-from-zero + labels: + app: scale-up-from-zero +spec: + replicas: 0 + template: + cliques: + - name: expert-worker + spec: + roleName: expert + replicas: 2 + minAvailable: 2 + podSpec: + schedulerName: default-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: type + operator: In + values: + - kwok + tolerations: + - key: node_role.e2e.grove.nvidia.com + operator: Equal + value: agent + effect: NoSchedule + containers: + - name: expert-worker + image: registry:5001/nginx:alpine-slim + resources: + requests: + memory: 10Mi diff --git a/operator/e2e/yaml/scale-up-large-delta.yaml b/operator/e2e/yaml/scale-up-large-delta.yaml new file mode 100644 index 000000000..3e901ede0 --- /dev/null +++ b/operator/e2e/yaml/scale-up-large-delta.yaml @@ -0,0 +1,40 @@ +# Scale Up Test: large-delta variant (2x) +# Starts at 250 PCS replicas (500 pods); the test scales to 500 (1000 pods). +--- +apiVersion: grove.io/v1alpha1 +kind: PodCliqueSet +metadata: + name: scale-up-large-delta + labels: + app: scale-up-large-delta +spec: + replicas: 250 + template: + cliques: + - name: expert-worker + spec: + roleName: expert + replicas: 2 + minAvailable: 2 + podSpec: + schedulerName: default-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: type + operator: In + values: + - kwok + tolerations: + - key: node_role.e2e.grove.nvidia.com + operator: Equal + value: agent + effect: NoSchedule + containers: + - name: expert-worker + image: registry:5001/nginx:alpine-slim + resources: + requests: + memory: 10Mi diff --git a/operator/e2e/yaml/scale-up-small-delta.yaml b/operator/e2e/yaml/scale-up-small-delta.yaml new file mode 100644 index 000000000..ede8ccb1d --- /dev/null +++ b/operator/e2e/yaml/scale-up-small-delta.yaml @@ -0,0 +1,40 @@ +# Scale Up Test: small-delta variant (+10%) +# Starts at 500 PCS replicas (1000 pods); the test scales to 550 (1100 pods). +--- +apiVersion: grove.io/v1alpha1 +kind: PodCliqueSet +metadata: + name: scale-up-small-delta + labels: + app: scale-up-small-delta +spec: + replicas: 500 + template: + cliques: + - name: expert-worker + spec: + roleName: expert + replicas: 2 + minAvailable: 2 + podSpec: + schedulerName: default-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: type + operator: In + values: + - kwok + tolerations: + - key: node_role.e2e.grove.nvidia.com + operator: Equal + value: agent + effect: NoSchedule + containers: + - name: expert-worker + image: registry:5001/nginx:alpine-slim + resources: + requests: + memory: 10Mi diff --git a/operator/e2e/yaml/scale-up-tiny.yaml b/operator/e2e/yaml/scale-up-tiny.yaml new file mode 100644 index 000000000..daaa76313 --- /dev/null +++ b/operator/e2e/yaml/scale-up-tiny.yaml @@ -0,0 +1,42 @@ +# Scale Up Test: tiny sanity variant +# Starts at replicas: 0; the test scales to 5 PCS replicas (10 pods). +# Used to validate cluster + test plumbing end-to-end without paying the cost +# of the real 500/1000-pod scenarios. +--- +apiVersion: grove.io/v1alpha1 +kind: PodCliqueSet +metadata: + name: scale-up-tiny + labels: + app: scale-up-tiny +spec: + replicas: 0 + template: + cliques: + - name: expert-worker + spec: + roleName: expert + replicas: 2 + minAvailable: 2 + podSpec: + schedulerName: default-scheduler + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: type + operator: In + values: + - kwok + tolerations: + - key: node_role.e2e.grove.nvidia.com + operator: Equal + value: agent + effect: NoSchedule + containers: + - name: expert-worker + image: registry:5001/nginx:alpine-slim + resources: + requests: + memory: 10Mi