Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions operator/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -214,13 +214,19 @@ scale-cluster-down:
@$(MODULE_HACK_DIR)/infra-manager.py delete k3d-cluster

# Run scale tests against an existing cluster
# Usage: make run-scale-test [DIAG_DIR=<path>] [DIAG_MODE=<mode>]
# Usage: make run-scale-test [TEST_PATTERN=<pattern>] [DIAG_DIR=<path>] [DIAG_MODE=<mode>]
# Examples:
# make run-scale-test # Run all scale tests
# make run-scale-test TEST_PATTERN=Test_ScaleUp # Run all scale-up variants
# make run-scale-test TEST_PATTERN=Test_ScaleDown # Run all scale-down variants
# make run-scale-test TEST_PATTERN='Test_Scale(Up|Down)' # Run both directions
# make run-scale-test TEST_PATTERN=Test_ScaleUp_Tiny # Run the sanity test only
.PHONY: run-scale-test
run-scale-test: export GROVE_E2E_DIAG_DIR = $(DIAG_DIR)
run-scale-test: export GROVE_E2E_DIAG_MODE = $(DIAG_MODE)
run-scale-test:
@echo "> Running scale tests..."
@cd e2e && go test -count=1 -tags=e2e ./tests/scale/... -v -timeout 40m
@cd e2e && go test -count=1 -tags=e2e ./tests/scale/... -v -timeout 40m $(if $(TEST_PATTERN),-run $(TEST_PATTERN))

# Make targets for local development and testing
# -------------------------------------------------------------
Expand Down
35 changes: 35 additions & 0 deletions operator/e2e/measurement/condition/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,3 +135,38 @@ func (c *PodsReadyCondition) Met(ctx context.Context) (bool, error) {
func (c *PodsReadyCondition) Progress(_ context.Context) string {
return fmt.Sprintf("%d/%d pods ready", c.lastReady, c.ExpectedCount)
}

// PodsAtCountCondition fires when the live pod count drops to ExpectedCount or
// below. Intended for scale-down milestones where PodsCreatedCondition (≥-only)
// would fire immediately because the starting count already exceeds the target.
// Using ≤ rather than == makes the condition robust to a transient overshoot
// during cascade-delete where two consecutive polls might skip the exact target.
type PodsAtCountCondition struct {
Client client.Client
Namespace string
LabelSelector string
ExpectedCount int
lastCount int
sel parsedSelector
}

// Met returns true once the live pod count is ≤ ExpectedCount.
func (c *PodsAtCountCondition) Met(ctx context.Context) (bool, error) {
if c.ExpectedCount < 0 {
return false, errors.New("expected count cannot be negative")
}

c.sel.init(c.LabelSelector)
pods, err := listPods(ctx, c.Client, c.Namespace, &c.sel)
if err != nil {
return false, err
}

c.lastCount = len(pods)
return c.lastCount <= c.ExpectedCount, nil
}

// Progress returns a human-readable progress string.
func (c *PodsAtCountCondition) Progress(_ context.Context) string {
return fmt.Sprintf("%d pods (target ≤%d)", c.lastCount, c.ExpectedCount)
}
199 changes: 199 additions & 0 deletions operator/e2e/tests/scale/scale_down_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
//go:build e2e

// /*
// Copyright 2026 The Grove Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// */

package scale

import (
"context"
"testing"
"time"

"github.com/ai-dynamo/grove/operator/e2e/grove/workload"
"github.com/ai-dynamo/grove/operator/e2e/k8s/resources"
"github.com/ai-dynamo/grove/operator/e2e/measurement"
"github.com/ai-dynamo/grove/operator/e2e/measurement/condition"
"github.com/ai-dynamo/grove/operator/e2e/testctx"
)

const (
scaleDownTimeout = 15 * time.Minute
// scaleDownWorkerNodes matches scaleUpWorkerNodes: ~1100 kwok pods on 30 nodes
// (~37 pods/node) is under the default 110-pod kubelet limit, so these tests
// run on smaller dev clusters.
scaleDownWorkerNodes = 30
)

// scaleDownVariant configures a single scale-down scenario. Each variant boots a
// PCS at the YAML-encoded initialReplicas and then patches spec.replicas to
// targetReplicas so the timeline isolates the marginal scale-down cost.
type scaleDownVariant struct {
name string
workloadName string
yamlPath string
initialPods int
targetReplicas int
targetPods int
// workerNodes overrides scaleDownWorkerNodes for variants that need fewer
// nodes (e.g. the tiny sanity test). Zero means "use the default".
workerNodes int
}

// Test_ScaleDown_Tiny is a sanity-check variant that scales from 5 replicas
// (10 pods) down to 0. It runs the same code paths as the real benchmarks but
// completes in seconds — use it to validate cluster setup and the new
// PodsAtCountCondition before running the 500/1000-pod scenarios.
func Test_ScaleDown_Tiny(t *testing.T) {
runScaleDownTest(t, scaleDownVariant{
name: "ScaleDown_Tiny",
workloadName: "scale-down-tiny",
yamlPath: "../../yaml/scale-down-tiny.yaml",
initialPods: 10,
targetReplicas: 0,
targetPods: 0,
workerNodes: 5,
})
}

// Test_ScaleDown_ToZero scales an existing 500-replica PCS (1000 pods) down to 0.
// Captures the cascade-delete-everything case where every child must be torn down.
func Test_ScaleDown_ToZero(t *testing.T) {
runScaleDownTest(t, scaleDownVariant{
name: "ScaleDown_ToZero",
workloadName: "scale-down-to-zero",
yamlPath: "../../yaml/scale-down-to-zero.yaml",
initialPods: 1000,
targetReplicas: 0,
targetPods: 0,
})
}

// Test_ScaleDown_SmallDelta scales an existing 1100-pod PCS by -10% (to 1000 pods).
// Captures the steady-state-with-modest-shrink case.
func Test_ScaleDown_SmallDelta(t *testing.T) {
runScaleDownTest(t, scaleDownVariant{
name: "ScaleDown_SmallDelta",
workloadName: "scale-down-small-delta",
yamlPath: "../../yaml/scale-down-small-delta.yaml",
initialPods: 1100,
targetReplicas: 500,
targetPods: 1000,
})
}

// Test_ScaleDown_LargeDelta scales an existing 1000-pod PCS by 0.5x (to 500 pods).
// Captures the burst-shrink case where the controller has to tear down as many
// replicas as it keeps.
func Test_ScaleDown_LargeDelta(t *testing.T) {
runScaleDownTest(t, scaleDownVariant{
name: "ScaleDown_LargeDelta",
workloadName: "scale-down-large-delta",
yamlPath: "../../yaml/scale-down-large-delta.yaml",
initialPods: 1000,
targetReplicas: 250,
targetPods: 500,
})
}

// runScaleDownTest builds the deploy → scale-down → delete timeline for a variant.
// The deploy phase brings the PCS up at the YAML's initial replica count; the
// scale-down phase is the measurement of interest and milestones-out at
// pod-count-at-target.
func runScaleDownTest(t *testing.T, v scaleDownVariant) {
workerNodes := scaleDownWorkerNodes
if v.workerNodes > 0 {
workerNodes = v.workerNodes
}
// expectedPods is the upper bound used to size cluster fixtures; the scale-down
// phase shrinks below it.
runScaleTest(t, scaleTestConfig{
name: v.name,
workload: v.workloadName,
yamlPath: v.yamlPath,
expectedPods: v.initialPods,
pcsCount: defaultScalePCSCount,
workerNodes: workerNodes,
timeout: scaleDownTimeout,
pollInterval: defaultScalePollInterval,
}, func(tracker *measurement.TimelineTracker, tc *testctx.TestContext, _ string) {
tracker.AddPhase(measurement.PhaseDefinition{
Name: "deploy",
ActionFn: func(ctx context.Context) error {
_, err := resources.NewResourceManager(tc.Client, Logger).ApplyYAMLFile(ctx, tc.Workload.YAMLPath, tc.Namespace)
return err
},
Milestones: []measurement.MilestoneDefinition{
{
Name: "initial-pods-created",
Condition: &condition.PodsCreatedCondition{
Client: tc.Client.Client,
Namespace: tc.Namespace,
LabelSelector: tc.GetLabelSelector(),
ExpectedCount: v.initialPods,
},
},
{
Name: "initial-pods-ready",
Condition: &condition.PodsReadyCondition{
Client: tc.Client.Client,
Namespace: tc.Namespace,
LabelSelector: tc.GetLabelSelector(),
ExpectedCount: v.initialPods,
},
},
},
})

tracker.AddPhase(measurement.PhaseDefinition{
Name: "scale-down",
ActionFn: func(ctx context.Context) error {
Logger.Infof("scaling %s from %d to %d PCS replicas (target %d pods)",
tc.Workload.Name, v.initialPods/2, v.targetReplicas, v.targetPods)
return workload.NewWorkloadManager(tc.Client, Logger).ScalePCS(ctx, tc.Namespace, tc.Workload.Name, v.targetReplicas)
},
Milestones: []measurement.MilestoneDefinition{
{
Name: "pods-at-target",
Condition: &condition.PodsAtCountCondition{
Client: tc.Client.Client,
Namespace: tc.Namespace,
LabelSelector: tc.GetLabelSelector(),
ExpectedCount: v.targetPods,
},
},
},
})

tracker.AddPhase(measurement.PhaseDefinition{
Name: "delete",
ActionFn: func(ctx context.Context) error {
return workload.NewWorkloadManager(tc.Client, Logger).DeletePCS(ctx, tc.Namespace, tc.Workload.Name)
},
Milestones: []measurement.MilestoneDefinition{
{
Name: "pcs-deleted",
Condition: &condition.PCSAndSubresourcesDeletedCondition{
Client: tc.Client.Client,
Name: tc.Workload.Name,
Namespace: tc.Namespace,
LabelSelector: tc.GetLabelSelector(),
},
},
},
})
})
}
Loading
Loading