diff --git a/.github/workflows/build-check-test.yaml b/.github/workflows/build-check-test.yaml index 5d2723ff6..957bfabf1 100644 --- a/.github/workflows/build-check-test.yaml +++ b/.github/workflows/build-check-test.yaml @@ -93,6 +93,8 @@ jobs: # Matrix entries can set: # test_name (required) - name shown in the GitHub Actions UI # test_pattern (optional) - Go test -run pattern (standard e2e tests) + # create_flags (optional) - extra flags appended to E2E_CREATE_FLAGS; + # empty string means "use the base e2e.yaml preset (KAI)" # make_target (optional) - Makefile target, defaults to run-e2e-full e2e: needs: [test, build, check, changes] @@ -109,26 +111,49 @@ jobs: fail-fast: false matrix: include: + # --- kai-scheduler (primary backend: agnostic + sensitive + KAI-supported capabilities) --- - test_name: gang_scheduling test_pattern: "^Test_GS" + create_flags: "" - test_name: rolling_updates test_pattern: "^Test_RU" + create_flags: "" - test_name: ondelete_updates test_pattern: "^Test_OD" + create_flags: "" - test_name: startup_ordering test_pattern: "^Test_SO" + create_flags: "" make_target: "run-e2e-real-full" - test_name: Topology_Aware_Scheduling test_pattern: "^Test_TAS" + create_flags: "" - test_name: cert_management test_pattern: "^Test_CM" + create_flags: "" - test_name: auto_mnnvl test_pattern: "^Test_AutoMNNVL" + create_flags: "" make_target: "run-e2e-mnnvl-full" - test_name: crd_installer test_pattern: "^Test_CRD_Installer" + create_flags: "" - test_name: resource_sharing test_pattern: "^Test_RS" + create_flags: "" + + # --- default-scheduler (sensitive tests only; agnostic skipped by policy, + # capability-gated skipped by RequireCapability at runtime) --- + - test_name: rolling_updates_default-scheduler + test_pattern: "^Test_RU" + create_flags: "-f hack/e2e-default-scheduler.yaml" + - test_name: ondelete_updates_default-scheduler + test_pattern: "^Test_OD" + create_flags: "-f hack/e2e-default-scheduler.yaml" + - test_name: startup_ordering_default-scheduler + test_pattern: "^Test_SO" + create_flags: "-f hack/e2e-default-scheduler.yaml" + make_target: "run-e2e-real-full" name: E2E - ${{ matrix.test_name }} steps: # print runner specs so we have a record in case of failures @@ -150,7 +175,7 @@ jobs: - name: Run e2e tests - ${{ matrix.test_name }} run: | - make ${{ matrix.make_target || 'run-e2e-full' }} TEST_PATTERN='${{ matrix.test_pattern }}' E2E_CREATE_FLAGS='--dind-memory-mode' + make ${{ matrix.make_target || 'run-e2e-full' }} TEST_PATTERN='${{ matrix.test_pattern }}' E2E_CREATE_FLAGS='${{ matrix.create_flags }} --dind-memory-mode' working-directory: operator # The test code handles cleanup via Teardown(), but this step provides @@ -187,14 +212,21 @@ jobs: fail-fast: false matrix: include: + # Must mirror the e2e job's matrix.test_name list exactly so required + # branch-protection checks resolve the same set of names regardless + # of whether E2E ran or was skipped. - test_name: gang_scheduling - test_name: rolling_updates + - test_name: ondelete_updates - test_name: startup_ordering - test_name: Topology_Aware_Scheduling - test_name: cert_management - test_name: auto_mnnvl - test_name: crd_installer - test_name: resource_sharing + - test_name: rolling_updates_default-scheduler + - test_name: ondelete_updates_default-scheduler + - test_name: startup_ordering_default-scheduler name: E2E - ${{ matrix.test_name }} steps: - name: Skip E2E (no relevant changes) diff --git a/operator/e2e/tests/capabilities.go b/operator/e2e/tests/capabilities.go new file mode 100644 index 000000000..c32566d24 --- /dev/null +++ b/operator/e2e/tests/capabilities.go @@ -0,0 +1,109 @@ +// /* +// Copyright 2026 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package tests + +import ( + "sync" + "testing" +) + +// Capability is a scheduler feature the E2E suite may require (e.g. gang +// scheduling, topology-aware scheduling). Tests gate themselves with +// RequireCapability and auto-skip when the active backend does not provide it. +type Capability string + +const ( + // GangScheduling indicates the active backend treats a PodGang as an + // all-or-nothing scheduling unit. + GangScheduling Capability = "GangScheduling" + + // TopologyAwareScheduling indicates the active backend implements the + // scheduler.TopologyAwareSchedBackend interface AND the operator has + // topologyAwareScheduling.enabled=true. + TopologyAwareScheduling Capability = "TopologyAwareScheduling" + + // AutoMNNVL indicates the operator has network.autoMNNVLEnabled=true. + // Config-only — no backend coupling. + AutoMNNVL Capability = "AutoMNNVL" +) + +// CapabilitySet is the resolved set of capabilities for a single E2E run. +type CapabilitySet struct { + // ActiveBackend is the value of OperatorConfiguration.scheduler.defaultProfileName. + ActiveBackend string + // caps is the set of capabilities present on the active backend. + caps map[Capability]bool +} + +// Has reports whether the set contains the given capability. +func (s CapabilitySet) Has(c Capability) bool { + return s.caps[c] +} + +// backendInterfaceCapabilities is the hardcoded map of backend → capabilities +// that depend on Go interface implementation in the operator. Entries here are +// what E2E cannot deduce from a live OperatorConfiguration alone (the operator +// uses Go type assertions; the test binary runs out-of-process and cannot). +// +// Capabilities derived purely from configuration flags (e.g. AutoMNNVL from +// network.autoMNNVLEnabled) are NOT listed here — they are resolved directly +// from OperatorConfiguration in DiscoverCapabilities. +// +// When adding a new backend, add a row here AND update the developer +// checklist in the design proposal. The capabilities_test.go cross-check +// fails the build if this table disagrees with the actual Go interfaces. +var backendInterfaceCapabilities = map[string]map[Capability]bool{ + "kai-scheduler": { + GangScheduling: true, + TopologyAwareScheduling: true, + }, + "default-scheduler": { + // KubeSchedulerConfig.GangScheduling is forward-looking — the kube + // backend does not yet read or act on it. When it does, set + // GangScheduling: true here. + }, +} + +// currentCapabilities holds the resolved CapabilitySet for the running e2e +// suite. DiscoverCapabilities (in capability_discovery.go, e2e build tag) +// populates it once at TestMain time; RequireCapability reads it on every +// gated test entry. +var ( + currentCapabilities CapabilitySet + currentCapabilitiesSet bool + currentCapabilitiesMu sync.RWMutex +) + +// RequireCapability skips t when the active backend does not provide cap. +// Tests gated with RequireCapability are listed in the design proposal's +// Test Classification table as "Capability-gated". +// +// The function is no-op if capabilities have not been discovered yet (e.g. when +// running unit tests with go test ./... without an e2e cluster); the e2e build +// flow guarantees discovery runs before any test that calls this. +func RequireCapability(t *testing.T, cap Capability) { + t.Helper() + currentCapabilitiesMu.RLock() + defer currentCapabilitiesMu.RUnlock() + if !currentCapabilitiesSet { + return + } + if !currentCapabilities.Has(cap) { + t.Skipf("skipping: active backend %q does not provide capability %q", + currentCapabilities.ActiveBackend, cap) + } +} diff --git a/operator/e2e/tests/capabilities_test.go b/operator/e2e/tests/capabilities_test.go new file mode 100644 index 000000000..be1a21da7 --- /dev/null +++ b/operator/e2e/tests/capabilities_test.go @@ -0,0 +1,100 @@ +// /* +// Copyright 2026 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package tests + +import ( + "testing" + + configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" + "github.com/ai-dynamo/grove/operator/internal/scheduler" + "github.com/ai-dynamo/grove/operator/internal/scheduler/kai" + "github.com/ai-dynamo/grove/operator/internal/scheduler/kube" + + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/tools/record" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +// backendConstructors mirrors the switch in +// operator/internal/scheduler/manager/manager.go newBackendForProfile. +// Adding a new backend means adding a row here AND in +// backendInterfaceCapabilities (in capabilities.go); TestCapabilityTableMatchesBackends +// fails the build if the two disagree. +var backendConstructors = map[configv1alpha1.SchedulerName]func() scheduler.Backend{ + configv1alpha1.SchedulerNameKai: func() scheduler.Backend { + return kai.New( + fake.NewClientBuilder().Build(), + runtime.NewScheme(), + record.NewFakeRecorder(1), + configv1alpha1.SchedulerProfile{Name: configv1alpha1.SchedulerNameKai}, + ) + }, + configv1alpha1.SchedulerNameKube: func() scheduler.Backend { + return kube.New( + fake.NewClientBuilder().Build(), + runtime.NewScheme(), + record.NewFakeRecorder(1), + configv1alpha1.SchedulerProfile{Name: configv1alpha1.SchedulerNameKube}, + ) + }, +} + +// TestCapabilityTableCoversAllSupportedBackends ensures the hardcoded +// capability table has a row for every backend the operator can construct. +// Catches the failure mode where a contributor adds a backend to +// SupportedSchedulerNames + manager.newBackendForProfile but forgets the +// capability table — without this, the new backend's capability-gated tests +// would silently skip rather than fail. +func TestCapabilityTableCoversAllSupportedBackends(t *testing.T) { + for _, name := range configv1alpha1.SupportedSchedulerNames { + if _, ok := backendInterfaceCapabilities[string(name)]; !ok { + t.Errorf("backend %q is in SupportedSchedulerNames but missing from "+ + "backendInterfaceCapabilities; add a row to "+ + "operator/e2e/tests/capabilities.go", name) + } + if _, ok := backendConstructors[name]; !ok { + t.Errorf("backend %q is in SupportedSchedulerNames but missing from "+ + "backendConstructors; add a row to "+ + "operator/e2e/tests/capabilities_test.go", name) + } + } +} + +// TestCapabilityTableMatchesBackends cross-checks the hardcoded capability +// table against actual Go interface implementation for each backend. Catches +// the failure mode where a backend's interface set changes (e.g. KAI drops +// TopologyAwareSchedBackend) but the table is not updated — without this, the +// E2E suite would either skip valid TAS tests or run them against a backend +// that no longer supports TAS. +func TestCapabilityTableMatchesBackends(t *testing.T) { + for name, ctor := range backendConstructors { + t.Run(string(name), func(t *testing.T) { + b := ctor() + table := backendInterfaceCapabilities[string(name)] + + // TopologyAwareScheduling: tied to the Go interface assertion + // the operator itself uses (clustertopology.go L46–54). + _, gotTAS := b.(scheduler.TopologyAwareSchedBackend) + wantTAS := table[TopologyAwareScheduling] + if gotTAS != wantTAS { + t.Errorf("backend %q: TopologyAwareScheduling table=%v but "+ + "interface assertion=%v; update either the backend "+ + "or backendInterfaceCapabilities", name, wantTAS, gotTAS) + } + }) + } +} diff --git a/operator/e2e/tests/capability_discovery.go b/operator/e2e/tests/capability_discovery.go new file mode 100644 index 000000000..ee04f9a50 --- /dev/null +++ b/operator/e2e/tests/capability_discovery.go @@ -0,0 +1,75 @@ +//go:build e2e + +// /* +// Copyright 2026 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package tests + +import ( + "context" + "fmt" + + "github.com/ai-dynamo/grove/operator/e2e/grove/config" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// DiscoverCapabilities resolves the active backend and its capabilities from +// the live OperatorConfiguration plus the hardcoded interface table, stores +// the result in the package-level currentCapabilities so RequireCapability +// can read it, and returns the resolved set so callers may log/inspect it. +// +// Called once from TestMain before any test runs. +func DiscoverCapabilities(ctx context.Context, crClient client.Client) (CapabilitySet, error) { + md, err := config.NewOperatorConfig(crClient).ReadGroveMetadata(ctx) + if err != nil { + return CapabilitySet{}, fmt.Errorf("read OperatorConfiguration: %w", err) + } + + backend := md.Config.Scheduler.DefaultProfileName + table, ok := backendInterfaceCapabilities[backend] + if !ok { + return CapabilitySet{}, fmt.Errorf( + "active backend %q has no entry in backendInterfaceCapabilities; "+ + "please update operator/e2e/tests/capabilities.go", backend) + } + + set := CapabilitySet{ + ActiveBackend: backend, + caps: map[Capability]bool{}, + } + + // Backend-coupled capability: present iff backend is in the table for it. + if table[GangScheduling] { + set.caps[GangScheduling] = true + } + + // Backend-coupled capability gated by an additional config flag. + if md.Config.TopologyAwareScheduling.Enabled && table[TopologyAwareScheduling] { + set.caps[TopologyAwareScheduling] = true + } + + // Config-only capability: no interface-table lookup. + if md.Config.Network.AutoMNNVLEnabled { + set.caps[AutoMNNVL] = true + } + + currentCapabilitiesMu.Lock() + currentCapabilities = set + currentCapabilitiesSet = true + currentCapabilitiesMu.Unlock() + + return set, nil +} diff --git a/operator/e2e/tests/gang_scheduling_test.go b/operator/e2e/tests/gang_scheduling_test.go index 3a971d6eb..b5095b143 100644 --- a/operator/e2e/tests/gang_scheduling_test.go +++ b/operator/e2e/tests/gang_scheduling_test.go @@ -32,6 +32,7 @@ import ( // 3. Verify all workload pods are pending due to insufficient resources // 4. Uncordon the node and verify all pods get scheduled func Test_GS1_GangSchedulingWithFullReplicas(t *testing.T) { + RequireCapability(t, GangScheduling) ctx := context.Background() Logger.Info("1. Initialize a 10-node Grove cluster, then cordon 1 node") @@ -83,6 +84,7 @@ func Test_GS1_GangSchedulingWithFullReplicas(t *testing.T) { // 6. Scale PCSG replicas to 3 and verify 4 new pending pods // 7. Uncordon remaining nodes and verify all pods get scheduled func Test_GS2_GangSchedulingWithScalingFullReplicas(t *testing.T) { + RequireCapability(t, GangScheduling) ctx := context.Background() // Setup cluster (shared or individual based on test run mode) @@ -152,6 +154,7 @@ func Test_GS2_GangSchedulingWithScalingFullReplicas(t *testing.T) { // 6. Scale PCS replicas to 2 and verify 10 new pending pods // 7. Uncordon remaining nodes and verify all pods get scheduled func Test_GS3_GangSchedulingWithPCSScalingFullReplicas(t *testing.T) { + RequireCapability(t, GangScheduling) ctx := context.Background() Logger.Info("1. Initialize a 20-node Grove cluster, then cordon 11 nodes") @@ -217,6 +220,7 @@ func Test_GS3_GangSchedulingWithPCSScalingFullReplicas(t *testing.T) { // 9. Scale PCSG replicas to 3 and verify 4 new pending pods // 10. Uncordon remaining nodes and verify all pods get scheduled func Test_GS4_GangSchedulingWithPCSAndPCSGScalingFullReplicas(t *testing.T) { + RequireCapability(t, GangScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster, then cordon 19 nodes") @@ -283,6 +287,7 @@ func Test_GS4_GangSchedulingWithPCSAndPCSGScalingFullReplicas(t *testing.T) { // 5. Wait for scheduled pods to become ready // 6. Uncordon 7 nodes and verify all remaining workload pods get scheduled func Test_GS5_GangSchedulingWithMinReplicas(t *testing.T) { + RequireCapability(t, GangScheduling) ctx := context.Background() Logger.Info("1. Initialize a 10-node Grove cluster, then cordon 8 nodes") @@ -351,6 +356,7 @@ func Test_GS5_GangSchedulingWithMinReplicas(t *testing.T) { // 11. Wait for scheduled pods to become ready // 12. Uncordon 2 nodes and verify remaining workload pods get scheduled func Test_GS6_GangSchedulingWithPCSGScalingMinReplicas(t *testing.T) { + RequireCapability(t, GangScheduling) ctx := context.Background() Logger.Info("1. Initialize a 14-node Grove cluster, then cordon 12 nodes") @@ -463,6 +469,7 @@ func Test_GS6_GangSchedulingWithPCSGScalingMinReplicas(t *testing.T) { // 13. Wait for scheduled pods to become ready // 14. Uncordon 2 nodes and verify remaining workload pods get scheduled func Test_GS7_GangSchedulingWithPCSGScalingMinReplicasAdvanced1(t *testing.T) { + RequireCapability(t, GangScheduling) ctx := context.Background() Logger.Info("1. Initialize a 14-node Grove cluster, then cordon 12 nodes") @@ -584,6 +591,7 @@ func Test_GS7_GangSchedulingWithPCSGScalingMinReplicasAdvanced1(t *testing.T) { // 9. Wait for scheduled pods to become ready // 10. Uncordon 7 nodes and verify the remaining workload pods get scheduled func Test_GS8_GangSchedulingWithPCSGScalingMinReplicasAdvanced2(t *testing.T) { + RequireCapability(t, GangScheduling) ctx := context.Background() Logger.Info("1. Initialize a 14-node Grove cluster, then cordon 12 nodes") @@ -679,6 +687,7 @@ func Test_GS8_GangSchedulingWithPCSGScalingMinReplicasAdvanced2(t *testing.T) { // 10. Wait for scheduled pods to become ready // 11. Uncordon 7 nodes and verify the remaining workload pods get scheduled func Test_GS9_GangSchedulingWithPCSScalingMinReplicas(t *testing.T) { + RequireCapability(t, GangScheduling) ctx := context.Background() Logger.Info("1. Initialize a 20-node Grove cluster, then cordon 18 nodes") @@ -782,6 +791,7 @@ func Test_GS9_GangSchedulingWithPCSScalingMinReplicas(t *testing.T) { // 9. Wait for scheduled pods to become ready // 10. Uncordon 10 nodes and verify the remaining workload pods get scheduled func Test_GS10_GangSchedulingWithPCSScalingMinReplicasAdvanced(t *testing.T) { + RequireCapability(t, GangScheduling) ctx := context.Background() Logger.Info("1. Initialize a 20-node Grove cluster, then cordon 18 nodes") @@ -886,6 +896,7 @@ func Test_GS10_GangSchedulingWithPCSScalingMinReplicasAdvanced(t *testing.T) { // 19. Wait for 2 more pods to be scheduled (min-available for pcs-1-sg-x-2) // 20. Uncordon 2 nodes and verify remaining workload pods get scheduled func Test_GS11_GangSchedulingWithPCSAndPCSGScalingMinReplicas(t *testing.T) { + RequireCapability(t, GangScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster, then cordon 26 nodes") @@ -1028,6 +1039,7 @@ func Test_GS11_GangSchedulingWithPCSAndPCSGScalingMinReplicas(t *testing.T) { // 11. Wait for scheduled pods to become ready // 12. Uncordon 14 nodes and verify the remaining workload pods get scheduled func Test_GS12_GangSchedulingWithComplexPCSGScaling(t *testing.T) { + RequireCapability(t, GangScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster, then cordon 26 nodes") diff --git a/operator/e2e/tests/suite_test.go b/operator/e2e/tests/suite_test.go index a9a2e4847..7a0381324 100644 --- a/operator/e2e/tests/suite_test.go +++ b/operator/e2e/tests/suite_test.go @@ -45,6 +45,17 @@ func TestMain(m *testing.M) { os.Exit(1) } + // Discover scheduler capabilities from the live OperatorConfiguration + // before any test runs. RequireCapability uses the result to auto-skip + // tests whose required capability is not provided by the active backend. + caps, err := DiscoverCapabilities(ctx, sharedCluster.GetClient()) + if err != nil { + Logger.Errorf("failed to discover scheduler capabilities: %s", err) + sharedCluster.Teardown() + os.Exit(1) + } + Logger.Infof("Active backend: %s", caps.ActiveBackend) + // Run tests code := m.Run() diff --git a/operator/e2e/tests/topology_test.go b/operator/e2e/tests/topology_test.go index 204cef333..57678a17c 100644 --- a/operator/e2e/tests/topology_test.go +++ b/operator/e2e/tests/topology_test.go @@ -101,6 +101,7 @@ func GetPodGroupOrFail(t *testing.T, tc *testctx.TestContext, podGroupVerifier * // Note: grove-topology is NOT cleaned up after this test — it is shared cluster infrastructure // used by TAS2-TAS16. ensureGroveTopology() in each subsequent test is idempotent. func Test_TAS1_TopologyInfrastructure(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() tc, cleanup := testctx.PrepareTest(ctx, t, 0) @@ -166,6 +167,7 @@ func Test_TAS1_TopologyInfrastructure(t *testing.T) { // 4. Verify worker-block pods (4) are in the same block // 5. Verify different cliques can have independent topology constraints func Test_TAS2_MultipleCliquesWithDifferentConstraints(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for topology testing") @@ -225,6 +227,7 @@ func Test_TAS2_MultipleCliquesWithDifferentConstraints(t *testing.T) { // 4. Verify router pods (2 standalone) // 5. Verify KAI PodGroup SubGroups: NO PCSG parent groups (because PCSG constraint is nil, per PR #357) func Test_TAS3_PCSOnlyConstraint(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for topology testing") @@ -279,6 +282,7 @@ func Test_TAS3_PCSOnlyConstraint(t *testing.T) { // 3. Verify PCSG worker pods (2 total) respect rack constraint // 4. Router pods (2 standalone) are unconstrained func Test_TAS4_PCSGOnlyConstraint(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for topology testing") @@ -344,6 +348,7 @@ func Test_TAS4_PCSGOnlyConstraint(t *testing.T) { // 2. PCS has NO explicit constraint // 3. Verify all 2 pods on same host (strictest constraint) func Test_TAS5_HostLevelConstraint(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for topology testing") @@ -406,6 +411,7 @@ func Test_TAS5_HostLevelConstraint(t *testing.T) { // 3. Verify KAI PodGroup has zone constraint at top level // 4. Verify 1 SubGroup (standalone PCLQ) with NO additional constraint func Test_TAS6_StandalonePCLQOnlyPCSZoneConstraint(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for topology testing") @@ -454,6 +460,7 @@ func Test_TAS6_StandalonePCLQOnlyPCSZoneConstraint(t *testing.T) { // 2. Verify all 4 pods scheduled (gang scheduling works) // 3. Verify KAI PodGroup has 4 SubGroups with NO topology constraints func Test_TAS7_NoTopologyConstraint(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for topology testing") @@ -506,6 +513,7 @@ func Test_TAS7_NoTopologyConstraint(t *testing.T) { // 5. Verify all pods in same block (PCS constraint) // 6. Verify KAI PodGroup hierarchy with correct topology constraints func Test_TAS8_FullHierarchyWithCascadingConstraints(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize an 8-node Grove cluster for topology testing") @@ -583,6 +591,7 @@ func Test_TAS8_FullHierarchyWithCascadingConstraints(t *testing.T) { // 3. Verify pods on same host (PCLQ constraint - strictest) // 4. Verify KAI PodGroup has block constraint at top level, host constraint at PCLQ level func Test_TAS9_PCSPlusPCLQConstraint(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for topology testing") @@ -633,6 +642,7 @@ func Test_TAS9_PCSPlusPCLQConstraint(t *testing.T) { // 5. Verify base PodGang KAI PodGroup topology constraints // 6. Verify scaled PodGangs' KAI PodGroups (replicas 1-2) func Test_TAS10_PCSGScalingWithTopologyConstraints(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for topology testing") @@ -713,6 +723,7 @@ func Test_TAS10_PCSGScalingWithTopologyConstraints(t *testing.T) { // 3. Verify each PCSG replica's pods on same host // 4. Verify KAI PodGroup has PCSG rack + PCLQ host constraints, NO top-level PCS constraint func Test_TAS11_PCSGPlusPCLQNoParentConstraint(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for topology testing") @@ -772,6 +783,7 @@ func Test_TAS11_PCSGPlusPCLQNoParentConstraint(t *testing.T) { // 5. Verify base PodGang KAI PodGroup contains minAvailable=3 replicas // 6. Verify 7 scaled PodGangs' KAI PodGroups (replicas 3-9) func Test_TAS12_LargeScalingRatio(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for topology testing") @@ -866,6 +878,7 @@ func Test_TAS12_LargeScalingRatio(t *testing.T) { // 4. Verify pod events show Unschedulable reason // 5. Verify KAI PodGroup exists with correct constraints even though pods are pending func Test_TAS13_InsufficientNodesForConstraint(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for topology testing") @@ -925,6 +938,7 @@ func Test_TAS13_InsufficientNodesForConstraint(t *testing.T) { // 3. Verify each PCS replica's pods in same rack // 4. Verify KAI PodGroups for both PCS replicas have correct topology constraints func Test_TAS14_MultiReplicaWithRackConstraint(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for topology testing") @@ -983,6 +997,7 @@ func Test_TAS14_MultiReplicaWithRackConstraint(t *testing.T) { // 6. Verify base PodGang KAI PodGroup topology for complex multi-PCSG workload // 7. Verify scaled PodGangs' KAI PodGroups (decoder replica 1, prefill replica 1) func Test_TAS15_DisaggregatedInferenceMultiplePCSGs(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for topology testing") @@ -1097,6 +1112,7 @@ func Test_TAS15_DisaggregatedInferenceMultiplePCSGs(t *testing.T) { // 4. Verify block constraint at PCS level, rack at PCSG, for both PCS replicas // 5. Similar to TAS15 but scaled across 2 PCS replicas func Test_TAS16_MultiReplicaPCSWithThreeLevelHierarchy(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for multi-replica PCS testing") @@ -1198,6 +1214,7 @@ func Test_TAS16_MultiReplicaPCSWithThreeLevelHierarchy(t *testing.T) { // 5. Verify KAI Topology CRs auto-created with correct keys // 6. Deploy H100 and GB200 workloads, verify pods packed at block level on correct node segments func Test_TAS17_HeterogeneousGPUCluster(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for heterogeneous GPU testing") @@ -1374,6 +1391,7 @@ func Test_TAS17_HeterogeneousGPUCluster(t *testing.T) { // 2. Verify SchedulerTopologyDrift condition becomes True/Drift // 3. Verify SchedulerTopologyStatuses shows InSync=false func Test_TAS18_ClusterTopologyDriftDetection(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) const ctName = "drift-detect-topo" const kaiTopoRef = "non-existent-kai-topo" ctx := context.Background() @@ -1434,6 +1452,7 @@ func Test_TAS18_ClusterTopologyDriftDetection(t *testing.T) { // 5. Verify KAI Topology recreated with 3 keys // 6. Verify SchedulerTopologyDrift remains False/InSync func Test_TAS19_AutoManagedCTLifecycle(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) const ctName = "lifecycle-topo" ctx := context.Background() @@ -1514,6 +1533,7 @@ func Test_TAS19_AutoManagedCTLifecycle(t *testing.T) { // 9. Re-create the ClusterTopology // 10. Verify TopologyLevelsUnavailable = False/AllClusterTopologyLevelsAvailable func Test_TAS20_PCSTopologyLevelsUnavailableCondition(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 2-node Grove cluster for PCS condition testing") @@ -1639,6 +1659,7 @@ func Test_TAS20_PCSTopologyLevelsUnavailableCondition(t *testing.T) { // Test_TAS21_ClusterTopologyValidationWebhook verifies that the ClusterTopology validating webhook // rejects invalid topology definitions and invalid schedulerTopologyReferences. func Test_TAS21_ClusterTopologyValidationWebhook(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a Grove cluster for ClusterTopology webhook validation testing") diff --git a/operator/e2e/yaml/tas-hierarchy.yaml b/operator/e2e/yaml/tas-hierarchy.yaml index 9c6c0572a..2794efd1e 100644 --- a/operator/e2e/yaml/tas-hierarchy.yaml +++ b/operator/e2e/yaml/tas-hierarchy.yaml @@ -36,7 +36,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -70,7 +69,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/tas-host-level.yaml b/operator/e2e/yaml/tas-host-level.yaml index b13bda54d..44bb968ae 100644 --- a/operator/e2e/yaml/tas-host-level.yaml +++ b/operator/e2e/yaml/tas-host-level.yaml @@ -23,7 +23,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/tas-indep-clq.yaml b/operator/e2e/yaml/tas-indep-clq.yaml index 614c867bb..b11b0e9f6 100644 --- a/operator/e2e/yaml/tas-indep-clq.yaml +++ b/operator/e2e/yaml/tas-indep-clq.yaml @@ -24,7 +24,6 @@ spec: minAvailable: 3 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -67,7 +66,6 @@ spec: minAvailable: 4 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/tas-insuffic.yaml b/operator/e2e/yaml/tas-insuffic.yaml index f77278abc..cbc2cc0a9 100644 --- a/operator/e2e/yaml/tas-insuffic.yaml +++ b/operator/e2e/yaml/tas-insuffic.yaml @@ -23,7 +23,6 @@ spec: minAvailable: 10 # All-or-nothing gang scheduling podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/tas-large-scale.yaml b/operator/e2e/yaml/tas-large-scale.yaml index 034d17769..5253f297a 100644 --- a/operator/e2e/yaml/tas-large-scale.yaml +++ b/operator/e2e/yaml/tas-large-scale.yaml @@ -33,7 +33,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/tas-multirep.yaml b/operator/e2e/yaml/tas-multirep.yaml index 71218e47e..aa2ed865f 100644 --- a/operator/e2e/yaml/tas-multirep.yaml +++ b/operator/e2e/yaml/tas-multirep.yaml @@ -23,7 +23,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/tas-no-constraint.yaml b/operator/e2e/yaml/tas-no-constraint.yaml index 111937d41..22bfa6623 100644 --- a/operator/e2e/yaml/tas-no-constraint.yaml +++ b/operator/e2e/yaml/tas-no-constraint.yaml @@ -26,7 +26,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/tas-pcs-multi-pcsg-multi-replica.yaml b/operator/e2e/yaml/tas-pcs-multi-pcsg-multi-replica.yaml index 24f7b69e7..e29e8aa91 100644 --- a/operator/e2e/yaml/tas-pcs-multi-pcsg-multi-replica.yaml +++ b/operator/e2e/yaml/tas-pcs-multi-pcsg-multi-replica.yaml @@ -42,7 +42,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -73,7 +72,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -107,7 +105,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -138,7 +135,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -169,7 +165,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/tas-pcs-multi-pcsg.yaml b/operator/e2e/yaml/tas-pcs-multi-pcsg.yaml index 115caf612..69bec7fe5 100644 --- a/operator/e2e/yaml/tas-pcs-multi-pcsg.yaml +++ b/operator/e2e/yaml/tas-pcs-multi-pcsg.yaml @@ -42,7 +42,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -82,7 +81,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -122,7 +120,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -162,7 +159,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -202,7 +198,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/tas-pcs-pclq.yaml b/operator/e2e/yaml/tas-pcs-pclq.yaml index ca90817fe..1034d0539 100644 --- a/operator/e2e/yaml/tas-pcs-pclq.yaml +++ b/operator/e2e/yaml/tas-pcs-pclq.yaml @@ -26,7 +26,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/tas-pcsg-pclq.yaml b/operator/e2e/yaml/tas-pcsg-pclq.yaml index c69cac669..97be1aa97 100644 --- a/operator/e2e/yaml/tas-pcsg-pclq.yaml +++ b/operator/e2e/yaml/tas-pcsg-pclq.yaml @@ -32,7 +32,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/tas-pcsg-scale.yaml b/operator/e2e/yaml/tas-pcsg-scale.yaml index ffcc06e65..6cc999249 100644 --- a/operator/e2e/yaml/tas-pcsg-scale.yaml +++ b/operator/e2e/yaml/tas-pcsg-scale.yaml @@ -32,7 +32,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/tas-sl-pcs-only.yaml b/operator/e2e/yaml/tas-sl-pcs-only.yaml index 3b71e2849..fac96f735 100644 --- a/operator/e2e/yaml/tas-sl-pcs-only.yaml +++ b/operator/e2e/yaml/tas-sl-pcs-only.yaml @@ -29,7 +29,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -69,7 +68,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/tas-sl-pcsg-only.yaml b/operator/e2e/yaml/tas-sl-pcsg-only.yaml index 598bdff55..336a587ee 100644 --- a/operator/e2e/yaml/tas-sl-pcsg-only.yaml +++ b/operator/e2e/yaml/tas-sl-pcsg-only.yaml @@ -29,7 +29,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -69,7 +68,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/tas-standalone-pclq-only-pcs-zone.yaml b/operator/e2e/yaml/tas-standalone-pclq-only-pcs-zone.yaml index 77a1887ca..d5231b034 100644 --- a/operator/e2e/yaml/tas-standalone-pclq-only-pcs-zone.yaml +++ b/operator/e2e/yaml/tas-standalone-pclq-only-pcs-zone.yaml @@ -23,7 +23,6 @@ spec: minAvailable: 4 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/workload-ondelete.yaml b/operator/e2e/yaml/workload-ondelete.yaml index b8a94162d..de77b56f3 100644 --- a/operator/e2e/yaml/workload-ondelete.yaml +++ b/operator/e2e/yaml/workload-ondelete.yaml @@ -21,7 +21,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -52,7 +51,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -83,7 +81,6 @@ spec: minAvailable: 3 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/workload1.yaml b/operator/e2e/yaml/workload1.yaml index cfd254429..b81de1414 100644 --- a/operator/e2e/yaml/workload1.yaml +++ b/operator/e2e/yaml/workload1.yaml @@ -19,7 +19,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -50,7 +49,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -81,7 +79,6 @@ spec: minAvailable: 3 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/workload2.yaml b/operator/e2e/yaml/workload2.yaml index 0d3a6b22a..01b15c5dd 100644 --- a/operator/e2e/yaml/workload2.yaml +++ b/operator/e2e/yaml/workload2.yaml @@ -20,7 +20,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -51,7 +50,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -82,7 +80,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/workload3.yaml b/operator/e2e/yaml/workload3.yaml index ccc7b8fae..d7c7bef8d 100644 --- a/operator/e2e/yaml/workload3.yaml +++ b/operator/e2e/yaml/workload3.yaml @@ -20,7 +20,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -51,7 +50,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -82,7 +80,6 @@ spec: minAvailable: 3 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/workload4.yaml b/operator/e2e/yaml/workload4.yaml index 9e63f3e33..80bdcaac5 100644 --- a/operator/e2e/yaml/workload4.yaml +++ b/operator/e2e/yaml/workload4.yaml @@ -20,7 +20,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -51,7 +50,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -82,7 +80,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/workload5.yaml b/operator/e2e/yaml/workload5.yaml index 5c760a088..1c693229f 100644 --- a/operator/e2e/yaml/workload5.yaml +++ b/operator/e2e/yaml/workload5.yaml @@ -20,7 +20,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -53,7 +52,6 @@ spec: - pc-c podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -86,7 +84,6 @@ spec: - pc-a podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/workload6.yaml b/operator/e2e/yaml/workload6.yaml index 7c11982cb..4d7570e95 100644 --- a/operator/e2e/yaml/workload6.yaml +++ b/operator/e2e/yaml/workload6.yaml @@ -20,7 +20,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -53,7 +52,6 @@ spec: - pc-a podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -86,7 +84,6 @@ spec: - pc-b podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/hack/README.md b/operator/hack/README.md index 8417510d1..fa58ed7a5 100644 --- a/operator/hack/README.md +++ b/operator/hack/README.md @@ -89,7 +89,7 @@ All configuration can be overridden via `E2E_*` environment variables (used by ` **Components (ComponentConfig):** - `E2E_KAI_VERSION` - Kai Scheduler version (default: from `dependencies.yaml`) -- `E2E_SKAFFOLD_PROFILE` - Skaffold profile for Grove (default: `topology-test`) +- `E2E_SKAFFOLD_PROFILE` - Skaffold profile for Grove (default: `e2e-kai`) - `E2E_GROVE_NAMESPACE` - Grove operator namespace (default: `grove-system`) - `E2E_REGISTRY` - Container registry override (default: none) diff --git a/operator/hack/e2e-autoMNNVL/README.md b/operator/hack/e2e-autoMNNVL/README.md index 3670eea74..4a1d4de6b 100644 --- a/operator/hack/e2e-autoMNNVL/README.md +++ b/operator/hack/e2e-autoMNNVL/README.md @@ -93,5 +93,5 @@ make e2e-cluster-down - **Cluster name:** `shared-e2e-test-cluster` (same as standard e2e) - **Nodes:** 1 server + 2 agents (lightweight — standard e2e uses 30) - **Registry:** local registry on port 5001 -- **Skaffold profile:** `topology-test` (same as standard e2e; Kai and topology are installed, only worker count and prepull are reduced) +- **Skaffold profile:** `e2e-kai` (same as standard e2e; Kai and topology are installed, only worker count and prepull are reduced) - **Fake GPU:** [fake-gpu-operator](https://github.com/run-ai/fake-gpu-operator) v0.0.72 (provides ComputeDomain CRD) diff --git a/operator/hack/e2e-cluster/create-e2e-cluster.py b/operator/hack/e2e-cluster/create-e2e-cluster.py index 54af806dd..7f710f98b 100755 --- a/operator/hack/e2e-cluster/create-e2e-cluster.py +++ b/operator/hack/e2e-cluster/create-e2e-cluster.py @@ -129,7 +129,7 @@ class ClusterConfig(BaseSettings): worker_memory: Optional[str] = Field(default=DEFAULT_WORKER_MEMORY, pattern=r"^\d+[mMgG]?$") k3s_image: str = "rancher/k3s:v1.34.2-k3s1" kai_version: str = Field(default=DEPENDENCIES['kai_scheduler']['version'], pattern=r"^v[\d.]+(-[\w.]+)?$") - skaffold_profile: str = "topology-test" + skaffold_profile: str = "e2e-kai" max_retries: int = Field(default=3, ge=1, le=10) # Constants (not configurable via environment variables) diff --git a/operator/hack/e2e-default-scheduler.yaml b/operator/hack/e2e-default-scheduler.yaml new file mode 100644 index 000000000..0214b5566 --- /dev/null +++ b/operator/hack/e2e-default-scheduler.yaml @@ -0,0 +1,14 @@ +# E2E preset overlay for the in-tree default-scheduler backend. +# Layers on top of e2e.yaml; cluster shape and KWOK config are inherited so +# kai vs default-scheduler comparisons stay fair. +# +# Activated via: infra-manager.py setup -f hack/e2e-default-scheduler.yaml +# (threaded through E2E_CREATE_FLAGS in the CI matrix; see build-check-test.yaml). + +scheduler: + kai: + enabled: false + +grove: + local: + skaffold_profile: e2e-default-scheduler diff --git a/operator/hack/e2e.yaml b/operator/hack/e2e.yaml index e09cedd40..a50290b9b 100644 --- a/operator/hack/e2e.yaml +++ b/operator/hack/e2e.yaml @@ -17,3 +17,5 @@ scheduler: grove: enabled: true profiling: false + local: + skaffold_profile: e2e-kai diff --git a/operator/hack/infra_manager/constants.py b/operator/hack/infra_manager/constants.py index 10a29d2b0..14b3be285 100644 --- a/operator/hack/infra_manager/constants.py +++ b/operator/hack/infra_manager/constants.py @@ -186,7 +186,7 @@ def parse_memory_mb(mem_str: str) -> int: DEFAULT_CLUSTER_CREATE_MAX_RETRIES = 3 # -- Component defaults -- -DEFAULT_SKAFFOLD_PROFILE = "topology-test" +DEFAULT_SKAFFOLD_PROFILE = "e2e-kai" DEFAULT_GROVE_NAMESPACE = "grove-system" # -- KWOK defaults -- diff --git a/operator/hack/infra_manager/orchestrator.py b/operator/hack/infra_manager/orchestrator.py index 198fde710..ae7454109 100644 --- a/operator/hack/infra_manager/orchestrator.py +++ b/operator/hack/infra_manager/orchestrator.py @@ -126,16 +126,21 @@ def _run_task(name: str, fn: Callable) -> None: console.print(outputs[name], end="") -def _run_prepull(registry_port: int) -> None: +def _run_prepull(registry_port: int, kai_enabled: bool) -> None: """Pre-pull images to local registry in a single batch. Args: registry_port: Port for the local container registry. + kai_enabled: Whether the KAI scheduler is enabled. When false, the KAI + image group is skipped because no workload will reference it. """ groups: list[tuple[list[str], str]] = [ - (DEPENDENCIES["kai_scheduler"]["images"], DEPENDENCIES["kai_scheduler"]["version"]), (DEPENDENCIES["cert_manager"]["images"], DEPENDENCIES["cert_manager"]["version"]), ] + if kai_enabled: + groups.insert( + 0, (DEPENDENCIES["kai_scheduler"]["images"], DEPENDENCIES["kai_scheduler"]["version"]) + ) busybox_images = dep_value("test_images", "busybox") if busybox_images: groups.append((busybox_images, "latest")) @@ -211,7 +216,9 @@ def run_setup(cfg: SetupConfig) -> None: if cfg.cluster.create: parallel_tasks["topology"] = apply_topology_labels if do_prepull: - parallel_tasks["prepull"] = lambda: _run_prepull(cfg.cluster.registry_port) + parallel_tasks["prepull"] = lambda: _run_prepull( + cfg.cluster.registry_port, cfg.scheduler.kai.enabled + ) if cfg.scheduler.kai.enabled: parallel_tasks["kai"] = lambda: install_kai_scheduler(cfg.scheduler.kai) if cfg.grove.enabled: diff --git a/operator/skaffold.yaml b/operator/skaffold.yaml index 72e403cc9..8a099ac46 100644 --- a/operator/skaffold.yaml +++ b/operator/skaffold.yaml @@ -73,7 +73,7 @@ profiles: config: leaderElection: enabled: false - - name: topology-test + - name: e2e-kai patches: - op: add path: /deploy/helm/releases/0/setValues @@ -88,6 +88,17 @@ profiles: enabled: false topologyAwareScheduling: enabled: true + - name: e2e-default-scheduler + patches: + - op: add + path: /deploy/helm/releases/0/setValues + value: + replicaCount: 1 + config: + scheduler: + defaultProfileName: default-scheduler + leaderElection: + enabled: false - name: mnnvl-test patches: - op: add