ai-dynamo · brluobt · May 6, 2026 · May 9, 2026 · kangclzjc · May 18, 2026
@@ -93,6 +93,8 @@ jobs:
   # Matrix entries can set:
   #   test_name     (required) - name shown in the GitHub Actions UI
   #   test_pattern  (optional) - Go test -run pattern (standard e2e tests)
+  #   create_flags  (optional) - extra flags appended to E2E_CREATE_FLAGS;
+  #                              empty string means "use the base e2e.yaml preset (KAI)"
   #   make_target   (optional) - Makefile target, defaults to run-e2e-full
   e2e:
     needs: [test, build, check, changes]
@@ -109,26 +111,49 @@ jobs:
       fail-fast: false
       matrix:
         include:
+          # --- kai-scheduler (primary backend: agnostic + sensitive + KAI-supported capabilities) ---
           - test_name: gang_scheduling
             test_pattern: "^Test_GS"
+            create_flags: ""
           - test_name: rolling_updates
             test_pattern: "^Test_RU"
+            create_flags: ""
           - test_name: ondelete_updates
             test_pattern: "^Test_OD"
+            create_flags: ""
           - test_name: startup_ordering
             test_pattern: "^Test_SO"
+            create_flags: ""
             make_target: "run-e2e-real-full"
           - test_name: Topology_Aware_Scheduling
             test_pattern: "^Test_TAS"
+            create_flags: ""
           - test_name: cert_management
             test_pattern: "^Test_CM"
+            create_flags: ""
           - test_name: auto_mnnvl
             test_pattern: "^Test_AutoMNNVL"
+            create_flags: ""
             make_target: "run-e2e-mnnvl-full"
           - test_name: crd_installer
             test_pattern: "^Test_CRD_Installer"
+            create_flags: ""
           - test_name: resource_sharing
             test_pattern: "^Test_RS"
+            create_flags: ""
+
+          # --- default-scheduler (sensitive tests only; agnostic skipped by policy,
+          #     capability-gated skipped by RequireCapability at runtime) ---
+          - test_name: rolling_updates_default-scheduler
+            test_pattern: "^Test_RU"
+            create_flags: "-f hack/e2e-default-scheduler.yaml"
+          - test_name: ondelete_updates_default-scheduler
+            test_pattern: "^Test_OD"
+            create_flags: "-f hack/e2e-default-scheduler.yaml"
+          - test_name: startup_ordering_default-scheduler
+            test_pattern: "^Test_SO"
+            create_flags: "-f hack/e2e-default-scheduler.yaml"
+            make_target: "run-e2e-real-full"
     name: E2E - ${{ matrix.test_name }}
     steps:
       # print runner specs so we have a record in case of failures
@@ -150,7 +175,7 @@ jobs:
 
       - name: Run e2e tests - ${{ matrix.test_name }}
         run: |
-          make ${{ matrix.make_target || 'run-e2e-full' }} TEST_PATTERN='${{ matrix.test_pattern }}' E2E_CREATE_FLAGS='--dind-memory-mode'
+          make ${{ matrix.make_target || 'run-e2e-full' }} TEST_PATTERN='${{ matrix.test_pattern }}' E2E_CREATE_FLAGS='${{ matrix.create_flags }} --dind-memory-mode'
         working-directory: operator
 
       # The test code handles cleanup via Teardown(), but this step provides
@@ -187,14 +212,21 @@ jobs:
       fail-fast: false
       matrix:
         include:
+          # Must mirror the e2e job's matrix.test_name list exactly so required
+          # branch-protection checks resolve the same set of names regardless
+          # of whether E2E ran or was skipped.
           - test_name: gang_scheduling
           - test_name: rolling_updates
+          - test_name: ondelete_updates
           - test_name: startup_ordering
           - test_name: Topology_Aware_Scheduling
           - test_name: cert_management
           - test_name: auto_mnnvl
           - test_name: crd_installer
           - test_name: resource_sharing
+          - test_name: rolling_updates_default-scheduler
+          - test_name: ondelete_updates_default-scheduler
+          - test_name: startup_ordering_default-scheduler
     name: E2E - ${{ matrix.test_name }}
     steps:
       - name: Skip E2E (no relevant changes)

@@ -0,0 +1,109 @@
+// /*
+// Copyright 2026 The Grove Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// */
+
+package tests
+
+import (
+	"sync"
+	"testing"
+)
+
+// Capability is a scheduler feature the E2E suite may require (e.g. gang
+// scheduling, topology-aware scheduling). Tests gate themselves with
+// RequireCapability and auto-skip when the active backend does not provide it.
+type Capability string
+
+const (
+	// GangScheduling indicates the active backend treats a PodGang as an
+	// all-or-nothing scheduling unit.
+	GangScheduling Capability = "GangScheduling"
+
+	// TopologyAwareScheduling indicates the active backend implements the
+	// scheduler.TopologyAwareSchedBackend interface AND the operator has
+	// topologyAwareScheduling.enabled=true.
+	TopologyAwareScheduling Capability = "TopologyAwareScheduling"
+
+	// AutoMNNVL indicates the operator has network.autoMNNVLEnabled=true.
+	// Config-only — no backend coupling.
+	AutoMNNVL Capability = "AutoMNNVL"
+)
+
+// CapabilitySet is the resolved set of capabilities for a single E2E run.
+type CapabilitySet struct {
+	// ActiveBackend is the value of OperatorConfiguration.scheduler.defaultProfileName.
+	ActiveBackend string
+	// caps is the set of capabilities present on the active backend.
+	caps map[Capability]bool
+}
+
+// Has reports whether the set contains the given capability.
+func (s CapabilitySet) Has(c Capability) bool {
+	return s.caps[c]
+}
+
+// backendInterfaceCapabilities is the hardcoded map of backend → capabilities
+// that depend on Go interface implementation in the operator. Entries here are
+// what E2E cannot deduce from a live OperatorConfiguration alone (the operator
+// uses Go type assertions; the test binary runs out-of-process and cannot).
+//
+// Capabilities derived purely from configuration flags (e.g. AutoMNNVL from
+// network.autoMNNVLEnabled) are NOT listed here — they are resolved directly
+// from OperatorConfiguration in DiscoverCapabilities.
+//
+// When adding a new backend, add a row here AND update the developer
+// checklist in the design proposal. The capabilities_test.go cross-check
+// fails the build if this table disagrees with the actual Go interfaces.
+var backendInterfaceCapabilities = map[string]map[Capability]bool{
+	"kai-scheduler": {
+		GangScheduling:          true,
+		TopologyAwareScheduling: true,
+	},
+	"default-scheduler": {
+		// KubeSchedulerConfig.GangScheduling is forward-looking — the kube
+		// backend does not yet read or act on it. When it does, set
+		// GangScheduling: true here.
+	},
+}
+
+// currentCapabilities holds the resolved CapabilitySet for the running e2e
+// suite. DiscoverCapabilities (in capability_discovery.go, e2e build tag)
+// populates it once at TestMain time; RequireCapability reads it on every
+// gated test entry.
+var (
+	currentCapabilities    CapabilitySet
+	currentCapabilitiesSet bool
+	currentCapabilitiesMu  sync.RWMutex
+)
+
+// RequireCapability skips t when the active backend does not provide cap.
+// Tests gated with RequireCapability are listed in the design proposal's
+// Test Classification table as "Capability-gated".
+//
+// The function is no-op if capabilities have not been discovered yet (e.g. when
+// running unit tests with go test ./... without an e2e cluster); the e2e build
+// flow guarantees discovery runs before any test that calls this.
+func RequireCapability(t *testing.T, cap Capability) {
+	t.Helper()
+	currentCapabilitiesMu.RLock()
+	defer currentCapabilitiesMu.RUnlock()
+	if !currentCapabilitiesSet {
+		return
+	}
+	if !currentCapabilities.Has(cap) {
+		t.Skipf("skipping: active backend %q does not provide capability %q",
+			currentCapabilities.ActiveBackend, cap)
+	}
+}
@@ -0,0 +1,100 @@
+// /*
+// Copyright 2026 The Grove Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// */
+
+package tests
+
+import (
+	"testing"
+
+	configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1"
+	"github.com/ai-dynamo/grove/operator/internal/scheduler"
+	"github.com/ai-dynamo/grove/operator/internal/scheduler/kai"
+	"github.com/ai-dynamo/grove/operator/internal/scheduler/kube"
+
+	"k8s.io/apimachinery/pkg/runtime"
+	"k8s.io/client-go/tools/record"
+	"sigs.k8s.io/controller-runtime/pkg/client/fake"
+)
+
+// backendConstructors mirrors the switch in
+// operator/internal/scheduler/manager/manager.go newBackendForProfile.
+// Adding a new backend means adding a row here AND in
+// backendInterfaceCapabilities (in capabilities.go); TestCapabilityTableMatchesBackends
+// fails the build if the two disagree.
+var backendConstructors = map[configv1alpha1.SchedulerName]func() scheduler.Backend{
+	configv1alpha1.SchedulerNameKai: func() scheduler.Backend {
+		return kai.New(
+			fake.NewClientBuilder().Build(),
+			runtime.NewScheme(),
+			record.NewFakeRecorder(1),
+			configv1alpha1.SchedulerProfile{Name: configv1alpha1.SchedulerNameKai},
+		)
+	},
+	configv1alpha1.SchedulerNameKube: func() scheduler.Backend {
+		return kube.New(
+			fake.NewClientBuilder().Build(),
+			runtime.NewScheme(),
+			record.NewFakeRecorder(1),
+			configv1alpha1.SchedulerProfile{Name: configv1alpha1.SchedulerNameKube},
+		)
+	},
+}
+
+// TestCapabilityTableCoversAllSupportedBackends ensures the hardcoded
+// capability table has a row for every backend the operator can construct.
+// Catches the failure mode where a contributor adds a backend to
+// SupportedSchedulerNames + manager.newBackendForProfile but forgets the
+// capability table — without this, the new backend's capability-gated tests
+// would silently skip rather than fail.
+func TestCapabilityTableCoversAllSupportedBackends(t *testing.T) {
+	for _, name := range configv1alpha1.SupportedSchedulerNames {
+		if _, ok := backendInterfaceCapabilities[string(name)]; !ok {
+			t.Errorf("backend %q is in SupportedSchedulerNames but missing from "+
+				"backendInterfaceCapabilities; add a row to "+
+				"operator/e2e/tests/capabilities.go", name)
+		}
+		if _, ok := backendConstructors[name]; !ok {
+			t.Errorf("backend %q is in SupportedSchedulerNames but missing from "+
+				"backendConstructors; add a row to "+
+				"operator/e2e/tests/capabilities_test.go", name)
+		}
+	}
+}
+
+// TestCapabilityTableMatchesBackends cross-checks the hardcoded capability
+// table against actual Go interface implementation for each backend. Catches
+// the failure mode where a backend's interface set changes (e.g. KAI drops
+// TopologyAwareSchedBackend) but the table is not updated — without this, the
+// E2E suite would either skip valid TAS tests or run them against a backend
+// that no longer supports TAS.
+func TestCapabilityTableMatchesBackends(t *testing.T) {
+	for name, ctor := range backendConstructors {
+		t.Run(string(name), func(t *testing.T) {
+			b := ctor()
+			table := backendInterfaceCapabilities[string(name)]
+
+			// TopologyAwareScheduling: tied to the Go interface assertion
+			// the operator itself uses (clustertopology.go L46–54).
+			_, gotTAS := b.(scheduler.TopologyAwareSchedBackend)
+			wantTAS := table[TopologyAwareScheduling]
+			if gotTAS != wantTAS {
+				t.Errorf("backend %q: TopologyAwareScheduling table=%v but "+
+					"interface assertion=%v; update either the backend "+
+					"or backendInterfaceCapabilities", name, wantTAS, gotTAS)
+			}
+		})
+	}
+}
@@ -0,0 +1,75 @@
+//go:build e2e
+
+// /*
+// Copyright 2026 The Grove Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// */
+
+package tests
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/ai-dynamo/grove/operator/e2e/grove/config"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+)
+
+// DiscoverCapabilities resolves the active backend and its capabilities from
+// the live OperatorConfiguration plus the hardcoded interface table, stores
+// the result in the package-level currentCapabilities so RequireCapability
+// can read it, and returns the resolved set so callers may log/inspect it.
+//
+// Called once from TestMain before any test runs.
+func DiscoverCapabilities(ctx context.Context, crClient client.Client) (CapabilitySet, error) {
+	md, err := config.NewOperatorConfig(crClient).ReadGroveMetadata(ctx)
+	if err != nil {
+		return CapabilitySet{}, fmt.Errorf("read OperatorConfiguration: %w", err)
+	}
+
+	backend := md.Config.Scheduler.DefaultProfileName
+	table, ok := backendInterfaceCapabilities[backend]
+	if !ok {
+		return CapabilitySet{}, fmt.Errorf(
+			"active backend %q has no entry in backendInterfaceCapabilities; "+
+				"please update operator/e2e/tests/capabilities.go", backend)
+	}
+
+	set := CapabilitySet{
+		ActiveBackend: backend,
+		caps:          map[Capability]bool{},
+	}
+
+	// Backend-coupled capability: present iff backend is in the table for it.
+	if table[GangScheduling] {
+		set.caps[GangScheduling] = true
+	}
+
+	// Backend-coupled capability gated by an additional config flag.
+	if md.Config.TopologyAwareScheduling.Enabled && table[TopologyAwareScheduling] {
+		set.caps[TopologyAwareScheduling] = true
+	}
+
+	// Config-only capability: no interface-table lookup.
+	if md.Config.Network.AutoMNNVLEnabled {
+		set.caps[AutoMNNVL] = true
+	}
+
+	currentCapabilitiesMu.Lock()
+	currentCapabilities = set
+	currentCapabilitiesSet = true
+	currentCapabilitiesMu.Unlock()
+
+	return set, nil
+}