diff --git a/.github/workflows/build-check-test.yaml b/.github/workflows/build-check-test.yaml index 5d2723ff6..66928bca4 100644 --- a/.github/workflows/build-check-test.yaml +++ b/.github/workflows/build-check-test.yaml @@ -32,21 +32,59 @@ concurrency: cancel-in-progress: true jobs: - # Detect which paths have changed to conditionally run E2E tests - changes: + # E2E matrix selector. Computes which (suite × backend) rows actually run + # and which fall to the e2e-skip mirror, based on changed paths, PR labels, + # and draft state. See hack/e2e-select/main.py + testdata/ for the rules + # and golden samples. + e2e-select: runs-on: ubuntu-latest if: github.event_name == 'pull_request' outputs: - e2e-relevant: ${{ steps.filter.outputs.e2e-relevant }} + run: ${{ steps.select.outputs.run }} + skip: ${{ steps.select.outputs.skip }} + has_run: ${{ steps.select.outputs.has_run }} + has_skip: ${{ steps.select.outputs.has_skip }} + reason: ${{ steps.select.outputs.reason }} steps: - uses: actions/checkout@v4 - - uses: dorny/paths-filter@v3 - id: filter with: - filters: | - e2e-relevant: - - 'operator/**' - - '.github/**' + fetch-depth: 0 + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + - name: Compute changed files + run: | + git diff --name-only \ + "${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.head.sha }}" \ + > /tmp/changed-files.txt + echo "Changed files in this PR:" + cat /tmp/changed-files.txt + - name: Run selector + id: select + run: | + LABELS="${{ join(github.event.pull_request.labels.*.name, ',') }}" + DRAFT_FLAG="" + if [ "${{ github.event.pull_request.draft }}" = "true" ]; then + DRAFT_FLAG="--draft" + fi + + # Print full result to logs for transparency. + echo "=== Selector result ===" + python3 hack/e2e-select/main.py \ + --mode pr \ + --changed-files /tmp/changed-files.txt \ + --labels "$LABELS" \ + $DRAFT_FLAG | tee /tmp/select.json + echo "=======================" + + # Slice into individual job outputs (one Python invocation, jq slices). + { + echo "run=$(jq -c .run /tmp/select.json)" + echo "skip=$(jq -c .skip /tmp/select.json)" + echo "has_run=$(jq -r .has_run /tmp/select.json)" + echo "has_skip=$(jq -r .has_skip /tmp/select.json)" + echo "reason=$(jq -r .reason /tmp/select.json)" + } >> "$GITHUB_OUTPUT" test: runs-on: ubuntu-latest @@ -87,48 +125,30 @@ jobs: - name: check run: make check - # E2E tests - only run after build, check, and test jobs succeed - # Only triggered by changes to operator or .github folders + # E2E tests — matrix comes from the selector job. + # + # Matrix entries (defined in hack/e2e-select/main.py ALL_ROWS): + # test_name - name shown in the GitHub Actions UI + # test_pattern - Go test -run regex + # backend - kai-scheduler | default-scheduler | ... + # create_flags - extra flags appended to E2E_CREATE_FLAGS + # (empty string means "use the base e2e.yaml preset, KAI") + # make_target - Makefile target (run-e2e-full | run-e2e-real-full | run-e2e-mnnvl-full) # - # Matrix entries can set: - # test_name (required) - name shown in the GitHub Actions UI - # test_pattern (optional) - Go test -run pattern (standard e2e tests) - # make_target (optional) - Makefile target, defaults to run-e2e-full + # The selector handles draft policy: a draft PR without the 'run-e2e' label + # gets has_run=false (all rows fall to e2e-skip). The 'run-e2e' label forces + # the full matrix (safety escape for reviewers). e2e: - needs: [test, build, check, changes] - # Run on non-draft PRs (or draft PRs with 'run-e2e' label) - # AND only when operator or .github files are changed + needs: [test, build, check, e2e-select] if: | github.event_name == 'pull_request' && - needs.changes.outputs.e2e-relevant == 'true' && - (github.event.pull_request.draft == false || contains(github.event.pull_request.labels.*.name, 'run-e2e')) + needs.e2e-select.outputs.has_run == 'true' # use NVIDIA self-hosted runner setting is on Velonix repository runs-on: prod-grove-e2e-v1 timeout-minutes: 60 strategy: fail-fast: false - matrix: - include: - - test_name: gang_scheduling - test_pattern: "^Test_GS" - - test_name: rolling_updates - test_pattern: "^Test_RU" - - test_name: ondelete_updates - test_pattern: "^Test_OD" - - test_name: startup_ordering - test_pattern: "^Test_SO" - make_target: "run-e2e-real-full" - - test_name: Topology_Aware_Scheduling - test_pattern: "^Test_TAS" - - test_name: cert_management - test_pattern: "^Test_CM" - - test_name: auto_mnnvl - test_pattern: "^Test_AutoMNNVL" - make_target: "run-e2e-mnnvl-full" - - test_name: crd_installer - test_pattern: "^Test_CRD_Installer" - - test_name: resource_sharing - test_pattern: "^Test_RS" + matrix: ${{ fromJSON(needs.e2e-select.outputs.run) }} name: E2E - ${{ matrix.test_name }} steps: # print runner specs so we have a record in case of failures @@ -136,6 +156,7 @@ jobs: run: | echo "CPUs: $(nproc)" echo "RAM: $(free -h | awk '/^Mem:/ {print $2}')" + echo "Selection reason: ${{ needs.e2e-select.outputs.reason }}" - name: Checkout code uses: actions/checkout@v4 @@ -150,7 +171,7 @@ jobs: - name: Run e2e tests - ${{ matrix.test_name }} run: | - make ${{ matrix.make_target || 'run-e2e-full' }} TEST_PATTERN='${{ matrix.test_pattern }}' E2E_CREATE_FLAGS='--dind-memory-mode' + make ${{ matrix.make_target }} TEST_PATTERN='${{ matrix.test_pattern }}' E2E_CREATE_FLAGS='${{ matrix.create_flags }} --dind-memory-mode' working-directory: operator # The test code handles cleanup via Teardown(), but this step provides @@ -173,29 +194,22 @@ jobs: if-no-files-found: warn retention-days: 7 - # This job runs with the same matrix as 'e2e' when E2E tests are skipped (no relevant - # file changes and no 'run-e2e' label). It reports a passing status so that required - # branch protection checks are satisfied even for documentation-only PRs. + # Mirror that emits synthetic passes for matrix rows the selector excluded + # (path-filtered out, or all-rows when the PR is in draft state without the + # 'run-e2e' label). This keeps the required branch-protection check names + # (E2E - ) resolvable even when nothing real runs. e2e-skip: - needs: [changes] + needs: [e2e-select] if: | github.event_name == 'pull_request' && - needs.changes.outputs.e2e-relevant != 'true' && - !contains(github.event.pull_request.labels.*.name, 'run-e2e') + needs.e2e-select.outputs.has_skip == 'true' runs-on: ubuntu-latest strategy: fail-fast: false - matrix: - include: - - test_name: gang_scheduling - - test_name: rolling_updates - - test_name: startup_ordering - - test_name: Topology_Aware_Scheduling - - test_name: cert_management - - test_name: auto_mnnvl - - test_name: crd_installer - - test_name: resource_sharing + matrix: ${{ fromJSON(needs.e2e-select.outputs.skip) }} name: E2E - ${{ matrix.test_name }} steps: - - name: Skip E2E (no relevant changes) - run: echo "E2E skipped — no changes to operator/ or .github/ and 'run-e2e' label not set" + - name: Skip E2E + run: | + echo "Skipped: ${{ matrix.test_name }} on ${{ matrix.backend }}" + echo "Selection reason: ${{ needs.e2e-select.outputs.reason }}" diff --git a/.github/workflows/e2e-nightly.yaml b/.github/workflows/e2e-nightly.yaml new file mode 100644 index 000000000..636022d8e --- /dev/null +++ b/.github/workflows/e2e-nightly.yaml @@ -0,0 +1,130 @@ +# /* +# Copyright 2026 The Grove Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# */ + +# Nightly E2E — runs the full (suite × capable backend) matrix on schedule. +# +# Rationale: the PR matrix (build-check-test.yaml) is path-filtered to keep +# per-PR cost bounded as more backends land. That means cross-backend +# regressions in code paths neither PR touched can slip in. This workflow +# catches them by running the exhaustive matrix once a day. +# +# Failure routing: this initial cut uploads per-row diagnostic artifacts and +# writes a job-summary report. Issue auto-open / Slack notification is left +# as a follow-up so the first weeks of nightly runs do not spam the repo +# while the matrix is stabilising. + +name: Nightly E2E + +on: + schedule: + # 07:00 UTC = 15:00 Beijing / 00:00 PST. Adjust to maintainer preference. + - cron: "0 7 * * *" + # Allow manual trigger for ad-hoc verification (e.g. after a flaky run). + workflow_dispatch: + +# Don't run multiple nightlies in parallel (matrix already consumes the +# self-hosted runner pool; concurrent runs would queue and bleed into the +# next day's window). +concurrency: + group: e2e-nightly + cancel-in-progress: false + +jobs: + # Compute the full matrix via the same selector the PR workflow uses, in + # nightly mode (ignores path filter / labels / draft state). + matrix: + runs-on: ubuntu-latest + outputs: + run: ${{ steps.select.outputs.run }} + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + - name: Run selector (nightly mode) + id: select + run: | + echo "=== Selector result ===" + python3 hack/e2e-select/main.py --mode nightly | tee /tmp/select.json + echo "=======================" + echo "run=$(jq -c .run /tmp/select.json)" >> "$GITHUB_OUTPUT" + + e2e: + needs: matrix + runs-on: prod-grove-e2e-v1 + timeout-minutes: 90 + strategy: + fail-fast: false + matrix: ${{ fromJSON(needs.matrix.outputs.run) }} + name: Nightly E2E - ${{ matrix.test_name }} + steps: + - name: Print runner specs + run: | + echo "CPUs: $(nproc)" + echo "RAM: $(free -h | awk '/^Mem:/ {print $2}')" + + - name: Checkout code + uses: actions/checkout@v4 + + - name: Pull registry image from GHCR to avoid Docker Hub rate limits + run: | + docker pull ghcr.io/distribution/distribution:3.1.0 + docker tag ghcr.io/distribution/distribution:3.1.0 registry:2 + + - name: E2E Setup + uses: ./.github/actions/e2e-setup + + - name: Run e2e tests - ${{ matrix.test_name }} + run: | + make ${{ matrix.make_target }} TEST_PATTERN='${{ matrix.test_pattern }}' E2E_CREATE_FLAGS='${{ matrix.create_flags }} --dind-memory-mode' + working-directory: operator + + - name: Cleanup k3d cluster + if: always() + working-directory: operator + run: make e2e-cluster-down || true + + - name: Upload test logs on failure + if: failure() + uses: actions/upload-artifact@v4 + with: + name: nightly-e2e-logs-${{ matrix.test_name }} + path: operator/e2e-diagnostics/ + if-no-files-found: warn + retention-days: 14 + + # Aggregate report. Runs unconditionally after the matrix so we always have + # a single place to look at the night's results. + summary: + needs: e2e + if: always() + runs-on: ubuntu-latest + steps: + - name: Write summary + run: | + { + echo "## Nightly E2E summary" + echo "" + echo "Run: ${{ github.run_id }}" + echo "Trigger: ${{ github.event_name }}" + echo "Status: ${{ needs.e2e.result }}" + echo "" + if [ "${{ needs.e2e.result }}" != "success" ]; then + echo "⚠️ One or more matrix rows failed. See the [run page](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) and the per-row \`nightly-e2e-logs-*\` artifacts." + else + echo "✅ All matrix rows passed." + fi + } >> "$GITHUB_STEP_SUMMARY" diff --git a/hack/e2e-select/main.py b/hack/e2e-select/main.py new file mode 100644 index 000000000..6cf16cb3f --- /dev/null +++ b/hack/e2e-select/main.py @@ -0,0 +1,276 @@ +#!/usr/bin/env python3 +# /* +# Copyright 2026 The Grove Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# */ +""" +E2E test matrix selector for GitHub Actions. + +Computes which (suite × backend) rows the e2e job should run on a given +PR (mode=pr) or scheduled run (mode=nightly), based on changed file paths +and PR labels. + +The selector emits the matrix as JSON in the GHA ``{"include": [...]}`` +shape. Two outputs are emitted in a single call: + + - ``run``: rows the matrix should actually execute + - ``skip``: rows the e2e-skip mirror should emit as synthetic passes, + so that branch-protection-required check names stay stable across PRs + +The union of ``run`` + ``skip`` always equals the full matrix +(``ALL_ROWS``); the two are disjoint. + +Selection logic +--------------- +- ``mode=nightly``: ``run = ALL_ROWS``, ``skip = []``. Path filter and + labels are ignored. +- ``mode=pr`` with ``--has-label run-e2e``: same as nightly. This is the + "safety escape" — a reviewer can force the full matrix without having + to figure out which path triggers which rows. +- ``mode=pr`` with ``--draft`` and no ``run-e2e`` label: ``run = []``, + ``skip = ALL_ROWS``. Draft PRs do not gate merges, so we emit the full + set as synthetic passes; the contributor can add the label to force + real runs. +- ``mode=pr`` otherwise: changed files are matched against ``PATH_RULES`` + in order; the union of matched "affected" sets selects which rows run. + Unselected rows go to ``skip``. + +Adding a new backend +-------------------- +1. Add the per-backend rows to ``ALL_ROWS`` (test_name must be unique). +2. If the backend has its own scheduler package subdir + (``operator/internal/scheduler//``), add a path rule above the + generic ``scheduler/**`` shared-framework rule. +3. Add testdata samples covering the new backend's path-filter case and + re-run the unit tests. +""" + +import argparse +import fnmatch +import json +import sys +from typing import Any + +# --------------------------------------------------------------------------- +# Matrix definition. Keep test_name unique across rows. +# --------------------------------------------------------------------------- +ALL_ROWS: list[dict[str, Any]] = [ + # ---- kai-scheduler (primary backend) ---- + {"test_name": "gang_scheduling", "test_pattern": "^Test_GS", + "backend": "kai-scheduler", "create_flags": "", + "make_target": "run-e2e-full", "tier": "capability"}, + {"test_name": "rolling_updates", "test_pattern": "^Test_RU", + "backend": "kai-scheduler", "create_flags": "", + "make_target": "run-e2e-full", "tier": "sensitive"}, + {"test_name": "ondelete_updates", "test_pattern": "^Test_OD", + "backend": "kai-scheduler", "create_flags": "", + "make_target": "run-e2e-full", "tier": "sensitive"}, + {"test_name": "startup_ordering", "test_pattern": "^Test_SO", + "backend": "kai-scheduler", "create_flags": "", + "make_target": "run-e2e-real-full", "tier": "sensitive"}, + {"test_name": "Topology_Aware_Scheduling", "test_pattern": "^Test_TAS", + "backend": "kai-scheduler", "create_flags": "", + "make_target": "run-e2e-full", "tier": "capability"}, + {"test_name": "cert_management", "test_pattern": "^Test_CM", + "backend": "kai-scheduler", "create_flags": "", + "make_target": "run-e2e-full", "tier": "agnostic"}, + {"test_name": "auto_mnnvl", "test_pattern": "^Test_AutoMNNVL", + "backend": "kai-scheduler", "create_flags": "", + "make_target": "run-e2e-mnnvl-full", "tier": "capability"}, + {"test_name": "crd_installer", "test_pattern": "^Test_CRD_Installer", + "backend": "kai-scheduler", "create_flags": "", + "make_target": "run-e2e-full", "tier": "agnostic"}, + {"test_name": "resource_sharing", "test_pattern": "^Test_RS", + "backend": "kai-scheduler", "create_flags": "", + "make_target": "run-e2e-full", "tier": "capability"}, + # ---- default-scheduler ---- + {"test_name": "rolling_updates_default-scheduler", "test_pattern": "^Test_RU", + "backend": "default-scheduler", + "create_flags": "-f hack/e2e-default-scheduler.yaml", + "make_target": "run-e2e-full", "tier": "sensitive"}, + {"test_name": "ondelete_updates_default-scheduler", "test_pattern": "^Test_OD", + "backend": "default-scheduler", + "create_flags": "-f hack/e2e-default-scheduler.yaml", + "make_target": "run-e2e-full", "tier": "sensitive"}, + {"test_name": "startup_ordering_default-scheduler", "test_pattern": "^Test_SO", + "backend": "default-scheduler", + "create_flags": "-f hack/e2e-default-scheduler.yaml", + "make_target": "run-e2e-real-full", "tier": "sensitive"}, +] + +# --------------------------------------------------------------------------- +# Path filter rules. Order matters: first matching rule per file wins. +# Each rule maps a glob set to an "affected" set; "all" means all backends, +# "agnostic" means include agnostic-tier rows even when no specific backend +# matched. The selector unions affected sets across all changed files. +# --------------------------------------------------------------------------- +PATH_RULES: list[dict[str, Any]] = [ + # Docs / pure markdown / top-level metadata: never trigger e2e. + {"globs": ["docs/**", "*.md", "**/*.md", + "ATTRIBUTION.md", "LICENSE", "OWNERS", + "MAINTAINERS.md", "code-of-conduct.md", + "SECURITY.md", "CONTRIBUTING.md"], + "affected": set()}, + # Backend-specific subpaths under the scheduler package. + {"globs": ["operator/internal/scheduler/kai/**"], + "affected": {"kai-scheduler"}}, + {"globs": ["operator/internal/scheduler/kube/**"], + "affected": {"default-scheduler"}}, + # Shared scheduler framework (anything else under scheduler/). + {"globs": ["operator/internal/scheduler/**"], + "affected": {"all"}}, + # API surface and Helm charts: broad — affects every backend's deploy. + {"globs": ["operator/api/**", "operator/charts/**"], + "affected": {"all", "agnostic"}}, + # E2E infra, CI workflows, hack scripts: broad. + {"globs": ["operator/e2e/**", "operator/hack/**", + ".github/**", "hack/**"], + "affected": {"all", "agnostic"}}, + # Fallback for anything else under operator/: treat as broad change. + {"globs": ["operator/**"], + "affected": {"all"}}, +] + + +def _match(path: str, globs: list[str]) -> bool: + return any(fnmatch.fnmatch(path, g) for g in globs) + + +def compute_affected(changed_files: list[str]) -> set[str]: + """Walk PATH_RULES; first match per file contributes to the affected set.""" + affected: set[str] = set() + for path in changed_files: + for rule in PATH_RULES: + if _match(path, rule["globs"]): + affected |= rule["affected"] + break + # No-match files (e.g. unknown top-level paths) are ignored intentionally. + return affected + + +def select_rows(affected: set[str]) -> list[dict[str, Any]]: + """Filter ALL_ROWS by an affected set. + + 'all' matches every row; otherwise a row matches if its backend is in the + set, or if 'agnostic' is in the set and the row is agnostic-tier. + """ + if not affected: + return [] + if "all" in affected: + return list(ALL_ROWS) + selected: list[dict[str, Any]] = [] + include_agnostic = "agnostic" in affected + for row in ALL_ROWS: + if row["backend"] in affected: + selected.append(row) + elif include_agnostic and row["tier"] == "agnostic": + selected.append(row) + return selected + + +def split(rows_run: list[dict[str, Any]]) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: + """Return (run, skip) where skip = ALL_ROWS - run, preserving order.""" + run_names = {r["test_name"] for r in rows_run} + skip = [r for r in ALL_ROWS if r["test_name"] not in run_names] + return rows_run, skip + + +def _strip_internal(rows: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Drop selector-internal fields before emitting JSON.""" + out = [] + for r in rows: + out.append({k: v for k, v in r.items() if k != "tier"}) + return out + + +def compute(mode: str, changed_files: list[str], labels: list[str], + draft: bool) -> dict[str, Any]: + """Top-level decision tree. Returns dict with 'run', 'skip', 'has_run', + 'has_skip', 'reason' (a short string for logging).""" + has_run_e2e_label = "run-e2e" in labels + + if mode == "nightly": + run = list(ALL_ROWS) + reason = "nightly: full matrix" + elif has_run_e2e_label: + run = list(ALL_ROWS) + reason = "pr+run-e2e label: full matrix (safety escape)" + elif draft: + # Draft PR with no label: do not run e2e; all rows go to skip mirror. + run = [] + reason = "pr+draft+no run-e2e label: skip all" + else: + affected = compute_affected(changed_files) + run = select_rows(affected) + if not run: + reason = f"pr: no rows affected (affected={sorted(affected) or '∅'})" + else: + reason = f"pr: affected={sorted(affected)}" + + run, skip = split(run) + return { + "run": {"include": _strip_internal(run)}, + "skip": {"include": _strip_internal(skip)}, + "has_run": len(run) > 0, + "has_skip": len(skip) > 0, + "reason": reason, + } + + +def _read_changed_files(arg: str) -> list[str]: + if arg == "-": + lines = sys.stdin.read().splitlines() + else: + with open(arg) as f: + lines = f.read().splitlines() + return [line.strip() for line in lines if line.strip()] + + +def main(argv: list[str] | None = None) -> int: + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument("--mode", required=True, choices=["pr", "nightly"]) + ap.add_argument("--changed-files", default="-", + help="path to file containing one changed path per line, " + "or '-' for stdin. ignored for mode=nightly.") + ap.add_argument("--labels", default="", + help="comma-separated list of PR labels. " + "'run-e2e' triggers full matrix in pr mode.") + ap.add_argument("--draft", action="store_true", + help="set if the PR is in draft state.") + ap.add_argument("--show", default="all", + choices=["all", "run", "skip", "has_run", "has_skip", "reason"], + help="which part of the result to print " + "(default: full JSON object).") + args = ap.parse_args(argv) + + if args.mode == "nightly": + changed_files: list[str] = [] + else: + changed_files = _read_changed_files(args.changed_files) + + labels = [s.strip() for s in args.labels.split(",") if s.strip()] + result = compute(args.mode, changed_files, labels, args.draft) + + if args.show == "all": + print(json.dumps(result, indent=2)) + elif args.show in ("run", "skip"): + print(json.dumps(result[args.show], separators=(",", ":"))) + else: + print(result[args.show]) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/hack/e2e-select/testdata/nightly-mode-full.json b/hack/e2e-select/testdata/nightly-mode-full.json new file mode 100644 index 000000000..dcb333c60 --- /dev/null +++ b/hack/e2e-select/testdata/nightly-mode-full.json @@ -0,0 +1,95 @@ +{ + "run": { + "include": [ + { + "test_name": "gang_scheduling", + "test_pattern": "^Test_GS", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "rolling_updates", + "test_pattern": "^Test_RU", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "ondelete_updates", + "test_pattern": "^Test_OD", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "startup_ordering", + "test_pattern": "^Test_SO", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-real-full" + }, + { + "test_name": "Topology_Aware_Scheduling", + "test_pattern": "^Test_TAS", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "cert_management", + "test_pattern": "^Test_CM", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "auto_mnnvl", + "test_pattern": "^Test_AutoMNNVL", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-mnnvl-full" + }, + { + "test_name": "crd_installer", + "test_pattern": "^Test_CRD_Installer", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "resource_sharing", + "test_pattern": "^Test_RS", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "rolling_updates_default-scheduler", + "test_pattern": "^Test_RU", + "backend": "default-scheduler", + "create_flags": "-f hack/e2e-default-scheduler.yaml", + "make_target": "run-e2e-full" + }, + { + "test_name": "ondelete_updates_default-scheduler", + "test_pattern": "^Test_OD", + "backend": "default-scheduler", + "create_flags": "-f hack/e2e-default-scheduler.yaml", + "make_target": "run-e2e-full" + }, + { + "test_name": "startup_ordering_default-scheduler", + "test_pattern": "^Test_SO", + "backend": "default-scheduler", + "create_flags": "-f hack/e2e-default-scheduler.yaml", + "make_target": "run-e2e-real-full" + } + ] + }, + "skip": { + "include": [] + }, + "has_run": true, + "has_skip": false +} diff --git a/hack/e2e-select/testdata/pr-mode-docs-only.json b/hack/e2e-select/testdata/pr-mode-docs-only.json new file mode 100644 index 000000000..535173844 --- /dev/null +++ b/hack/e2e-select/testdata/pr-mode-docs-only.json @@ -0,0 +1,95 @@ +{ + "run": { + "include": [] + }, + "skip": { + "include": [ + { + "test_name": "gang_scheduling", + "test_pattern": "^Test_GS", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "rolling_updates", + "test_pattern": "^Test_RU", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "ondelete_updates", + "test_pattern": "^Test_OD", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "startup_ordering", + "test_pattern": "^Test_SO", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-real-full" + }, + { + "test_name": "Topology_Aware_Scheduling", + "test_pattern": "^Test_TAS", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "cert_management", + "test_pattern": "^Test_CM", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "auto_mnnvl", + "test_pattern": "^Test_AutoMNNVL", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-mnnvl-full" + }, + { + "test_name": "crd_installer", + "test_pattern": "^Test_CRD_Installer", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "resource_sharing", + "test_pattern": "^Test_RS", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "rolling_updates_default-scheduler", + "test_pattern": "^Test_RU", + "backend": "default-scheduler", + "create_flags": "-f hack/e2e-default-scheduler.yaml", + "make_target": "run-e2e-full" + }, + { + "test_name": "ondelete_updates_default-scheduler", + "test_pattern": "^Test_OD", + "backend": "default-scheduler", + "create_flags": "-f hack/e2e-default-scheduler.yaml", + "make_target": "run-e2e-full" + }, + { + "test_name": "startup_ordering_default-scheduler", + "test_pattern": "^Test_SO", + "backend": "default-scheduler", + "create_flags": "-f hack/e2e-default-scheduler.yaml", + "make_target": "run-e2e-real-full" + } + ] + }, + "has_run": false, + "has_skip": true +} diff --git a/hack/e2e-select/testdata/pr-mode-draft-no-label.json b/hack/e2e-select/testdata/pr-mode-draft-no-label.json new file mode 100644 index 000000000..535173844 --- /dev/null +++ b/hack/e2e-select/testdata/pr-mode-draft-no-label.json @@ -0,0 +1,95 @@ +{ + "run": { + "include": [] + }, + "skip": { + "include": [ + { + "test_name": "gang_scheduling", + "test_pattern": "^Test_GS", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "rolling_updates", + "test_pattern": "^Test_RU", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "ondelete_updates", + "test_pattern": "^Test_OD", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "startup_ordering", + "test_pattern": "^Test_SO", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-real-full" + }, + { + "test_name": "Topology_Aware_Scheduling", + "test_pattern": "^Test_TAS", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "cert_management", + "test_pattern": "^Test_CM", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "auto_mnnvl", + "test_pattern": "^Test_AutoMNNVL", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-mnnvl-full" + }, + { + "test_name": "crd_installer", + "test_pattern": "^Test_CRD_Installer", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "resource_sharing", + "test_pattern": "^Test_RS", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "rolling_updates_default-scheduler", + "test_pattern": "^Test_RU", + "backend": "default-scheduler", + "create_flags": "-f hack/e2e-default-scheduler.yaml", + "make_target": "run-e2e-full" + }, + { + "test_name": "ondelete_updates_default-scheduler", + "test_pattern": "^Test_OD", + "backend": "default-scheduler", + "create_flags": "-f hack/e2e-default-scheduler.yaml", + "make_target": "run-e2e-full" + }, + { + "test_name": "startup_ordering_default-scheduler", + "test_pattern": "^Test_SO", + "backend": "default-scheduler", + "create_flags": "-f hack/e2e-default-scheduler.yaml", + "make_target": "run-e2e-real-full" + } + ] + }, + "has_run": false, + "has_skip": true +} diff --git a/hack/e2e-select/testdata/pr-mode-kai-only.json b/hack/e2e-select/testdata/pr-mode-kai-only.json new file mode 100644 index 000000000..269a03806 --- /dev/null +++ b/hack/e2e-select/testdata/pr-mode-kai-only.json @@ -0,0 +1,96 @@ +{ + "run": { + "include": [ + { + "test_name": "gang_scheduling", + "test_pattern": "^Test_GS", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "rolling_updates", + "test_pattern": "^Test_RU", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "ondelete_updates", + "test_pattern": "^Test_OD", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "startup_ordering", + "test_pattern": "^Test_SO", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-real-full" + }, + { + "test_name": "Topology_Aware_Scheduling", + "test_pattern": "^Test_TAS", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "cert_management", + "test_pattern": "^Test_CM", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "auto_mnnvl", + "test_pattern": "^Test_AutoMNNVL", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-mnnvl-full" + }, + { + "test_name": "crd_installer", + "test_pattern": "^Test_CRD_Installer", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "resource_sharing", + "test_pattern": "^Test_RS", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + } + ] + }, + "skip": { + "include": [ + { + "test_name": "rolling_updates_default-scheduler", + "test_pattern": "^Test_RU", + "backend": "default-scheduler", + "create_flags": "-f hack/e2e-default-scheduler.yaml", + "make_target": "run-e2e-full" + }, + { + "test_name": "ondelete_updates_default-scheduler", + "test_pattern": "^Test_OD", + "backend": "default-scheduler", + "create_flags": "-f hack/e2e-default-scheduler.yaml", + "make_target": "run-e2e-full" + }, + { + "test_name": "startup_ordering_default-scheduler", + "test_pattern": "^Test_SO", + "backend": "default-scheduler", + "create_flags": "-f hack/e2e-default-scheduler.yaml", + "make_target": "run-e2e-real-full" + } + ] + }, + "has_run": true, + "has_skip": true +} diff --git a/hack/e2e-select/testdata/pr-mode-kube-only.json b/hack/e2e-select/testdata/pr-mode-kube-only.json new file mode 100644 index 000000000..542927c69 --- /dev/null +++ b/hack/e2e-select/testdata/pr-mode-kube-only.json @@ -0,0 +1,96 @@ +{ + "run": { + "include": [ + { + "test_name": "rolling_updates_default-scheduler", + "test_pattern": "^Test_RU", + "backend": "default-scheduler", + "create_flags": "-f hack/e2e-default-scheduler.yaml", + "make_target": "run-e2e-full" + }, + { + "test_name": "ondelete_updates_default-scheduler", + "test_pattern": "^Test_OD", + "backend": "default-scheduler", + "create_flags": "-f hack/e2e-default-scheduler.yaml", + "make_target": "run-e2e-full" + }, + { + "test_name": "startup_ordering_default-scheduler", + "test_pattern": "^Test_SO", + "backend": "default-scheduler", + "create_flags": "-f hack/e2e-default-scheduler.yaml", + "make_target": "run-e2e-real-full" + } + ] + }, + "skip": { + "include": [ + { + "test_name": "gang_scheduling", + "test_pattern": "^Test_GS", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "rolling_updates", + "test_pattern": "^Test_RU", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "ondelete_updates", + "test_pattern": "^Test_OD", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "startup_ordering", + "test_pattern": "^Test_SO", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-real-full" + }, + { + "test_name": "Topology_Aware_Scheduling", + "test_pattern": "^Test_TAS", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "cert_management", + "test_pattern": "^Test_CM", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "auto_mnnvl", + "test_pattern": "^Test_AutoMNNVL", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-mnnvl-full" + }, + { + "test_name": "crd_installer", + "test_pattern": "^Test_CRD_Installer", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "resource_sharing", + "test_pattern": "^Test_RS", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + } + ] + }, + "has_run": true, + "has_skip": true +} diff --git a/hack/e2e-select/testdata/pr-mode-run-e2e-label.json b/hack/e2e-select/testdata/pr-mode-run-e2e-label.json new file mode 100644 index 000000000..dcb333c60 --- /dev/null +++ b/hack/e2e-select/testdata/pr-mode-run-e2e-label.json @@ -0,0 +1,95 @@ +{ + "run": { + "include": [ + { + "test_name": "gang_scheduling", + "test_pattern": "^Test_GS", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "rolling_updates", + "test_pattern": "^Test_RU", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "ondelete_updates", + "test_pattern": "^Test_OD", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "startup_ordering", + "test_pattern": "^Test_SO", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-real-full" + }, + { + "test_name": "Topology_Aware_Scheduling", + "test_pattern": "^Test_TAS", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "cert_management", + "test_pattern": "^Test_CM", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "auto_mnnvl", + "test_pattern": "^Test_AutoMNNVL", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-mnnvl-full" + }, + { + "test_name": "crd_installer", + "test_pattern": "^Test_CRD_Installer", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "resource_sharing", + "test_pattern": "^Test_RS", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "rolling_updates_default-scheduler", + "test_pattern": "^Test_RU", + "backend": "default-scheduler", + "create_flags": "-f hack/e2e-default-scheduler.yaml", + "make_target": "run-e2e-full" + }, + { + "test_name": "ondelete_updates_default-scheduler", + "test_pattern": "^Test_OD", + "backend": "default-scheduler", + "create_flags": "-f hack/e2e-default-scheduler.yaml", + "make_target": "run-e2e-full" + }, + { + "test_name": "startup_ordering_default-scheduler", + "test_pattern": "^Test_SO", + "backend": "default-scheduler", + "create_flags": "-f hack/e2e-default-scheduler.yaml", + "make_target": "run-e2e-real-full" + } + ] + }, + "skip": { + "include": [] + }, + "has_run": true, + "has_skip": false +} diff --git a/hack/e2e-select/testdata/pr-mode-shared-scheduler.json b/hack/e2e-select/testdata/pr-mode-shared-scheduler.json new file mode 100644 index 000000000..dcb333c60 --- /dev/null +++ b/hack/e2e-select/testdata/pr-mode-shared-scheduler.json @@ -0,0 +1,95 @@ +{ + "run": { + "include": [ + { + "test_name": "gang_scheduling", + "test_pattern": "^Test_GS", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "rolling_updates", + "test_pattern": "^Test_RU", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "ondelete_updates", + "test_pattern": "^Test_OD", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "startup_ordering", + "test_pattern": "^Test_SO", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-real-full" + }, + { + "test_name": "Topology_Aware_Scheduling", + "test_pattern": "^Test_TAS", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "cert_management", + "test_pattern": "^Test_CM", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "auto_mnnvl", + "test_pattern": "^Test_AutoMNNVL", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-mnnvl-full" + }, + { + "test_name": "crd_installer", + "test_pattern": "^Test_CRD_Installer", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "resource_sharing", + "test_pattern": "^Test_RS", + "backend": "kai-scheduler", + "create_flags": "", + "make_target": "run-e2e-full" + }, + { + "test_name": "rolling_updates_default-scheduler", + "test_pattern": "^Test_RU", + "backend": "default-scheduler", + "create_flags": "-f hack/e2e-default-scheduler.yaml", + "make_target": "run-e2e-full" + }, + { + "test_name": "ondelete_updates_default-scheduler", + "test_pattern": "^Test_OD", + "backend": "default-scheduler", + "create_flags": "-f hack/e2e-default-scheduler.yaml", + "make_target": "run-e2e-full" + }, + { + "test_name": "startup_ordering_default-scheduler", + "test_pattern": "^Test_SO", + "backend": "default-scheduler", + "create_flags": "-f hack/e2e-default-scheduler.yaml", + "make_target": "run-e2e-real-full" + } + ] + }, + "skip": { + "include": [] + }, + "has_run": true, + "has_skip": false +} diff --git a/hack/e2e-select/tests/test_selector.py b/hack/e2e-select/tests/test_selector.py new file mode 100644 index 000000000..a4b6a6fe1 --- /dev/null +++ b/hack/e2e-select/tests/test_selector.py @@ -0,0 +1,252 @@ +# /* +# Copyright 2026 The Grove Authors. +# SPDX-License-Identifier: Apache-2.0 +# */ +"""Unit tests for the e2e matrix selector. + +Run from repo root: + + python3 -m pytest hack/e2e-select/tests/ -v + +or without pytest: + + python3 hack/e2e-select/tests/test_selector.py +""" + +import json +import os +import sys +import unittest +from pathlib import Path + +# Make the selector importable when invoked from the repo root. +HERE = Path(__file__).resolve().parent +sys.path.insert(0, str(HERE.parent)) + +import main as selector # noqa: E402 + + +REPO_ROOT = HERE.parent.parent.parent +TESTDATA = HERE.parent / "testdata" + + +def _names(rows: list[dict]) -> list[str]: + return [r["test_name"] for r in rows] + + +def _backends(rows: list[dict]) -> set[str]: + return {r["backend"] for r in rows} + + +class TestModes(unittest.TestCase): + """Top-level mode selection.""" + + def test_nightly_runs_full_matrix(self): + r = selector.compute("nightly", changed_files=[], labels=[], draft=False) + self.assertEqual(len(r["run"]["include"]), len(selector.ALL_ROWS)) + self.assertEqual(r["skip"]["include"], []) + self.assertTrue(r["has_run"]) + self.assertFalse(r["has_skip"]) + + def test_nightly_ignores_changed_files_and_labels(self): + r = selector.compute("nightly", + changed_files=["docs/foo.md"], + labels=["run-e2e"], + draft=True) + self.assertEqual(len(r["run"]["include"]), len(selector.ALL_ROWS)) + + def test_pr_run_e2e_label_forces_full_matrix(self): + # Even with a docs-only change, run-e2e label overrides. + r = selector.compute("pr", + changed_files=["docs/foo.md"], + labels=["run-e2e"], + draft=False) + self.assertEqual(len(r["run"]["include"]), len(selector.ALL_ROWS)) + self.assertEqual(r["skip"]["include"], []) + + def test_pr_draft_no_label_emits_no_run(self): + r = selector.compute("pr", + changed_files=["operator/internal/scheduler/kai/foo.go"], + labels=[], + draft=True) + self.assertEqual(r["run"]["include"], []) + self.assertEqual(len(r["skip"]["include"]), len(selector.ALL_ROWS)) + + +class TestPathFilters(unittest.TestCase): + """Path-rule logic for mode=pr (non-draft, no label).""" + + def _run(self, files): + return selector.compute("pr", changed_files=files, + labels=[], draft=False) + + def test_docs_only_runs_nothing(self): + r = self._run(["docs/foo.md", "README.md"]) + self.assertEqual(r["run"]["include"], []) + self.assertEqual(len(r["skip"]["include"]), len(selector.ALL_ROWS)) + + def test_kai_subpath_runs_only_kai_rows(self): + r = self._run(["operator/internal/scheduler/kai/backend.go"]) + backends = _backends(r["run"]["include"]) + self.assertEqual(backends, {"kai-scheduler"}) + # Ensure default-scheduler rows landed in skip. + skip_backends = _backends(r["skip"]["include"]) + self.assertIn("default-scheduler", skip_backends) + + def test_kube_subpath_runs_only_default_scheduler(self): + r = self._run(["operator/internal/scheduler/kube/backend.go"]) + backends = _backends(r["run"]["include"]) + self.assertEqual(backends, {"default-scheduler"}) + + def test_shared_scheduler_runs_all(self): + # File under scheduler/ but NOT under kai/ or kube/ → all backends. + r = self._run(["operator/internal/scheduler/types.go"]) + self.assertEqual(len(r["run"]["include"]), len(selector.ALL_ROWS)) + + def test_charts_runs_all_plus_agnostic(self): + r = self._run(["operator/charts/values.yaml"]) + self.assertEqual(len(r["run"]["include"]), len(selector.ALL_ROWS)) + + def test_e2e_infra_runs_all(self): + r = self._run(["operator/e2e/tests/foo_test.go"]) + self.assertEqual(len(r["run"]["include"]), len(selector.ALL_ROWS)) + + def test_workflow_change_runs_all(self): + r = self._run([".github/workflows/build-check-test.yaml"]) + self.assertEqual(len(r["run"]["include"]), len(selector.ALL_ROWS)) + + def test_mixed_kai_plus_docs_keeps_only_kai(self): + r = self._run(["operator/internal/scheduler/kai/x.go", + "docs/proposals/foo.md"]) + backends = _backends(r["run"]["include"]) + self.assertEqual(backends, {"kai-scheduler"}) + + def test_mixed_kai_plus_kube_runs_both(self): + r = self._run(["operator/internal/scheduler/kai/x.go", + "operator/internal/scheduler/kube/y.go"]) + backends = _backends(r["run"]["include"]) + self.assertEqual(backends, {"kai-scheduler", "default-scheduler"}) + + def test_empty_changed_files_emits_no_run(self): + r = self._run([]) + self.assertEqual(r["run"]["include"], []) + self.assertEqual(len(r["skip"]["include"]), len(selector.ALL_ROWS)) + + def test_unknown_top_level_path_is_ignored(self): + # Some random top-level file no rule matches → contributes nothing. + r = self._run(["unrelated-top-level-file.txt"]) + self.assertEqual(r["run"]["include"], []) + + def test_fallback_rule_for_other_operator_paths(self): + # operator/scheduler.go (hypothetical) — not under api/charts/e2e/ + # but under operator/ → fallback rule fires. + r = self._run(["operator/some-toplevel-go-file.go"]) + self.assertEqual(len(r["run"]["include"]), len(selector.ALL_ROWS)) + + +class TestSplitInvariants(unittest.TestCase): + """Invariants of the split() helper.""" + + def test_run_plus_skip_equals_all(self): + for files in ( + [], + ["operator/internal/scheduler/kai/x.go"], + ["operator/internal/scheduler/kube/x.go"], + ["operator/internal/scheduler/types.go"], + ["docs/foo.md"], + ["operator/charts/values.yaml"], + ): + r = selector.compute("pr", files, labels=[], draft=False) + run = _names(r["run"]["include"]) + skip = _names(r["skip"]["include"]) + self.assertEqual(set(run) | set(skip), + {row["test_name"] for row in selector.ALL_ROWS}, + msg=f"files={files}") + self.assertEqual(set(run) & set(skip), set(), + msg=f"run/skip not disjoint for files={files}") + + def test_emitted_rows_have_no_tier_field(self): + r = selector.compute("nightly", [], [], draft=False) + for row in r["run"]["include"]: + self.assertNotIn("tier", row, + msg="tier is selector-internal, must not leak to GHA") + + +class TestRowsConsistency(unittest.TestCase): + """Sanity checks on ALL_ROWS itself.""" + + def test_test_names_are_unique(self): + names = [r["test_name"] for r in selector.ALL_ROWS] + self.assertEqual(len(names), len(set(names))) + + def test_required_fields_present(self): + required = {"test_name", "test_pattern", "backend", + "create_flags", "make_target", "tier"} + for row in selector.ALL_ROWS: + self.assertEqual(set(row.keys()) & required, required, + msg=f"missing fields in row: {row}") + + def test_only_known_tiers(self): + for row in selector.ALL_ROWS: + self.assertIn(row["tier"], + {"agnostic", "sensitive", "capability"}, + msg=f"unknown tier in row: {row}") + + +class TestGoldenSamples(unittest.TestCase): + """Regenerate samples in testdata/ and assert they match committed files. + + Run with E2E_SELECT_REGENERATE=1 to update the golden files instead. + """ + + SAMPLES = { + "pr-mode-kai-only.json": dict( + mode="pr", + changed_files=["operator/internal/scheduler/kai/backend.go"], + labels=[], draft=False), + "pr-mode-kube-only.json": dict( + mode="pr", + changed_files=["operator/internal/scheduler/kube/backend.go"], + labels=[], draft=False), + "pr-mode-shared-scheduler.json": dict( + mode="pr", + changed_files=["operator/internal/scheduler/types.go"], + labels=[], draft=False), + "pr-mode-docs-only.json": dict( + mode="pr", + changed_files=["docs/proposals/foo.md", "README.md"], + labels=[], draft=False), + "pr-mode-run-e2e-label.json": dict( + mode="pr", + changed_files=["docs/proposals/foo.md"], + labels=["run-e2e"], draft=False), + "pr-mode-draft-no-label.json": dict( + mode="pr", + changed_files=["operator/internal/scheduler/kai/backend.go"], + labels=[], draft=True), + "nightly-mode-full.json": dict( + mode="nightly", + changed_files=[], labels=[], draft=False), + } + + def test_golden_samples_match(self): + regenerate = os.environ.get("E2E_SELECT_REGENERATE") == "1" + for fname, kwargs in self.SAMPLES.items(): + with self.subTest(sample=fname): + got = selector.compute(**kwargs) + # Drop 'reason' from golden files — it's diagnostic, not contract. + got_stripped = {k: v for k, v in got.items() if k != "reason"} + path = TESTDATA / fname + if regenerate: + path.write_text(json.dumps(got_stripped, indent=2) + "\n") + continue + self.assertTrue(path.exists(), + f"missing golden file {path}; " + f"run with E2E_SELECT_REGENERATE=1 to create") + expected = json.loads(path.read_text()) + self.assertEqual(got_stripped, expected, + f"selector output diverged from {fname}") + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/operator/e2e/tests/capabilities.go b/operator/e2e/tests/capabilities.go new file mode 100644 index 000000000..c32566d24 --- /dev/null +++ b/operator/e2e/tests/capabilities.go @@ -0,0 +1,109 @@ +// /* +// Copyright 2026 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package tests + +import ( + "sync" + "testing" +) + +// Capability is a scheduler feature the E2E suite may require (e.g. gang +// scheduling, topology-aware scheduling). Tests gate themselves with +// RequireCapability and auto-skip when the active backend does not provide it. +type Capability string + +const ( + // GangScheduling indicates the active backend treats a PodGang as an + // all-or-nothing scheduling unit. + GangScheduling Capability = "GangScheduling" + + // TopologyAwareScheduling indicates the active backend implements the + // scheduler.TopologyAwareSchedBackend interface AND the operator has + // topologyAwareScheduling.enabled=true. + TopologyAwareScheduling Capability = "TopologyAwareScheduling" + + // AutoMNNVL indicates the operator has network.autoMNNVLEnabled=true. + // Config-only — no backend coupling. + AutoMNNVL Capability = "AutoMNNVL" +) + +// CapabilitySet is the resolved set of capabilities for a single E2E run. +type CapabilitySet struct { + // ActiveBackend is the value of OperatorConfiguration.scheduler.defaultProfileName. + ActiveBackend string + // caps is the set of capabilities present on the active backend. + caps map[Capability]bool +} + +// Has reports whether the set contains the given capability. +func (s CapabilitySet) Has(c Capability) bool { + return s.caps[c] +} + +// backendInterfaceCapabilities is the hardcoded map of backend → capabilities +// that depend on Go interface implementation in the operator. Entries here are +// what E2E cannot deduce from a live OperatorConfiguration alone (the operator +// uses Go type assertions; the test binary runs out-of-process and cannot). +// +// Capabilities derived purely from configuration flags (e.g. AutoMNNVL from +// network.autoMNNVLEnabled) are NOT listed here — they are resolved directly +// from OperatorConfiguration in DiscoverCapabilities. +// +// When adding a new backend, add a row here AND update the developer +// checklist in the design proposal. The capabilities_test.go cross-check +// fails the build if this table disagrees with the actual Go interfaces. +var backendInterfaceCapabilities = map[string]map[Capability]bool{ + "kai-scheduler": { + GangScheduling: true, + TopologyAwareScheduling: true, + }, + "default-scheduler": { + // KubeSchedulerConfig.GangScheduling is forward-looking — the kube + // backend does not yet read or act on it. When it does, set + // GangScheduling: true here. + }, +} + +// currentCapabilities holds the resolved CapabilitySet for the running e2e +// suite. DiscoverCapabilities (in capability_discovery.go, e2e build tag) +// populates it once at TestMain time; RequireCapability reads it on every +// gated test entry. +var ( + currentCapabilities CapabilitySet + currentCapabilitiesSet bool + currentCapabilitiesMu sync.RWMutex +) + +// RequireCapability skips t when the active backend does not provide cap. +// Tests gated with RequireCapability are listed in the design proposal's +// Test Classification table as "Capability-gated". +// +// The function is no-op if capabilities have not been discovered yet (e.g. when +// running unit tests with go test ./... without an e2e cluster); the e2e build +// flow guarantees discovery runs before any test that calls this. +func RequireCapability(t *testing.T, cap Capability) { + t.Helper() + currentCapabilitiesMu.RLock() + defer currentCapabilitiesMu.RUnlock() + if !currentCapabilitiesSet { + return + } + if !currentCapabilities.Has(cap) { + t.Skipf("skipping: active backend %q does not provide capability %q", + currentCapabilities.ActiveBackend, cap) + } +} diff --git a/operator/e2e/tests/capabilities_test.go b/operator/e2e/tests/capabilities_test.go new file mode 100644 index 000000000..be1a21da7 --- /dev/null +++ b/operator/e2e/tests/capabilities_test.go @@ -0,0 +1,100 @@ +// /* +// Copyright 2026 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package tests + +import ( + "testing" + + configv1alpha1 "github.com/ai-dynamo/grove/operator/api/config/v1alpha1" + "github.com/ai-dynamo/grove/operator/internal/scheduler" + "github.com/ai-dynamo/grove/operator/internal/scheduler/kai" + "github.com/ai-dynamo/grove/operator/internal/scheduler/kube" + + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/tools/record" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +// backendConstructors mirrors the switch in +// operator/internal/scheduler/manager/manager.go newBackendForProfile. +// Adding a new backend means adding a row here AND in +// backendInterfaceCapabilities (in capabilities.go); TestCapabilityTableMatchesBackends +// fails the build if the two disagree. +var backendConstructors = map[configv1alpha1.SchedulerName]func() scheduler.Backend{ + configv1alpha1.SchedulerNameKai: func() scheduler.Backend { + return kai.New( + fake.NewClientBuilder().Build(), + runtime.NewScheme(), + record.NewFakeRecorder(1), + configv1alpha1.SchedulerProfile{Name: configv1alpha1.SchedulerNameKai}, + ) + }, + configv1alpha1.SchedulerNameKube: func() scheduler.Backend { + return kube.New( + fake.NewClientBuilder().Build(), + runtime.NewScheme(), + record.NewFakeRecorder(1), + configv1alpha1.SchedulerProfile{Name: configv1alpha1.SchedulerNameKube}, + ) + }, +} + +// TestCapabilityTableCoversAllSupportedBackends ensures the hardcoded +// capability table has a row for every backend the operator can construct. +// Catches the failure mode where a contributor adds a backend to +// SupportedSchedulerNames + manager.newBackendForProfile but forgets the +// capability table — without this, the new backend's capability-gated tests +// would silently skip rather than fail. +func TestCapabilityTableCoversAllSupportedBackends(t *testing.T) { + for _, name := range configv1alpha1.SupportedSchedulerNames { + if _, ok := backendInterfaceCapabilities[string(name)]; !ok { + t.Errorf("backend %q is in SupportedSchedulerNames but missing from "+ + "backendInterfaceCapabilities; add a row to "+ + "operator/e2e/tests/capabilities.go", name) + } + if _, ok := backendConstructors[name]; !ok { + t.Errorf("backend %q is in SupportedSchedulerNames but missing from "+ + "backendConstructors; add a row to "+ + "operator/e2e/tests/capabilities_test.go", name) + } + } +} + +// TestCapabilityTableMatchesBackends cross-checks the hardcoded capability +// table against actual Go interface implementation for each backend. Catches +// the failure mode where a backend's interface set changes (e.g. KAI drops +// TopologyAwareSchedBackend) but the table is not updated — without this, the +// E2E suite would either skip valid TAS tests or run them against a backend +// that no longer supports TAS. +func TestCapabilityTableMatchesBackends(t *testing.T) { + for name, ctor := range backendConstructors { + t.Run(string(name), func(t *testing.T) { + b := ctor() + table := backendInterfaceCapabilities[string(name)] + + // TopologyAwareScheduling: tied to the Go interface assertion + // the operator itself uses (clustertopology.go L46–54). + _, gotTAS := b.(scheduler.TopologyAwareSchedBackend) + wantTAS := table[TopologyAwareScheduling] + if gotTAS != wantTAS { + t.Errorf("backend %q: TopologyAwareScheduling table=%v but "+ + "interface assertion=%v; update either the backend "+ + "or backendInterfaceCapabilities", name, wantTAS, gotTAS) + } + }) + } +} diff --git a/operator/e2e/tests/capability_discovery.go b/operator/e2e/tests/capability_discovery.go new file mode 100644 index 000000000..ee04f9a50 --- /dev/null +++ b/operator/e2e/tests/capability_discovery.go @@ -0,0 +1,75 @@ +//go:build e2e + +// /* +// Copyright 2026 The Grove Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package tests + +import ( + "context" + "fmt" + + "github.com/ai-dynamo/grove/operator/e2e/grove/config" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// DiscoverCapabilities resolves the active backend and its capabilities from +// the live OperatorConfiguration plus the hardcoded interface table, stores +// the result in the package-level currentCapabilities so RequireCapability +// can read it, and returns the resolved set so callers may log/inspect it. +// +// Called once from TestMain before any test runs. +func DiscoverCapabilities(ctx context.Context, crClient client.Client) (CapabilitySet, error) { + md, err := config.NewOperatorConfig(crClient).ReadGroveMetadata(ctx) + if err != nil { + return CapabilitySet{}, fmt.Errorf("read OperatorConfiguration: %w", err) + } + + backend := md.Config.Scheduler.DefaultProfileName + table, ok := backendInterfaceCapabilities[backend] + if !ok { + return CapabilitySet{}, fmt.Errorf( + "active backend %q has no entry in backendInterfaceCapabilities; "+ + "please update operator/e2e/tests/capabilities.go", backend) + } + + set := CapabilitySet{ + ActiveBackend: backend, + caps: map[Capability]bool{}, + } + + // Backend-coupled capability: present iff backend is in the table for it. + if table[GangScheduling] { + set.caps[GangScheduling] = true + } + + // Backend-coupled capability gated by an additional config flag. + if md.Config.TopologyAwareScheduling.Enabled && table[TopologyAwareScheduling] { + set.caps[TopologyAwareScheduling] = true + } + + // Config-only capability: no interface-table lookup. + if md.Config.Network.AutoMNNVLEnabled { + set.caps[AutoMNNVL] = true + } + + currentCapabilitiesMu.Lock() + currentCapabilities = set + currentCapabilitiesSet = true + currentCapabilitiesMu.Unlock() + + return set, nil +} diff --git a/operator/e2e/tests/gang_scheduling_test.go b/operator/e2e/tests/gang_scheduling_test.go index 3a971d6eb..b5095b143 100644 --- a/operator/e2e/tests/gang_scheduling_test.go +++ b/operator/e2e/tests/gang_scheduling_test.go @@ -32,6 +32,7 @@ import ( // 3. Verify all workload pods are pending due to insufficient resources // 4. Uncordon the node and verify all pods get scheduled func Test_GS1_GangSchedulingWithFullReplicas(t *testing.T) { + RequireCapability(t, GangScheduling) ctx := context.Background() Logger.Info("1. Initialize a 10-node Grove cluster, then cordon 1 node") @@ -83,6 +84,7 @@ func Test_GS1_GangSchedulingWithFullReplicas(t *testing.T) { // 6. Scale PCSG replicas to 3 and verify 4 new pending pods // 7. Uncordon remaining nodes and verify all pods get scheduled func Test_GS2_GangSchedulingWithScalingFullReplicas(t *testing.T) { + RequireCapability(t, GangScheduling) ctx := context.Background() // Setup cluster (shared or individual based on test run mode) @@ -152,6 +154,7 @@ func Test_GS2_GangSchedulingWithScalingFullReplicas(t *testing.T) { // 6. Scale PCS replicas to 2 and verify 10 new pending pods // 7. Uncordon remaining nodes and verify all pods get scheduled func Test_GS3_GangSchedulingWithPCSScalingFullReplicas(t *testing.T) { + RequireCapability(t, GangScheduling) ctx := context.Background() Logger.Info("1. Initialize a 20-node Grove cluster, then cordon 11 nodes") @@ -217,6 +220,7 @@ func Test_GS3_GangSchedulingWithPCSScalingFullReplicas(t *testing.T) { // 9. Scale PCSG replicas to 3 and verify 4 new pending pods // 10. Uncordon remaining nodes and verify all pods get scheduled func Test_GS4_GangSchedulingWithPCSAndPCSGScalingFullReplicas(t *testing.T) { + RequireCapability(t, GangScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster, then cordon 19 nodes") @@ -283,6 +287,7 @@ func Test_GS4_GangSchedulingWithPCSAndPCSGScalingFullReplicas(t *testing.T) { // 5. Wait for scheduled pods to become ready // 6. Uncordon 7 nodes and verify all remaining workload pods get scheduled func Test_GS5_GangSchedulingWithMinReplicas(t *testing.T) { + RequireCapability(t, GangScheduling) ctx := context.Background() Logger.Info("1. Initialize a 10-node Grove cluster, then cordon 8 nodes") @@ -351,6 +356,7 @@ func Test_GS5_GangSchedulingWithMinReplicas(t *testing.T) { // 11. Wait for scheduled pods to become ready // 12. Uncordon 2 nodes and verify remaining workload pods get scheduled func Test_GS6_GangSchedulingWithPCSGScalingMinReplicas(t *testing.T) { + RequireCapability(t, GangScheduling) ctx := context.Background() Logger.Info("1. Initialize a 14-node Grove cluster, then cordon 12 nodes") @@ -463,6 +469,7 @@ func Test_GS6_GangSchedulingWithPCSGScalingMinReplicas(t *testing.T) { // 13. Wait for scheduled pods to become ready // 14. Uncordon 2 nodes and verify remaining workload pods get scheduled func Test_GS7_GangSchedulingWithPCSGScalingMinReplicasAdvanced1(t *testing.T) { + RequireCapability(t, GangScheduling) ctx := context.Background() Logger.Info("1. Initialize a 14-node Grove cluster, then cordon 12 nodes") @@ -584,6 +591,7 @@ func Test_GS7_GangSchedulingWithPCSGScalingMinReplicasAdvanced1(t *testing.T) { // 9. Wait for scheduled pods to become ready // 10. Uncordon 7 nodes and verify the remaining workload pods get scheduled func Test_GS8_GangSchedulingWithPCSGScalingMinReplicasAdvanced2(t *testing.T) { + RequireCapability(t, GangScheduling) ctx := context.Background() Logger.Info("1. Initialize a 14-node Grove cluster, then cordon 12 nodes") @@ -679,6 +687,7 @@ func Test_GS8_GangSchedulingWithPCSGScalingMinReplicasAdvanced2(t *testing.T) { // 10. Wait for scheduled pods to become ready // 11. Uncordon 7 nodes and verify the remaining workload pods get scheduled func Test_GS9_GangSchedulingWithPCSScalingMinReplicas(t *testing.T) { + RequireCapability(t, GangScheduling) ctx := context.Background() Logger.Info("1. Initialize a 20-node Grove cluster, then cordon 18 nodes") @@ -782,6 +791,7 @@ func Test_GS9_GangSchedulingWithPCSScalingMinReplicas(t *testing.T) { // 9. Wait for scheduled pods to become ready // 10. Uncordon 10 nodes and verify the remaining workload pods get scheduled func Test_GS10_GangSchedulingWithPCSScalingMinReplicasAdvanced(t *testing.T) { + RequireCapability(t, GangScheduling) ctx := context.Background() Logger.Info("1. Initialize a 20-node Grove cluster, then cordon 18 nodes") @@ -886,6 +896,7 @@ func Test_GS10_GangSchedulingWithPCSScalingMinReplicasAdvanced(t *testing.T) { // 19. Wait for 2 more pods to be scheduled (min-available for pcs-1-sg-x-2) // 20. Uncordon 2 nodes and verify remaining workload pods get scheduled func Test_GS11_GangSchedulingWithPCSAndPCSGScalingMinReplicas(t *testing.T) { + RequireCapability(t, GangScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster, then cordon 26 nodes") @@ -1028,6 +1039,7 @@ func Test_GS11_GangSchedulingWithPCSAndPCSGScalingMinReplicas(t *testing.T) { // 11. Wait for scheduled pods to become ready // 12. Uncordon 14 nodes and verify the remaining workload pods get scheduled func Test_GS12_GangSchedulingWithComplexPCSGScaling(t *testing.T) { + RequireCapability(t, GangScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster, then cordon 26 nodes") diff --git a/operator/e2e/tests/suite_test.go b/operator/e2e/tests/suite_test.go index a9a2e4847..7a0381324 100644 --- a/operator/e2e/tests/suite_test.go +++ b/operator/e2e/tests/suite_test.go @@ -45,6 +45,17 @@ func TestMain(m *testing.M) { os.Exit(1) } + // Discover scheduler capabilities from the live OperatorConfiguration + // before any test runs. RequireCapability uses the result to auto-skip + // tests whose required capability is not provided by the active backend. + caps, err := DiscoverCapabilities(ctx, sharedCluster.GetClient()) + if err != nil { + Logger.Errorf("failed to discover scheduler capabilities: %s", err) + sharedCluster.Teardown() + os.Exit(1) + } + Logger.Infof("Active backend: %s", caps.ActiveBackend) + // Run tests code := m.Run() diff --git a/operator/e2e/tests/topology_test.go b/operator/e2e/tests/topology_test.go index 204cef333..57678a17c 100644 --- a/operator/e2e/tests/topology_test.go +++ b/operator/e2e/tests/topology_test.go @@ -101,6 +101,7 @@ func GetPodGroupOrFail(t *testing.T, tc *testctx.TestContext, podGroupVerifier * // Note: grove-topology is NOT cleaned up after this test — it is shared cluster infrastructure // used by TAS2-TAS16. ensureGroveTopology() in each subsequent test is idempotent. func Test_TAS1_TopologyInfrastructure(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() tc, cleanup := testctx.PrepareTest(ctx, t, 0) @@ -166,6 +167,7 @@ func Test_TAS1_TopologyInfrastructure(t *testing.T) { // 4. Verify worker-block pods (4) are in the same block // 5. Verify different cliques can have independent topology constraints func Test_TAS2_MultipleCliquesWithDifferentConstraints(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for topology testing") @@ -225,6 +227,7 @@ func Test_TAS2_MultipleCliquesWithDifferentConstraints(t *testing.T) { // 4. Verify router pods (2 standalone) // 5. Verify KAI PodGroup SubGroups: NO PCSG parent groups (because PCSG constraint is nil, per PR #357) func Test_TAS3_PCSOnlyConstraint(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for topology testing") @@ -279,6 +282,7 @@ func Test_TAS3_PCSOnlyConstraint(t *testing.T) { // 3. Verify PCSG worker pods (2 total) respect rack constraint // 4. Router pods (2 standalone) are unconstrained func Test_TAS4_PCSGOnlyConstraint(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for topology testing") @@ -344,6 +348,7 @@ func Test_TAS4_PCSGOnlyConstraint(t *testing.T) { // 2. PCS has NO explicit constraint // 3. Verify all 2 pods on same host (strictest constraint) func Test_TAS5_HostLevelConstraint(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for topology testing") @@ -406,6 +411,7 @@ func Test_TAS5_HostLevelConstraint(t *testing.T) { // 3. Verify KAI PodGroup has zone constraint at top level // 4. Verify 1 SubGroup (standalone PCLQ) with NO additional constraint func Test_TAS6_StandalonePCLQOnlyPCSZoneConstraint(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for topology testing") @@ -454,6 +460,7 @@ func Test_TAS6_StandalonePCLQOnlyPCSZoneConstraint(t *testing.T) { // 2. Verify all 4 pods scheduled (gang scheduling works) // 3. Verify KAI PodGroup has 4 SubGroups with NO topology constraints func Test_TAS7_NoTopologyConstraint(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for topology testing") @@ -506,6 +513,7 @@ func Test_TAS7_NoTopologyConstraint(t *testing.T) { // 5. Verify all pods in same block (PCS constraint) // 6. Verify KAI PodGroup hierarchy with correct topology constraints func Test_TAS8_FullHierarchyWithCascadingConstraints(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize an 8-node Grove cluster for topology testing") @@ -583,6 +591,7 @@ func Test_TAS8_FullHierarchyWithCascadingConstraints(t *testing.T) { // 3. Verify pods on same host (PCLQ constraint - strictest) // 4. Verify KAI PodGroup has block constraint at top level, host constraint at PCLQ level func Test_TAS9_PCSPlusPCLQConstraint(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for topology testing") @@ -633,6 +642,7 @@ func Test_TAS9_PCSPlusPCLQConstraint(t *testing.T) { // 5. Verify base PodGang KAI PodGroup topology constraints // 6. Verify scaled PodGangs' KAI PodGroups (replicas 1-2) func Test_TAS10_PCSGScalingWithTopologyConstraints(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for topology testing") @@ -713,6 +723,7 @@ func Test_TAS10_PCSGScalingWithTopologyConstraints(t *testing.T) { // 3. Verify each PCSG replica's pods on same host // 4. Verify KAI PodGroup has PCSG rack + PCLQ host constraints, NO top-level PCS constraint func Test_TAS11_PCSGPlusPCLQNoParentConstraint(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for topology testing") @@ -772,6 +783,7 @@ func Test_TAS11_PCSGPlusPCLQNoParentConstraint(t *testing.T) { // 5. Verify base PodGang KAI PodGroup contains minAvailable=3 replicas // 6. Verify 7 scaled PodGangs' KAI PodGroups (replicas 3-9) func Test_TAS12_LargeScalingRatio(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for topology testing") @@ -866,6 +878,7 @@ func Test_TAS12_LargeScalingRatio(t *testing.T) { // 4. Verify pod events show Unschedulable reason // 5. Verify KAI PodGroup exists with correct constraints even though pods are pending func Test_TAS13_InsufficientNodesForConstraint(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for topology testing") @@ -925,6 +938,7 @@ func Test_TAS13_InsufficientNodesForConstraint(t *testing.T) { // 3. Verify each PCS replica's pods in same rack // 4. Verify KAI PodGroups for both PCS replicas have correct topology constraints func Test_TAS14_MultiReplicaWithRackConstraint(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for topology testing") @@ -983,6 +997,7 @@ func Test_TAS14_MultiReplicaWithRackConstraint(t *testing.T) { // 6. Verify base PodGang KAI PodGroup topology for complex multi-PCSG workload // 7. Verify scaled PodGangs' KAI PodGroups (decoder replica 1, prefill replica 1) func Test_TAS15_DisaggregatedInferenceMultiplePCSGs(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for topology testing") @@ -1097,6 +1112,7 @@ func Test_TAS15_DisaggregatedInferenceMultiplePCSGs(t *testing.T) { // 4. Verify block constraint at PCS level, rack at PCSG, for both PCS replicas // 5. Similar to TAS15 but scaled across 2 PCS replicas func Test_TAS16_MultiReplicaPCSWithThreeLevelHierarchy(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for multi-replica PCS testing") @@ -1198,6 +1214,7 @@ func Test_TAS16_MultiReplicaPCSWithThreeLevelHierarchy(t *testing.T) { // 5. Verify KAI Topology CRs auto-created with correct keys // 6. Deploy H100 and GB200 workloads, verify pods packed at block level on correct node segments func Test_TAS17_HeterogeneousGPUCluster(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 28-node Grove cluster for heterogeneous GPU testing") @@ -1374,6 +1391,7 @@ func Test_TAS17_HeterogeneousGPUCluster(t *testing.T) { // 2. Verify SchedulerTopologyDrift condition becomes True/Drift // 3. Verify SchedulerTopologyStatuses shows InSync=false func Test_TAS18_ClusterTopologyDriftDetection(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) const ctName = "drift-detect-topo" const kaiTopoRef = "non-existent-kai-topo" ctx := context.Background() @@ -1434,6 +1452,7 @@ func Test_TAS18_ClusterTopologyDriftDetection(t *testing.T) { // 5. Verify KAI Topology recreated with 3 keys // 6. Verify SchedulerTopologyDrift remains False/InSync func Test_TAS19_AutoManagedCTLifecycle(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) const ctName = "lifecycle-topo" ctx := context.Background() @@ -1514,6 +1533,7 @@ func Test_TAS19_AutoManagedCTLifecycle(t *testing.T) { // 9. Re-create the ClusterTopology // 10. Verify TopologyLevelsUnavailable = False/AllClusterTopologyLevelsAvailable func Test_TAS20_PCSTopologyLevelsUnavailableCondition(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a 2-node Grove cluster for PCS condition testing") @@ -1639,6 +1659,7 @@ func Test_TAS20_PCSTopologyLevelsUnavailableCondition(t *testing.T) { // Test_TAS21_ClusterTopologyValidationWebhook verifies that the ClusterTopology validating webhook // rejects invalid topology definitions and invalid schedulerTopologyReferences. func Test_TAS21_ClusterTopologyValidationWebhook(t *testing.T) { + RequireCapability(t, TopologyAwareScheduling) ctx := context.Background() Logger.Info("1. Initialize a Grove cluster for ClusterTopology webhook validation testing") diff --git a/operator/e2e/yaml/tas-hierarchy.yaml b/operator/e2e/yaml/tas-hierarchy.yaml index 9c6c0572a..2794efd1e 100644 --- a/operator/e2e/yaml/tas-hierarchy.yaml +++ b/operator/e2e/yaml/tas-hierarchy.yaml @@ -36,7 +36,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -70,7 +69,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/tas-host-level.yaml b/operator/e2e/yaml/tas-host-level.yaml index b13bda54d..44bb968ae 100644 --- a/operator/e2e/yaml/tas-host-level.yaml +++ b/operator/e2e/yaml/tas-host-level.yaml @@ -23,7 +23,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/tas-indep-clq.yaml b/operator/e2e/yaml/tas-indep-clq.yaml index 614c867bb..b11b0e9f6 100644 --- a/operator/e2e/yaml/tas-indep-clq.yaml +++ b/operator/e2e/yaml/tas-indep-clq.yaml @@ -24,7 +24,6 @@ spec: minAvailable: 3 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -67,7 +66,6 @@ spec: minAvailable: 4 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/tas-insuffic.yaml b/operator/e2e/yaml/tas-insuffic.yaml index f77278abc..cbc2cc0a9 100644 --- a/operator/e2e/yaml/tas-insuffic.yaml +++ b/operator/e2e/yaml/tas-insuffic.yaml @@ -23,7 +23,6 @@ spec: minAvailable: 10 # All-or-nothing gang scheduling podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/tas-large-scale.yaml b/operator/e2e/yaml/tas-large-scale.yaml index 034d17769..5253f297a 100644 --- a/operator/e2e/yaml/tas-large-scale.yaml +++ b/operator/e2e/yaml/tas-large-scale.yaml @@ -33,7 +33,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/tas-multirep.yaml b/operator/e2e/yaml/tas-multirep.yaml index 71218e47e..aa2ed865f 100644 --- a/operator/e2e/yaml/tas-multirep.yaml +++ b/operator/e2e/yaml/tas-multirep.yaml @@ -23,7 +23,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/tas-no-constraint.yaml b/operator/e2e/yaml/tas-no-constraint.yaml index 111937d41..22bfa6623 100644 --- a/operator/e2e/yaml/tas-no-constraint.yaml +++ b/operator/e2e/yaml/tas-no-constraint.yaml @@ -26,7 +26,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/tas-pcs-multi-pcsg-multi-replica.yaml b/operator/e2e/yaml/tas-pcs-multi-pcsg-multi-replica.yaml index 24f7b69e7..e29e8aa91 100644 --- a/operator/e2e/yaml/tas-pcs-multi-pcsg-multi-replica.yaml +++ b/operator/e2e/yaml/tas-pcs-multi-pcsg-multi-replica.yaml @@ -42,7 +42,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -73,7 +72,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -107,7 +105,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -138,7 +135,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -169,7 +165,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/tas-pcs-multi-pcsg.yaml b/operator/e2e/yaml/tas-pcs-multi-pcsg.yaml index 115caf612..69bec7fe5 100644 --- a/operator/e2e/yaml/tas-pcs-multi-pcsg.yaml +++ b/operator/e2e/yaml/tas-pcs-multi-pcsg.yaml @@ -42,7 +42,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -82,7 +81,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -122,7 +120,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -162,7 +159,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -202,7 +198,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/tas-pcs-pclq.yaml b/operator/e2e/yaml/tas-pcs-pclq.yaml index ca90817fe..1034d0539 100644 --- a/operator/e2e/yaml/tas-pcs-pclq.yaml +++ b/operator/e2e/yaml/tas-pcs-pclq.yaml @@ -26,7 +26,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/tas-pcsg-pclq.yaml b/operator/e2e/yaml/tas-pcsg-pclq.yaml index c69cac669..97be1aa97 100644 --- a/operator/e2e/yaml/tas-pcsg-pclq.yaml +++ b/operator/e2e/yaml/tas-pcsg-pclq.yaml @@ -32,7 +32,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/tas-pcsg-scale.yaml b/operator/e2e/yaml/tas-pcsg-scale.yaml index ffcc06e65..6cc999249 100644 --- a/operator/e2e/yaml/tas-pcsg-scale.yaml +++ b/operator/e2e/yaml/tas-pcsg-scale.yaml @@ -32,7 +32,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/tas-sl-pcs-only.yaml b/operator/e2e/yaml/tas-sl-pcs-only.yaml index 3b71e2849..fac96f735 100644 --- a/operator/e2e/yaml/tas-sl-pcs-only.yaml +++ b/operator/e2e/yaml/tas-sl-pcs-only.yaml @@ -29,7 +29,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -69,7 +68,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/tas-sl-pcsg-only.yaml b/operator/e2e/yaml/tas-sl-pcsg-only.yaml index 598bdff55..336a587ee 100644 --- a/operator/e2e/yaml/tas-sl-pcsg-only.yaml +++ b/operator/e2e/yaml/tas-sl-pcsg-only.yaml @@ -29,7 +29,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -69,7 +68,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/tas-standalone-pclq-only-pcs-zone.yaml b/operator/e2e/yaml/tas-standalone-pclq-only-pcs-zone.yaml index 77a1887ca..d5231b034 100644 --- a/operator/e2e/yaml/tas-standalone-pclq-only-pcs-zone.yaml +++ b/operator/e2e/yaml/tas-standalone-pclq-only-pcs-zone.yaml @@ -23,7 +23,6 @@ spec: minAvailable: 4 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/workload-ondelete.yaml b/operator/e2e/yaml/workload-ondelete.yaml index b8a94162d..de77b56f3 100644 --- a/operator/e2e/yaml/workload-ondelete.yaml +++ b/operator/e2e/yaml/workload-ondelete.yaml @@ -21,7 +21,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -52,7 +51,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -83,7 +81,6 @@ spec: minAvailable: 3 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/workload1.yaml b/operator/e2e/yaml/workload1.yaml index cfd254429..b81de1414 100644 --- a/operator/e2e/yaml/workload1.yaml +++ b/operator/e2e/yaml/workload1.yaml @@ -19,7 +19,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -50,7 +49,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -81,7 +79,6 @@ spec: minAvailable: 3 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/workload2.yaml b/operator/e2e/yaml/workload2.yaml index 0d3a6b22a..01b15c5dd 100644 --- a/operator/e2e/yaml/workload2.yaml +++ b/operator/e2e/yaml/workload2.yaml @@ -20,7 +20,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -51,7 +50,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -82,7 +80,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/workload3.yaml b/operator/e2e/yaml/workload3.yaml index ccc7b8fae..d7c7bef8d 100644 --- a/operator/e2e/yaml/workload3.yaml +++ b/operator/e2e/yaml/workload3.yaml @@ -20,7 +20,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -51,7 +50,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -82,7 +80,6 @@ spec: minAvailable: 3 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/workload4.yaml b/operator/e2e/yaml/workload4.yaml index 9e63f3e33..80bdcaac5 100644 --- a/operator/e2e/yaml/workload4.yaml +++ b/operator/e2e/yaml/workload4.yaml @@ -20,7 +20,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -51,7 +50,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -82,7 +80,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/workload5.yaml b/operator/e2e/yaml/workload5.yaml index 5c760a088..1c693229f 100644 --- a/operator/e2e/yaml/workload5.yaml +++ b/operator/e2e/yaml/workload5.yaml @@ -20,7 +20,6 @@ spec: minAvailable: 2 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -53,7 +52,6 @@ spec: - pc-c podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -86,7 +84,6 @@ spec: - pc-a podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/e2e/yaml/workload6.yaml b/operator/e2e/yaml/workload6.yaml index 7c11982cb..4d7570e95 100644 --- a/operator/e2e/yaml/workload6.yaml +++ b/operator/e2e/yaml/workload6.yaml @@ -20,7 +20,6 @@ spec: minAvailable: 1 podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -53,7 +52,6 @@ spec: - pc-a podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -86,7 +84,6 @@ spec: - pc-b podSpec: terminationGracePeriodSeconds: 5 - schedulerName: kai-scheduler affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/operator/hack/README.md b/operator/hack/README.md index 8417510d1..fa58ed7a5 100644 --- a/operator/hack/README.md +++ b/operator/hack/README.md @@ -89,7 +89,7 @@ All configuration can be overridden via `E2E_*` environment variables (used by ` **Components (ComponentConfig):** - `E2E_KAI_VERSION` - Kai Scheduler version (default: from `dependencies.yaml`) -- `E2E_SKAFFOLD_PROFILE` - Skaffold profile for Grove (default: `topology-test`) +- `E2E_SKAFFOLD_PROFILE` - Skaffold profile for Grove (default: `e2e-kai`) - `E2E_GROVE_NAMESPACE` - Grove operator namespace (default: `grove-system`) - `E2E_REGISTRY` - Container registry override (default: none) diff --git a/operator/hack/e2e-autoMNNVL/README.md b/operator/hack/e2e-autoMNNVL/README.md index 3670eea74..4a1d4de6b 100644 --- a/operator/hack/e2e-autoMNNVL/README.md +++ b/operator/hack/e2e-autoMNNVL/README.md @@ -93,5 +93,5 @@ make e2e-cluster-down - **Cluster name:** `shared-e2e-test-cluster` (same as standard e2e) - **Nodes:** 1 server + 2 agents (lightweight — standard e2e uses 30) - **Registry:** local registry on port 5001 -- **Skaffold profile:** `topology-test` (same as standard e2e; Kai and topology are installed, only worker count and prepull are reduced) +- **Skaffold profile:** `e2e-kai` (same as standard e2e; Kai and topology are installed, only worker count and prepull are reduced) - **Fake GPU:** [fake-gpu-operator](https://github.com/run-ai/fake-gpu-operator) v0.0.72 (provides ComputeDomain CRD) diff --git a/operator/hack/e2e-cluster/create-e2e-cluster.py b/operator/hack/e2e-cluster/create-e2e-cluster.py index 54af806dd..7f710f98b 100755 --- a/operator/hack/e2e-cluster/create-e2e-cluster.py +++ b/operator/hack/e2e-cluster/create-e2e-cluster.py @@ -129,7 +129,7 @@ class ClusterConfig(BaseSettings): worker_memory: Optional[str] = Field(default=DEFAULT_WORKER_MEMORY, pattern=r"^\d+[mMgG]?$") k3s_image: str = "rancher/k3s:v1.34.2-k3s1" kai_version: str = Field(default=DEPENDENCIES['kai_scheduler']['version'], pattern=r"^v[\d.]+(-[\w.]+)?$") - skaffold_profile: str = "topology-test" + skaffold_profile: str = "e2e-kai" max_retries: int = Field(default=3, ge=1, le=10) # Constants (not configurable via environment variables) diff --git a/operator/hack/e2e-default-scheduler.yaml b/operator/hack/e2e-default-scheduler.yaml new file mode 100644 index 000000000..0214b5566 --- /dev/null +++ b/operator/hack/e2e-default-scheduler.yaml @@ -0,0 +1,14 @@ +# E2E preset overlay for the in-tree default-scheduler backend. +# Layers on top of e2e.yaml; cluster shape and KWOK config are inherited so +# kai vs default-scheduler comparisons stay fair. +# +# Activated via: infra-manager.py setup -f hack/e2e-default-scheduler.yaml +# (threaded through E2E_CREATE_FLAGS in the CI matrix; see build-check-test.yaml). + +scheduler: + kai: + enabled: false + +grove: + local: + skaffold_profile: e2e-default-scheduler diff --git a/operator/hack/e2e.yaml b/operator/hack/e2e.yaml index e09cedd40..a50290b9b 100644 --- a/operator/hack/e2e.yaml +++ b/operator/hack/e2e.yaml @@ -17,3 +17,5 @@ scheduler: grove: enabled: true profiling: false + local: + skaffold_profile: e2e-kai diff --git a/operator/hack/infra_manager/constants.py b/operator/hack/infra_manager/constants.py index 10a29d2b0..14b3be285 100644 --- a/operator/hack/infra_manager/constants.py +++ b/operator/hack/infra_manager/constants.py @@ -186,7 +186,7 @@ def parse_memory_mb(mem_str: str) -> int: DEFAULT_CLUSTER_CREATE_MAX_RETRIES = 3 # -- Component defaults -- -DEFAULT_SKAFFOLD_PROFILE = "topology-test" +DEFAULT_SKAFFOLD_PROFILE = "e2e-kai" DEFAULT_GROVE_NAMESPACE = "grove-system" # -- KWOK defaults -- diff --git a/operator/hack/infra_manager/orchestrator.py b/operator/hack/infra_manager/orchestrator.py index 198fde710..ae7454109 100644 --- a/operator/hack/infra_manager/orchestrator.py +++ b/operator/hack/infra_manager/orchestrator.py @@ -126,16 +126,21 @@ def _run_task(name: str, fn: Callable) -> None: console.print(outputs[name], end="") -def _run_prepull(registry_port: int) -> None: +def _run_prepull(registry_port: int, kai_enabled: bool) -> None: """Pre-pull images to local registry in a single batch. Args: registry_port: Port for the local container registry. + kai_enabled: Whether the KAI scheduler is enabled. When false, the KAI + image group is skipped because no workload will reference it. """ groups: list[tuple[list[str], str]] = [ - (DEPENDENCIES["kai_scheduler"]["images"], DEPENDENCIES["kai_scheduler"]["version"]), (DEPENDENCIES["cert_manager"]["images"], DEPENDENCIES["cert_manager"]["version"]), ] + if kai_enabled: + groups.insert( + 0, (DEPENDENCIES["kai_scheduler"]["images"], DEPENDENCIES["kai_scheduler"]["version"]) + ) busybox_images = dep_value("test_images", "busybox") if busybox_images: groups.append((busybox_images, "latest")) @@ -211,7 +216,9 @@ def run_setup(cfg: SetupConfig) -> None: if cfg.cluster.create: parallel_tasks["topology"] = apply_topology_labels if do_prepull: - parallel_tasks["prepull"] = lambda: _run_prepull(cfg.cluster.registry_port) + parallel_tasks["prepull"] = lambda: _run_prepull( + cfg.cluster.registry_port, cfg.scheduler.kai.enabled + ) if cfg.scheduler.kai.enabled: parallel_tasks["kai"] = lambda: install_kai_scheduler(cfg.scheduler.kai) if cfg.grove.enabled: diff --git a/operator/skaffold.yaml b/operator/skaffold.yaml index 72e403cc9..8a099ac46 100644 --- a/operator/skaffold.yaml +++ b/operator/skaffold.yaml @@ -73,7 +73,7 @@ profiles: config: leaderElection: enabled: false - - name: topology-test + - name: e2e-kai patches: - op: add path: /deploy/helm/releases/0/setValues @@ -88,6 +88,17 @@ profiles: enabled: false topologyAwareScheduling: enabled: true + - name: e2e-default-scheduler + patches: + - op: add + path: /deploy/helm/releases/0/setValues + value: + replicaCount: 1 + config: + scheduler: + defaultProfileName: default-scheduler + leaderElection: + enabled: false - name: mnnvl-test patches: - op: add