Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 75 additions & 61 deletions .github/workflows/build-check-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,21 +32,59 @@ concurrency:
cancel-in-progress: true

jobs:
# Detect which paths have changed to conditionally run E2E tests
changes:
# E2E matrix selector. Computes which (suite × backend) rows actually run
# and which fall to the e2e-skip mirror, based on changed paths, PR labels,
# and draft state. See hack/e2e-select/main.py + testdata/ for the rules
# and golden samples.
e2e-select:
runs-on: ubuntu-latest
if: github.event_name == 'pull_request'
outputs:
e2e-relevant: ${{ steps.filter.outputs.e2e-relevant }}
run: ${{ steps.select.outputs.run }}
skip: ${{ steps.select.outputs.skip }}
has_run: ${{ steps.select.outputs.has_run }}
has_skip: ${{ steps.select.outputs.has_skip }}
reason: ${{ steps.select.outputs.reason }}
steps:
- uses: actions/checkout@v4
- uses: dorny/paths-filter@v3
id: filter
with:
filters: |
e2e-relevant:
- 'operator/**'
- '.github/**'
fetch-depth: 0
- uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Compute changed files
run: |
git diff --name-only \
"${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.head.sha }}" \
> /tmp/changed-files.txt
echo "Changed files in this PR:"
cat /tmp/changed-files.txt
- name: Run selector
id: select
run: |
LABELS="${{ join(github.event.pull_request.labels.*.name, ',') }}"
DRAFT_FLAG=""
if [ "${{ github.event.pull_request.draft }}" = "true" ]; then
DRAFT_FLAG="--draft"
fi

# Print full result to logs for transparency.
echo "=== Selector result ==="
python3 hack/e2e-select/main.py \
--mode pr \
--changed-files /tmp/changed-files.txt \
--labels "$LABELS" \
$DRAFT_FLAG | tee /tmp/select.json
echo "======================="

# Slice into individual job outputs (one Python invocation, jq slices).
{
echo "run=$(jq -c .run /tmp/select.json)"
echo "skip=$(jq -c .skip /tmp/select.json)"
echo "has_run=$(jq -r .has_run /tmp/select.json)"
echo "has_skip=$(jq -r .has_skip /tmp/select.json)"
echo "reason=$(jq -r .reason /tmp/select.json)"
} >> "$GITHUB_OUTPUT"

test:
runs-on: ubuntu-latest
Expand Down Expand Up @@ -87,55 +125,38 @@ jobs:
- name: check
run: make check

# E2E tests - only run after build, check, and test jobs succeed
# Only triggered by changes to operator or .github folders
# E2E tests — matrix comes from the selector job.
#
# Matrix entries (defined in hack/e2e-select/main.py ALL_ROWS):
# test_name - name shown in the GitHub Actions UI
# test_pattern - Go test -run regex
# backend - kai-scheduler | default-scheduler | ...
# create_flags - extra flags appended to E2E_CREATE_FLAGS
# (empty string means "use the base e2e.yaml preset, KAI")
# make_target - Makefile target (run-e2e-full | run-e2e-real-full | run-e2e-mnnvl-full)
#
# Matrix entries can set:
# test_name (required) - name shown in the GitHub Actions UI
# test_pattern (optional) - Go test -run pattern (standard e2e tests)
# make_target (optional) - Makefile target, defaults to run-e2e-full
# The selector handles draft policy: a draft PR without the 'run-e2e' label
# gets has_run=false (all rows fall to e2e-skip). The 'run-e2e' label forces
# the full matrix (safety escape for reviewers).
e2e:
needs: [test, build, check, changes]
# Run on non-draft PRs (or draft PRs with 'run-e2e' label)
# AND only when operator or .github files are changed
needs: [test, build, check, e2e-select]
if: |
github.event_name == 'pull_request' &&
needs.changes.outputs.e2e-relevant == 'true' &&
(github.event.pull_request.draft == false || contains(github.event.pull_request.labels.*.name, 'run-e2e'))
needs.e2e-select.outputs.has_run == 'true'
# use NVIDIA self-hosted runner setting is on Velonix repository
runs-on: prod-grove-e2e-v1
timeout-minutes: 60
strategy:
fail-fast: false
matrix:
include:
- test_name: gang_scheduling
test_pattern: "^Test_GS"
- test_name: rolling_updates
test_pattern: "^Test_RU"
- test_name: ondelete_updates
test_pattern: "^Test_OD"
- test_name: startup_ordering
test_pattern: "^Test_SO"
make_target: "run-e2e-real-full"
- test_name: Topology_Aware_Scheduling
test_pattern: "^Test_TAS"
- test_name: cert_management
test_pattern: "^Test_CM"
- test_name: auto_mnnvl
test_pattern: "^Test_AutoMNNVL"
make_target: "run-e2e-mnnvl-full"
- test_name: crd_installer
test_pattern: "^Test_CRD_Installer"
- test_name: resource_sharing
test_pattern: "^Test_RS"
matrix: ${{ fromJSON(needs.e2e-select.outputs.run) }}
name: E2E - ${{ matrix.test_name }}
steps:
# print runner specs so we have a record in case of failures
- name: Print runner specs
run: |
echo "CPUs: $(nproc)"
echo "RAM: $(free -h | awk '/^Mem:/ {print $2}')"
echo "Selection reason: ${{ needs.e2e-select.outputs.reason }}"

- name: Checkout code
uses: actions/checkout@v4
Expand All @@ -150,7 +171,7 @@ jobs:

- name: Run e2e tests - ${{ matrix.test_name }}
run: |
make ${{ matrix.make_target || 'run-e2e-full' }} TEST_PATTERN='${{ matrix.test_pattern }}' E2E_CREATE_FLAGS='--dind-memory-mode'
make ${{ matrix.make_target }} TEST_PATTERN='${{ matrix.test_pattern }}' E2E_CREATE_FLAGS='${{ matrix.create_flags }} --dind-memory-mode'
working-directory: operator

# The test code handles cleanup via Teardown(), but this step provides
Expand All @@ -173,29 +194,22 @@ jobs:
if-no-files-found: warn
retention-days: 7

# This job runs with the same matrix as 'e2e' when E2E tests are skipped (no relevant
# file changes and no 'run-e2e' label). It reports a passing status so that required
# branch protection checks are satisfied even for documentation-only PRs.
# Mirror that emits synthetic passes for matrix rows the selector excluded
# (path-filtered out, or all-rows when the PR is in draft state without the
# 'run-e2e' label). This keeps the required branch-protection check names
# (E2E - <test_name>) resolvable even when nothing real runs.
e2e-skip:
needs: [changes]
needs: [e2e-select]
if: |
github.event_name == 'pull_request' &&
needs.changes.outputs.e2e-relevant != 'true' &&
!contains(github.event.pull_request.labels.*.name, 'run-e2e')
needs.e2e-select.outputs.has_skip == 'true'
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
include:
- test_name: gang_scheduling
- test_name: rolling_updates
- test_name: startup_ordering
- test_name: Topology_Aware_Scheduling
- test_name: cert_management
- test_name: auto_mnnvl
- test_name: crd_installer
- test_name: resource_sharing
matrix: ${{ fromJSON(needs.e2e-select.outputs.skip) }}
name: E2E - ${{ matrix.test_name }}
steps:
- name: Skip E2E (no relevant changes)
run: echo "E2E skipped — no changes to operator/ or .github/ and 'run-e2e' label not set"
- name: Skip E2E
run: |
echo "Skipped: ${{ matrix.test_name }} on ${{ matrix.backend }}"
echo "Selection reason: ${{ needs.e2e-select.outputs.reason }}"
130 changes: 130 additions & 0 deletions .github/workflows/e2e-nightly.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# /*
# Copyright 2026 The Grove Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# */

# Nightly E2E — runs the full (suite × capable backend) matrix on schedule.
#
# Rationale: the PR matrix (build-check-test.yaml) is path-filtered to keep
# per-PR cost bounded as more backends land. That means cross-backend
# regressions in code paths neither PR touched can slip in. This workflow
# catches them by running the exhaustive matrix once a day.
#
# Failure routing: this initial cut uploads per-row diagnostic artifacts and
# writes a job-summary report. Issue auto-open / Slack notification is left
# as a follow-up so the first weeks of nightly runs do not spam the repo
# while the matrix is stabilising.

name: Nightly E2E

on:
schedule:
# 07:00 UTC = 15:00 Beijing / 00:00 PST. Adjust to maintainer preference.
- cron: "0 7 * * *"
# Allow manual trigger for ad-hoc verification (e.g. after a flaky run).
workflow_dispatch:

# Don't run multiple nightlies in parallel (matrix already consumes the
# self-hosted runner pool; concurrent runs would queue and bleed into the
# next day's window).
concurrency:
group: e2e-nightly
cancel-in-progress: false

jobs:
# Compute the full matrix via the same selector the PR workflow uses, in
# nightly mode (ignores path filter / labels / draft state).
matrix:
runs-on: ubuntu-latest
outputs:
run: ${{ steps.select.outputs.run }}
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Run selector (nightly mode)
id: select
run: |
echo "=== Selector result ==="
python3 hack/e2e-select/main.py --mode nightly | tee /tmp/select.json
echo "======================="
echo "run=$(jq -c .run /tmp/select.json)" >> "$GITHUB_OUTPUT"

e2e:
needs: matrix
runs-on: prod-grove-e2e-v1
timeout-minutes: 90
strategy:
fail-fast: false
matrix: ${{ fromJSON(needs.matrix.outputs.run) }}
name: Nightly E2E - ${{ matrix.test_name }}
steps:
- name: Print runner specs
run: |
echo "CPUs: $(nproc)"
echo "RAM: $(free -h | awk '/^Mem:/ {print $2}')"

- name: Checkout code
uses: actions/checkout@v4

- name: Pull registry image from GHCR to avoid Docker Hub rate limits
run: |
docker pull ghcr.io/distribution/distribution:3.1.0
docker tag ghcr.io/distribution/distribution:3.1.0 registry:2

- name: E2E Setup
uses: ./.github/actions/e2e-setup

- name: Run e2e tests - ${{ matrix.test_name }}
run: |
make ${{ matrix.make_target }} TEST_PATTERN='${{ matrix.test_pattern }}' E2E_CREATE_FLAGS='${{ matrix.create_flags }} --dind-memory-mode'
working-directory: operator

- name: Cleanup k3d cluster
if: always()
working-directory: operator
run: make e2e-cluster-down || true

- name: Upload test logs on failure
if: failure()
uses: actions/upload-artifact@v4
with:
name: nightly-e2e-logs-${{ matrix.test_name }}
path: operator/e2e-diagnostics/
if-no-files-found: warn
retention-days: 14

# Aggregate report. Runs unconditionally after the matrix so we always have
# a single place to look at the night's results.
summary:
needs: e2e
if: always()
runs-on: ubuntu-latest
steps:
- name: Write summary
run: |
{
echo "## Nightly E2E summary"
echo ""
echo "Run: ${{ github.run_id }}"
echo "Trigger: ${{ github.event_name }}"
echo "Status: ${{ needs.e2e.result }}"
echo ""
if [ "${{ needs.e2e.result }}" != "success" ]; then
echo "⚠️ One or more matrix rows failed. See the [run page](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) and the per-row \`nightly-e2e-logs-*\` artifacts."
else
echo "✅ All matrix rows passed."
fi
} >> "$GITHUB_STEP_SUMMARY"
Loading
Loading