From fe1d5751c24409ac2d8b8022ea1f92af4d0b3f76 Mon Sep 17 00:00:00 2001 From: Bill Berry Date: Mon, 1 Jun 2026 17:27:37 -0700 Subject: [PATCH] ci(workflows): add Vally eval workflows and update PR automation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - add six eval-* and evals-agent-matrix workflows - update pr-review and pr-validation automation - update .vally.yaml configuration 🔧 - Generated by Copilot --- .github/workflows/eval-corpus-moderation.yml | 89 ++++++++ .github/workflows/eval-spec-lint.yml | 100 +++++++++ .github/workflows/eval-stimulus-presence.yml | 77 +++++++ .github/workflows/eval-text-moderation.yml | 61 ++++++ .github/workflows/eval-vally.yml | 206 +++++++++++++++++++ .github/workflows/evals-agent-matrix.yml | 106 ++++++++++ .github/workflows/pr-review.md | 5 + .github/workflows/pr-validation.yml | 47 +++++ .vally.yaml | 19 +- 9 files changed, 699 insertions(+), 11 deletions(-) create mode 100644 .github/workflows/eval-corpus-moderation.yml create mode 100644 .github/workflows/eval-spec-lint.yml create mode 100644 .github/workflows/eval-stimulus-presence.yml create mode 100644 .github/workflows/eval-text-moderation.yml create mode 100644 .github/workflows/eval-vally.yml create mode 100644 .github/workflows/evals-agent-matrix.yml diff --git a/.github/workflows/eval-corpus-moderation.yml b/.github/workflows/eval-corpus-moderation.yml new file mode 100644 index 000000000..2e7fc1661 --- /dev/null +++ b/.github/workflows/eval-corpus-moderation.yml @@ -0,0 +1,89 @@ +name: Evals - Corpus Content Moderation + +on: + workflow_call: + inputs: + base-sha: + description: 'Base SHA for changed-artifact detection' + required: true + type: string + head-sha: + description: 'Head SHA for changed-artifact detection' + required: true + type: string + soft-fail: + description: 'Whether to continue on content moderation failures' + required: false + type: boolean + default: false + +permissions: + contents: read + +jobs: + content-moderation: + name: Evals - Corpus Content Moderation + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Checkout repository + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + fetch-depth: 0 + + - name: Setup Node.js + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 + with: + node-version: "24" + + - name: Set up Python + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: "3.11" + + - name: Install uv + uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0 + with: + version: "0.10.9" + + - name: Install moderation dependencies + run: uv pip install --system -r scripts/evals/moderation/requirements.txt + + - name: Cache Detoxify model + uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2 + with: + path: ~/.cache/torch/hub/checkpoints + key: detoxify-unbiased-${{ hashFiles('scripts/evals/moderation/requirements.txt') }} + + - name: Create logs directory + shell: pwsh + run: New-Item -ItemType Directory -Force -Path logs | Out-Null + + - name: Detect changed AI artifacts + shell: pwsh + run: | + pwsh -NoProfile -File scripts/evals/Get-ChangedAIArtifact.ps1 ` + -BaseRef "${{ inputs.base-sha }}" ` + -HeadRef "${{ inputs.head-sha }}" ` + -OutFile logs/changed-ai-artifacts.json + + - name: Moderate changed corpus + shell: pwsh + continue-on-error: ${{ inputs.soft-fail }} + run: | + pwsh -NoProfile -File scripts/evals/Invoke-CorpusModeration.ps1 ` + -ManifestPath logs/changed-ai-artifacts.json ` + -OutFile logs/moderation-corpus.json + + - name: Upload moderation artifacts on failure + if: failure() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: content-moderation-logs + path: | + logs/changed-ai-artifacts.json + logs/moderation-corpus.json + if-no-files-found: ignore + retention-days: 7 diff --git a/.github/workflows/eval-spec-lint.yml b/.github/workflows/eval-spec-lint.yml new file mode 100644 index 000000000..78bc458db --- /dev/null +++ b/.github/workflows/eval-spec-lint.yml @@ -0,0 +1,100 @@ +name: Evals - Spec Lint and Skill Hygiene + +on: + workflow_call: + inputs: + base-sha: + description: "Base commit SHA for changed-artifact detection." + required: true + type: string + head-sha: + description: "Head commit SHA for changed-artifact detection." + required: true + type: string + soft-fail: + description: "When true, lint failures do not fail the job." + required: false + type: boolean + default: false + +permissions: + contents: read + +jobs: + eval-lint: + name: Evals - Spec Lint and Skill Hygiene + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Checkout repository + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + fetch-depth: 0 + + - name: Setup Node.js + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 + with: + node-version: "24" + cache: "npm" + + - name: Install npm dependencies + run: npm ci + + - name: Install PowerShell-Yaml + shell: pwsh + run: | + Install-Module -Name PowerShell-Yaml -RequiredVersion 0.4.7 -Force -Scope CurrentUser + + - name: Create logs directory + shell: pwsh + run: New-Item -ItemType Directory -Force -Path logs | Out-Null + + - name: Detect changed AI artifacts + shell: pwsh + run: | + pwsh -NoProfile -File scripts/evals/Get-ChangedAIArtifact.ps1 ` + -BaseRef "${{ inputs.base-sha }}" ` + -HeadRef "${{ inputs.head-sha }}" ` + -OutFile logs/changed-ai-artifacts.json + + - name: Validate eval spec schema + shell: pwsh + continue-on-error: ${{ inputs.soft-fail }} + run: | + pwsh -NoProfile -File scripts/evals/Test-EvalSpec.ps1 ` + -Root evals/ ` + -OutputPath logs/eval-spec-lint.json + + - name: Run skill hygiene lint + shell: pwsh + continue-on-error: ${{ inputs.soft-fail }} + run: | + $manifestPath = 'logs/changed-ai-artifacts.json' + if (-not (Test-Path -LiteralPath $manifestPath)) { + Write-Host "No changed-artifact manifest found; skipping skill hygiene lint." + return + } + $manifest = Get-Content -LiteralPath $manifestPath -Raw | ConvertFrom-Json + $skillChanges = @($manifest | Where-Object { $_.kind -eq 'skill' }) + if ($skillChanges.Count -eq 0) { + Write-Host "No skill artifacts changed; skipping skill hygiene lint." + return + } + Write-Host "Detected $($skillChanges.Count) changed skill artifact(s); running vally lint." + npm run eval:lint:skills + if ($LASTEXITCODE -ne 0) { + throw "Skill hygiene lint failed with exit code $LASTEXITCODE." + } + + - name: Upload eval-lint artifacts on failure + if: failure() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: eval-lint-logs + path: | + logs/eval-spec-lint.json + logs/changed-ai-artifacts.json + if-no-files-found: ignore + retention-days: 7 diff --git a/.github/workflows/eval-stimulus-presence.yml b/.github/workflows/eval-stimulus-presence.yml new file mode 100644 index 000000000..e771c67a1 --- /dev/null +++ b/.github/workflows/eval-stimulus-presence.yml @@ -0,0 +1,77 @@ +name: Evals - Stimulus Presence + +on: + workflow_call: + inputs: + base-sha: + description: "Base commit SHA for change detection" + required: true + type: string + head-sha: + description: "Head commit SHA for change detection" + required: true + type: string + soft-fail: + description: "Whether to continue on validation failures" + required: false + type: boolean + default: false + +permissions: + contents: read + +jobs: + eval-presence: + name: Evals - Stimulus Presence + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Checkout repository + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + fetch-depth: 0 + + - name: Setup Node.js + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 + with: + node-version: "24" + cache: "npm" + + - name: Install PowerShell-Yaml + shell: pwsh + run: | + Install-Module -Name PowerShell-Yaml -RequiredVersion 0.4.7 -Force -Scope CurrentUser + + - name: Create logs directory + shell: pwsh + run: New-Item -ItemType Directory -Force -Path logs | Out-Null + + - name: Detect changed AI artifacts + shell: pwsh + run: | + pwsh -NoProfile -File scripts/evals/Get-ChangedAIArtifact.ps1 ` + -BaseRef "${{ inputs.base-sha }}" ` + -HeadRef "${{ inputs.head-sha }}" ` + -OutFile logs/changed-ai-artifacts.json + + - name: Enforce stimulus presence + shell: pwsh + continue-on-error: ${{ inputs.soft-fail }} + run: | + pwsh -NoProfile -File scripts/evals/Test-StimulusPresence.ps1 ` + -ManifestPath logs/changed-ai-artifacts.json ` + -EvalRoot evals/ ` + -OutFile logs/stimulus-presence.json + + - name: Upload presence artifacts on failure + if: failure() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: eval-presence-logs + path: | + logs/changed-ai-artifacts.json + logs/stimulus-presence.json + if-no-files-found: ignore + retention-days: 7 diff --git a/.github/workflows/eval-text-moderation.yml b/.github/workflows/eval-text-moderation.yml new file mode 100644 index 000000000..572e679b9 --- /dev/null +++ b/.github/workflows/eval-text-moderation.yml @@ -0,0 +1,61 @@ +name: Evals - Text Moderation + +on: + workflow_call: + inputs: + soft-fail: + description: 'Whether to continue on text moderation failures' + required: false + type: boolean + default: false + +permissions: + contents: read + +jobs: + text-moderation: + name: Evals - Stimulus Text Moderation + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Checkout repository + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + fetch-depth: 0 + + - name: Setup Node.js + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 + with: + node-version: "24" + cache: "npm" + + - name: Install npm dependencies + run: npm ci + + - name: Install PowerShell-Yaml + shell: pwsh + run: | + Install-Module -Name PowerShell-Yaml -RequiredVersion 0.4.7 -Force -Scope CurrentUser + + - name: Create logs directory + shell: pwsh + run: New-Item -ItemType Directory -Force -Path logs | Out-Null + + - name: Moderate AI artifact corpus (alex.js + retext-profanities) + shell: pwsh + continue-on-error: ${{ inputs.soft-fail }} + run: | + pwsh -NoProfile -File scripts/evals/Test-EvalSpecText.ps1 ` + -OutputPath logs/eval-spec-text-moderation.json + + - name: Upload text moderation artifacts on failure + if: failure() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: eval-text-moderation-logs + path: | + logs/eval-spec-text-moderation.json + if-no-files-found: ignore + retention-days: 7 diff --git a/.github/workflows/eval-vally.yml b/.github/workflows/eval-vally.yml new file mode 100644 index 000000000..15e835065 --- /dev/null +++ b/.github/workflows/eval-vally.yml @@ -0,0 +1,206 @@ +name: Evals - Execute Vally Suites + +on: + workflow_call: + inputs: + base-sha: + description: 'Base SHA for changed-artifact detection' + required: true + type: string + head-sha: + description: 'Head SHA for changed-artifact detection' + required: true + type: string + secrets: + copilot-github-token: + description: 'Token used to authenticate Copilot for eval execution' + required: true + +permissions: + contents: read + +jobs: + eval-execute: + name: Evals - Execute Vally Suites + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write + env: + COPILOT_GITHUB_TOKEN: ${{ secrets.copilot-github-token }} + steps: + - name: Checkout repository + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + fetch-depth: 0 + + - name: Setup Node.js + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 + with: + node-version: "24" + cache: "npm" + + - name: Install npm dependencies + run: npm ci + + - name: Install PowerShell-Yaml + shell: pwsh + run: | + Install-Module -Name PowerShell-Yaml -RequiredVersion 0.4.7 -Force -Scope CurrentUser + + - name: Set up Python + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: "3.11" + + - name: Install uv + uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0 + with: + version: "0.10.9" + + - name: Install moderation dependencies + run: uv pip install --system -r scripts/evals/moderation/requirements.txt + + - name: Cache Detoxify model + uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2 + with: + path: ~/.cache/torch/hub/checkpoints + key: detoxify-unbiased-${{ hashFiles('scripts/evals/moderation/requirements.txt') }} + + - name: Create logs directory + shell: pwsh + run: New-Item -ItemType Directory -Force -Path logs | Out-Null + + - name: Configure Copilot home + shell: pwsh + run: | + $copilotHome = Join-Path $env:RUNNER_TEMP 'copilot-home' + New-Item -ItemType Directory -Force -Path $copilotHome | Out-Null + "COPILOT_HOME=$copilotHome" | Out-File -FilePath $env:GITHUB_ENV -Append + + - name: Verify Copilot token + shell: pwsh + run: pwsh -NoProfile -File scripts/evals/Test-CopilotToken.ps1 + + - name: Detect changed AI artifacts + shell: pwsh + run: | + pwsh -NoProfile -File scripts/evals/Get-ChangedAIArtifact.ps1 ` + -BaseRef "${{ inputs.base-sha }}" ` + -HeadRef "${{ inputs.head-sha }}" ` + -OutFile logs/changed-ai-artifacts.json + + - name: Run vally evals for changed artifacts + shell: pwsh + run: | + pwsh -NoProfile -File scripts/evals/Invoke-VallyEvals.ps1 ` + -ManifestPath logs/changed-ai-artifacts.json ` + -LogsDir logs/ + + - name: Run per-agent agent-behavior matrix (changed) + shell: pwsh + continue-on-error: true + run: | + $manifestPath = 'logs/changed-ai-artifacts.json' + $changedPaths = @() + if (Test-Path -LiteralPath $manifestPath) { + $manifest = Get-Content -LiteralPath $manifestPath -Raw | ConvertFrom-Json + if ($manifest.artifacts) { + $changedPaths = @($manifest.artifacts | ForEach-Object { $_.path } | Where-Object { $_ }) + } + } + if ($changedPaths.Count -eq 0) { + Write-Host 'No changed AI artifacts; skipping per-agent matrix.' -ForegroundColor Yellow + exit 0 + } + pwsh -NoProfile -File scripts/evals/Invoke-AgentMatrix.ps1 ` + -Changed $changedPaths ` + -Tier pr + + - name: Upload eval execution artifacts + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: eval-execute-logs + path: | + logs/eval-results-*.json + logs/eval-summary.json + logs/changed-ai-artifacts.json + logs/agent-matrix/**/*.log + evals/results/agent-matrix/**/*.json + if-no-files-found: ignore + retention-days: 14 + + - name: Post or update PR-comment summary + if: always() && github.event_name == 'pull_request' + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 + with: + script: | + const fs = require('fs'); + const summaryPath = 'logs/eval-summary.json'; + const marker = ''; + const runUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`; + + if (!fs.existsSync(summaryPath)) { + core.info(`${summaryPath} not found; skipping eval summary comment.`); + return; + } + + let summary; + try { + summary = JSON.parse(fs.readFileSync(summaryPath, 'utf8')); + } catch (err) { + core.warning(`Could not parse ${summaryPath}: ${err.message}`); + return; + } + + const totals = summary.totals || {}; + const artifacts = Number(totals.artifacts || 0); + const specs = Number(totals.specs || 0); + const assertionsPassed = Number(totals.assertionsPassed || 0); + const assertionsFailed = Number(totals.assertionsFailed || 0); + const failedSpecs = Number(totals.failedSpecs || 0); + const perArtifact = Array.isArray(summary.perArtifact) ? summary.perArtifact : []; + const lines = perArtifact.map(a => { + const passed = Number(a.assertionsPassed || 0); + const failed = Number(a.assertionsFailed || 0); + const indicator = a.status === 'fail' ? ':x:' : ':white_check_mark:'; + const id = a.artifactId || a.path || '(unknown)'; + return `- ${indicator} \`${id}\` — ${passed} passed, ${failed} failed`; + }); + + const headline = `**Artifacts:** ${artifacts} | **Specs:** ${specs} (${failedSpecs} failed) | **Assertions:** ${assertionsPassed} passed, ${assertionsFailed} failed`; + + const body = [ + marker, + '## Eval Coverage Summary', + '', + headline, + '', + lines.length ? lines.join('\n') : '_No artifact-scoped results in this run._', + '', + `[View workflow run](${runUrl})` + ].join('\n'); + + const { data: comments } = await github.rest.issues.listComments({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number + }); + const existing = comments.find(c => c.body && c.body.includes(marker)); + if (existing) { + await github.rest.issues.updateComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: existing.id, + body + }); + } else { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body + }); + } diff --git a/.github/workflows/evals-agent-matrix.yml b/.github/workflows/evals-agent-matrix.yml new file mode 100644 index 000000000..e16aedbae --- /dev/null +++ b/.github/workflows/evals-agent-matrix.yml @@ -0,0 +1,106 @@ +name: Evals - Per-Agent Matrix (On-Demand) + +# Manual full-matrix dispatch for the agent-behavior suite. Honors the +# 2026-05-24 cross-plan rule that prohibits scheduled eval jobs: this workflow +# is invoked on demand via the GitHub UI / API and uses Tier nightly exit +# policy (exit 1 on any per-agent overall: fail). + +on: + workflow_dispatch: + inputs: + tier: + description: "Exit policy tier (pr=advisory, nightly=strict)." + required: false + default: "nightly" + type: choice + options: + - nightly + - pr + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + agent-matrix: + name: Per-Agent Matrix + runs-on: ubuntu-latest + permissions: + contents: read + env: + COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + steps: + - name: Checkout repository + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + fetch-depth: 0 + + - name: Setup Node.js + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 + with: + node-version: "24" + cache: "npm" + + - name: Install npm dependencies + run: npm ci + + - name: Install PowerShell-Yaml + shell: pwsh + run: | + Install-Module -Name PowerShell-Yaml -RequiredVersion 0.4.7 -Force -Scope CurrentUser + + - name: Set up Python + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: "3.11" + + - name: Install uv + uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0 + with: + version: "0.10.9" + + - name: Create logs directory + shell: pwsh + run: New-Item -ItemType Directory -Force -Path logs | Out-Null + + - name: Configure Copilot home + shell: pwsh + run: | + $copilotHome = Join-Path $env:RUNNER_TEMP 'copilot-home' + New-Item -ItemType Directory -Force -Path $copilotHome | Out-Null + "COPILOT_HOME=$copilotHome" | Out-File -FilePath $env:GITHUB_ENV -Append + + - name: Verify Copilot token + shell: pwsh + run: pwsh -NoProfile -File scripts/evals/Test-CopilotToken.ps1 + + - name: Run per-agent agent-behavior matrix (all) + shell: pwsh + env: + MATRIX_TIER: ${{ inputs.tier }} + run: | + pwsh -NoProfile -File scripts/evals/Invoke-AgentMatrix.ps1 ` + -All ` + -Tier $env:MATRIX_TIER + + - name: Render per-agent matrix dashboard + if: always() + shell: pwsh + run: | + pwsh -NoProfile -File scripts/evals/New-AgentMatrixDashboard.ps1 + + - name: Upload matrix artifacts + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: agent-matrix-results + path: | + logs/agent-matrix/**/*.log + logs/agent-matrix-dashboard.html + evals/results/agent-matrix/**/*.json + if-no-files-found: ignore + retention-days: 30 diff --git a/.github/workflows/pr-review.md b/.github/workflows/pr-review.md index 06534a66c..826b31983 100644 --- a/.github/workflows/pr-review.md +++ b/.github/workflows/pr-review.md @@ -12,6 +12,7 @@ timeout-minutes: 15 imports: - ../agents/hve-core/pr-review.agent.md + - ../agents/content-policy-citation.agent.md checkout: sparse-checkout: | @@ -200,6 +201,10 @@ to submitting REQUEST_CHANGES and adding `needs-revision`. Add a comment explaining that the PR was converted to draft due to insufficient quality for review. +## Output Style + +When any output emitted by this workflow (PR review comments, PR descriptions, or other public output) references or flags a suspected content-policy concern, follow the citation discretion rules from the imported Content Policy Citation agent as authoritative. + ## Constraints * Do not approve PRs. Only use `COMMENT` or `REQUEST_CHANGES`. diff --git a/.github/workflows/pr-validation.yml b/.github/workflows/pr-validation.yml index 9bb949c12..24ce15f66 100644 --- a/.github/workflows/pr-validation.yml +++ b/.github/workflows/pr-validation.yml @@ -338,3 +338,50 @@ jobs: security-events: write # Required for SARIF upload to Security tab actions: read + eval-presence: + name: Evals - Stimulus Presence + permissions: + contents: read + uses: ./.github/workflows/eval-stimulus-presence.yml + with: + base-sha: ${{ github.event.pull_request.base.sha }} + head-sha: ${{ github.event.pull_request.head.sha }} + + eval-lint: + name: Evals - Spec Lint and Skill Hygiene + permissions: + contents: read + uses: ./.github/workflows/eval-spec-lint.yml + with: + base-sha: ${{ github.event.pull_request.base.sha }} + head-sha: ${{ github.event.pull_request.head.sha }} + + eval-text-moderation: + name: Evals - Stimulus Text Moderation + permissions: + contents: read + uses: ./.github/workflows/eval-text-moderation.yml + + content-moderation: + name: Evals - Corpus Content Moderation + permissions: + contents: read + uses: ./.github/workflows/eval-corpus-moderation.yml + with: + base-sha: ${{ github.event.pull_request.base.sha }} + head-sha: ${{ github.event.pull_request.head.sha }} + + eval-execute: + name: Evals - Execute Vally Suites + needs: [eval-presence, eval-lint, eval-text-moderation, content-moderation] + if: github.event.pull_request.head.repo.fork == false + permissions: + contents: read + pull-requests: write + uses: ./.github/workflows/eval-vally.yml + with: + base-sha: ${{ github.event.pull_request.base.sha }} + head-sha: ${{ github.event.pull_request.head.sha }} + secrets: + copilot-github-token: ${{ secrets.COPILOT_GITHUB_TOKEN }} + diff --git a/.vally.yaml b/.vally.yaml index e0158de2b..19fd523af 100644 --- a/.vally.yaml +++ b/.vally.yaml @@ -5,32 +5,29 @@ paths: environments: security: skills: - - .github/skills/security/owasp-top-10 - - .github/skills/security/owasp-cicd + - ../../.github/skills/security/owasp-top-10 + - ../../.github/skills/security/owasp-cicd coding-standards: skills: - - .github/skills/coding-standards/python-foundational + - ../../.github/skills/coding-standards/python-foundational security-and-coding: skills: - - .github/skills/security/owasp-top-10 - - .github/skills/coding-standards/python-foundational + - ../../.github/skills/security/owasp-top-10 + - ../../.github/skills/coding-standards/python-foundational suites: skill-quality: description: Evaluate skill behavior via copilot-sdk agent conversations filter: - tags: - category: skill-quality + category: skill-quality agent-behavior: description: Evaluate agent routing and response quality filter: - tags: - category: agent-behavior + category: agent-behavior script-validation: description: Validate script correctness via copilot-sdk conversations filter: - tags: - category: script-validation + category: script-validation