diff --git a/core/capabilities/remote/executable/server_test.go b/core/capabilities/remote/executable/server_test.go index 0b89c722f23..ad557c4e061 100644 --- a/core/capabilities/remote/executable/server_test.go +++ b/core/capabilities/remote/executable/server_test.go @@ -211,7 +211,7 @@ func Test_Server_CapabilityError(t *testing.T) { numCapabilityPeers := 4 - callers, srvcs := testRemoteExecutableCapabilityServer(ctx, t, &commoncap.RemoteExecutableConfig{}, &TestErrorCapability{}, 10, 9, numCapabilityPeers, 3, 100*time.Millisecond, nil) + callers, srvcs := testRemoteExecutableCapabilityServer(ctx, t, &commoncap.RemoteExecutableConfig{}, &TestErrorCapability{}, 10, 9, numCapabilityPeers, 3, 10*time.Second, nil) for _, caller := range callers { _, err := caller.Execute(t.Context(), diff --git a/tools/test/.agents/skills/_shared-jira-flaky-ops/README.md b/tools/test/.agents/skills/_shared-jira-flaky-ops/README.md new file mode 100644 index 00000000000..ca496c089d1 --- /dev/null +++ b/tools/test/.agents/skills/_shared-jira-flaky-ops/README.md @@ -0,0 +1,83 @@ +--- +name: shared-jira-flaky-ops-readme +description: Index of shared JIRA operations for flaky-test skills. Defines the canonical slim-record schema, required env, and how to use these files from another skill. +--- + +# Shared JIRA Flaky-Test Operations + +This directory contains self-contained reference files for JIRA operations specific to flaky-test workflows. Any skill that needs to interact with JIRA around flaky tests should read the relevant file here instead of duplicating the logic. + +## When to use + +These files are **includable references**, not user-facing skills (no `SKILL.md`). Read the relevant file(s) when you need to perform one of the operations listed below. Each file is self-contained: it declares its inputs, steps, and outputs. You do not need to read files you are not using. + +## Required environment + +All operations require: +- `cloudId` — the Atlassian cloud ID (from `mcp__atlassian__getAccessibleAtlassianResources` or cached from a prior call). +- `accountId` — the current user's Atlassian account ID (from `mcp__atlassian__atlassianUserInfo`, cached in `phase_outputs.phase0`). + +## Operations index + +| File | Purpose | Key inputs | +|------|---------|------------| +| `investigation-comment.md` | Comment format for Investigation Updates; parsing prior-attempt comments | jira_key, outcome, field values | +| `abandon-ticket.md` | Mid-flight abandonment: unassign → Open → Investigation Update comment (ABANDONED) | jira_key, reason | +| `transition-ticket.md` | Transition a ticket to a semantic target state | jira_key, target | +| `claim-ticket.md` | Assign to self and transition to "In Progress" | jira_key, accountId | +| `fetch-flaky-tickets.md` | JQL search loop: fetch N eligible flaky-test tickets for a project key | KEY, N, cloudId, current_repo, nav_tool, lsp_available, repo root | +| `validate-flaky-ticket.md` | Validate a single explicitly-provided ticket and build its slim record | jira_key, ci_run_url, cloudId, current_repo, nav_tool, lsp_available, repo root | +| `recheck-ownership.md` | Verify the ticket is still assigned to us before touching files or pushing | jira_key, accountId | + +## How to include from another skill + +1. Read the relevant file(s) from this directory. +2. Collect the declared inputs from your skill's context. +3. Follow the file's steps exactly. +4. Use the declared output schema to integrate the result back into your skill's state. + +Example: to claim a ticket, read `claim-ticket.md` and follow its steps with your `jira_key` and `accountId`. + +--- + +## Canonical slim-record schema + +Defined here once. Referenced by `fetch-flaky-tickets.md` and `validate-flaky-ticket.md`. Both files must produce records conforming to this schema. Do not redefine it in either file. + +```json +{ + "jira_key": "KEY-NNN | null", + "local_id": "local-N | null", + "title": "string", + "description": "string", + "trunk_test_case_url": "https://app.trunk.io/.../test/{UUID} | null", + "test_case_id": "{UUID} | null", + "package": "github.com/owner/repo/path | null", + "test_name": "TestFoo | TestFoo/subtest_name", + "previous_attempts": [ + { + "outcome": "INCONCLUSIVE | PARTIAL_FIX | MISMATCH | SKIP_TOP_LEVEL | RETURNED_TO_QUEUE | ABANDONED | FIXED", + "date": "YYYY-MM-DD", + "summary": "string", + "excluded_approaches": ["string"], + "rejection_reasons": ["string"], + "recommended_next_step": "string | null", + "full_text": "string" + } + ], + "ci_run_url": "string | null", + "provided_log_path": "string | null", + "provided_log_text": "string | null" +} +``` + +**Field rules:** +- `jira_key`: non-null in JIRA modes; null in local mode. +- `local_id`: non-null in local mode (`local-1`, `local-2`, …); null in JIRA modes. +- `test_case_id`: `customfield_13010` (bare UUID). If absent, extract UUID from `https://app.trunk.io/*/test/{UUID}` in description. Null only if neither yields a value. Always null in local mode. +- `package`: `customfield_13009`. Null if absent. +- `test_name`: `customfield_13007` (full path including subtest). If absent, longest `TestXxx`/`testXxx` token from title. +- `trunk_test_case_url`: scan description for `https://app.trunk.io/*/test/{UUID}`; null if not found. Display only. +- `previous_attempts`: parsed per `investigation-comment.md` parsing rules. Empty array in local mode. +- `ci_run_url`: null in project mode (no upfront URL syntax). Populated from `KEY@URL` syntax in direct-ticket mode, or from a 3a fallback prompt. +- `provided_log_path`, `provided_log_text`: null in JIRA modes; populated from `--log ` in local mode. diff --git a/tools/test/.agents/skills/_shared-jira-flaky-ops/abandon-ticket.md b/tools/test/.agents/skills/_shared-jira-flaky-ops/abandon-ticket.md new file mode 100644 index 00000000000..aa2edcfac80 --- /dev/null +++ b/tools/test/.agents/skills/_shared-jira-flaky-ops/abandon-ticket.md @@ -0,0 +1,34 @@ +--- +name: abandon-ticket +description: Mid-flight abandonment procedure for a claimed flaky-test ticket. Unassigns, transitions back to Open, and writes an ABANDONED Investigation Update comment. Must never be skipped once a ticket has been claimed. +--- + +# Abandon Ticket + +Apply whenever a claimed ticket is stopped mid-flight — regardless of reason. This includes: user cancels, user skips, verdict is INCONCLUSIVE, PARTIAL_FIX is reverted, ownership conflict detected, SUT/AMBIGUOUS/INFRA auto-queue return, or session ends early. + +**Never leave a claimed ticket in "In Progress" with no assignee action.** + +## Inputs + +- `jira_key` — the JIRA ticket key +- `reason` — one sentence describing why work stopped (used in "What was investigated" section) +- `accountId` — the current user's Atlassian account ID (to confirm we own it before unassigning) + +## Steps + +Execute in order: + +1. `mcp__atlassian__editJiraIssue` → unassign the issue (set `assignee` to null). +2. Follow `transition-ticket.md` with `jira_key` and `target = "Open"`. +3. Follow `investigation-comment.md` to write an `addCommentToJiraIssue` call: + - **Outcome**: ABANDONED + - **What was investigated**: `reason` (the reason work stopped). + - **Hypothesis**: N/A + - **What was tried**: N/A + - **Why it didn't hold**: N/A + - **Recommended next step**: N/A + +## Output + +No structured output. Caller continues with other issues after this completes. diff --git a/tools/test/.agents/skills/_shared-jira-flaky-ops/claim-ticket.md b/tools/test/.agents/skills/_shared-jira-flaky-ops/claim-ticket.md new file mode 100644 index 00000000000..2ccd4a1a7cc --- /dev/null +++ b/tools/test/.agents/skills/_shared-jira-flaky-ops/claim-ticket.md @@ -0,0 +1,31 @@ +--- +name: claim-ticket +description: Assign a flaky-test ticket to the current user and transition it to In Progress. Used after a ticket is approved for investigation. +--- + +# Claim Ticket + +## Inputs + +- `jira_key` — the JIRA ticket key (e.g. `CRE-5719`) +- `accountId` — the current user's Atlassian account ID (from `phase_outputs.phase0`) + +## Steps + +Execute in order — wait for each step to succeed before proceeding: + +1. `mcp__atlassian__editJiraIssue` → assign the issue to `accountId` (set `assignee.accountId = accountId`). Wait for success. +2. Follow `transition-ticket.md` with `jira_key` and `target = "In Progress"`. + - If the transition fails: log available transitions and stop. Do not leave the ticket assigned without transitioning. + +## Output + +```json +{ "success": true, "jira_key": "KEY-NNN" } +``` + +or on failure: + +```json +{ "success": false, "jira_key": "KEY-NNN", "error": "" } +``` diff --git a/tools/test/.agents/skills/_shared-jira-flaky-ops/fetch-flaky-tickets.md b/tools/test/.agents/skills/_shared-jira-flaky-ops/fetch-flaky-tickets.md new file mode 100644 index 00000000000..c2da5d691a8 --- /dev/null +++ b/tools/test/.agents/skills/_shared-jira-flaky-ops/fetch-flaky-tickets.md @@ -0,0 +1,79 @@ +--- +name: fetch-flaky-tickets +description: JQL search loop that fetches N eligible flaky-test tickets for a project key, filters out cross-repo and system-test tickets, resolves each to a local test function, and returns slim records. Extracted from phase2a. +--- + +# Fetch Flaky Tickets + +## Inputs + +- `KEY` — JIRA project key (e.g. `CRE`) +- `N` — number of eligible records to return +- `cloudId` — Atlassian cloud ID +- `current_repo` — `{owner}/{repo}` extracted from `git remote get-url origin` +- `nav_tool` — `"lsp"` | `"crg"` (from `phase_outputs.phase0`) +- `lsp_available` — boolean (from `phase_outputs.phase0`) +- repo root path + +## Output + +```json +{ + "slim_records": [ /* see README.md slim-record schema */ ], + "skipped": { "cross_repo": 0, "system_tests": 0, "not_found": 0 } +} +``` + +Never return raw JIRA API objects — caller only receives slim records. + +## Slim-record schema + +See `README.md` in this directory for the canonical schema. In the records produced here: +- `jira_key` non-null, `local_id` null. +- `ci_run_url` always null (project mode has no upfront URL syntax; 3a fallback may prompt later). +- `provided_log_path`, `provided_log_text` null. + +## Loop + +``` +results = [] +cursor = null +while len(results) < N: + fetch N issues via mcp__atlassian__searchJiraIssuesUsingJql: + jql: project = {KEY} AND labels = "flaky-test" AND status = "Open" ORDER BY created DESC + fields: ["summary", "description", "comment", "status", "assignee", + "customfield_13010", "customfield_13009", "customfield_13007"] + maxResults: N + nextPageToken: cursor (omit on first call) + + for each issue (in order): + 1. Repo check (zero-cost): extract {owner}/{repo} from customfield_13009 + (2nd + 3rd segments after github.com/). Mismatch → skip (cross_repo++). + If customfield_13009 absent, scan description for + https://github.com/{owner}/{repo} or a "Repo:" / "Repository:" field. + + 2. System-tests exclusion (zero-cost): if customfield_13009 starts with + github.com/smartcontractkit/chainlink/system-tests/ → skip (system_tests++). + + 3. Test function check: extract top-level function name from customfield_13007 + (part before first /), fall back to longest TestXxx token in title if absent. + - nav_tool="lsp" or lsp_available=true: LSP definition lookup + - nav_tool="crg": mcp__code-review-graph__semantic_search_nodes_tool + - last resort only: grep -rl "func {TestName}" . + Not found → skip (not_found++). + + 4. Eligible: build slim record (see schema), append to results. + Stop once len(results) == N. + + cursor = nextPageToken from response + if no more pages: break +``` + +## Field extraction rules + +- `test_case_id`: `customfield_13010` (bare UUID). If absent, extract UUID from `https://app.trunk.io/*/test/{UUID}` in description. Null only if neither yields a value. +- `package`: `customfield_13009`. Null if absent. +- `test_name`: `customfield_13007` (full path including subtest, e.g. `TestFoo/subtest`). If absent, longest `TestXxx`/`testXxx` token from title. +- `trunk_test_case_url`: scan description for `https://app.trunk.io/*/test/{UUID}`; null if not found. Display only. +- `previous_attempts`: parse per `investigation-comment.md` parsing rules. +- If any custom field is absent from the search response, call `mcp__atlassian__getJiraIssue` with `fields=["summary","description","comment","status","assignee","customfield_13010","customfield_13009","customfield_13007"]` for that issue as a fallback. diff --git a/tools/test/.agents/skills/_shared-jira-flaky-ops/investigation-comment.md b/tools/test/.agents/skills/_shared-jira-flaky-ops/investigation-comment.md new file mode 100644 index 00000000000..5cf3b2c317b --- /dev/null +++ b/tools/test/.agents/skills/_shared-jira-flaky-ops/investigation-comment.md @@ -0,0 +1,59 @@ +--- +name: investigation-comment +description: Authoritative format for Investigation Update JIRA comments and rules for parsing previous-attempt comments. Read by any phase or skill that writes to or reads from JIRA investigation history. +--- + +# Investigation Update Comment Format + +## Writing a comment + +Every `mcp__atlassian__addCommentToJiraIssue` call for an investigation update uses the structure below. Always include all five sections — write `N/A` for any that don't apply. **Style: concise and matter-of-fact. 1–3 sentences per section. No narrative padding, no hedge words.** + +```markdown +## Investigation Update — {OUTCOME} · {YYYY-MM-DD} + +**Outcome**: {OUTCOME} +**Investigator**: {display name from atlassianUserInfo} +**Classification**: {TEST | SUT | AMBIGUOUS | INFRA} (confidence: {high | low | none}) | N/A + +### What was investigated +{The failure mode and where analysis focused.} + +### Hypothesis +{The proposed root cause, or N/A.} + +### What was tried +{The fix or approach applied or proposed, or N/A.} + +### Why it didn't hold +{Objections, test results, or reason the fix was rejected or reverted — or N/A.} + +### Recommended next step +{Concrete actionable direction for the next investigator, or N/A.} +``` + +**Outcome values:** +| Value | When to use | +|-------|------------| +| `FIXED` | Fix verified, PR created | +| `INCONCLUSIVE` | Debate unresolved — no fix applied | +| `PARTIAL_FIX` | Fix applied, tests still failed, reverted | +| `RETURNED_TO_QUEUE` | SUT / AMBIGUOUS / INFRA classification | +| `CLOSED_SUBTEST` | Failure originates in a `t.Run` subtest, not the top-level function | +| `ABANDONED` | Mid-flight stop for any reason | + +--- + +## Parsing previous-attempt comments + +Scan JIRA comments for `## Investigation Update — {OUTCOME}`. For each match extract: + +- `outcome` — the OUTCOME token from the heading +- `date` — the date after `·` +- `full_text` — the full comment text +- `excluded_approaches` — content of `### What was tried` (skip if "N/A") +- `rejection_reasons` — content of `### Why it didn't hold` (skip if "N/A") +- `recommended_next_step` — content of `### Recommended next step` (null if "N/A") +- `summary` — content of `### What was investigated` + +Fall back to keyword scanning for non-standard-format comments. diff --git a/tools/test/.agents/skills/_shared-jira-flaky-ops/recheck-ownership.md b/tools/test/.agents/skills/_shared-jira-flaky-ops/recheck-ownership.md new file mode 100644 index 00000000000..8354139823a --- /dev/null +++ b/tools/test/.agents/skills/_shared-jira-flaky-ops/recheck-ownership.md @@ -0,0 +1,35 @@ +--- +name: recheck-ownership +description: Verify that a claimed ticket is still assigned to the current user before touching files or pushing. Used in phase4 (before applying fixes) and phase5 (before pushing). +--- + +# Recheck Ownership + +## Inputs + +- `jira_key` — the JIRA ticket key +- `accountId` — the current user's Atlassian account ID (cached from `phase_outputs.phase0`) + +## Steps + +1. Call `mcp__atlassian__getJiraIssue` with `fields=["assignee"]`. +2. Compare `assignee.accountId` to `accountId`. + +## Output + +```json +{ "result": "ok" } +``` + +or if reassigned: + +```json +{ "result": "reassigned", "reassigned_to": "" } +``` + +## Caller responsibility + +If `result = "reassigned"`: +- Report: *"KEY-NNN is now assigned to {displayName} — reach out before proceeding."* +- Follow `abandon-ticket.md` for this ticket. +- Continue with remaining issues. diff --git a/tools/test/.agents/skills/_shared-jira-flaky-ops/transition-ticket.md b/tools/test/.agents/skills/_shared-jira-flaky-ops/transition-ticket.md new file mode 100644 index 00000000000..fcae85fb18a --- /dev/null +++ b/tools/test/.agents/skills/_shared-jira-flaky-ops/transition-ticket.md @@ -0,0 +1,33 @@ +--- +name: transition-ticket +description: Generic transition operation for flaky-test JIRA tickets. Takes a semantic target state, resolves it to the actual transition name, and applies it. Referenced by claim-ticket.md and abandon-ticket.md. +--- + +# Transition Ticket + +## Inputs + +- `jira_key` — the JIRA ticket key (e.g. `CRE-5719`) +- `target` — semantic state: `"In Progress"` | `"In Review"` | `"Open"` | `"Won't Do"` | `"Done"` + +## Steps + +1. Call `mcp__atlassian__getTransitionsForJiraIssue` with `jira_key`. +2. Match `target` to an available transition using the alias table below. Pick the **first alias that appears** in the response. +3. Call `mcp__atlassian__transitionJiraIssue` with the matched transition ID. + - For closing targets (`"Won't Do"`, `"Done"`): if the transition supports a `resolution` field, set `resolution = "Won't Do"` (fallback: `"Won't Fix"`). +4. Output: `{ "success": true, "transition_name_used": "" }` on success, or `{ "success": false, "error": "", "available_transitions": ["..."] }` if no alias matched. + +## Target alias table + +| Semantic target | Try these names in order | +|----------------|--------------------------| +| `In Progress` | "In Progress", "In Development", "Active", "Start Progress" | +| `In Review` | "In Review", "In Code Review", "Code Review", "Review" | +| `Open` | "Open", "Reopen", "Backlog", "To Do", "Reopened" | +| `Won't Do` | "Won't Do", "Won't Fix", "Reject", "Close", "Done" | +| `Done` | "Done", "Closed", "Resolved", "Close", "Resolve" | + +## Error handling + +If no alias matches any available transition: log all available transition names and return `success: false`. The caller decides how to proceed (stop, skip, or pick manually). diff --git a/tools/test/.agents/skills/_shared-jira-flaky-ops/validate-flaky-ticket.md b/tools/test/.agents/skills/_shared-jira-flaky-ops/validate-flaky-ticket.md new file mode 100644 index 00000000000..df66a95800e --- /dev/null +++ b/tools/test/.agents/skills/_shared-jira-flaky-ops/validate-flaky-ticket.md @@ -0,0 +1,56 @@ +--- +name: validate-flaky-ticket +description: Validates a single explicitly-provided JIRA ticket key and builds its slim record. Returns structured status so the caller can handle ownership conflicts. Extracted from phase2b. +--- + +# Validate Flaky Ticket + +## Inputs + +- `jira_key` — the JIRA ticket key (e.g. `CRE-5719`) +- `ci_run_url` — GitHub Actions run URL from `KEY@URL` syntax, or null if not provided +- `cloudId` — Atlassian cloud ID +- `current_repo` — `{owner}/{repo}` extracted from `git remote get-url origin` +- `nav_tool` — `"lsp"` | `"crg"` (from `phase_outputs.phase0`) +- `lsp_available` — boolean (from `phase_outputs.phase0`) +- repo root path + +## Output + +```json +{ + "status": "ok" | "error" | "needs_assignment_check", + "message": "string (errors only)", + "slim_record": { /* see README.md slim-record schema — ok and needs_assignment_check only */ }, + "assignee_display_name": "string (needs_assignment_check only)", + "current_status": "string (needs_assignment_check only)" +} +``` + +Never return raw JIRA API objects. The slim record schema is defined in `README.md`. + +## Steps + +1. **Existence check**: Call `mcp__atlassian__getJiraIssue` with `fields=["summary","description","comment","status","assignee","customfield_13010","customfield_13009","customfield_13007"]` (native string array). Reuse this single response for all subsequent steps. + - Issue not found → `{ "status": "error", "message": "Issue KEY-NNN not found — the project may not exist or the ticket number is invalid." }` + +2. **Required data check**: The issue must contain both: + - A test function name: `customfield_13007` non-null, OR a `TestXxx`/`testXxx` token in title or description. + - A `testCaseId`: `customfield_13010` non-null, OR a Trunk URL matching `https://app.trunk.io/*/test/{UUID}` in description. + - Either missing → `{ "status": "error", "message": "KEY-NNN is missing required data: {test name | Trunk ID | both}. Cannot reliably investigate without it." }` + +3. **Repo compatibility check** (stop at first definitive result): + a. Read `customfield_13009`. If present, extract `{owner}/{repo}` from 2nd + 3rd path segments after `github.com/`. Mismatch → `{ "status": "error", "message": "KEY-NNN specifies repo '{owner}/{repo}' which does not match the current repository." }`. Only fall back to scanning description for `https://github.com/{owner}/{repo}` or a `Repo:`/`Repository:` field if `customfield_13009` is absent. + b. **Test function check**: use top-level function from `customfield_13007` (part before first `/`), falling back to longest `TestXxx`/`testXxx` token in title. Check locally: LSP → code-review-graph → grep (last resort only). + - Not found → `{ "status": "error", "message": "Test function not found in the current repository for KEY-NNN." }` + +4. **System-tests exclusion**: If `customfield_13009` starts with `github.com/smartcontractkit/chainlink/system-tests/` → `{ "status": "error", "message": "KEY-NNN is in the system-tests package ({package}), which is excluded from automated investigation." }` + +5. **Assignment check**: If the issue is assigned to another user AND status ≠ `Open` → return `{ "status": "needs_assignment_check", "assignee_display_name": "...", "current_status": "...", "slim_record": {...} }`. Include the slim record so the parent can proceed without re-fetching if the user confirms. + +6. **Build slim record** from the step 1 response — no additional API call. + - `jira_key` non-null, `local_id` null. + - `ci_run_url` from input parameter (null if not provided). + - `provided_log_path`, `provided_log_text` null. + - `previous_attempts`: parse per `investigation-comment.md` parsing rules. + - Return `{ "status": "ok", "slim_record": {...} }`. diff --git a/tools/test/.agents/skills/backlog-flaky-test-pipeline/SKILL.md b/tools/test/.agents/skills/backlog-flaky-test-pipeline/SKILL.md index 3406c7be6ff..55700089114 100644 --- a/tools/test/.agents/skills/backlog-flaky-test-pipeline/SKILL.md +++ b/tools/test/.agents/skills/backlog-flaky-test-pipeline/SKILL.md @@ -1,29 +1,33 @@ --- name: backlog-flaky-test-pipeline description: >- - A high-automation, multi-agent workflow specifically for resolving flaky tests - tracked in JIRA and analyzed by Trunk.io. + A high-automation, multi-agent workflow for resolving flaky tests. Supports + three modes: JIRA project backlog, direct JIRA ticket IDs, and local benchmark + mode (no JIRA or Trunk required). USE THIS WHEN: 1. You have specific JIRA ticket IDs (e.g., CRE-123) or want to pull N tickets from a project backlog. 2. The failure is identified as "flaky" by Trunk.io analysis. 3. You need an end-to-end pipeline (fetch -> debate -> fix -> PR -> JIRA update). + 4. You want a one-shot multi-agent fix attempt on named tests without JIRA (--local benchmark mode). DO NOT USE THIS WHEN: - 1. Investigating general local failures, race conditions, or timeouts not linked to a JIRA ticket. - 2. You are performing manual, exploratory debugging of a single test file (use 'debug-test-failure' instead). + 1. Iterative, interactive debugging of a single test file — use 'debug-test-failure' instead. + 2. You need an exploratory diagnostic loop rather than a structured fix attempt. --- -End-to-end workflow: fetch flaky-test JIRA tickets → run Trunk + code analysis in parallel → apply fixes → commit → update JIRA with the PR. +End-to-end workflow: fetch flaky-test JIRA tickets → run Trunk + code analysis in parallel → apply fixes → commit → update JIRA with the PR. In local mode: take test names directly, skip JIRA + Trunk, stop after fix verification. - `/backlog-flaky-test-pipeline [KEY [N]] [--auto]` — project + count mode - `/backlog-flaky-test-pipeline PROJ-NNN[@] [PROJ-NNN[@] ...] [--auto]` — direct-ticket mode (e.g. `CRE-5719 CCIP-42@https://github.com/.../actions/runs/123`). The optional `@` attaches a CI run URL for `investigate-ci-failure` analysis (direct-ticket mode only). +- `/backlog-flaky-test-pipeline --local .TestName [.TestName/subtest ...] [--log ] [--auto]` — local benchmark mode. Takes test specs directly; no JIRA or Trunk required. Stops after phase4 verification (no commit, no PR). Use for benchmarking this skill against other tools. - `KEY` — JIRA project key (e.g. `CRE`). Skips Phase 1 prompt if provided. - `N` — number of issues (default 3). Skips Phase 1 prompt if provided. - `--auto` — accept all defaults at every gate; only blocks on hard failures and ownership conflicts. +- `--log ` — (local mode only) path to a log file whose contents become seed evidence for analysis. @@ -51,8 +55,8 @@ This skill is a multi-phase workflow; later phases depend on outputs from earlie - - `mode`: `project` | `direct-ticket` - - `args`: original user input verbatim (project key, count, ticket list, flags) + - `mode`: `project` | `direct-ticket` | `local` + - `args`: original user input verbatim (project key, count, ticket list, flags). In local mode: `{ test_specs: [".", ...], log_path: " | null" }`. - `auto_mode`: boolean @@ -64,20 +68,23 @@ This skill is a multi-phase workflow; later phases depend on outputs from earlie One row per ticket. Maintain as a table, not a paragraph. Required fields: - - `jira_key` (e.g. `CRE-5719`) - - `test_case_id` (UUID from `customfield_13010`) + - `jira_key` (e.g. `CRE-5719`) — null in local mode + - `local_id` (e.g. `local-1`) — null in JIRA modes + - `test_case_id` (UUID from `customfield_13010`) — null in local mode - `test_name`, `package_path` - - `trunk_investigation_id` + - `trunk_investigation_id` — null in local mode - `actionable_facts` — `fix-flaky-test` facts with `Confidence ≥ 0.9` only - - `ci_run_url` — GitHub Actions run URL for `investigate-ci-failure` (null in project mode; populated from `KEY@URL` syntax or 3a fallback prompt) - - `ci_run_evidence` — structured failure data from `investigate-ci-failure` (null if no URL provided or call failed). Separate evidence track — exempt from the ≥ 0.9 rule. + - `ci_run_url` — GitHub Actions run URL for `investigate-ci-failure` (null in project mode and local mode; populated from `KEY@URL` syntax or 3a fallback prompt in direct-ticket mode) + - `ci_run_evidence` — structured failure data from `investigate-ci-failure` (null if no URL provided or call failed). Separate evidence track — exempt from the ≥ 0.9 rule. Always null in local mode. + - `provided_log_path` — path supplied via `--log` in local mode; null in JIRA modes + - `provided_log_text` — contents of the log file read once; null if no log or file missing - `shared_cause_group` — id linking tickets in the same package that share a root cause (see `shared-package-cause` tip) - `chosen_fix` — { approach, rationale } - `applied_fix` — { files_changed, diff_summary } - `lint_status` — `"ran"` | `"skipped"` | `"failed"` - `lint_scope` — package path used for linting (e.g. `./core/chains/evm/txmgr/...`) - `verification` — { local_10x_passed, ci_status } - - `branch`, `commit_sha`, `pr_url` + - `branch`, `commit_sha`, `pr_url` — null in local mode diff --git a/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase-final-local.md b/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase-final-local.md new file mode 100644 index 00000000000..ae37805a28f --- /dev/null +++ b/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase-final-local.md @@ -0,0 +1,40 @@ +--- +phase: phase-final-local +model: haiku +--- + + + + +Print the local-mode session summary and end. No commits, no push, no PR, no JIRA writes. Reached only from local mode after phase4 completes. + + + +Print the following table, populated from `ticket_records`: + +``` +Session complete (local mode — no JIRA, no PR). + +Fix results: +| Test | Verdict | Local 10x | Notes | +|------|---------|-----------|-------| +| .TestFoo | FIXED | 10/10 | diff retained, uncommitted | +| .TestBar | PARTIAL_FIX | 4/10 | reverted | +| .TestBaz | SKIPPED | — | classified SUT | +| .TestQux | INCONCLUSIVE | — | debate did not converge | +``` + +Column rules: +- **Test**: `{package}.{test_name}` (use `local_id` as fallback if package is null). +- **Verdict**: FIXED | PARTIAL_FIX | SKIPPED | INCONCLUSIVE | MISMATCH. +- **Local 10x**: pass count out of 10 for FIXED/PARTIAL_FIX; `—` for others. +- **Notes**: one short phrase. For FIXED: "diff retained, uncommitted". For PARTIAL_FIX: "reverted". For SKIPPED: the classification reason (e.g. "classified SUT", "classified AMBIGUOUS", "SKIP_TOP_LEVEL"). For INCONCLUSIVE: "debate did not converge". For MISMATCH: "stack trace stale". + + +
+After the table, print: + +> FIXED diffs are uncommitted in your working tree. Review with `git diff` and commit manually if you want to keep them. +
+ +
diff --git a/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase0-prerequisites.md b/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase0-prerequisites.md index b4069bdf4b4..f0aa7254a3e 100644 --- a/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase0-prerequisites.md +++ b/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase0-prerequisites.md @@ -9,6 +9,15 @@ model: haiku Check each required tool before proceeding. Stop with setup instructions if any hard requirement fails. On success, write `phase_outputs.phase0` with `nav_tool`, `lsp_available`, and `golangci_lint_available`. + +Scan the raw invocation args for the literal token `--local`. If found: +- Set `invocation_mode_hint = "local"`. +- Skip the `trunk-mcp` and `atlassian-mcp` checks below entirely (JIRA and Trunk are not needed). +- Still run the `golangci-lint` and `code-navigation` checks — local mode requires both. + +Phase 1 performs the authoritative mode parse; this pre-detect only prevents hard-stopping on missing MCP servers when they are irrelevant. + + @@ -59,6 +68,18 @@ If not found, determine the install path and stop: ``` + +Run `go -C tools/test run . diagnose -h` (exit code 0 expected; help text on stdout). + +If it fails → stop: +``` +The chainlink `diagnose` tool is not available at tools/test/. Phase 4 relies on it to +verify fixes (10x iteration runner with AI-readable output). Verify you are running +this skill from the chainlink repo root and that tools/test/ contains the diagnose +command. +``` + + At least one code navigation tool must work. diff --git a/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase1-input.md b/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase1-input.md index 302764a933c..80c120c2d64 100644 --- a/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase1-input.md +++ b/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase1-input.md @@ -12,9 +12,15 @@ Resolve invocation mode and arguments. Write `invocation.mode` and `invocation.a Check arguments in order — stop at the first match: -1. Any argument matches `PROJ-NNN` or `PROJ-NNN@` (e.g. `CRE-5719`, `CCIP-42@https://github.com/.../actions/runs/123`) → **direct-ticket mode**, args = list of `{ key, ci_run_url }` pairs. For each token: split on the first `@`. Left side is the JIRA key (must match `PROJ-NNN`); right side, if present, is the CI run URL — store as-is, do not validate the URL here. Skip prompt. -2. Both `KEY` and `N` were provided → **project mode**, args = `{ key, n }`. Skip prompt. -3. Neither matched → **ask the user**: +1. **`--local` flag present** → **local-test mode**. Parse remaining args as: + - Test specs: any token not starting with `--` and not immediately following `--log`. Format: `.TestName` (period separator) or bare `TestName`. Subtest segments use `/`. + - `--log `: the token immediately following `--log` is the log file path. + - Validate: at least one test spec is present. If none found, re-prompt: *"Local mode requires at least one test spec. Example: `--local core/services/llo.TestFoo`"* + - Set `args = { test_specs: ["..."], log_path: " | null" }`. Skip prompt. + +2. Any argument matches `PROJ-NNN` or `PROJ-NNN@` (e.g. `CRE-5719`, `CCIP-42@https://github.com/.../actions/runs/123`) → **direct-ticket mode**, args = list of `{ key, ci_run_url }` pairs. For each token: split on the first `@`. Left side is the JIRA key (must match `PROJ-NNN`); right side, if present, is the CI run URL — store as-is, do not validate the URL here. Skip prompt. +3. Both `KEY` and `N` were provided → **project mode**, args = `{ key, n }`. Skip prompt. +4. Neither matched → **ask the user**: **Two modes available:** @@ -37,6 +43,7 @@ If `N > 5`: suggest a lower number and wait for confirmation before proceeding. Write to `invocation`: `{ mode, args, auto_mode }`. +- Local mode → Read [phase2d-local-mode.md](phase2d-local-mode.md) and follow its instructions. - Project mode → Read [phase2a-project-mode.md](phase2a-project-mode.md) and follow its instructions. - Direct-ticket mode → Read [phase2b-direct-mode.md](phase2b-direct-mode.md) and follow its instructions. diff --git a/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase2a-project-mode.md b/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase2a-project-mode.md index e7e3bce3a7c..4be11275a98 100644 --- a/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase2a-project-mode.md +++ b/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase2a-project-mode.md @@ -11,7 +11,7 @@ Fetch and filter JIRA issues for a project key, resolve each to a local test fun -Read [shared-jira-protocol.md](shared-jira-protocol.md) before parsing any previous-investigation comments. It defines the comment format, parsing rules, and abandonment procedure. +Read [../../_shared-jira-flaky-ops/investigation-comment.md](../../_shared-jira-flaky-ops/investigation-comment.md) before parsing any previous-investigation comments. It defines the comment format and parsing rules. @@ -20,88 +20,18 @@ Run `git remote get-url origin` once. Extract `{owner}/{repo}` from the URL. Cac - -`KEY`, `N`, `cloudId`, `current_repo`, `nav_tool`, `lsp_available` (from `phase_outputs.phase0`), repo root path. - +Spawn this subagent following `_shared-jira-flaky-ops/fetch-flaky-tickets.md`. Read that file for the full loop, filtering rules, and field extraction logic. - -```json -{ - "slim_records": [...], - "skipped": { "cross_repo": int, "system_tests": int, "not_found": int } -} -``` -Never return raw JIRA API objects — parent only receives slim records. - - - -``` -results = [] -cursor = null -while len(results) < N: - fetch N issues via mcp__atlassian__searchJiraIssuesUsingJql: - jql: project = {KEY} AND labels = "flaky-test" AND status = "Open" ORDER BY created DESC - fields: ["summary", "description", "comment", "status", "assignee", - "customfield_13010", "customfield_13009", "customfield_13007"] - maxResults: N - nextPageToken: cursor (omit on first call) - - for each issue (in order): - 1. Repo check (zero-cost): extract {owner}/{repo} from customfield_13009 - (2nd+3rd segments after github.com/). Mismatch → skip (cross_repo++). - If customfield_13009 absent, scan description for - https://github.com/{owner}/{repo} or a "Repo:" / "Repository:" field. +Inputs to pass: `KEY`, `N`, `cloudId`, `current_repo`, `nav_tool`, `lsp_available` (from `phase_outputs.phase0`), repo root path. - 2. System-tests exclusion (zero-cost): if customfield_13009 starts with - github.com/smartcontractkit/chainlink/system-tests/ → skip (system_tests++). - - 3. Test function check: extract top-level function name from customfield_13007 - (part before first /), fall back to longest TestXxx token in title if absent. - - nav_tool="lsp" or lsp_available=true: LSP definition lookup - - nav_tool="crg": mcp__code-review-graph__semantic_search_nodes_tool - - last resort only: grep -rl "func {TestName}" . - Not found → skip (not_found++). - - 4. Eligible: build slim record (see schema below), append to results. - Stop once len(results) == N. - - cursor = nextPageToken from response - if no more pages: break -``` - - - +Expected output: ```json { - "key": "KEY-NNN", - "title": "...", - "description": "...", - "trunk_test_case_url": "https://app.trunk.io/.../test/{UUID}", - "test_case_id": "{UUID}", - "package": "github.com/owner/repo/...", - "test_name": "TestFoo/subtest_name", - "previous_attempts": [{ - "outcome": "str", - "date": "str", - "summary": "str", - "excluded_approaches": ["str"], - "rejection_reasons": ["str"], - "recommended_next_step": "str | null", - "full_text": "str" - }] + "slim_records": [ /* see _shared-jira-flaky-ops/README.md for schema */ ], + "skipped": { "cross_repo": 0, "system_tests": 0, "not_found": 0 } } ``` -Field extraction rules: -- `test_case_id`: `customfield_13010` (bare UUID). If absent, extract UUID from `https://app.trunk.io/*/test/{UUID}` in description. Null only if neither yields a value. -- `package`: `customfield_13009`. Null if absent. -- `test_name`: `customfield_13007` (full path including subtest, e.g. `TestFoo/subtest`). If absent, longest `TestXxx`/`testXxx` token from title. -- `trunk_test_case_url`: scan description for `https://app.trunk.io/*/test/{UUID}`; null if not found. Display only. -- `previous_attempts`: parse per `shared-jira-protocol.md`. -- `ci_run_url`: always `null` in project mode (no upfront URL syntax). The 3a fallback may prompt for one later. -- If any custom field is absent from the search response, call `mcp__atlassian__getJiraIssue` with `fields=["summary","description","comment","status","assignee","customfield_13010","customfield_13009","customfield_13007"]` for that issue as a fallback. - - diff --git a/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase2b-direct-mode.md b/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase2b-direct-mode.md index ae47795cbf6..cd0e6e2abaa 100644 --- a/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase2b-direct-mode.md +++ b/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase2b-direct-mode.md @@ -11,7 +11,7 @@ Validate each explicitly-provided ticket, build slim records for eligible ones, -Read [shared-jira-protocol.md](shared-jira-protocol.md) before parsing any previous-investigation comments. It defines the comment format, parsing rules, and abandonment procedure. +Read [../../_shared-jira-flaky-ops/investigation-comment.md](../../_shared-jira-flaky-ops/investigation-comment.md) before parsing any previous-investigation comments. It defines the comment format and parsing rules. @@ -20,43 +20,20 @@ Run `git remote get-url origin` once. Extract `{owner}/{repo}`. Cache as `curren - -Ticket key, optional `ci_run_url` (from phase1 `KEY@URL` parsing — null if not provided), `cloudId`, `current_repo`, `nav_tool`, `lsp_available` (from `phase_outputs.phase0`), repo root path. - +Spawn one subagent per ticket, following `_shared-jira-flaky-ops/validate-flaky-ticket.md`. Read that file for the full validation steps and field extraction rules. - +Inputs to pass per ticket: ticket key, `ci_run_url` (from phase1 `KEY@URL` parsing — null if not provided), `cloudId`, `current_repo`, `nav_tool`, `lsp_available` (from `phase_outputs.phase0`), repo root path. + +Expected output per subagent: ```json { "status": "ok" | "error" | "needs_assignment_check", "message": "string (errors only)", - "slim_record": { ... }, + "slim_record": { /* see _shared-jira-flaky-ops/README.md for schema */ }, "assignee_display_name": "string (needs_assignment_check only)", "current_status": "string (needs_assignment_check only)" } ``` -Never return raw JIRA API objects. The slim record schema is identical to phase2a. - - - -1. **Existence check**: Call `mcp__atlassian__getJiraIssue` with `fields=["summary","description","comment","status","assignee","customfield_13010","customfield_13009","customfield_13007"]` (native string array). Reuse this single response for all subsequent steps. - - Issue not found → `{ "status": "error", "message": "Issue KEY-NNN not found — the project may not exist or the ticket number is invalid." }` - -2. **Required data check**: The issue must contain both: - - A test function name: `customfield_13007` non-null, OR a `TestXxx`/`testXxx` token in title or description. - - A `testCaseId`: `customfield_13010` non-null, OR a Trunk URL matching `https://app.trunk.io/*/test/{UUID}` in description. - - Either missing → `{ "status": "error", "message": "KEY-NNN is missing required data: {test name | Trunk ID | both}. Cannot reliably investigate without it." }` - -3. **Repo compatibility check** (stop at first definitive result): - a. Read `customfield_13009`. If present, extract `{owner}/{repo}` from 2nd + 3rd path segments after `github.com/`. Mismatch → `{ "status": "error", "message": "KEY-NNN specifies repo '{owner}/{repo}' which does not match the current repository." }`. Only fall back to scanning description for `https://github.com/{owner}/{repo}` or a `Repo:`/`Repository:` field if `customfield_13009` is absent. - b. **Test function check**: use top-level function from `customfield_13007` (part before first `/`), falling back to longest `TestXxx`/`testXxx` token in title. Check locally: LSP → code-review-graph → grep (last resort only). - - Not found → `{ "status": "error", "message": "Test function not found in the current repository for KEY-NNN." }` - -4. **System-tests exclusion**: If `customfield_13009` starts with `github.com/smartcontractkit/chainlink/system-tests/` → `{ "status": "error", "message": "KEY-NNN is in the system-tests package ({package}), which is excluded from automated investigation." }` - -5. **Assignment check**: If the issue is assigned to another user AND status ≠ `Open` → return `{ "status": "needs_assignment_check", "assignee_display_name": "...", "current_status": "...", "slim_record": {...} }`. Include the slim record so the parent can proceed without re-fetching if the user confirms. - -6. **Build slim record** from the step 1 response — no additional API call. Field extraction rules are identical to phase2a. Set `slim_record.ci_run_url` from the input parameter (null if not provided). Return `{ "status": "ok", "slim_record": {...} }`. - diff --git a/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase2c-prior-gate.md b/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase2c-prior-gate.md index b63d94f06b1..5ac7b2a7705 100644 --- a/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase2c-prior-gate.md +++ b/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase2c-prior-gate.md @@ -11,7 +11,7 @@ Gate on prior investigation history, collect user decisions for affected tickets -Read [shared-jira-protocol.md](shared-jira-protocol.md) before parsing any previous-investigation comments. It defines the comment format, parsing rules, and abandonment procedure. +Read [../../_shared-jira-flaky-ops/investigation-comment.md](../../_shared-jira-flaky-ops/investigation-comment.md) before parsing any previous-investigation comments. It defines the comment format and parsing rules. @@ -46,11 +46,7 @@ In `--auto` mode: automatically choose (a) for all and log: *"Prior investigatio -For each approved ticket — assign before transitioning (serialize in this order): - -1. `mcp__atlassian__editJiraIssue` → assign to cached `accountId`. Wait for success. -2. `mcp__atlassian__getTransitionsForJiraIssue` → find "In Progress" (aliases: "In Development", "Active"). If no match: log all available transitions and stop. -3. `mcp__atlassian__transitionJiraIssue` → transition to "In Progress". +For each approved ticket, follow `_shared-jira-flaky-ops/claim-ticket.md` with `jira_key` and `accountId`. Announce: "Claimed K issues: [KEY-1, KEY-2, ...]. Proceeding to investigation." diff --git a/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase2d-local-mode.md b/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase2d-local-mode.md new file mode 100644 index 00000000000..cf2a6925f54 --- /dev/null +++ b/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase2d-local-mode.md @@ -0,0 +1,83 @@ +--- +phase: phase2d +model: haiku +--- + + + + +Build slim records for local mode (no JIRA, no Trunk). One record per test spec provided via `--local`. Replaces phases 2a/2b/2c for local invocations. Populates `ticket_records` in context. + + + +This phase only runs when `invocation.mode = "local"`. It never touches JIRA or Trunk. No tickets are claimed. No prior-attempt gating applies. + + + + + +Parse the test specs from `invocation.args.test_specs` (populated by phase1). Each spec has the form `.` (period separator) or bare `TestName` (no period). Subtest segments use `/`. + +For each spec: +- If a period is present: left side = package import path, right side = test function name (including any `/subtest` suffix). +- If no period: treat the whole token as `TestName`; package will be resolved via code-nav in the next step. + + + +For each test spec, locate the test function in the repo: + +1. **LSP definition lookup** (if `nav_tool = "lsp"` or `lsp_available = true`): look up the definition of `func {TestName}`. +2. **code-review-graph** (if `nav_tool = "crg"`): `mcp__code-review-graph__semantic_search_nodes_tool` with the test name. +3. **Last resort**: `grep -r "func {TestName}" .` — parse first `filepath:line`, warn if multiple matches. + +If not found after all three: report *"Test function {TestName} not found in the repo — skipping."* and do not create a record for this spec. Continue with remaining specs. + +If the package was not provided in the spec, infer it from the found file path (convert file path to Go import path using the module path from `go.mod`). + + + +If `invocation.args.log_path` is non-null and has not yet been read: +- Read the file once into a single string `provided_log_text`. +- If the file is missing: warn user *"--log file not found at {path}; proceeding without log evidence."* and set `provided_log_text = null`. + +Apply to all records (one log file shared across all test specs). + + + +For each located test, build a slim record conforming to the schema in `_shared-jira-flaky-ops/README.md`: + +```json +{ + "jira_key": null, + "local_id": "local-{N}", + "title": "{TestName}", + "description": "", + "trunk_test_case_url": null, + "test_case_id": null, + "package": "", + "test_name": "", + "previous_attempts": [], + "ci_run_url": null, + "provided_log_path": "", + "provided_log_text": "" +} +``` + +Assign `local_id` values sequentially starting at `local-1`. + + + +Write all built records to `ticket_records` in context. + +Announce: *"Local mode: {N} tests prepared. No JIRA tickets claimed. Proceeding to investigation."* + +If N = 0 (all specs failed to locate): stop. Nothing to investigate. + + + + + +Read [phase3-investigation.md](phase3-investigation.md) and follow its instructions. + + + diff --git a/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase3-investigation.md b/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase3-investigation.md index e95498b1e5e..7d307a804ac 100644 --- a/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase3-investigation.md +++ b/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase3-investigation.md @@ -11,11 +11,15 @@ Run parallel per-ticket investigations: extract Trunk data, analyze test code, c -Read [shared-jira-protocol.md](shared-jira-protocol.md) if not already loaded in this session. Required for Investigation Update comment format and mid-flight abandonment procedure. +**If `invocation.mode = "local"`: skip this section entirely — no JIRA writes occur in local mode.** + +Otherwise (JIRA modes only): +- Read [../../_shared-jira-flaky-ops/investigation-comment.md](../../_shared-jira-flaky-ops/investigation-comment.md) if not already loaded — required for Investigation Update comment format and parsing rules. +- Read [../../_shared-jira-flaky-ops/abandon-ticket.md](../../_shared-jira-flaky-ops/abandon-ticket.md) if not already loaded — required for mid-flight abandonment procedure. -Spawn all N per-ticket investigation subagents in a **single message**. Each receives: the slim record (`key`, `title`, `description`, `trunk_test_case_url`, `test_name`, `package`, `previous_attempts`), `nav_tool`, `lsp_available`, `auto_mode`. Never pass raw JIRA API objects. +Spawn all N per-ticket investigation subagents in a **single message**. Each receives: the slim record (`jira_key`, `local_id`, `title`, `description`, `trunk_test_case_url`, `test_name`, `package`, `previous_attempts`, `provided_log_text`), `mode` (`project` | `direct-ticket` | `local`), `nav_tool`, `lsp_available`, `auto_mode`. Never pass raw JIRA API objects. --- @@ -23,6 +27,24 @@ Spawn all N per-ticket investigation subagents in a **single message**. Each rec Resolve `testCaseId`, retrieve `fix-flaky-test` historical data, optionally invoke `investigate-ci-failure` for single-run forensic data, and run the top-level subtest check. + +**If `mode = "local"`**: skip the `fix-flaky-test-call` and `investigate-ci-failure-call` blocks entirely. + +- If `slim_record.provided_log_text` is non-null → set `trunk_filtered_facts = [slim_record.provided_log_text]`, `trunk_investigation_status = "user_provided"`. +- If `provided_log_text` is null → **run the chainlink `diagnose` tool** to gather observational evidence locally before falling back to code-analysis-only: + ```bash + go -C tools/test run . diagnose --ai-output --iterations 3 --parallel-iterations 3 -- --run "^{TestName}$" --race --shuffle=on ./{package}/... + ``` + Parse the `--ai-output` summary: + - **At least one iteration failed** → extract the failure-specific portions (error messages, stack traces, race-detector output, timeout reports) into `trunk_filtered_facts` as one or more raw strings. Set `trunk_investigation_status = "diagnose_run"`. + - **All iterations passed** → set `trunk_filtered_facts = []`, `trunk_investigation_status = "uninvestigated"`. Inform user: *"diagnose ran 3 iterations without reproducing the failure for {test_name}; proceeding with code analysis only."* + - **Tool itself failed to run** (missing dependency, build error, etc.) → set `trunk_filtered_facts = []`, `trunk_investigation_status = "uninvestigated"`. Log the error to the user; do not retry. + + Drop `--parallel-iterations` to `1` if the test holds external resources (fixed port, exclusive temp file, etc.) that don't tolerate concurrent runs. Use `--iterations 3` here (not 10) — this is evidence-gathering, not fix verification. + +Proceed directly to ``. + + - Use `slim_record.test_case_id` as `testCaseId`. If null: call `mcp__trunk__search-test` with `slim_record.test_name` (falling back to `slim_record.title`) as fuzzy fallback — flag if used, it may match the wrong test. - Call `mcp__trunk__fix-flaky-test` with `testCaseId` (no `createNewInvestigation`). **Immediately after each response** (including polls), apply the filter — never store or forward the raw blob: @@ -71,15 +93,20 @@ Runs only when `slim_record.ci_run_url` is non-null (after upfront input or fall -After Trunk investigation resolves, inspect `trunk_filtered_facts` (and any stack trace within them) for a `file:line`. If the failure line falls inside a `t.Run(...)` callback AND the outer function contains no assertions outside `t.Run` blocks → candidate for **SKIP_TOP_LEVEL**. +After Trunk investigation resolves (or after the local-mode branch sets `trunk_filtered_facts`), inspect `trunk_filtered_facts` (and any stack trace within them) for a `file:line`. If the failure line falls inside a `t.Run(...)` callback AND the outer function contains no assertions outside `t.Run` blocks → candidate for **SKIP_TOP_LEVEL**. **Exception**: if `slim_record.test_name` contains a `/` the ticket was already filed against a specific subtest — SKIP_TOP_LEVEL must not fire. If `test_name` is null, check the title for `/`. If all conditions are met: -1. `mcp__atlassian__getTransitionsForJiraIssue` → find a closing transition ("Won't Do", "Closed", "Done"). If it supports a resolution field, set `resolution = "Won't Do"` (fallback: "Won't Fix"). -2. `mcp__atlassian__transitionJiraIssue` → close with that transition + resolution. -3. `mcp__atlassian__addCommentToJiraIssue` → Investigation Update comment (OUTCOME = CLOSED_SUBTEST). "What was investigated": failure originates in a `t.Run` subtest, not the top-level function. "Recommended next step": file or locate a ticket for the specific subtest. All other sections: N/A. -4. Stop investigation for this issue. + +**JIRA mode** (mode ≠ "local"): +1. Follow `_shared-jira-flaky-ops/transition-ticket.md` with `jira_key` and `target = "Won't Do"`. +2. Follow `_shared-jira-flaky-ops/investigation-comment.md` to write `addCommentToJiraIssue` (OUTCOME = CLOSED_SUBTEST). "What was investigated": failure originates in a `t.Run` subtest, not the top-level function. "Recommended next step": file or locate a ticket for the specific subtest. All other sections: N/A. +3. Stop investigation for this issue. + +**Local mode** (mode = "local"): +- Print: *"Skipping {test_name} — failure originates in a t.Run subtest, not the top-level function. File a ticket against the specific subtest if you want this fixed."* +- Return verdict `SKIP_TOP_LEVEL` for this record. No JIRA writes. @@ -90,11 +117,13 @@ If all conditions are met: -`trunk_filtered_facts` (already filtered to ≥ 0.9 in 3a), `trunk_investigation_status`. Do not call any Trunk MCP tools. +`trunk_filtered_facts` (already filtered to ≥ 0.9 in 3a, or the user-provided log in local mode), `trunk_investigation_status`, `mode`. Do not call any Trunk MCP tools. - If `trunk_investigation_status = "uninvestigated"` or `trunk_filtered_facts` is empty → return `confidence: "none"` with empty `facts`. -- Map confidence from pre-filtered facts: +- If `trunk_investigation_status = "user_provided"` (local mode, user-supplied log) → treat the log content as observational evidence with `confidence: "low"`. It is symptom data, not aggregated CI data. +- If `trunk_investigation_status = "diagnose_run"` (local mode, agent-collected from chainlink `diagnose` tool) → treat the diagnose output as observational evidence with `confidence: "low"`. It is a small local sample (3 iterations), not aggregated CI data. +- Map confidence from pre-filtered facts (JIRA modes): - `"high"`: at least one fact contains raw CI observational data — exact log lines, error messages, stack traces, or specific `file:line` from actual failing runs. - `"low"`: facts describe symptoms only (e.g. "failures in cluster 2 are regex mismatches") but contain no raw CI data. - `"none"`: `trunk_filtered_facts` empty or `trunk_investigation_status = "uninvestigated"`. @@ -152,26 +181,26 @@ Two failure classes — validate each subagent individually: Structural failures include: `facts` containing category labels or counts instead of raw text strings (`"CI_LOGS (1.0)"` is a label — invalid; `"Error: no contract code at given address"` is raw text — valid), `confidence` not one of three allowed values, `code_mismatch` not a boolean. -Allow up to **3 total attempts** per subagent. After 3 failures → hard stop for this issue. Apply mid-flight abandonment rule. Write Investigation Update comment (OUTCOME = ABANDONED): state which subagent failed and include the validation error; recommended next step: re-run and include last raw output verbatim. Continue with other issues. +Allow up to **3 total attempts** per subagent. After 3 failures → hard stop for this issue. In **JIRA mode**: follow `_shared-jira-flaky-ops/abandon-ticket.md` then write Investigation Update comment via `_shared-jira-flaky-ops/investigation-comment.md` (OUTCOME = ABANDONED): state which subagent failed and include the validation error; recommended next step: re-run and include last raw output verbatim. In **local mode**: no JIRA writes — just return verdict `ABANDONED` with the error. Continue with other issues. --- - + -Classify flakiness source as TEST / SUT / INFRA / AMBIGUOUS before entering the fix debate. Runs in the parent (not a subagent) because it may require a user gate. +Classify flakiness source as TEST / SUT / INFRA / AMBIGUOUS before entering the fix debate. Runs in the parent (not a subagent) because it may require a user gate. Single LLM call — no scoring, no signal enumeration, no tier ladder. The model examines the available evidence and chooses one classification, with every conclusion grounded in a verbatim quote. ```json { "$schema": "phase_3bii_input_v1", - "ticket_key": "string", + "record_id": "string (jira_key or local_id)", "test_name": "string", "trunk_filtered_facts": ["string"], - "trunk_investigation_status": "existing | triggered | uninvestigated | ci_run_only", + "trunk_investigation_status": "existing | triggered | uninvestigated | ci_run_only | user_provided | diagnose_run", "subagent_b_output": { "file": "string", "line": "number", "analysis": "string", "suspected_cause": "string", "suspected_cause_location": "test_code | production_code | unknown", @@ -179,60 +208,32 @@ Classify flakiness source as TEST / SUT / INFRA / AMBIGUOUS before entering the } } ``` -**Matching scope rule**: all signal triggers match only against `trunk_filtered_facts` text or stack-trace excerpts — never against test source code, test names, or code comments. - -Deterministic — no LLM judgment. - -**SUT signals:** -| ID | Trigger | Weight | -|----|---------|--------| -| SUT_SERVICE_UNAVAILABLE | "connection refused", "service unavailable", "dial tcp", "reset by peer"; OR "EOF" co-occurring within 200 chars of any of {grpc, rpc, dial, net., tcp, http} | 2 | -| SUT_COMPONENT_NOT_INITIALIZED | "nil pointer dereference", "not initialized", "component not ready" — excerpt must be in a production code frame (not `_test.go`) | 1 | -| SUT_CONSISTENT_PROD_CODE_FAILURE | Same production-code `file:line` appears in stack traces from ≥ 2 distinct CI runs in `trunk_filtered_facts` | 1 | - -**TEST signals** (all weight 1): -| ID | Trigger | -|----|---------| -| TEST_SHARED_GLOBAL_STATE | Package-level var, global map, or singleton mutated without `t.Cleanup` restore | -| TEST_PARALLEL_UNSYNC | `t.Parallel()` present with shared resource used without sync primitive | -| TEST_TIMING_DEPENDENCY | `time.Sleep` or fixed-duration delay used to synchronize async behavior | -| TEST_MISSING_CLEANUP | Resource setup (server, DB, goroutine) without corresponding Cleanup/defer | -| TEST_RACE_DETECTOR_FIRED | `DATA RACE` or `RACE CONDITION DETECTED` in `trunk_filtered_facts` | - -**INFRA signals** (any match → INFRA, overrides scoring): -| ID | Trigger | -|----|---------| -| INFRA_OOM_KILLED | "signal: killed", "OOM", "out of memory", "exit status 137" | -| INFRA_DISK_FULL | "no space left on device", "disk full" | -| INFRA_REGISTRY_FAILURE | "pulling image", "registry", "manifest unknown", "pull access denied" | - - - -| ID | Trigger | Weight | -|----|---------|--------| -| SUT_PRECONDITION_NOT_MET | LLM determines the failure indicates a precondition was unsatisfied — a dependency wasn't available, a registration hadn't completed, a service hadn't started — rather than the SUT behaving incorrectly *during* the test scenario. LLM must quote a verbatim excerpt from `trunk_filtered_facts` and provide a one-sentence explanation. | 2 | - -Ask: *"Does this failure indicate that a precondition for the test wasn't met (something wasn't ready or available), or does it indicate the system-under-test behaved incorrectly while executing the test's actual scenario?"* If LLM answers yes with a verbatim excerpt, the signal fires. If no excerpt can be quoted from the inputs, the signal is dropped. - - - -Deterministic post-LLM — LLM never computes scores: -1. LLM outputs matched signal IDs with one verbatim excerpt each. -2. Post-processing: each excerpt must appear verbatim in `trunk_filtered_facts` joined text OR `subagent_b_output.analysis`. Unvalidated signals are dropped in-place; scores recomputed (no retry). -3. `sut_score` = sum of weights for validated SUT signals; `test_score` = sum of weights for validated TEST signals. -4. Classify: any validated INFRA signal → `INFRA`; `sut_score > test_score` → `SUT`; `test_score > sut_score` → `TEST`; equal → tiebreaker. - -**Tiebreaker** (only on tie): -0. `sut_score == 0 AND test_score == 0` → `AMBIGUOUS` immediately. -1. `subagent_b_output.code_mismatch == true` → `AMBIGUOUS` (stale data, cannot trust evidence). -2. `trunk_investigation_status == "uninvestigated"` → `AMBIGUOUS`. -3. Any SUT signal excerpt verbatim-matches a SUT trigger string → `SUT`. -4. Default → `AMBIGUOUS`. - -**Confidence rule**: `high` = score margin ≥ 2 OR at least one weight-2 signal on the winning side; `low` = margin = 1 with no weight-2 signal; `none` = no signals matched, or classification is AMBIGUOUS/INFRA. - + +A single Sonnet call. The prompt: + +> You are classifying a flaky test failure. Choose **one** of: +> - **TEST** — the test code introduces non-determinism (timing dependency, shared state, missing cleanup, parallelism without sync, non-deterministic data, ordering assumption, race in test code, hardcoded resources, etc.). +> - **SUT** — the production code under test is incorrect, racy, or not ready when the test exercises it. +> - **INFRA** — an environmental failure unrelated to either the test or the SUT (OOM, disk full, image pull failure, network outage at infra layer). +> - **AMBIGUOUS** — evidence is insufficient, contradictory, or absent. +> +> **Rules:** +> 1. Every classification must be backed by 1–3 verbatim quotes. A quote's `source` is one of: +> - `trunk_facts` — excerpt must appear verbatim in `trunk_filtered_facts`. +> - `code_analysis` — excerpt must appear verbatim in `subagent_b_output.analysis`. +> - `direct_field` — the literal value `"test_code"` or `"production_code"` from `subagent_b_output.suspected_cause_location` (counts as TEST or SUT evidence respectively). +> 2. **Never quote raw test source code, function names, or code comments.** Only the analyzer's *synthesized* `analysis` text counts as code-side evidence. +> 3. If you cannot produce at least one valid quote for the winning side → classify AMBIGUOUS with confidence `none`. +> 4. INFRA requires at least one `trunk_facts` quote. Code analysis alone cannot establish INFRA. +> 5. If `subagent_b_output.code_mismatch == true` → classify AMBIGUOUS (stale stack trace; attribution unsafe). +> 6. Confidence: `high` = 2+ corroborating quotes, no contradicting evidence; `low` = 1 quote OR thin/indirect quotes; `none` = no usable evidence (only on AMBIGUOUS). +> 7. For SUT: also produce `sut_description` (one sentence) and `sut_pivot` (file/component/hypothesis — fields may be null). +> 8. `pattern_category` is a short free-form label (≤ 5 words) for diagnostic display — e.g. "timing dependency", "OOM during test", "stale precondition", "production nil deref". Not load-bearing. + +Inputs to inject into the prompt: `trunk_filtered_facts`, `subagent_b_output` (all fields). Bias the model toward AMBIGUOUS when evidence is thin — false TEST classification leads to bogus fixes; AMBIGUOUS just surfaces the case to the user. + ```json @@ -240,37 +241,56 @@ Deterministic post-LLM — LLM never computes scores: "$schema": "phase_3bii_output_v1", "classification": "TEST | SUT | AMBIGUOUS | INFRA", "confidence": "high | low | none", - "sut_score": "number", "test_score": "number", - "sut_signals_matched": ["string"], "test_signals_matched": ["string"], "infra_signals_matched": ["string"], - "evidence": [{ "signal_id": "string", "source": "trunk_fact | code_analysis | stack_trace", "excerpt": "string" }], - "tiebreaker_applied": "boolean", "tiebreaker_step_fired": "number | null", - "rationale": "string", + "rationale": "string (one sentence)", + "pattern_category": "string | null (≤ 5 words; diagnostic label only)", + "evidence": [ + { + "source": "trunk_facts | code_analysis | direct_field", + "excerpt": "string", + "supports": "TEST | SUT | INFRA" + } + ], "sut_description": "string | null", - "sut_pivot": { "file": "string | null", "component": "string | null", "hypothesis": "string | null" }, - "smell_notes": ["string"] + "sut_pivot": { "file": "string | null", "component": "string | null", "hypothesis": "string | null" } } ``` -`sut_pivot`: required (fields may be null) when classification is SUT or AMBIGUOUS; null otherwise. +`sut_description` required (non-null) when classification == SUT. `sut_pivot` required (fields may be null) when classification is SUT or AMBIGUOUS; null otherwise. - -- **Transient**: retry immediately with original prompt. -- **Structural** (missing fields, wrong types, invalid enum, `sut_pivot` absent when classification is SUT/AMBIGUOUS): retry with error context. -- Excerpt validation is post-processing, not a retry trigger — drop unvalidated signals and recompute. -- Allow up to 3 total attempts. After 3 failures: set `classification = "AMBIGUOUS"`, `confidence = "none"`, `rationale = "Schema validation failed after 3 attempts"`, and continue to gate logic. - + +Post-call, deterministically: + +1. **Excerpt verification** — for each `evidence` entry: + - `trunk_facts` → search the joined `trunk_filtered_facts` text. Excerpt must appear verbatim. + - `code_analysis` → search `subagent_b_output.analysis`. Excerpt must appear verbatim. + - `direct_field` → excerpt must equal either `"test_code"` or `"production_code"` AND match `subagent_b_output.suspected_cause_location`. + Drop any entry that fails verification. +2. **Consistency** — apply in order: + - `subagent_b_output.code_mismatch == true` → force `classification = AMBIGUOUS`, `confidence = none`. + - After dropping invalid evidence, if no evidence remains supporting the chosen classification → force AMBIGUOUS. + - `classification == INFRA` but zero validated `trunk_facts` quotes → force AMBIGUOUS. + - `classification == SUT` but `sut_description` is null → force AMBIGUOUS. +3. **Schema validation** — transient (empty/null) → retry; structural (missing fields, wrong types, invalid enum) → retry with error context. Up to 3 total attempts. After 3 failures: `classification = AMBIGUOUS`, `confidence = none`, `rationale = "Schema validation failed after 3 attempts"`, `evidence = []`. Continue to gate logic. + -| Classification | `--auto` mode | Interactive mode | -|---|---|---| -| TEST | Continue to 3c | Continue to 3c | -| SUT | Return to queue + JIRA comment | Prompt user (options a/b/c) | -| AMBIGUOUS | Return to queue + JIRA comment | Prompt user with both signal lists | -| INFRA | Return to queue + JIRA comment | Prompt user | +| Classification | `--auto` JIRA mode | Interactive JIRA mode | `--auto` local mode | Interactive local mode | +|---|---|---|---|---| +| TEST | Continue to 3c | Continue to 3c | Continue to 3c | Continue to 3c | +| SUT | Return to queue + JIRA comment | Prompt user (options a/b/c) | Skip test + report | Prompt: (a) skip (b) override to TEST — no JIRA option | +| AMBIGUOUS | Return to queue + JIRA comment | Prompt user with evidence | Skip test + report | Prompt: (a) skip (b) override to TEST | +| INFRA | Return to queue + JIRA comment | Prompt user | Skip test + report | Prompt: (a) skip (b) override to TEST | + +In local mode, "skip" means: no JIRA writes, no fix attempted; include in final summary as SKIPPED with the classification reason. Classification: **SUT** (confidence: {confidence}) -Signals: {sut_signals_matched} +Pattern: {pattern_category} +Rationale: {rationale} + +Evidence: +{for each evidence row: - "{excerpt}" (from {source}, supports {supports})} + {sut_description} This test appears to expose a SUT bug, not a test-code bug. Options: @@ -279,11 +299,39 @@ This test appears to expose a SUT bug, not a test-code bug. Options: (c) Fix the test code AND auto-file a SUT bug ticket (label: sut-bug) -*Option (b) audit trail*: add JIRA comment "Classification overridden to TEST by user. Original: SUT, confidence: {confidence}, signals: {list}." Add commit trailer: `Flakiness-classification: TEST (user override from SUT)`. + +Classification: **AMBIGUOUS** (confidence: {confidence}) +Rationale: {rationale} + +Evidence: +{for each evidence row: - "{excerpt}" (from {source}, supports {supports})} +{if evidence is empty: (no quotable evidence — investigation cannot reliably attribute the failure)} + +Options: +(a) Return this ticket to the queue / skip +(b) Override — treat as TEST and proceed to fix debate (audited) + + + +Classification: **INFRA** (confidence: {confidence}) +Pattern: {pattern_category} +Rationale: {rationale} + +Evidence: +{for each evidence row: - "{excerpt}" (from {source})} + +This test failed for environmental reasons unrelated to the test or SUT. Options: +(a) Return to queue / skip (recommended — re-run when infra is healthy) +(b) Override — treat as TEST and proceed to fix debate (audited) + + +*Option (b) audit trail*: add JIRA comment *"Classification overridden to TEST by user. Original: {classification}, confidence: {confidence}, rationale: {rationale}."* Add commit trailer: `Flakiness-classification: TEST (user override from {classification})`. + +*Option (c)* (SUT only): create a JIRA issue in the same project: summary `SUT bug: {sut_description}`, description includes `sut_pivot` fields and the evidence list, label `sut-bug`. Return the new ticket key to the user. Proceed to 3c treating current ticket as TEST. -*Option (c)*: create a JIRA issue in the same project: summary `SUT bug: {sut_description}`, description includes `sut_pivot` fields, label `sut-bug`. Return the new ticket key to the user. Proceed to 3c treating current ticket as TEST. +For SUT/AMBIGUOUS/INFRA auto-queue returns in **JIRA mode**: follow `_shared-jira-flaky-ops/investigation-comment.md` to write `addCommentToJiraIssue` (OUTCOME = RETURNED_TO_QUEUE). "What was investigated": classification, confidence, pattern category, rationale, evidence quotes. "Hypothesis": `sut_description` if SUT, otherwise N/A. "Recommended next step": SUT → investigate `sut_pivot`; AMBIGUOUS → clarify before re-investigating; INFRA → re-run after infra is healthy. Then follow `_shared-jira-flaky-ops/abandon-ticket.md` (unassign + transition to Open). Continue with other issues. -For SUT/AMBIGUOUS/INFRA auto-queue returns: write Investigation Update comment (OUTCOME = RETURNED_TO_QUEUE). "What was investigated": classification, confidence, matched signal IDs, SUT score, TEST score, rationale. "Hypothesis": `sut_description` if SUT, otherwise N/A. "Recommended next step": SUT → investigate `sut_pivot`; AMBIGUOUS → clarify classification before re-investigating; INFRA → check infrastructure. Apply mid-flight abandonment rule (unassign + transition to Open). Continue with other issues. +For SUT/AMBIGUOUS/INFRA in **local mode**: no JIRA writes. Return verdict `SKIPPED` with the classification reason. Continue with other issues. @@ -311,7 +359,7 @@ Synthesizes Subagent A + B output (and `ci_run_evidence` if present); proposes t - **If `ci_run_evidence` is non-null**: include it in the prompt under a clearly labeled section: *"CI run forensic evidence (single run, not aggregated — weigh accordingly):"*. This data is unscored and reflects only one specific failure, so corroborating signals across `trunk_filtered_facts` and code analysis should outweigh isolated CI-run observations when they conflict. If `ci_run_evidence.status == "build_failure"`, note that test-level data is unavailable from this source. - Must explicitly state any approaches excluded due to previous failed attempts. - If any `previous_attempts` entry has a non-null `recommended_next_step`, prepend to prompt: *"Prior investigation ({date}, {outcome}) recommended: '{recommended_next_step}'. Approaches already tried and rejected: {excluded_approaches}. Rejected because: {rejection_reasons}. Start from this hypothesis — confirm or refute it with code evidence before proposing anything else."* -- If 3b-ii returned `classification = "SUT"` with user override (option b or c), prepend: *"Note: this was originally classified SUT (signals: {sut_signals_matched}). The SUT hypothesis: {sut_description}. The test fix should defensively address this."* +- If 3b-ii returned `classification = "SUT"` with user override (option b or c), prepend: *"Note: this was originally classified SUT (rationale: {rationale}; pattern: {pattern_category}). The SUT hypothesis: {sut_description}. The test fix should defensively address this."* ```json @@ -342,7 +390,7 @@ Receives both Proposer and Challenger outputs. Decides whether to stop (enough c -Same two classes as 3b (transient → immediate retry; structural → retry with error context). Allow up to **3 total attempts** per role. After 3 failures → hard stop for this issue. Apply mid-flight abandonment rule. Write Investigation Update comment (OUTCOME = ABANDONED): state which debate role failed and include the validation error; recommended next step: re-run and include last raw output verbatim. Continue with other issues. +Same two classes as 3b (transient → immediate retry; structural → retry with error context). Allow up to **3 total attempts** per role. After 3 failures → hard stop for this issue. In **JIRA mode**: follow `_shared-jira-flaky-ops/abandon-ticket.md` then write Investigation Update comment via `_shared-jira-flaky-ops/investigation-comment.md` (OUTCOME = ABANDONED): state which debate role failed and include the validation error; recommended next step: re-run and include last raw output verbatim. In **local mode**: no JIRA writes — return verdict `ABANDONED` with the error. Continue with other issues. @@ -354,9 +402,12 @@ Never surface raw Proposer/Challenger/Arbiter responses to the top-level parent. ```json { - "key": "KEY-NNN", - "outcome": "PROCEED | INCONCLUSIVE | MISMATCH | SKIP_TOP_LEVEL | RETURNED_TO_QUEUE | ABANDONED", - "trunk_investigation_status": "existing | triggered | uninvestigated | ci_run_only", + "key": "KEY-NNN | null", + "local_id": "local-N | null", + "record_id": "KEY-NNN (jira_key if non-null, else local_id)", + "outcome": "PROCEED | INCONCLUSIVE | MISMATCH | SKIP_TOP_LEVEL | RETURNED_TO_QUEUE | ABANDONED | SKIPPED", + "subagent_calls_made": ["trunk_analyzer | code_analyzer | classifier | proposer | challenger | arbiter"], + "trunk_investigation_status": "existing | triggered | uninvestigated | ci_run_only | user_provided | diagnose_run", "trunk_fact_count": "integer", "trunk_analysis_url": "string | null", "trunk_test_case_url": "string | null", @@ -368,8 +419,10 @@ Never surface raw Proposer/Challenger/Arbiter responses to the top-level parent. "excluded_approaches": ["string (PROCEED only, null otherwise)"], "classifier": { "classification": "TEST | SUT | AMBIGUOUS | INFRA", - "sut_score": "number", "test_score": "number", - "sut_signals_matched": ["string"], "test_signals_matched": ["string"], + "confidence": "high | low | none", + "rationale": "string", + "pattern_category": "string | null", + "evidence": [{ "source": "trunk_facts | code_analysis | direct_field", "excerpt": "string", "supports": "TEST | SUT | INFRA" }], "sut_description": "string | null", "sut_pivot": { "file": "string | null", "component": "string | null", "hypothesis": "string | null" } }, @@ -382,15 +435,38 @@ Never surface raw Proposer/Challenger/Arbiter responses to the top-level parent. } ``` -Outcomes RETURNED_TO_QUEUE, ABANDONED, CLOSED_SUBTEST, and SKIP_TOP_LEVEL are **fully handled within the per-issue subagent** (JIRA comment written, abandonment rule applied) before returning. The parent only records the outcome. +Outcomes RETURNED_TO_QUEUE, ABANDONED, CLOSED_SUBTEST, and SKIP_TOP_LEVEL are **fully handled within the per-issue subagent** (JIRA comment written and abandonment rule applied in JIRA modes; no JIRA writes in local mode) before returning. The parent only records the outcome. + +`SKIPPED` is the local-mode equivalent of `RETURNED_TO_QUEUE` — no JIRA writes, included in the final summary with the classification reason. + --- + +**The parent MUST check `subagent_calls_made` on every per-issue return. This check is never skipped — not in `--auto` mode, not for tickets that look obviously simple, not when the verdict appears self-evidently correct. The check is the only thing standing between the multi-agent protocol and a single model rationalizing its way to a confidently-wrong conclusion.** + +Required entries by outcome: + +| Outcome | Required entries in `subagent_calls_made` | +|---------|-------------------------------------------| +| `PROCEED` / `INCONCLUSIVE` | all six: `trunk_analyzer`, `code_analyzer`, `classifier`, `proposer`, `challenger`, `arbiter` | +| `RETURNED_TO_QUEUE` / `SKIPPED` | `trunk_analyzer`, `code_analyzer`, `classifier` (classification ran; debate did not) | +| `MISMATCH` | `trunk_analyzer`, `code_analyzer` (short-circuits at 3c) | +| `SKIP_TOP_LEVEL` | `trunk_analyzer` (short-circuits in 3a) | +| `ABANDONED` | whatever ran before the abandonment trigger; document the trigger in the return | + +If any required entry is missing → **override the outcome to `ABANDONED`**, reason `"protocol skipped — missing: {list of roles}"`. Do not apply the fix. Do not negotiate. Do not accept the verdict on the grounds that the inline reasoning "looks right." + + +--- + Print summary and wait for user confirmation before any files are modified. Skip in `--auto` mode — proceed with all PROCEED verdicts automatically (MISMATCH issues were already resolved above). +**JIRA modes** — include Trunk columns: + | Issue | Trunk | Trunk link | Proposed fix location | Verdict | |-------|-------|------------|-----------------------|---------| | KEY-123 | existing (2 facts ≥0.9) / triggered (0 facts ≥0.9) / uninvestigated | [Analysis]({trunk_analysis_url}) or [Test case]({trunk_test_case_url}) | `pkg/foo/bar_test.go:447` | PROCEED / INCONCLUSIVE / SKIP_TOP_LEVEL / MISMATCH | @@ -399,6 +475,14 @@ Outcomes RETURNED_TO_QUEUE, ABANDONED, CLOSED_SUBTEST, and SKIP_TOP_LEVEL are ** - `uninvestigated` — Trunk returned no results within 5 minutes; fix based on code analysis only. - `0 facts ≥0.9` — investigation existed but all facts were below threshold; treated as code-analysis-only. - `MISMATCH` — the innermost failing function from the Trunk stack trace no longer exists in the codebase. + +**Local mode** — drop Trunk columns; use `local_id + test_name` in the Issue column: + +| Issue / Test | Evidence | Proposed fix location | Verdict | +|--------------|----------|----------------------|---------| +| local-1 · TestFoo | user log / none | `pkg/foo/bar_test.go:447` | PROCEED / INCONCLUSIVE / SKIPPED / MISMATCH | + +- `Evidence` = "user log" if `provided_log_text` non-null, else "none". @@ -418,14 +502,15 @@ Apply the user's choice: State explicitly: "Investigation is done. Here's the summary above." Then ask: "Proceed with fixes? Exclude specific issues by listing their keys." -If the user excludes or skips any ticket, apply the mid-flight abandonment rule to it immediately. +If the user excludes or skips any ticket: in JIRA mode, follow `_shared-jira-flaky-ops/abandon-ticket.md` for it immediately. In local mode, no JIRA writes — just exclude from fix list. Write investigation results into `ticket_records` (update `actionable_facts`, `chosen_fix`, outcome fields per ticket). - Any PROCEED verdicts exist → Read [phase4-apply-fix.md](phase4-apply-fix.md) and follow its instructions. -- All verdicts are INCONCLUSIVE / MISMATCH / SKIP_TOP_LEVEL / RETURNED_TO_QUEUE → Read [phase6-jira-update.md](phase6-jira-update.md) and follow its instructions. +- All verdicts are INCONCLUSIVE / MISMATCH / SKIP_TOP_LEVEL / RETURNED_TO_QUEUE / SKIPPED (no PROCEED) and `mode = "local"` → Read [phase-final-local.md](phase-final-local.md) and follow its instructions. +- All verdicts are INCONCLUSIVE / MISMATCH / SKIP_TOP_LEVEL / RETURNED_TO_QUEUE (no PROCEED) and `mode ≠ "local"` → Read [phase6-jira-update.md](phase6-jira-update.md) and follow its instructions. diff --git a/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase4-apply-fix.md b/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase4-apply-fix.md index cbb76120a09..77630a3708d 100644 --- a/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase4-apply-fix.md +++ b/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase4-apply-fix.md @@ -12,9 +12,9 @@ Apply and verify fixes for all PROCEED issues. Updates `ticket_records` with `ap Ownership re-check before touching any files. -For each PROCEED issue: call `mcp__atlassian__getJiraIssue`, verify assignee matches cached `accountId`. +**Skip entirely in local mode** (`invocation.mode = "local"`) — no JIRA tickets are owned. -If reassigned: report "KEY-NNN is now assigned to {displayName} — reach out before proceeding." Apply mid-flight abandonment rule (unassign + transition to Open + comment). Continue with remaining issues. +For each PROCEED issue (JIRA modes): follow `_shared-jira-flaky-ops/recheck-ownership.md` with `jira_key` and `accountId`. If result is `reassigned`: follow `_shared-jira-flaky-ops/abandon-ticket.md` for that ticket. Continue with remaining issues. @@ -50,19 +50,24 @@ If reassigned: report "KEY-NNN is now assigned to {displayName} — reach out be - **Lint finds violations that require changes outside the fix's scope** → set `lint_status = "failed"`, record violation summary in `lint_failure_detail`, return without fixing. - **Lint cannot execute** (binary missing, config error — not a lint violation) → set `lint_status = "skipped"`, record reason in `lint_failure_detail`, return without blocking. -6. **Rerun the test 10 times in independent processes**: +6. **Run the chainlink `diagnose` tool to verify the fix** (10 iterations, parallelized): ```bash - # Go (detected via go.mod presence): - for i in $(seq 10); do go test -race -shuffle=on -run "^{TestName}$" ./{package}/...; done - # Adjust for non-Go projects based on detected language/build tool. + go -C tools/test run . diagnose --ai-output --iterations 10 --parallel-iterations 5 -- --run "^{TestName}$" --race --shuffle=on ./{package}/... ``` + Rules: + - `--ai-output` is mandatory (machine-readable summary) and must appear **before** the `--` separator. + - Harness flags (`--iterations`, `--parallel-iterations`, `--fail-fast-on`) go before `--`. `go test` flags go after `--`. + - Reduce `--parallel-iterations` to `1` if the test holds external resources that don't tolerate concurrent runs (e.g. listens on a fixed port, opens a fixed temp file, or claims an exclusive lock). Otherwise default 5. + - For additional flags: `go -C tools/test run . diagnose -h`. + + Parse the `--ai-output` summary to determine pass count (N out of 10). The verdict logic in step 7 is unchanged. 7. **Record result and return**: - **10/10 pass** → return `{ "verdict": "FIXED", "diff": "" }`. - **< 10/10 pass** → verdict `PARTIAL_FIX`: - Revert: `git restore {file}`. - - Apply mid-flight abandonment rule: unassign, transition to "Open". - - Write Investigation Update comment (OUTCOME = PARTIAL_FIX). "What was investigated": the suspected cause. "Hypothesis": `proposer_root_cause`. "What was tried": `fix_description` + attempted diff. "Why it didn't hold": test passed {n}/10 runs + first failure output (truncated to ~500 chars). "Recommended next step": `recommended_next_step` adapted as next direction, or N/A. + - **JIRA mode**: follow `_shared-jira-flaky-ops/abandon-ticket.md` (unassign + transition to Open), then follow `_shared-jira-flaky-ops/investigation-comment.md` to write `addCommentToJiraIssue` (OUTCOME = PARTIAL_FIX). "What was investigated": the suspected cause. "Hypothesis": `proposer_root_cause`. "What was tried": `fix_description` + attempted diff. "Why it didn't hold": test passed {n}/10 runs + first failure output (truncated to ~500 chars). "Recommended next step": `recommended_next_step` adapted as next direction, or N/A. + - **Local mode**: no JIRA writes. Return `{ "verdict": "PARTIAL_FIX", "pass_count": N }` — include in final summary as `PARTIAL_FIX (reverted)`. - Return `{ "verdict": "PARTIAL_FIX", "pass_count": N }`. @@ -103,11 +108,10 @@ Record decision in `user_decisions`. In `--auto` mode: automatically choose (a) -Announce verdict for each issue: "Fix results: KEY-1 FIXED, KEY-2 PARTIAL_FIX (reverted and returned to queue)." - -State: "Moving to commit and PR. Please review the fix files before confirming." +Announce verdict for each issue: "Fix results: KEY-1 FIXED, KEY-2 PARTIAL_FIX (reverted and returned to queue)." In local mode, use `local_id` or test name in place of the JIRA key. -Read [phase5-commit-pr.md](phase5-commit-pr.md) and follow its instructions. +- **JIRA mode**: State "Moving to commit and PR. Please review the fix files before confirming." → Read [phase5-commit-pr.md](phase5-commit-pr.md) and follow its instructions. +- **Local mode**: → Read [phase-final-local.md](phase-final-local.md) and follow its instructions. diff --git a/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase5-commit-pr.md b/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase5-commit-pr.md index e92883431a6..b2d06e9eaeb 100644 --- a/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase5-commit-pr.md +++ b/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase5-commit-pr.md @@ -42,9 +42,9 @@ Commit following the repo's existing message style. -For each FIXED issue: call `mcp__atlassian__getJiraIssue`, confirm assignee still matches cached `accountId`. +For each FIXED issue: follow `_shared-jira-flaky-ops/recheck-ownership.md` with `jira_key` and `accountId`. -If reassigned: pause and report. Do not push until user explicitly confirms. **This gate is never skipped by `--auto`.** +If result is `reassigned`: pause and report. Do not push until user explicitly confirms. **This gate is never skipped by `--auto`.** diff --git a/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase6-jira-update.md b/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase6-jira-update.md index 81d4c99363e..a55ab847e48 100644 --- a/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase6-jira-update.md +++ b/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/phase6-jira-update.md @@ -11,18 +11,19 @@ Write final JIRA comments and transition tickets to their terminal state. Reads -Read [shared-jira-protocol.md](shared-jira-protocol.md) if not already loaded in this session. +Read [../../_shared-jira-flaky-ops/investigation-comment.md](../../_shared-jira-flaky-ops/investigation-comment.md) if not already loaded — needed for Investigation Update comment format. + +Read [../../_shared-jira-flaky-ops/abandon-ticket.md](../../_shared-jira-flaky-ops/abandon-ticket.md) if not already loaded — needed for INCONCLUSIVE abandonment procedure. For each FIXED issue: -1. `getTransitionsForJiraIssue` → find "In Review" (aliases: "In Code Review", "Review"). -2. `transitionJiraIssue` → "In Review". -3. `addCommentToJiraIssue` → Investigation Update comment (OUTCOME = FIXED): +1. Follow `_shared-jira-flaky-ops/transition-ticket.md` with `jira_key` and `target = "In Review"`. +2. Follow `_shared-jira-flaky-ops/investigation-comment.md` to write `addCommentToJiraIssue` (OUTCOME = FIXED): - **What was investigated**: the failure mode and root cause in one sentence. - **Hypothesis**: the Proposer's root cause. - - **What was tried**: fix description; PR: {PR URL}; signals matched: {sut_signals_matched + test_signals_matched}; SUT score: {sut_score}, TEST score: {test_score}. If classification was SUT with user override, note it here. + - **What was tried**: fix description; PR: {PR URL}; classification: {classification} ({confidence}); pattern: {pattern_category}; rationale: {rationale}. If classification was SUT with user override, note it here. - **Why it didn't hold**: N/A. - **Recommended next step**: N/A. @@ -30,13 +31,13 @@ For each FIXED issue: For each INCONCLUSIVE issue: -1. `addCommentToJiraIssue` → Investigation Update comment (OUTCOME = INCONCLUSIVE): +1. Follow `_shared-jira-flaky-ops/investigation-comment.md` to write `addCommentToJiraIssue` (OUTCOME = INCONCLUSIVE): - **What was investigated**: the failure mode and what code was analyzed. - **Hypothesis**: the Proposer's root cause. - **What was tried**: the proposed fix if any, otherwise "No fix applied." - **Why it didn't hold**: the Challenger's key objections and the Arbiter's rationale. - **Recommended next step**: a concrete actionable direction derived from the Arbiter's reasoning. -2. Apply the mid-flight abandonment rule (unassign + transition to "Open"). +2. Follow `_shared-jira-flaky-ops/abandon-ticket.md` (unassign + transition to "Open"). diff --git a/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/shared-jira-protocol.md b/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/shared-jira-protocol.md deleted file mode 100644 index 124d39623ce..00000000000 --- a/tools/test/.agents/skills/backlog-flaky-test-pipeline/references/shared-jira-protocol.md +++ /dev/null @@ -1,62 +0,0 @@ ---- -name: shared-jira-protocol -description: Investigation Update comment format, mid-flight abandonment rule, and previous-attempt parsing. Read by all phases that write JIRA comments. ---- - - - - -Every `addCommentToJiraIssue` call in this skill uses the structure below. Always include all five sections — write `N/A` for any that don't apply. **Style: concise and matter-of-fact. 1–3 sentences per section. No narrative padding, no hedge words.** - -```markdown -## Investigation Update — {OUTCOME} · {YYYY-MM-DD} - -**Outcome**: {OUTCOME} -**Investigator**: {display name from atlassianUserInfo} -**Classification**: {TEST | SUT | AMBIGUOUS | INFRA} (confidence: {high | low | none}) | N/A - -### What was investigated -{The failure mode and where analysis focused.} - -### Hypothesis -{The proposed root cause, or N/A.} - -### What was tried -{The fix or approach applied or proposed, or N/A.} - -### Why it didn't hold -{Objections, test results, or reason the fix was rejected or reverted — or N/A.} - -### Recommended next step -{Concrete actionable direction for the next investigator, or N/A.} -``` - -**Outcome values**: `INCONCLUSIVE` (debate unresolved), `PARTIAL_FIX` (fix applied, tests still failed, reverted), `FIXED` (fix verified, PR created), `RETURNED_TO_QUEUE` (SUT/AMBIGUOUS/INFRA classification), `CLOSED_SUBTEST` (failure in t.Run subtest, not top-level), `ABANDONED` (mid-flight stop for any reason). - - - -If the user cancels, skips, or stops working on a JIRA ticket at **any** point after it was claimed — regardless of reason — you **must** (never skip): - -1. `mcp__atlassian__editJiraIssue` → unassign the issue (set assignee to null). -2. `mcp__atlassian__getTransitionsForJiraIssue` → find "Open". -3. `mcp__atlassian__transitionJiraIssue` → transition back to "Open". -4. `mcp__atlassian__addCommentToJiraIssue` → write an Investigation Update comment (OUTCOME = ABANDONED). "What was investigated": state the reason work stopped. All other sections: N/A. - -Applies when: user says "skip this one", verdict is INCONCLUSIVE, PARTIAL_FIX is reverted, ownership conflict detected, or user ends the session early. Do **not** leave claimed tickets in "In Progress" with no assignee action. - - - -Scan JIRA comments for `## Investigation Update — {OUTCOME}`. For each match extract: - -- `outcome`: the OUTCOME token from the heading -- `date`: the date after `·` -- `full_text`: the full comment text -- `excluded_approaches`: content of `### What was tried` (skip if "N/A") -- `rejection_reasons`: content of `### Why it didn't hold` (skip if "N/A") -- `recommended_next_step`: content of `### Recommended next step` (null if "N/A") -- `summary`: content of `### What was investigated` - -Fall back to keyword scanning for non-standard-format comments. - - -