From b2a035ebc6fefe007a36dd9b2595dfb26178569c Mon Sep 17 00:00:00 2001
From: Seonmyeong Bak <sbak@nvidia.com>
Date: Wed, 22 Apr 2026 11:23:47 -0700
Subject: [PATCH 01/21] feat(skills): add nvrx-attr skill bundle

---
 src/nvidia_resiliency_ext/skills/__init__.py  |   1 +
 .../nvrx-attr/SESSION_REPORT_20260409_13.md   | 137 +++++++
 .../skills/nvrx-attr/SKILL.md                 |  52 +++
 .../nvrx-attr/fault-injection-loop/SKILL.md   | 336 ++++++++++++++++
 .../skills/nvrx-attr/fr-analysis/SKILL.md     | 113 ++++++
 .../fr-analysis/scripts/fr_attribution.py     |   1 +
 .../skills/nvrx-attr/l4_gb200_reduced.sh      | 363 +++++++++++++++++
 .../skills/nvrx-attr/log-analysis/SKILL.md    | 112 ++++++
 .../log-analysis/scripts/nvrx_logsage.py      |   1 +
 .../nvrx-attr/scripts/l4_gb200_reduced.sh     | 362 +++++++++++++++++
 .../nvrx-attr/scripts/n3_super_gb200.sh       | 166 ++++++++
 .../scripts/n3_super_gb200_shm_test.sh        | 369 ++++++++++++++++++
 .../scripts/pools/n3_super_8n_16n.pool        |  40 ++
 .../nvrx-attr/scripts/prepare_node_alloc.sh   | 209 ++++++++++
 .../skills/nvrx-attr/scripts/run_session.sh   |  39 ++
 .../nvrx-attr/scripts/score_attribution.py    | 237 +++++++++++
 .../nvrx-attr/scripts/watch_and_analyze.sh    | 202 ++++++++++
 .../skills/nvrx-attr/scripts/workloads.conf   |  17 +
 18 files changed, 2757 insertions(+)
 create mode 100644 src/nvidia_resiliency_ext/skills/__init__.py
 create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/SESSION_REPORT_20260409_13.md
 create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/SKILL.md
 create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
 create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md
 create mode 120000 src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/scripts/fr_attribution.py
 create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/l4_gb200_reduced.sh
 create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/SKILL.md
 create mode 120000 src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/scripts/nvrx_logsage.py
 create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh
 create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200.sh
 create mode 100755 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_shm_test.sh
 create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/pools/n3_super_8n_16n.pool
 create mode 100755 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh
 create mode 100755 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/run_session.sh
 create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py
 create mode 100755 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh
 create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/workloads.conf

diff --git a/src/nvidia_resiliency_ext/skills/__init__.py b/src/nvidia_resiliency_ext/skills/__init__.py
new file mode 100644
index 00000000..1670aafe
--- /dev/null
+++ b/src/nvidia_resiliency_ext/skills/__init__.py
@@ -0,0 +1 @@
+"""Agent skills bundled with nvidia_resiliency_ext."""
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/SESSION_REPORT_20260409_13.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/SESSION_REPORT_20260409_13.md
new file mode 100644
index 00000000..657cbbd5
--- /dev/null
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/SESSION_REPORT_20260409_13.md
@@ -0,0 +1,137 @@
+# Fault Injection Session Report — April 9–13, 2026
+
+## Summary
+
+End-to-end validation of the fault-injection attribution pipeline across 48 experiments.
+Identified and fixed three pipeline bugs, confirmed FR analysis is solid, and isolated the
+remaining attribution gap to a single issue: **logsage returns RESTART IMMEDIATE for
+crash/exception-type faults that should be STOP**.
+
+---
+
+## Pipeline Fixes Applied
+
+| File | Fix |
+|---|---|
+| `trace_analyzer/capture.py` | `capture_logs()` now saves/restores logger level and lowers it to INFO — previously, root logger at WARNING silently dropped all `logger.info()` calls inside the capture block, producing empty `analysis_text` from `CollectiveAnalyzer` |
+| `trace_analyzer/fr_attribution.py` | `main()` now prints `analysis_text` + `hanging_ranks` to stdout (was discarding results) |
+| `scripts/watch_and_analyze.sh` | FR inline Python block: import from installed package (not local skill copy), correctly extract `analysis_text`/`hanging_ranks` from returned dict, redirect stderr to `/dev/null` instead of mixing into FR output |
+| `scripts/score_attribution.py` | **New file** — LLM judge (Claude Sonnet) that scores 5 attribution dimensions per experiment and returns structured JSON |
+
+---
+
+## Experiment Sessions
+
+### Session 1 — Mini-batch validation (Apr 9, `20260409_160245`)
+
+6 experiments: GPU_SLEEP×2, GPU_ERROR×2, SIGKILL×1, SIGTERM×1 — all 2-node.
+Purpose: confirm pipeline works end-to-end after fixes.
+
+| # | FAULT_TYPE | RANK | restart | rank_p | rank_a | fault_d | fr_rank |
+|---|---|---|---|---|---|---|---|
+| 1 | GPU_SLEEP | 1 | ✅ | ✅ | ✅ | ✅ | ✅ |
+| 2 | GPU_SLEEP | 0 | ✅ | ✅ | ✅ | partial | ✅ |
+| 3 | GPU_ERROR | 1 | ❌ | ❌ | ❌ | partial | ✅ |
+| 4 | GPU_ERROR | 0 | ❌ | ❌ | ❌ | partial | ✅ |
+| 5 | SIGKILL | 1 | ❌ | ✅ | ✅ | partial | ✅ |
+| 6 | SIGTERM | 1 | ✅ | ❌ | ❌ | partial | ✅ |
+
+FR analysis: 6/6 correct. Pipeline confirmed working.
+
+---
+
+### Session 2 — Full default pool (Apr 9, `20260409_170603`)
+
+34 experiments across all fault types and node counts (2/4/8 nodes).
+
+**Infrastructure issue:** 18/34 jobs failed at container startup due to a pyxis/enroot
+`nvidia-container-cli ldcache` error on certain compute nodes:
+
+```
+nvidia-container-cli: ldcache error: process /usr/sbin/ldconfig.real failed with error code: 1
+[ERROR] /etc/enroot/hooks.d/98-nvidia.sh exited with return code 1
+pyxis: couldn't start container
+rm: cannot remove '/usr/local/cuda/compat/lib': Read-only file system
+```
+
+The CUDA compat overlay was not being applied on those nodes — `ldconfig` could not write its
+cache inside the read-only squashfs container. These jobs produced no FR dumps and their logs
+contained only the container error, which logsage misattributed as a disk/storage fault.
+The issue was transient and node-specific; jobs submitted the next day ran cleanly.
+
+**Clean-run results (16/34):** see full table in
+`/home/sbak/experiments/llama4-scout-gb200/fault_injection/20260409_170603/experiments_report.md`
+
+Aggregate for clean-run jobs:
+
+| FAULT_TYPE | N (clean) | restart% | rank_primary% | fr_rank% |
+|---|---|---|---|---|
+| GPU_SLEEP | 5 | 80% | 40% | 60% |
+| GPU_ERROR | 4 | 0% | 25% | 75% |
+| SIGKILL | 3 | 33% | 33% | 100% |
+| OS_ABORT | 1 | 0% | 0% | 100% |
+
+---
+
+### Session 3 — SEGFAULT cluster health check (Apr 10, `20260410_135216`)
+
+2 experiments: SEGFAULT rank=0 and rank=1, 2-node. Purpose: confirm cluster healthy after
+the Apr 9 enroot issue.
+
+| # | FAULT_TYPE | RANK | restart | rank_p | rank_a | fault_d | fr_rank |
+|---|---|---|---|---|---|---|---|
+| 1 | SEGFAULT | 1 | ❌ | ✅ | ✅ | ✅ | ✅ |
+| 2 | SEGFAULT | 0 | ❌ | ✅ | ✅ | ✅ | ✅ |
+
+Cluster healthy (both COMPLETED, 7 FR dumps each). Rank and fault description correct;
+restart decision wrong (RESTART instead of STOP).
+
+---
+
+### Session 4 — Python fault types (Apr 10, `20260410_143501`)
+
+4 experiments: LOCK_GIL×2, WORKLOAD_EXC×1, ASYNC_EXC×1 — all 2-node.
+These were skipped in the full session due to the enroot issue.
+
+| # | FAULT_TYPE | RANK | restart | rank_p | rank_a | fault_d | fr_rank |
+|---|---|---|---|---|---|---|---|
+| 1 | LOCK_GIL | 1 | ✅ | ✅ | ✅ | partial | ✅ |
+| 2 | LOCK_GIL | 0 | ✅ | ✅ | ✅ | partial | ✅ |
+| 3 | WORKLOAD_EXC | 1 | ❌ | ✅ | ✅ | partial | ❌ (rank 7) |
+| 4 | ASYNC_EXC | 1 | ❌ | ❌ | ❌ | false | ✅ |
+
+Note on WORKLOAD_EXC FR result: FR flagged rank 7 instead of rank 1. When a rank throws an
+application exception and crashes, the last rank detected as missing by NCCL's collective
+timeout isn't necessarily the originating rank — FR is identifying the symptom rank.
+
+---
+
+## Attribution Quality Summary (clean runs only)
+
+| Dimension | Assessment |
+|---|---|
+| **FR rank identification** | Solid — correctly identified the hanging rank in all clean-run experiments where NCCL completed enough to produce dumps. The `capture_logs()` fix was the key enabler. |
+| **Log rank identification** | Good for hang types (GPU_SLEEP, LOCK_GIL); weaker for crash/signal types where all ranks see a simultaneous NCCL timeout masking the originator. FR compensates for this gap. |
+| **Restart decision** | ✅ Correct for hang/recoverable types: GPU_SLEEP, LOCK_GIL, SIGTERM. ❌ Wrong for crash/exception types: GPU_ERROR, SIGKILL, SEGFAULT, WORKLOAD_EXC, ASYNC_EXC — logsage consistently returns RESTART IMMEDIATE when the correct decision is STOP. |
+| **Fault description** | Consistently `partial` — logsage describes the observable NCCL collective timeout symptom, not the underlying injected fault (GPU hang, kill signal, exception). This is expected given the log contains only symptoms. |
+
+---
+
+## Open Gap
+
+**Single actionable fix:** logsage restart decision for crash/exception-type faults.
+
+Logsage sees the same NCCL collective timeout pattern whether the root cause is a recoverable
+GPU hang or a hard crash (SIGKILL, SEGFAULT, CUDA error, application exception). It needs
+keyword-based fast-path rules to detect crash signals before the LLM runs:
+
+| Fault type | Expected | Currently returns |
+|---|---|---|
+| GPU_ERROR | STOP | RESTART IMMEDIATE |
+| SIGKILL | STOP | RESTART IMMEDIATE |
+| SEGFAULT | STOP | RESTART IMMEDIATE |
+| WORKLOAD_EXC | STOP | RESTART IMMEDIATE |
+| ASYNC_EXC | STOP | RESTART IMMEDIATE |
+| OS_ABORT | STOP | RESTART IMMEDIATE |
+
+Target file: `attribution/log_analyzer/nvrx_logsage.py`
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/SKILL.md
new file mode 100644
index 00000000..6884f96f
--- /dev/null
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/SKILL.md
@@ -0,0 +1,52 @@
+---
+name: nvrx-attr
+description: >
+  Orchestration layer over nvidia_resiliency_ext attribution modules. Provides
+  log-analysis, fr-analysis, and a Megatron-LM-oriented fault-injection feedback
+  loop for benchmarking attribution quality on SLURM workloads.
+compatibility: Requires Python 3.10+, nvidia-resiliency-ext installed, logsage, langchain-openai, and NVIDIA_API_KEY (env var, NVIDIA_API_KEY_FILE, or ~/.nvidia_api_key). The fault-injection loop has only been validated with Megatron-LM workloads.
+metadata:
+  author: nvidia
+---
+
+# Attribution Skills
+
+High-level orchestration layer over the `nvidia_resiliency_ext.attribution` modules.
+Each subdirectory is a self-contained skill with its own `SKILL.md` and helper scripts.
+
+## Skills
+
+| Directory | Purpose | Entry point |
+|-----------|---------|------------|
+| [`log-analysis/`](./log-analysis/SKILL.md) | Analyze SLURM job logs for failure root-cause and restart decisions | `NVRxLogAnalyzer` (`nvrx_logsage.py`) |
+| [`fr-analysis/`](./fr-analysis/SKILL.md) | Analyze NCCL flight-recorder dumps for collective-hang root-cause | `CollectiveAnalyzer` (`fr_attribution.py`) |
+| [`fault-injection-loop/`](./fault-injection-loop/SKILL.md) | Run a batched SLURM fault-injection feedback loop and score attribution accuracy | `prepare_node_alloc.sh` / `watch_and_analyze.sh` |
+
+## How skills relate to the library
+
+```
+src/nvidia_resiliency_ext/
+├── attribution/
+│   ├── log_analyzer/nvrx_logsage.py      ← log-analysis implementation
+│   ├── trace_analyzer/fr_attribution.py  ← fr-analysis implementation
+│   ├── analyzer/engine.py                ← combined orchestration entry point
+│   └── combined_log_fr/                  ← optional log + FR fusion
+└── skills/
+    └── nvrx-attr/                        ← this skill bundle
+        ├── log-analysis/
+        ├── fr-analysis/
+        └── fault-injection-loop/
+```
+
+The `Analyzer` (`analyzer/engine.py`) is the recommended entry point when you need
+request coalescing, result caching, or the combined `LOG_AND_TRACE` pipeline.
+Use the individual skills when you want to run one analysis type directly without the
+full coalescing stack.
+
+## Common prerequisites
+
+- `NVIDIA_API_KEY` environment variable, `NVIDIA_API_KEY_FILE`, or `~/.nvidia_api_key`
+- `langchain-openai` installed
+- `logsage` package installed (required by `log_analysis`)
+- Package installed: `pip install nvidia-resiliency-ext` or `pip install -e .` from repo root
+- The fault-injection loop has only been validated with Megatron-LM training scripts
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
new file mode 100644
index 00000000..abec6a91
--- /dev/null
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
@@ -0,0 +1,336 @@
+---
+name: fault-injection-loop
+description: >
+  Closed-loop fault injection and attribution accuracy benchmark. Draws from a
+  prioritized pool of (fault_type, rank, iter, nodes) experiments and submits them
+  2 at a time via sbatch — waiting for each pair to finish before submitting the
+  next — to bound filesystem load. GPU-related faults are front-loaded in the pool.
+  After all jobs complete, runs /log-analysis and /fr-analysis on every experiment,
+  scores attribution vs. ground truth, aggregates gaps, and iterates on attribution
+  modules to close them.
+compatibility: Requires SLURM cluster access, sbatch, NVIDIA_API_KEY, langchain-openai, logsage, and nvidia-resiliency-ext installed. This workflow has only been validated with Megatron-LM workloads.
+metadata:
+  author: nvidia
+  sub-skills: [log-analysis, fr-analysis]
+---
+
+# Skill: fault-injection-loop
+
+Iterative closed-loop skill that runs a prioritized fault-injection experiment pool
+2 jobs at a time, analyzes every artifact, scores attribution accuracy, aggregates
+gaps across the matrix, and proposes targeted improvements to attribution modules.
+
+---
+
+## Overview
+
+```
+┌───────────────────────────────────────────────────────────────────────┐
+│  0. POOL     → build ordered pool of (fault_type, rank, iter, nodes)  │
+│               GPU faults first, then crash, Python-hang, signal       │
+│                                                                        │
+│  repeat until pool exhausted:                                          │
+│  1. SUBMIT   → sbatch 2 jobs from pool head                            │
+│  2. WAIT     → poll until both jobs leave RUNNING/PENDING              │
+│                                                                        │
+│  after all jobs done:                                                  │
+│  3. ANALYZE  → watch_and_analyze.sh: /log-analysis + /fr-analysis     │
+│               per completed job, streaming as jobs finish              │
+│  4. SCORE    → compare attribution output vs injected ground truth     │
+│  5. AGGREGATE→ build results table; identify systematic failure modes  │
+│  6. IMPROVE  → patch log_analyzer/nvrx_logsage.py                     │
+│  7. LOOP     → re-run same pool with updated attribution code          │
+└───────────────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## Step 0 — Fault Pool Design
+
+The pool is defined as an ordered list of `FAULT_TYPE:RANK:ITER:NODES` entries
+inside `scripts/prepare_node_alloc.sh`. Default pool (34 experiments, 17 batches):
+
+```
+# GPU hangs — highest priority; full rank sweep across all node counts
+GPU_SLEEP:1:5:2   GPU_SLEEP:0:5:2      # 2-node: rank-1, rank-0
+GPU_SLEEP:4:5:2   GPU_SLEEP:7:5:2      # 2-node: mid-rank, last-rank
+GPU_SLEEP:1:5:4   GPU_SLEEP:0:5:4      # 4-node: rank-1, rank-0
+GPU_SLEEP:8:5:4   GPU_SLEEP:15:5:4     # 4-node: mid, last
+GPU_SLEEP:1:5:8   GPU_SLEEP:0:5:8      # 8-node: rank-1, rank-0
+GPU_SLEEP:16:5:8  GPU_SLEEP:31:5:8     # 8-node: mid, last
+
+# GPU errors — high priority; rank-0 and rank-1 across all node counts
+GPU_ERROR:1:5:2   GPU_ERROR:0:5:2
+GPU_ERROR:1:5:4   GPU_ERROR:0:5:4
+GPU_ERROR:1:5:8   GPU_ERROR:0:5:8
+
+# Crash faults
+SIGKILL:1:5:2     SIGKILL:0:5:2
+SIGKILL:1:5:4     SIGKILL:1:5:8
+SEGFAULT:1:5:2    SEGFAULT:0:5:2
+SEGFAULT:1:5:4    OS_ABORT:1:5:2
+
+# Python-level hangs
+LOCK_GIL:1:5:2    LOCK_GIL:0:5:2
+WORKLOAD_EXC:1:5:2  ASYNC_EXC:1:5:2
+
+# Signals
+SIGTERM:1:5:2     SIGINT:1:5:2
+SIGSTOP:1:5:2     SIGNAL_EXC:1:5:2
+```
+
+Rank coverage per node count (4 GPUs/node):
+
+| Nodes | Total ranks | rank-0 | rank-1 | mid | last |
+|-------|-------------|--------|--------|-----|------|
+| 2     | 8           | 0      | 1      | 4   | 7    |
+| 4     | 16          | 0      | 1      | 8   | 15   |
+| 8     | 32          | 0      | 1      | 16  | 31   |
+
+To run a custom subset, override `POOL` before calling the script:
+```bash
+POOL="GPU_SLEEP:0:5:2 GPU_SLEEP:1:5:2" bash scripts/prepare_node_alloc.sh
+```
+
+Environment variables:
+
+| Variable | Default | Description |
+|---|---|---|
+| `WORKLOAD` | `llama4_scout` | Select a registered workload by name (see `scripts/workloads.conf`) |
+| `ACCOUNT` | `root` | SLURM account |
+| `PARTITION` | `gb-nvl-134-135` | SLURM partition |
+| `GPUS_PER_NODE` | `4` | GPUs per node |
+| `TIME` | `00:30:00` | Per-job wall-clock limit |
+| `BATCH_SIZE` | `2` | Jobs submitted per round |
+| `POLL_INTERVAL` | `30` | Seconds between queue polls |
+| `BASE_EXPERIMENTS_DIR` | _(from workloads.conf or `llama4-scout-gb200`)_ | Root for all output |
+| `SBATCH_SCRIPT` | `scripts/l4_gb200_reduced.sh` | Job script to submit |
+| `POOL` | _(default pool above)_ | Space-separated experiment triplets |
+
+### Registered workloads (`scripts/workloads.conf`)
+
+| Name | Script | Base dir | Description |
+|---|---|---|---|
+| `llama4_scout` | `l4_gb200_reduced.sh` | `.../llama4-scout-gb200` | Llama4-Scout (reduced layers) on GB200 |
+
+```bash
+# Run the full pool against the validated example workload
+bash scripts/prepare_node_alloc.sh
+
+# Run a custom subset against llama4_scout
+POOL="GPU_SLEEP:1:5:2 SIGKILL:1:5:2" WORKLOAD=llama4_scout bash scripts/prepare_node_alloc.sh
+```
+
+---
+
+## Step 1 & 2 — Batched Submission + Wait (automated)
+
+```bash
+bash scripts/prepare_node_alloc.sh
+```
+
+The script loops: submit 2 jobs → poll `squeue` every 30 s until both finish →
+submit next 2. Progress is printed inline:
+
+```
+>>> Batch 1: experiments 1–2 of 34
+  submitted: GPU_SLEEP rank=1  iter=5 nodes=2 -> job=1850
+  submitted: GPU_SLEEP rank=0  iter=5 nodes=2 -> job=1851
+  waiting for GPU_SLEEP:1:5:2 GPU_SLEEP:0:5:2 (1850,1851) ... 30s 60s done.
+>>> Batch 2: experiments 3–4 of 34
+  ...
+```
+
+A session directory and TSV tracking file are created at launch time:
+```
+${BASE_EXPERIMENTS_DIR}/fault_injection/<YYYYMMDD_HHMMSS>/
+  experiments.tsv                              ← tracking file (all job IDs + paths)
+  n<N>_<FAULT>_r<R>_i<I>/                     ← one subdir per experiment
+    logs/slurm/<JOB_ID>.launch.out
+    logs/slurm/<JOB_ID>.*.1.main_workload.log  ← log-analysis input
+    checkpoints/                               ← fr-analysis input (FR dumps)
+    tensorboard/
+  experiments_report.md                        ← generated by watch_and_analyze.sh
+```
+
+Tracking file columns: `JOB_ID  FAULT_TYPE  RANK  ITER  NODES  EXPERIMENT_DIR`
+
+---
+
+## Step 3 — Analyze All Experiments
+
+Run the watcher/analyzer — it reads the tracking file and processes each experiment
+as its job state leaves RUNNING/PENDING (works whether jobs are still running or
+already done):
+
+```bash
+bash scripts/watch_and_analyze.sh \
+    ${BASE_EXPERIMENTS_DIR}/fault_injection/<YYYYMMDD_HHMMSS>/experiments.tsv
+```
+
+The watcher:
+1. Reads each row from the tracking TSV
+2. Calls `nvrx_logsage.py --exclude_nvrx_logs` and parses the text output to get
+   `restart_decision` and `attribution_text`
+3. Calls `CollectiveAnalyzer` from `fr_attribution.py` to get suspect ranks
+4. Scores 7 dimensions (restart correctness, rank primary, rank any, category, type, FR rank)
+5. Appends a scored row to `<session>_report.md`
+6. Repeats until all experiments are analyzed
+
+To also run the sub-skills interactively for a single experiment:
+```bash
+/log-analysis --log-path "${EXPERIMENT_DIR}/logs/slurm/${JOB_ID}.*.1.main_workload.log"
+/fr-analysis  --fr-path  "${EXPERIMENT_DIR}/checkpoints/"
+```
+
+---
+
+## Step 4 — Score Each Experiment
+
+Scoring is performed by `scripts/score_attribution.py`, an LLM judge (Sonnet or Opus) that
+receives the ground truth, the filtered raw log, the logsage attribution output, and the FR
+analysis output, then returns structured JSON scores with a reasoning note.
+
+| Column | Values | Meaning |
+|---|---|---|
+| **restart_correct** | `true` / `false` / `N/A` | Restart decision matches expected for this fault type |
+| **rank_primary** | `true` / `false` / `partial` | Injected rank is the primary root-cause in attribution |
+| **rank_any** | `true` / `false` | Injected rank mentioned anywhere in attribution |
+| **fault_described** | `true` / `false` / `partial` | Fault nature (hang/crash/signal/exception) correctly described |
+| **fr_rank_correct** | `true` / `false` / `no_dumps` | FR analysis identifies injected rank as suspect |
+| **judge_notes** | string | One-sentence summary of the main gap or confirmation |
+
+The judge is given:
+1. Ground truth: `fault_type`, `rank`, `iter`, `nodes`
+2. Expected restart decision + rationale (derived from `score_attribution.py:_RESTART_TABLE`)
+3. Filtered raw log (last 400 lines, same `exclude_nvrx_logs` filtering as logsage)
+4. Raw logsage stdout (5-field text format)
+5. Raw CollectiveAnalyzer text output
+
+Default judge model: `azure/anthropic/claude-sonnet-4-6`. Override with `--model` in `score_attribution.py`.
+
+---
+
+## Step 5 — Aggregate Results
+
+The report markdown table from `watch_and_analyze.sh` gives a matrix view. Look for
+patterns across rows:
+
+```
+| FAULT_TYPE | NODES | RANK | restart_correct | rank_primary | rank_any | fault_described | fr_rank_correct | judge_notes |
+|------------|-------|------|-----------------|--------------|----------|-----------------|-----------------|-------------|
+| GPU_SLEEP  |   2   |  0   |      true       |    false     |   true   |      true       |      true       | rank-0 identified only in secondary issues |
+| GPU_SLEEP  |   2   |  1   |      true       |     true     |   true   |      true       |      true       | correct on all dimensions |
+| GPU_ERROR  |   2   |  1   |      false      |    false     |  false   |     partial     |      true       | LLM issued RESTART; rank not mentioned |
+| SIGKILL    |   2   |  0   |      true       |    false     |  false   |     false       |      true       | attribution describes timeout not kill signal |
+```
+
+Common failure mode patterns and their meaning:
+
+| Pattern | Interpretation |
+|---|---|
+| `rank_primary=false`, `rank_any=true` | Rank detected but treated as collateral; logsage putting it in secondary issues |
+| `rank_any=false` for rank-0 | Rank-0 hang silences watchdog on other ranks; logsage lacks rank-0 signal |
+| `fault_described=partial` for crash types | Crash keywords present but fault type not specifically named |
+| `restart_correct=false` for GPU_ERROR | LLM conflating hardware error with recoverable hang |
+| `fr_rank_correct=no_dumps` | NCCL watchdog did not fire before job ended — adjust `TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC` |
+
+---
+
+## Step 6 — Identify and Apply Improvements
+
+### FR analysis
+Deterministic graph algorithm — **do not modify automatically**.
+Note misidentifications and escalate to the team.
+
+### Log analysis (safe to modify)
+
+| Observation | Target location | Suggested fix |
+|---|---|---|
+| Wrong restart for hang | `nvrx_logsage.py` fast-path | Strengthen NCCL timeout → `RESTART IMMEDIATE` mapping |
+| Missing rank in attr text | `nvrx_logsage.py` prompt | Extract rank from NCCL watchdog lines; add regex |
+| Crash misclassified as hang | `nvrx_logsage.py` | Add SIGKILL/SEGFAULT/GPU_ERROR keyword patterns |
+| `ERRORS NOT FOUND` when errors exist | `return_application_errors` config | Loosen error extraction filter |
+| rank-0 not detected | prompt or fast-path | Add explicit rank-0 hang heuristic (other ranks silent) |
+| attr off by many iters | prompt context | Increase weight of iteration-stamped log lines |
+| LLM wrong on GPU_ERROR | prompt | Distinguish `cudaError` → crash from NCCL timeout → hang |
+
+Editable file: `attribution/log_analyzer/nvrx_logsage.py`
+
+After each patch, re-run the same pool subset that previously failed:
+```bash
+POOL="GPU_SLEEP:0:5:2 GPU_ERROR:1:5:2" bash scripts/prepare_node_alloc.sh
+```
+
+---
+
+## Step 7 — Loop
+
+Increment experiment counter. Suggested sweep order across code-change iterations:
+
+1. **Iteration 1**: full default pool (34 experiments)
+2. **Iteration 2**: targeted re-run of all failing cells from iteration 1
+3. **Iteration 3**: expand iter dimension (FAULT_AT_ITER=2 and 10) for remaining gaps
+4. **Iteration 4**: add SEGFAULT and LOCK_GIL 4-node/8-node coverage
+
+Stop condition: all cells pass all four scoring dimensions for two consecutive
+code-change iterations.
+
+---
+
+## Adapting A SLURM Script For The Feedback Loop
+
+The feedback loop is not tied to `l4_gb200_reduced.sh`, but your sbatch script must
+match a small contract so the loop can submit, analyze, and score each run.
+
+Required changes for a custom workload script:
+
+1. Accept these exported variables from `prepare_node_alloc.sh`:
+   `FAULT_TYPE`, `FAULT_RANK`, `FAULT_AT_ITER`, `EXPERIMENT_DIR`, `BASE_EXPERIMENTS_DIR`,
+   and `GPUS_PER_NODE`.
+2. Write the main training log to:
+   `${EXPERIMENT_DIR}/logs/slurm/${SLURM_JOB_ID}.*.1.main_workload.log`
+   so `watch_and_analyze.sh` can find it.
+3. Write NCCL flight-recorder dumps under `${EXPERIMENT_DIR}/checkpoints/`.
+4. Emit a `[MEGATRON_FAULT] ...` marker when the fault is injected.
+   `watch_and_analyze.sh` uses this to decide whether the run reached the injection point.
+5. Preserve the per-experiment directory layout:
+   `logs/slurm/`, `checkpoints/`, and `tensorboard/`.
+
+This has only been validated with Megatron-LM because the current run-valid check and
+fault markers depend on Megatron's `debug_fault_injection.py` behavior. If you adapt the
+loop to another framework, update both the sbatch script and `watch_and_analyze.sh`.
+
+## Appendix A: SBATCH_SCRIPT fault parameters
+
+The example `SBATCH_SCRIPT` reads these env vars from `prepare_node_alloc.sh` via `--export`:
+
+| Variable | Default | Description |
+|---|---|---|
+| `FAULT_AT_ITER` | `5` | Training iteration at which to inject |
+| `FAULT_RANK` | `1` | Global rank to inject `[0, total_ranks)` |
+| `FAULT_TYPE` | `GPU_SLEEP` | Megatron fault type enum name |
+| `GPUS_PER_NODE` | `4` | GPUs per node (used to compute `TOTAL_TASKS`) |
+| `EXPERIMENT_DIR` | `${BASE_EXPERIMENTS_DIR}/fault_injection/n${SLURM_NNODES}_${FAULT_TYPE}_r${FAULT_RANK}_i${FAULT_AT_ITER}` | Per-experiment output root |
+| `BASE_EXPERIMENTS_DIR` | `/home/sbak/experiments/llama4-scout-gb200` | Shared root (datacache, triton/inductor caches) |
+
+Valid `FAULT_TYPE` values:
+`GPU_ERROR`, `GPU_SLEEP`, `WORKLOAD_EXC`, `ASYNC_EXC`, `SIGNAL_EXC`, `OS_ABORT`,
+`LOCK_GIL`, `SEGFAULT`, `SIGINT`, `SIGKILL`, `SIGTERM`, `SIGSTOP`
+
+---
+
+## Appendix B: Single-experiment manual run
+
+```bash
+# Manual runs land under fault_injection/manual/ by default (no session dir needed)
+EXPERIMENT_DIR=/home/sbak/experiments/llama4-scout-gb200/fault_injection/manual/n2_GPU_SLEEP_r1_i5
+mkdir -p ${EXPERIMENT_DIR}/logs/slurm ${EXPERIMENT_DIR}/checkpoints ${EXPERIMENT_DIR}/tensorboard
+
+sbatch \
+    --nodes=2 \
+    --output=${EXPERIMENT_DIR}/logs/slurm/%j.launch.out \
+    --error=${EXPERIMENT_DIR}/logs/slurm/%j.launch.err \
+    --export=ALL,FAULT_TYPE=GPU_SLEEP,FAULT_RANK=1,FAULT_AT_ITER=5,GPUS_PER_NODE=4,EXPERIMENT_DIR=${EXPERIMENT_DIR} \
+    scripts/l4_gb200_reduced.sh
+```
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md
new file mode 100644
index 00000000..df038451
--- /dev/null
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md
@@ -0,0 +1,113 @@
+---
+name: fr-analysis
+description: >
+  Analyze PyTorch NCCL flight-recorder (FR) dumps to identify collective operation hangs and
+  isolate the responsible ranks using CollectiveAnalyzer. Use when a distributed training job
+  hangs due to an NCCL collective timeout and FR dump files are available. Detects the wavefront
+  process group where collectives diverge and returns the root-cause suspect ranks.
+compatibility: Requires PyTorch NCCL FR dumps (TORCH_NCCL_TRACE_BUFFER_SIZE > 0 must be set during training). NVIDIA_API_KEY and langchain-openai are required only when using --llm-analyze.
+metadata:
+  entry-point: CollectiveAnalyzer
+  script: scripts/fr_attribution.py
+---
+
+# Skill: fr_analysis
+
+Analyze PyTorch NCCL flight-recorder (FR) dumps to identify the collective operation hang
+and isolate the ranks responsible, using `CollectiveAnalyzer`.
+
+**Script:** [`scripts/fr_attribution.py`](./scripts/fr_attribution.py) → `attribution/trace_analyzer/fr_attribution.py`
+
+---
+
+## What it does
+
+1. Loads all FR dump files (JSON or binary pickle) matching a glob pattern under `--fr-path`.
+2. Parses each dump into `Collective` records (op type, ranks, process group, timing, state).
+3. Groups collectives by process group and sequence ID across ranks to detect mismatches.
+4. Identifies the **wavefront** — the process group boundary where collectives diverge — and
+   returns the missing ranks at that boundary as the root-cause suspects.
+5. Optionally runs an LLM pass (`--llm-analyze`) over the structured findings for a
+   human-readable summary.
+
+---
+
+## CLI
+
+```bash
+python scripts/fr_attribution.py \
+    --fr-path /path/to/fr_dumps/ \
+    [--pattern "*.json"] \
+    [--verbose] \
+    [--health-check] \
+    [--llm-analyze] \
+    [--model MODEL] \
+    [--debug]
+```
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--fr-path` | required | Path to a directory (or single file) containing FR dump files |
+| `--pattern` | `*.json` | Glob pattern for dump files within `--fr-path` |
+| `--verbose`, `-v` | off | Print detailed per-rank collective tables |
+| `--health-check`, `-c` | off | Include node health check results in output |
+| `--llm-analyze`, `-l` | off | Pass structured findings to the LLM for a narrative summary |
+| `--model`, `-m` | `nvdev/nvidia/llama-3.3-nemotron-super-49b-v1` | LLM model (only used with `--llm-analyze`) |
+| `--debug` | off | Convert binary trace files to JSON for inspection |
+
+---
+
+## Programmatic API
+
+```python
+from nvidia_resiliency_ext.attribution.trace_analyzer.fr_attribution import CollectiveAnalyzer
+
+analyzer = CollectiveAnalyzer({
+    "fr_path": "/path/to/fr_dumps/",
+    "pattern": "*.json",
+    "verbose": False,
+    "health_check": False,
+    "llm_analyze": False,
+    "model": "nvdev/nvidia/llama-3.3-nemotron-super-49b-v1",
+})
+results = analyzer.run_sync({
+    "fr_path": "/path/to/fr_dumps/",
+})
+# results: list[tuple[str, AttributionState]]
+```
+
+---
+
+## Output
+
+Returns `(text, AttributionState)` pairs where `text` describes:
+
+- The **wavefront process group** where collectives diverged
+- **Missing ranks** at the wavefront (root-cause suspects)
+- Per-rank collective status tables (when `--verbose`)
+- Node health summary (when `--health-check`)
+- LLM narrative (when `--llm-analyze`)
+
+`AttributionState.STOP` indicates the hang is unrecoverable; `CONTINUE` indicates the job
+may be restartable after isolating the identified ranks.
+
+---
+
+## Dump file formats
+
+| Format | Notes |
+|--------|-------|
+| JSON (`.json`) | Standard PyTorch FR export; default glob pattern |
+| Binary pickle | Detected automatically; use `--debug` to convert to JSON |
+
+FR dumps are typically written to the directory specified by `TORCH_NCCL_DEBUG_INFO_TEMP_FILE`
+or triggered automatically on NCCL timeout.
+
+---
+
+## Prerequisites
+
+- FR dump files produced by PyTorch NCCL (set `TORCH_NCCL_TRACE_BUFFER_SIZE` > 0)
+- `NVIDIA_API_KEY` required only when using `--llm-analyze`
+- `langchain-openai` required only when using `--llm-analyze`
+- `FR_DEBUG=1` env var enables verbose debug logging in the script
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/scripts/fr_attribution.py b/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/scripts/fr_attribution.py
new file mode 120000
index 00000000..cfac8e34
--- /dev/null
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/scripts/fr_attribution.py
@@ -0,0 +1 @@
+../../../trace_analyzer/fr_attribution.py
\ No newline at end of file
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/l4_gb200_reduced.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/l4_gb200_reduced.sh
new file mode 100644
index 00000000..5c903e7a
--- /dev/null
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/l4_gb200_reduced.sh
@@ -0,0 +1,363 @@
+#!/bin/bash
+
+#SBATCH --account=root
+#SBATCH --partition=gb-nvl-134-135
+#SBATCH --time=00:30:00
+
+#SBATCH --job-name=llama4-scout-gb200
+#SBATCH --output=/tmp/slurm-%j.launch.out
+#SBATCH --error=/tmp/slurm-%j.launch.err
+
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=4
+#SBATCH --gpus-per-node=4
+#SBATCH --exclusive
+#SBATCH --mem=0
+
+log_msg() {
+    local msg="$1"
+    UNIX_DATETIME=$(date +%s)
+    HUMAN_DATETIME=$(date -d "@$UNIX_DATETIME" '+%Y-%m-%d %H:%M:%S.%3N')
+    echo ">>> ${msg} ${UNIX_DATETIME} (${HUMAN_DATETIME})"
+}
+
+log_msg "START SBATCH"
+echo "Running on nodes: ${SLURM_NODELIST}"
+export RITS_PLATFORM_TYPE=gb200
+export RITS_GPUS_PER_NODE=4
+export RITS_NVL_DOMAIN_SIZE=72
+export NCCL_IB_DISABLE=0
+export NCCL_NET_GDR_LEVEL=3
+export RITS_CLUSTER_NAME=nvl72
+export PYXIS_LOG_LEVEL=debug
+export NCCL_IB_SL=1
+export NCCL_IB_TIMEOUT=19
+export UB_TIMEOUT=720
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NVTE_FWD_LAYERNORM_SM_MARGIN=16
+export NVTE_BWD_LAYERNORM_SM_MARGIN=16
+export NCCL_P2P_NET_CHUNKSIZE=2097152
+export NCCL_DEBUG=WARN
+export PYTHONUNBUFFERED=1
+export ONE_LOGGER_JOB_CATEGORY=test
+export LOGLEVEL=DEBUG
+export TORCHINDUCTOR_WORKER_START=fork
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TORCH_CPP_LOG_LEVEL=INFO
+export TORCH_NCCL_TRACE_BUFFER_SIZE=2000
+export TORCH_NCCL_RETHROW_CUDA_ERRORS=0
+export TORCH_NCCL_ENABLE_MONITORING=1
+export TORCH_NCCL_DUMP_ON_TIMEOUT=1
+export TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN=0
+export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=20
+export TORCH_DIST_INIT_BARRIER=0
+export TORCH_INCLUDE_STACK_TRACE=0
+export TORCH_INCLUDE_ONLY_ACTIVE=1
+export TORCH_NCCL_EXTRA_DUMP_ON_EXEC=1
+
+# Checkpoint settings (overridable via sbatch --export)
+export DIST_TIMEOUT_AFTER_INIT="${DIST_TIMEOUT_AFTER_INIT:-1800}"
+# USE_ASYNC_CKPT=1: enable async checkpointing every CKPT_SAVE_INTERVAL iters
+export USE_ASYNC_CKPT="${USE_ASYNC_CKPT:-0}"
+export CKPT_SAVE_INTERVAL="${CKPT_SAVE_INTERVAL:-100}"
+export USE_CPU_SHM="${USE_CPU_SHM:-1}"
+
+# Quantization mode (overridable via sbatch --export)
+export USE_FP8="${USE_FP8:-1}"
+export USE_FP4="${USE_FP4:-0}"
+
+# Overlap comm (overridable via sbatch --export)
+export USE_OVERLAP_COMM="${USE_OVERLAP_COMM:-0}"
+
+# Node / task geometry (SLURM_NNODES is set by SLURM from --nodes override)
+export GPUS_PER_NODE="${GPUS_PER_NODE:-4}"
+TOTAL_TASKS=$((SLURM_NNODES * GPUS_PER_NODE))
+
+# Per-experiment output directory (overridable via sbatch --export)
+export BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-/home/sbak/experiments/llama4-scout-gb200}"
+export EXPERIMENT_DIR="${EXPERIMENT_DIR:-${BASE_EXPERIMENTS_DIR}/ckpt_test/n${SLURM_NNODES}}"
+
+mkdir -p ${BASE_EXPERIMENTS_DIR}/datacache
+mkdir -p ${EXPERIMENT_DIR}/tensorboard
+
+: "${SLURM_RESTART_COUNT:=0}"
+
+LOG_DIR=${EXPERIMENT_DIR}/logs
+mkdir -p ${LOG_DIR}
+echo "Writing logs to ${LOG_DIR}"
+LOG_FILE_BASE="${LOG_DIR}/slurm/${SLURM_JOB_ID}.${SLURM_RESTART_COUNT}"
+
+# ── Shared-tmp directory (NFS, for cross-srun-step communication) ─────────────
+# Mounted to /shared_tmp (NOT /tmp) so the container keeps its native fast /tmp.
+SHARED_TMP_HOST=/home/sbak/tmp/${SLURM_JOB_ID}
+mkdir -p ${SHARED_TMP_HOST}
+
+# ── Pre-populate .myenv with all variables that must reach the container ───────
+# Pyxis env forwarding is unreliable for vars set via sbatch --export; writing
+# them into .myenv guarantees the inner bash picks them up via `source`.
+MYENV_FILE=${SHARED_TMP_HOST}/.myenv_${SLURM_JOB_ID}.sh
+cat > ${MYENV_FILE} << MYENVEOF
+# Auto-generated by l4_gb200_reduced.sh — do not edit by hand.
+export DIST_TIMEOUT_AFTER_INIT=${DIST_TIMEOUT_AFTER_INIT}
+export USE_ASYNC_CKPT=${USE_ASYNC_CKPT}
+export CKPT_SAVE_INTERVAL=${CKPT_SAVE_INTERVAL}
+export USE_CPU_SHM=${USE_CPU_SHM}
+export USE_FP8=${USE_FP8}
+export USE_FP4=${USE_FP4}
+export USE_OVERLAP_COMM=${USE_OVERLAP_COMM}
+# Prepend local nvrx src so container picks up our changes without a pip install step.
+export PYTHONPATH=/home/sbak/nvidia-resiliency-ext/src:\${PYTHONPATH}
+MYENVEOF
+
+# Mounts
+LUSTRE=/home:/home
+SHARED_TMP=${SHARED_TMP_HOST}:/shared_tmp
+LOGS=${EXPERIMENT_DIR}/logs:/logs
+MEGATRON_REPO=/home/sbak/megatron-lm-main:/megatron-lm_repo
+DATACACHE=${BASE_EXPERIMENTS_DIR}/datacache:/datacache
+TENSORBOARD=${EXPERIMENT_DIR}/tensorboard:/tensorboard
+WORKSPACE=/home/sbak/tmp:/workspace
+FR_DUMP=${EXPERIMENT_DIR}/flight_recorder:/flight_recorder
+mkdir -p ${EXPERIMENT_DIR}/flight_recorder
+CONTAINER_MOUNTS=$LUSTRE,$SHARED_TMP,$LOGS,$MEGATRON_REPO,$DATACACHE,$TENSORBOARD,$WORKSPACE,$FR_DUMP
+
+# ── Disk cleanup: remove stale enroot containers from prior jobs ──────────────
+log_msg "START disk_cleanup"
+srun \
+    --label \
+    --ntasks-per-node=1 \
+    --ntasks=${SLURM_NNODES} \
+    --kill-on-bad-exit=0 \
+    --mpi=none \
+    bash -c '
+        ENROOT_DIR="/var/lib/enroot/data/$(id -u)"
+        rm -rf "${ENROOT_DIR:?}"/* 2>/dev/null || true
+        echo "$(hostname): / $(df -h / | tail -1 | awk "{print \$3\" used, \"\$4\" avail\"}")"
+    '
+log_msg "END disk_cleanup"
+
+# all node setup
+#--------------------------------
+log_msg "START all_node_setup"
+srun \
+    --label \
+    --container-mounts ${CONTAINER_MOUNTS} \
+    --container-image /home/sbak/mcore_ci_0415.sqsh \
+    --container-name ${SLURM_JOB_ID} \
+    --container-workdir / \
+    --exclusive \
+    --error=${LOG_FILE_BASE}.0.all_node_setup.log \
+    --output=${LOG_FILE_BASE}.0.all_node_setup.log \
+    --ntasks-per-node=1 \
+    --ntasks=${SLURM_NNODES} \
+    --kill-on-bad-exit=0 \
+    --mpi=none \
+    bash -c '
+        # Use a per-node NFS path so all ranks on each node find the right clone.
+        MEGATRON_PATH=/shared_tmp/megatron_$(hostname)_${SLURM_JOB_ID}
+        mkdir -p ${MEGATRON_PATH}
+        pushd $MEGATRON_PATH
+        CURRENT_BRANCH=$(git -C /megatron-lm_repo branch --show-current)
+        echo "Cloning Megatron branch $CURRENT_BRANCH to ${MEGATRON_PATH}"
+        git clone --single-branch --branch $CURRENT_BRANCH /megatron-lm_repo .
+        popd
+    '
+log_msg "END all_node_setup"
+
+# main workload
+#--------------------------------
+log_msg "START main_workload"
+srun \
+    --label \
+    --container-mounts ${CONTAINER_MOUNTS} \
+    --container-image /home/sbak/mcore_ci_0415.sqsh \
+    --container-name ${SLURM_JOB_ID} \
+    --container-workdir / \
+    --error=${LOG_FILE_BASE}.1.main_workload.log \
+    --output=${LOG_FILE_BASE}.1.main_workload.log \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --ntasks=${TOTAL_TASKS} \
+    --kill-on-bad-exit=0 \
+    --mpi=none \
+    bash -c '
+        source /shared_tmp/.myenv_${SLURM_JOB_ID}.sh
+
+        # Match the per-node path used in all_node_setup.
+        MEGATRON_PATH=/shared_tmp/megatron_$(hostname)_${SLURM_JOB_ID}
+
+        NFS_TRITON_CACHE=/home/sbak/experiments/llama4-scout-gb200/triton_cache
+        NFS_INDUCTOR_CACHE=/home/sbak/experiments/llama4-scout-gb200/inductor_cache
+
+        # Per-rank Triton/inductor cache on the container native /tmp (local fast storage).
+        export TRITON_CACHE_DIR=/tmp/triton_${SLURM_LOCALID}
+        export TORCHINDUCTOR_CACHE_DIR=/tmp/inductor_${SLURM_LOCALID}
+        mkdir -p ${TRITON_CACHE_DIR} ${TORCHINDUCTOR_CACHE_DIR}
+
+        # Pre-stage: warm local cache from NFS (one rank per node)
+        if [[ "${SLURM_LOCALID}" == "0" ]]; then
+            if [[ -d "${NFS_TRITON_CACHE}" ]]; then
+                echo "Pre-staging triton cache from NFS..."
+                rsync -a --ignore-existing "${NFS_TRITON_CACHE}/" "${TRITON_CACHE_DIR}/" 2>/dev/null || true
+            fi
+            if [[ -d "${NFS_INDUCTOR_CACHE}" ]]; then
+                echo "Pre-staging inductor cache from NFS..."
+                rsync -a --ignore-existing "${NFS_INDUCTOR_CACHE}/" "${TORCHINDUCTOR_CACHE_DIR}/" 2>/dev/null || true
+            fi
+        fi
+
+        # Post-stage: write back to NFS on exit (one rank per node)
+        _stage_back() {
+            if [[ "${SLURM_LOCALID}" == "0" ]]; then
+                echo "Staging triton cache back to NFS..."
+                mkdir -p "${NFS_TRITON_CACHE}" "${NFS_INDUCTOR_CACHE}"
+                rsync -a --ignore-existing "${TRITON_CACHE_DIR}/" "${NFS_TRITON_CACHE}/" 2>/dev/null || true
+                rsync -a --ignore-existing "${TORCHINDUCTOR_CACHE_DIR}/" "${NFS_INDUCTOR_CACHE}/" 2>/dev/null || true
+                echo "Cache staged back."
+            fi
+        }
+        trap _stage_back EXIT
+
+        # Checkpoint directory — node-local /tmp (cleaned up by the cleanup job).
+        CKPT_DIR=/tmp/ckpt_${SLURM_JOB_ID}
+        mkdir -p ${CKPT_DIR}
+
+        if [[ "${USE_FP8:-1}" == "1" ]]; then
+            QUANT_ARGS="--fp8-format hybrid \
+            --fp8-recipe delayed \
+            --fp8-param-gather \
+            --fp8-amax-history-len 1024 \
+            --fp8-amax-compute-algo max \
+            --fp8-margin 0"
+        elif [[ "${USE_FP4:-0}" == "1" ]]; then
+            QUANT_ARGS="--fp4-format e2m1 \
+            --fp4-recipe nvfp4"
+        else
+            QUANT_ARGS=""
+        fi
+
+        if [[ "${USE_OVERLAP_COMM:-0}" == "1" ]]; then
+            OVERLAP_ARGS="--overlap-grad-reduce --overlap-param-gather"
+        else
+            OVERLAP_ARGS=""
+        fi
+
+        # Build checkpoint args (controlled by USE_ASYNC_CKPT from .myenv).
+        # No --load: we only want to test save here.
+        CKPT_SAVE_ARGS=""
+        if [[ "${USE_ASYNC_CKPT}" == "1" ]]; then
+            CKPT_SAVE_ARGS="--save ${CKPT_DIR} --save-interval ${CKPT_SAVE_INTERVAL} --async-save --use-persistent-ckpt-worker --use-dist-ckpt --ckpt-fully-parallel-save --ckpt-assume-constant-structure $([[ "${USE_CPU_SHM}" == "1" ]] && echo "--async-ckpt-use-cpu-shm")"
+        fi
+
+        pushd $MEGATRON_PATH
+        LAUNCHER_CMD="python3"
+        LAUNCHER_ARGS=" \
+        "
+        WORKLOAD_CMD=${MEGATRON_PATH}/pretrain_gpt.py
+        WORKLOAD_ARGS=" \
+            --exit-duration-in-mins 5750 \
+            --distributed-timeout-minutes 10 \
+            --disable-gloo-process-groups \
+            --mock-data \
+            --data-cache-path /datacache \
+            --no-create-attention-mask-in-dataloader \
+            --no-mmap-bin-files \
+            --tokenizer-type NullTokenizer \
+            --tiktoken-pattern v2 \
+            --tokenizer-model /lustre/fsw/portfolios/llmservice/projects/llmservice_nlp_fm/nemotron6/tokenizers/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json \
+            --micro-batch-size 1 \
+            --global-batch-size 64 \
+            --train-samples 10240000 \
+            --adam-beta1 0.9 \
+            --adam-beta2 0.95 \
+            --adam-eps 1e-05 \
+            --lr-decay-style cosine \
+            --lr-warmup-samples 1024000 \
+            --lr-decay-samples 20480000 \
+            --lr 0.0003 \
+            --min-lr 2.9999999999999997e-05 \
+            --weight-decay 0.1 \
+            --clip-grad 1.0 \
+            --loss-scale 1.0 \
+            --use-mcore-models \
+            --untie-embeddings-and-output-weights \
+            --disable-bias-linear \
+            --attention-backend flash \
+            --transformer-impl transformer_engine \
+            --position-embedding-type rope \
+            --rotary-base 500000 \
+            --rotary-interleaved \
+            --use-rope-scaling \
+            --rope-scaling-factor 8.0 \
+            --no-rope-fusion \
+            --no-rope-freq 4 \
+            --use-flash-attn \
+            --cross-entropy-fusion-impl te \
+            --cross-entropy-loss-fusion \
+            --seq-length 8192 \
+            --max-position-embeddings 8192 \
+            --num-layers 12 \
+            --swiglu \
+            --hidden-size 5120 \
+            --num-attention-heads 40 \
+            --group-query-attention \
+            --num-query-groups 8 \
+            --ffn-hidden-size 16384 \
+            --kv-channels 128 \
+            --normalization RMSNorm \
+            --attention-dropout 0.0 \
+            --hidden-dropout 0.0 \
+            --grad-reduce-in-bf16 \
+            --qk-l2-norm \
+            --num-experts 16 \
+            --moe-layer-freq 1 \
+            --moe-ffn-hidden-size 8192 \
+            --moe-shared-expert-intermediate-size 8192 \
+            --moe-router-topk 1 \
+            --moe-router-score-function sigmoid \
+            --moe-token-dispatcher-type alltoall \
+            --moe-grouped-gemm \
+            --moe-shared-expert-overlap \
+            --moe-router-bias-update-rate 0.001 \
+            --moe-router-load-balancing-type aux_loss \
+            --moe-aux-loss-coeff 0.01 \
+            --moe-router-enable-expert-bias \
+            --moe-apply-probs-on-input \
+            --moe-router-force-load-balancing \
+            --bf16 \
+            ${QUANT_ARGS} \
+            --te-rng-tracker \
+            --sequence-parallel \
+            --use-distributed-optimizer \
+            ${OVERLAP_ARGS} \
+            --ddp-num-buckets 5 \
+            --tensor-model-parallel-size 1 \
+            --pipeline-model-parallel-size 1 \
+            --expert-model-parallel-size 8 \
+            --expert-tensor-parallel-size 1 \
+            --ddp-average-in-collective \
+            --log-interval 1 \
+            --timing-log-option minmax \
+            --log-params-norm \
+            --log-num-zeros-in-grad \
+            --log-throughput \
+            --check-weight-hash-across-dp-replicas-interval 20000 \
+            --tensorboard-dir /tensorboard \
+            --logging-level 10 \
+            --eval-iters 14 \
+            --eval-interval 2000 \
+            --manual-gc \
+            --manual-gc-interval 100 \
+            --num-workers 1 \
+            --local-rank ${SLURM_LOCALID} \
+            --context-parallel-size 1 \
+            --vocab-size 238600 \
+            --distributed-timeout-seconds-after-init ${DIST_TIMEOUT_AFTER_INIT} \
+            --flight-recorder-dump-path /flight_recorder \
+        "
+        $LAUNCHER_CMD $LAUNCHER_ARGS $WORKLOAD_CMD $WORKLOAD_ARGS $CKPT_SAVE_ARGS
+    '
+log_msg "END main_workload"
+
+log_msg "END SBATCH"
+
+set +x
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/SKILL.md
new file mode 100644
index 00000000..a86e2ff7
--- /dev/null
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/SKILL.md
@@ -0,0 +1,112 @@
+---
+name: log-analysis
+description: >
+  Analyze a SLURM job log file for failure root-cause attribution and restart decisions using
+  NVRxLogAnalyzer. Use when you have a SLURM training job log and need to determine why the
+  job failed and whether it should be restarted. Performs per-cycle chunking, fast-path pattern
+  matching, and LLM-based classification.
+compatibility: Requires NVIDIA_API_KEY, langchain-openai, and logsage packages installed. nvidia-resiliency-ext must be installed.
+metadata:
+  entry-point: NVRxLogAnalyzer
+  script: scripts/nvrx_logsage.py
+---
+
+# Skill: log_analysis
+
+Analyze a SLURM job log file for failure root-cause attribution and restart decisions using `NVRxLogAnalyzer`.
+
+**Script:** [`scripts/nvrx_logsage.py`](./scripts/nvrx_logsage.py) → `attribution/log_analyzer/nvrx_logsage.py`
+
+---
+
+## What it does
+
+1. Reads the log file (UTF-8, falls back to latin-1).
+2. Splits into per-cycle chunks using `chunk_logs_strict` (scans for `profiling.py:.*Cycle:\s*N` markers). Falls back to a single chunk when no markers are found.
+3. For each chunk, extracts application errors via `return_application_errors` (logsage).
+4. Classifies each chunk with fast-path pattern matching (training done, SLURM cancelled, preemption, time limit) or calls the LLM via `get_proposed_solution_cat`.
+5. Returns one result tuple per cycle.
+
+---
+
+## CLI
+
+```bash
+python scripts/nvrx_logsage.py \
+    --log-path /path/to/job.log \
+    [--model MODEL] \
+    [--temperature 0.2] \
+    [--top_p 0.7] \
+    [--max_tokens 8192] \
+    [--exclude_nvrx_logs] \
+    [--is_per_cycle]
+```
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--log-path` | required | Path to the job log file |
+| `--model` | `nvidia/qwen/qwen3.5-35b-a3b` | LLM model |
+| `--temperature` | `0.2` | Sampling temperature |
+| `--top_p` | `0.7` | Top-p nucleus sampling |
+| `--max_tokens` | `8192` | Max output tokens |
+| `--exclude_nvrx_logs` / `--no-exclude_nvrx_logs` | on | Strip `nvidia_resiliency_ext` / `[workload:]` lines before chunking (default on; use `--no-exclude_nvrx_logs` to disable) |
+| `--is_per_cycle` | off | Skip chunking — treat the whole file as a single pre-split cycle |
+
+---
+
+## Programmatic API
+
+```python
+from nvidia_resiliency_ext.attribution.log_analyzer.nvrx_logsage import NVRxLogAnalyzer
+
+analyzer = NVRxLogAnalyzer({
+    "log_path": "/path/to/job.log",
+    "model": "nvidia/qwen/qwen3.5-35b-a3b",
+    "temperature": 0.2,
+    "top_p": 0.7,
+    "max_tokens": 8192,
+    "exclude_nvrx_logs": False,
+    "is_per_cycle": False,
+})
+results = analyzer.run_sync({"log_path": "/path/to/job.log"})
+# results: list[tuple[str, AttributionState]]
+```
+
+Run-time overrides take precedence over constructor config (see `base.effective_run_or_init_config`).
+
+---
+
+## Output
+
+Each element of the returned list is a `(text, AttributionState)` pair where `text` is five
+fields joined by `\n`:
+
+```
+<restart_decision>      # "RESTART IMMEDIATE" | "STOP - DONT RESTART IMMEDIATE"
+<error_explanation>     # short string or ""
+<attribution_text>      # "Attribution: Primary issues: [...], Secondary issues: [...]"
+<additional_detail>     # extended text or ""
+<checkpoint_saved>      # "True" | "False"
+```
+
+`AttributionState.STOP` is set when `restart_decision` contains `"STOP"`; otherwise `CONTINUE`.
+
+### Fast-path decisions (no LLM call)
+
+| Detected condition | restart_decision | attribution_text |
+|--------------------|-----------------|-----------------|
+| Training complete | `STOP - DONT RESTART IMMEDIATE` | `TRAINING DONE` |
+| SLURM preemption | `RESTART IMMEDIATE` | `SLURM CANCELLED DUE TO PREEMPTION` |
+| SLURM step cancelled | `RESTART IMMEDIATE` | `SLURM STEP CANCELLED` |
+| SLURM job requeue | `RESTART IMMEDIATE` | `SLURM STEP CANCELLED JOB REQUEUE` |
+| Time-limit exceeded | `STOP - DONT RESTART IMMEDIATE` | status string |
+| Empty log | — | `NO LOGS` |
+| No errors found | — | `ERRORS NOT FOUND` |
+| LLM failure | — | `LLM FAILURE` |
+
+---
+
+## Prerequisites
+
+- `NVIDIA_API_KEY` set (env var, `NVIDIA_API_KEY_FILE`, or `~/.nvidia_api_key`)
+- `langchain-openai` and `logsage` packages installed
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/scripts/nvrx_logsage.py b/src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/scripts/nvrx_logsage.py
new file mode 120000
index 00000000..528751d1
--- /dev/null
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/scripts/nvrx_logsage.py
@@ -0,0 +1 @@
+../../../../attribution/log_analyzer/nvrx_logsage.py
\ No newline at end of file
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh
new file mode 100644
index 00000000..9fd39ab8
--- /dev/null
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh
@@ -0,0 +1,362 @@
+#!/bin/bash
+
+# Validated only with Megatron-LM as the feedback-loop example workload.
+
+#SBATCH --account=root
+#SBATCH --partition=gb-nvl-134-135
+#SBATCH --time=00:30:00
+
+#SBATCH --job-name=llama4-scout-gb200
+#SBATCH --output=/tmp/slurm-%j.launch.out
+#SBATCH --error=/tmp/slurm-%j.launch.err
+
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=4
+#SBATCH --gpus-per-node=4
+#SBATCH --exclusive
+#SBATCH --mem=0
+
+log_msg() {
+    local msg="$1"
+    UNIX_DATETIME=$(date +%s)
+    HUMAN_DATETIME=$(date -d "@$UNIX_DATETIME" '+%Y-%m-%d %H:%M:%S.%3N')
+    echo ">>> ${msg} ${UNIX_DATETIME} (${HUMAN_DATETIME})"
+}
+
+log_msg "START SBATCH"
+echo "Running on nodes: ${SLURM_NODELIST}"
+export RITS_PLATFORM_TYPE=gb200
+export RITS_GPUS_PER_NODE=4
+export RITS_NVL_DOMAIN_SIZE=72
+export NCCL_IB_DISABLE=0
+export NCCL_NET_GDR_LEVEL=3
+export RITS_CLUSTER_NAME=nvl72
+export PYXIS_LOG_LEVEL=debug
+export NCCL_IB_SL=1
+export NCCL_IB_TIMEOUT=19
+export UB_TIMEOUT=720
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NVTE_FWD_LAYERNORM_SM_MARGIN=16
+export NVTE_BWD_LAYERNORM_SM_MARGIN=16
+export NCCL_P2P_NET_CHUNKSIZE=2097152
+export NCCL_DEBUG=WARN
+export PYTHONUNBUFFERED=1
+export ONE_LOGGER_JOB_CATEGORY=test
+export LOGLEVEL=DEBUG
+export TORCHINDUCTOR_WORKER_START=fork
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TORCH_CPP_LOG_LEVEL=INFO
+export TORCH_NCCL_TRACE_BUFFER_SIZE=2000
+export TORCH_NCCL_RETHROW_CUDA_ERRORS=0
+export TORCH_NCCL_ENABLE_MONITORING=1
+export TORCH_NCCL_DUMP_ON_TIMEOUT=1
+export TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN=0
+export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=20
+export TORCH_DIST_INIT_BARRIER=0
+export TORCH_INCLUDE_STACK_TRACE=0
+export TORCH_INCLUDE_ONLY_ACTIVE=1
+export TORCH_NCCL_EXTRA_DUMP_ON_EXEC=1
+
+# Fault injection parameters (overridable via sbatch --export or environment)
+export FAULT_AT_ITER="${FAULT_AT_ITER:-5}"
+export FAULT_RANK="${FAULT_RANK:-1}"
+export FAULT_TYPE="${FAULT_TYPE:-GPU_SLEEP}"
+
+# Checkpoint settings (overridable via sbatch --export)
+export NVRX_CKPT_USE_CPU_SHM="${NVRX_CKPT_USE_CPU_SHM:-0}"
+# Enable GPU-IPC cached-data-structure path without cpu-shm (for comparison baseline)
+export NVRX_CKPT_USE_CACHED_STRUCTURE="${NVRX_CKPT_USE_CACHED_STRUCTURE:-0}"
+export DIST_TIMEOUT_AFTER_INIT="${DIST_TIMEOUT_AFTER_INIT:-1}"
+# USE_ASYNC_CKPT=1: enable async checkpointing every CKPT_SAVE_INTERVAL iters
+export USE_ASYNC_CKPT="${USE_ASYNC_CKPT:-0}"
+export CKPT_SAVE_INTERVAL="${CKPT_SAVE_INTERVAL:-100}"
+
+# Node / task geometry (SLURM_NNODES is set by SLURM from --nodes override)
+export GPUS_PER_NODE="${GPUS_PER_NODE:-4}"
+TOTAL_TASKS=$((SLURM_NNODES * GPUS_PER_NODE))
+
+# Per-experiment output directory (overridable via sbatch --export)
+export BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-/home/sbak/experiments/llama4-scout-gb200}"
+export EXPERIMENT_DIR="${EXPERIMENT_DIR:-${BASE_EXPERIMENTS_DIR}/fault_injection/manual/n${SLURM_NNODES}_${FAULT_TYPE}_r${FAULT_RANK}_i${FAULT_AT_ITER}}"
+
+mkdir -p ${BASE_EXPERIMENTS_DIR}/datacache
+mkdir -p ${EXPERIMENT_DIR}/tensorboard
+
+: "${SLURM_RESTART_COUNT:=0}"
+
+LOG_DIR=${EXPERIMENT_DIR}/logs
+mkdir -p ${LOG_DIR}
+echo "Writing logs to ${LOG_DIR}"
+LOG_FILE_BASE="${LOG_DIR}/slurm/${SLURM_JOB_ID}.${SLURM_RESTART_COUNT}"
+
+# ── Shared-tmp directory (NFS, for cross-srun-step communication) ─────────────
+# Mounted to /shared_tmp (NOT /tmp) so the container keeps its native fast /tmp.
+SHARED_TMP_HOST=/home/sbak/tmp/${SLURM_JOB_ID}
+mkdir -p ${SHARED_TMP_HOST}
+
+# ── Pre-populate .myenv with all variables that must reach the container ───────
+# Pyxis env forwarding is unreliable for vars set via sbatch --export; writing
+# them into .myenv guarantees the inner bash picks them up via `source`.
+MYENV_FILE=${SHARED_TMP_HOST}/.myenv_${SLURM_JOB_ID}.sh
+cat > ${MYENV_FILE} << MYENVEOF
+# Auto-generated by l4_gb200_reduced.sh — do not edit by hand.
+export NVRX_CKPT_USE_CPU_SHM=${NVRX_CKPT_USE_CPU_SHM}
+export NVRX_CKPT_USE_CACHED_STRUCTURE=${NVRX_CKPT_USE_CACHED_STRUCTURE}
+export DIST_TIMEOUT_AFTER_INIT=${DIST_TIMEOUT_AFTER_INIT}
+export USE_ASYNC_CKPT=${USE_ASYNC_CKPT}
+export CKPT_SAVE_INTERVAL=${CKPT_SAVE_INTERVAL}
+export FAULT_AT_ITER=${FAULT_AT_ITER}
+export FAULT_RANK=${FAULT_RANK}
+export FAULT_TYPE=${FAULT_TYPE}
+# Prepend local nvrx src so container picks up our changes without a pip install step.
+export PYTHONPATH=/home/sbak/nvidia-resiliency-ext/src:\${PYTHONPATH}
+MYENVEOF
+
+# Mounts
+LUSTRE=/home:/home
+SHARED_TMP=${SHARED_TMP_HOST}:/shared_tmp
+LOGS=${EXPERIMENT_DIR}/logs:/logs
+MEGATRON_REPO=/home/sbak/megatron-lm-main:/megatron-lm_repo
+DATACACHE=${BASE_EXPERIMENTS_DIR}/datacache:/datacache
+TENSORBOARD=${EXPERIMENT_DIR}/tensorboard:/tensorboard
+WORKSPACE=/home/sbak/tmp:/workspace
+CHECKPOINTS=${EXPERIMENT_DIR}/checkpoints:/checkpoints
+mkdir -p ${EXPERIMENT_DIR}/checkpoints
+CONTAINER_MOUNTS=$LUSTRE,$SHARED_TMP,$LOGS,$MEGATRON_REPO,$DATACACHE,$TENSORBOARD,$WORKSPACE,$CHECKPOINTS
+
+# ── Disk cleanup: remove stale enroot containers from prior jobs ──────────────
+log_msg "START disk_cleanup"
+srun \
+    --label \
+    --ntasks-per-node=1 \
+    --ntasks=${SLURM_NNODES} \
+    --kill-on-bad-exit=0 \
+    --mpi=none \
+    bash -c '
+        ENROOT_DIR="/var/lib/enroot/data/$(id -u)"
+        rm -rf "${ENROOT_DIR:?}"/* 2>/dev/null || true
+        echo "$(hostname): / $(df -h / | tail -1 | awk "{print \$3\" used, \"\$4\" avail\"}")"
+    '
+log_msg "END disk_cleanup"
+
+# all node setup
+#--------------------------------
+log_msg "START all_node_setup"
+srun \
+    --label \
+    --container-mounts ${CONTAINER_MOUNTS} \
+    --container-image /home/sbak/mcore_ci_0415.sqsh \
+    --container-name ${SLURM_JOB_ID} \
+    --container-workdir / \
+    --exclusive \
+    --error=${LOG_FILE_BASE}.0.all_node_setup.log \
+    --output=${LOG_FILE_BASE}.0.all_node_setup.log \
+    --ntasks-per-node=1 \
+    --ntasks=${SLURM_NNODES} \
+    --kill-on-bad-exit=0 \
+    --mpi=none \
+    bash -c '
+        # Use a per-node NFS path so all ranks on each node find the right clone.
+        MEGATRON_PATH=/shared_tmp/megatron_$(hostname)_${SLURM_JOB_ID}
+        mkdir -p ${MEGATRON_PATH}
+        pushd $MEGATRON_PATH
+        CURRENT_BRANCH=$(git -C /megatron-lm_repo branch --show-current)
+        echo "Cloning Megatron branch $CURRENT_BRANCH to ${MEGATRON_PATH}"
+        git clone --single-branch --branch $CURRENT_BRANCH /megatron-lm_repo .
+        popd
+    '
+log_msg "END all_node_setup"
+
+# main workload
+#--------------------------------
+log_msg "START main_workload"
+srun \
+    --label \
+    --container-mounts ${CONTAINER_MOUNTS} \
+    --container-image /home/sbak/mcore_ci_0415.sqsh \
+    --container-name ${SLURM_JOB_ID} \
+    --container-workdir / \
+    --error=${LOG_FILE_BASE}.1.main_workload.log \
+    --output=${LOG_FILE_BASE}.1.main_workload.log \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --ntasks=${TOTAL_TASKS} \
+    --kill-on-bad-exit=0 \
+    --mpi=none \
+    bash -c '
+        source /shared_tmp/.myenv_${SLURM_JOB_ID}.sh
+
+        # Match the per-node path used in all_node_setup.
+        MEGATRON_PATH=/shared_tmp/megatron_$(hostname)_${SLURM_JOB_ID}
+
+        NFS_TRITON_CACHE=/home/sbak/experiments/llama4-scout-gb200/triton_cache
+        NFS_INDUCTOR_CACHE=/home/sbak/experiments/llama4-scout-gb200/inductor_cache
+
+        # Per-rank Triton/inductor cache on the container native /tmp (local fast storage).
+        export TRITON_CACHE_DIR=/tmp/triton_${SLURM_LOCALID}
+        export TORCHINDUCTOR_CACHE_DIR=/tmp/inductor_${SLURM_LOCALID}
+        mkdir -p ${TRITON_CACHE_DIR} ${TORCHINDUCTOR_CACHE_DIR}
+
+        # Pre-stage: warm local cache from NFS (one rank per node)
+        if [[ "${SLURM_LOCALID}" == "0" ]]; then
+            if [[ -d "${NFS_TRITON_CACHE}" ]]; then
+                echo "Pre-staging triton cache from NFS..."
+                rsync -a --ignore-existing "${NFS_TRITON_CACHE}/" "${TRITON_CACHE_DIR}/" 2>/dev/null || true
+            fi
+            if [[ -d "${NFS_INDUCTOR_CACHE}" ]]; then
+                echo "Pre-staging inductor cache from NFS..."
+                rsync -a --ignore-existing "${NFS_INDUCTOR_CACHE}/" "${TORCHINDUCTOR_CACHE_DIR}/" 2>/dev/null || true
+            fi
+        fi
+
+        # Post-stage: write back to NFS on exit (one rank per node)
+        _stage_back() {
+            if [[ "${SLURM_LOCALID}" == "0" ]]; then
+                echo "Staging triton cache back to NFS..."
+                mkdir -p "${NFS_TRITON_CACHE}" "${NFS_INDUCTOR_CACHE}"
+                rsync -a --ignore-existing "${TRITON_CACHE_DIR}/" "${NFS_TRITON_CACHE}/" 2>/dev/null || true
+                rsync -a --ignore-existing "${TORCHINDUCTOR_CACHE_DIR}/" "${NFS_INDUCTOR_CACHE}/" 2>/dev/null || true
+                echo "Cache staged back."
+            fi
+        }
+        trap _stage_back EXIT
+
+        # Checkpoint directory — NFS path mounted to /checkpoints inside the container.
+        # /dev/shm is reserved for IPC shm tensors and the DataLoader.
+        # Note: --log-progress is NOT set. Megatron will not write/read progress.txt
+        # (which would be per-node and invisible across nodes).
+        CKPT_DIR=/checkpoints
+        mkdir -p ${CKPT_DIR}
+
+        # Build checkpoint args (controlled by USE_ASYNC_CKPT from .myenv).
+        # No --load: we only want to test save here.
+        CKPT_SAVE_ARGS=""
+        if [[ "${USE_ASYNC_CKPT}" == "1" ]]; then
+            CKPT_SAVE_ARGS="--save ${CKPT_DIR} --save-interval ${CKPT_SAVE_INTERVAL} --async-save --use-persistent-ckpt-worker --use-dist-ckpt --ckpt-fully-parallel-save --ckpt-assume-constant-structure"
+        fi
+
+        pushd $MEGATRON_PATH
+        LAUNCHER_CMD="python3"
+        LAUNCHER_ARGS=" \
+        "
+        WORKLOAD_CMD=${MEGATRON_PATH}/pretrain_gpt.py
+        WORKLOAD_ARGS=" \
+            --exit-duration-in-mins 5750 \
+            --distributed-timeout-minutes 10 \
+            --disable-gloo-process-groups \
+            --mock-data \
+            --data-cache-path /datacache \
+            --no-create-attention-mask-in-dataloader \
+            --no-mmap-bin-files \
+            --tokenizer-type NullTokenizer \
+            --tiktoken-pattern v2 \
+            --tokenizer-model /lustre/fsw/portfolios/llmservice/projects/llmservice_nlp_fm/nemotron6/tokenizers/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json \
+            --micro-batch-size 1 \
+            --global-batch-size 64 \
+            --train-samples 10240000 \
+            --adam-beta1 0.9 \
+            --adam-beta2 0.95 \
+            --adam-eps 1e-05 \
+            --lr-decay-style cosine \
+            --lr-warmup-samples 1024000 \
+            --lr-decay-samples 20480000 \
+            --lr 0.0003 \
+            --min-lr 2.9999999999999997e-05 \
+            --weight-decay 0.1 \
+            --clip-grad 1.0 \
+            --loss-scale 1.0 \
+            --use-mcore-models \
+            --untie-embeddings-and-output-weights \
+            --disable-bias-linear \
+            --attention-backend flash \
+            --transformer-impl transformer_engine \
+            --position-embedding-type rope \
+            --rotary-base 500000 \
+            --rotary-interleaved \
+            --use-rope-scaling \
+            --rope-scaling-factor 8.0 \
+            --no-rope-fusion \
+            --no-rope-freq 4 \
+            --use-flash-attn \
+            --cross-entropy-fusion-impl te \
+            --cross-entropy-loss-fusion \
+            --seq-length 8192 \
+            --max-position-embeddings 8192 \
+            --num-layers 12 \
+            --swiglu \
+            --hidden-size 5120 \
+            --num-attention-heads 40 \
+            --group-query-attention \
+            --num-query-groups 8 \
+            --ffn-hidden-size 16384 \
+            --kv-channels 128 \
+            --normalization RMSNorm \
+            --attention-dropout 0.0 \
+            --hidden-dropout 0.0 \
+            --grad-reduce-in-bf16 \
+            --qk-l2-norm \
+            --num-experts 16 \
+            --moe-layer-freq 1 \
+            --moe-ffn-hidden-size 8192 \
+            --moe-shared-expert-intermediate-size 8192 \
+            --moe-router-topk 1 \
+            --moe-router-score-function sigmoid \
+            --moe-token-dispatcher-type alltoall \
+            --moe-grouped-gemm \
+            --moe-shared-expert-overlap \
+            --moe-router-bias-update-rate 0.001 \
+            --moe-router-load-balancing-type aux_loss \
+            --moe-aux-loss-coeff 0.01 \
+            --moe-router-enable-expert-bias \
+            --moe-apply-probs-on-input \
+            --moe-router-force-load-balancing \
+            --bf16 \
+            --fp8-format hybrid \
+            --fp8-recipe delayed \
+            --fp8-param-gather \
+            --fp8-amax-history-len 1024 \
+            --fp8-amax-compute-algo max \
+            --fp8-margin 0 \
+            --te-rng-tracker \
+            --sequence-parallel \
+            --use-distributed-optimizer \
+            --overlap-grad-reduce \
+            --overlap-param-gather \
+            --ddp-num-buckets 5 \
+            --tensor-model-parallel-size 1 \
+            --pipeline-model-parallel-size 1 \
+            --expert-model-parallel-size 8 \
+            --expert-tensor-parallel-size 1 \
+            --ddp-average-in-collective \
+            --log-interval 1 \
+            --timing-log-option minmax \
+            --log-params-norm \
+            --log-num-zeros-in-grad \
+            --log-throughput \
+            --check-weight-hash-across-dp-replicas-interval 20000 \
+            --tensorboard-dir /tensorboard \
+            --logging-level 10 \
+            --eval-iters 14 \
+            --eval-interval 2000 \
+            --manual-gc \
+            --manual-gc-interval 100 \
+            --num-workers 1 \
+            --rerun-mode validate_results \
+            --log-straggler \
+            --disable-straggler-on-startup \
+            --straggler-minmax-count 16 \
+            --local-rank ${SLURM_LOCALID} \
+            --context-parallel-size 1 \
+            --vocab-size 238600 \
+            --megatron-fault-at-iter ${FAULT_AT_ITER} \
+            --megatron-fault-rank ${FAULT_RANK} \
+            --megatron-fault-type ${FAULT_TYPE} \
+            --distributed-timeout-seconds-after-init ${DIST_TIMEOUT_AFTER_INIT} \
+            --flight-recorder-dump-path ${CKPT_DIR} \
+        "
+        $LAUNCHER_CMD $LAUNCHER_ARGS $WORKLOAD_CMD $WORKLOAD_ARGS $CKPT_SAVE_ARGS
+    '
+log_msg "END main_workload"
+
+log_msg "END SBATCH"
+
+set +x
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200.sh
new file mode 100644
index 00000000..1147dda6
--- /dev/null
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200.sh
@@ -0,0 +1,166 @@
+ENV_VARS:
+  NVTE_FWD_LAYERNORM_SM_MARGIN: 16
+  NVTE_BWD_LAYERNORM_SM_MARGIN: 16
+  TORCHINDUCTOR_WORKER_START: fork
+  QUANTIZATION_TYPE_DEBUG: 1
+  PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
+  NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN: 64
+  USE_MNNVL: 1
+TEST_TYPE: "release"
+MODEL_ARGS:
+  # Distributed args
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 1
+  --expert-model-parallel-size: 64
+  --expert-tensor-parallel-size: 1
+  --use-distributed-optimizer: true
+  --overlap-grad-reduce: true
+  --overlap-param-gather: true
+  --sequence-parallel: true
+  --ddp-num-buckets: 10
+  --ddp-pad-buckets-for-high-nccl-busbw: true
+  --high-priority-stream-groups: ep
+  --distributed-timeout-minutes: 10
+  --disable-gloo-process-groups: true
+
+  # Training args
+  --micro-batch-size: 1
+  --global-batch-size: 3072
+  --train-samples: 12207031
+  --cross-entropy-loss-fusion: true
+  --cross-entropy-fusion-impl: native
+  --attention-backend: flash
+  --enable-cuda-graph: true
+  --cuda-graph-scope: mamba attn moe_router
+  --te-rng-tracker: true
+  --manual-gc: true
+  --manual-gc-interval: 10
+  --no-create-attention-mask-in-dataloader: true
+  --num-workers: 1
+  --exit-interval: 51000
+  --override-opt_param-scheduler: true
+
+  # Network size args
+  --use-mcore-models: true
+  --spec: megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec
+  --is-hybrid-model: true
+  --mamba-num-heads: 128
+  --num-layers: 88
+  --hidden-size: 4096
+  --ffn-hidden-size: 2688
+  --num-attention-heads: 32
+  --group-query-attention: true
+  --num-query-groups: 2
+  --kv-channels: 128
+  --hybrid-override-pattern: MEMEMEM*EMEMEMEM*EMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEM*EMEMEMEME
+  --position-embedding-type: none
+  --normalization: RMSNorm
+  --untie-embeddings-and-output-weights: true
+  --init-method-std: 0.014
+  --disable-bias-linear: true
+  --squared-relu: true
+  --use-fused-weighted-squared-relu: true
+
+  # Data args
+  --seq-length: 8192
+  --max-position-embeddings: 8192
+  --data-path: ${DATA_BLEND}
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --tiktoken-pattern: v2
+  --tokenizer-type: ${TOKENIZER_TYPE}
+  --tokenizer-model: ${TOKENIZER_MODEL_PATH}
+  --no-mmap-bin-files: true
+
+  # MoE args
+  --num-experts: 512
+  --moe-router-topk: 22
+  --moe-router-topk-scaling-factor: 5.0
+  --moe-router-score-function: sigmoid
+  --moe-router-enable-expert-bias: true
+  --moe-router-dtype: fp32
+  --moe-router-load-balancing-type: seq_aux_loss
+  --moe-aux-loss-coeff: 1e-4
+  --moe-token-dispatcher-type: flex
+  --moe-flex-dispatcher-backend: hybridep
+  --moe-hybridep-num-sms: 32
+  --moe-grouped-gemm: true
+  --moe-permute-fusion: true
+  --moe-latent-size: 1024
+  --moe-shared-expert-intermediate-size: 5376
+  --moe-shared-expert-compute-before-router: true
+
+  # MTP args
+  --mtp-spec: megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec
+  --mtp-num-layers: 2
+  --mtp-hybrid-override-pattern: \"*E\"
+  --calculate-per-token-loss: true
+  --mtp-loss-scaling-factor: 0.3
+
+  # Mixed precision / quantization args
+  --bf16: true
+  --keep-mtp-spec-in-bf16: true
+  --keep-mamba-stack-attention-linear-in-bf16: true
+  --keep-mamba-out-proj-in-mxfp8: true
+  --keep-moe-latent-projections-in-bf16: true
+  --first-last-layers-bf16: true
+  --num-layers-at-start-in-bf16: 0
+  --num-layers-at-end-in-bf16: 14
+  --fp4-format: e2m1
+  --fp4-recipe: nvfp4
+
+  # Regularization args
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --clip-grad: 1.0
+  --weight-decay: 0.1
+
+  # Learning rate args
+  --lr: 4.5e-4
+  --min-lr: 4.5e-6
+  --lr-decay-style: WSD
+  --lr-warmup-samples: 24414063
+  --lr-decay-samples: 3048706055
+  --lr-wsd-decay-style: minus_sqrt
+  --lr-wsd-decay-samples: 610351563
+  --adam-beta1: 0.9
+  --adam-beta2: 0.95
+
+  # Checkpointing args
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
+  --ckpt-format: torch_dist
+  --ckpt-fully-parallel-save: true
+  --ckpt-fully-parallel-load: true
+  --ckpt-assume-constant-structure: true
+  --async-save: true
+  --use-persistent-ckpt-worker: true
+  --save-interval: 1000
+  --save-retain-interval: 5000
+
+  # Validation args
+  --eval-interval: 1000
+  --eval-iters: 14
+
+  # Logging args
+  --log-interval: 100
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --log-throughput: true
+  --log-progress: true
+  --log-energy: true
+  --log-memory-interval: 500
+  --logging-level: 20
+  --timing-log-option: minmax
+  --check-weight-hash-across-dp-replicas-interval: 20000
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --wandb-project: megatron-core-release-runs
+  --wandb-entity: adlr
+  --wandb-exp-name: ${WANDB_EXPERIMENT}
+  --wandb-save-dir: ${WANDB_SAVE_PATH}
+METRICS:
+  - "iteration-time"
+  - "lm loss"
+  - "mem-allocated-bytes"
+  - "mem-max-allocated-bytes"
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_shm_test.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_shm_test.sh
new file mode 100755
index 00000000..673965f2
--- /dev/null
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_shm_test.sh
@@ -0,0 +1,369 @@
+#!/bin/bash
+# n3_super_gb200_shm_test.sh — one-time validation: Nemotron Super 8N with async cpu-shm ckpt.
+# Model/infra config mirrors n3_super_gb200_fi.sh. No fault injection.
+# Checkpoints to node-local /tmp (discardable — not cross-node accessible).
+
+#SBATCH --account=root
+#SBATCH --partition=gb-nvl-134-135
+#SBATCH --time=00:45:00
+
+#SBATCH --job-name=n3-super-shm-test
+#SBATCH --output=/tmp/slurm-%j.launch.out
+#SBATCH --error=/tmp/slurm-%j.launch.err
+
+#SBATCH --nodes=8
+#SBATCH --ntasks-per-node=4
+#SBATCH --gpus-per-node=4
+#SBATCH --exclusive
+#SBATCH --mem=0
+
+log_msg() {
+    local msg="$1"
+    UNIX_DATETIME=$(date +%s)
+    HUMAN_DATETIME=$(date -d "@$UNIX_DATETIME" '+%Y-%m-%d %H:%M:%S.%3N')
+    echo ">>> ${msg} ${UNIX_DATETIME} (${HUMAN_DATETIME})"
+}
+
+log_msg "START SBATCH"
+echo "Running on nodes: ${SLURM_NODELIST}"
+
+# ── Platform / NCCL / RITS ────────────────────────────────────────────────────
+export RITS_PLATFORM_TYPE=gb200
+export RITS_GPUS_PER_NODE=4
+export RITS_NVL_DOMAIN_SIZE=72
+export NCCL_IB_DISABLE=0
+export NCCL_NET_GDR_LEVEL=3
+export RITS_CLUSTER_NAME=nvl72
+export PYXIS_LOG_LEVEL=debug
+export NCCL_IB_SL=1
+export NCCL_IB_TIMEOUT=19
+export UB_TIMEOUT=720
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_P2P_NET_CHUNKSIZE=2097152
+export NCCL_DEBUG=WARN
+
+# ── PyTorch / TE / inductor (from n3_super_gb200.sh ENV_VARS) ─────────────────
+export NVTE_FWD_LAYERNORM_SM_MARGIN=16
+export NVTE_BWD_LAYERNORM_SM_MARGIN=16
+export TORCHINDUCTOR_WORKER_START=fork
+export QUANTIZATION_TYPE_DEBUG=1
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export USE_MNNVL=1
+
+# ── DeepEP (hybridep MoE routing) — set USE_DEEPEP=0 to use alltoall instead ──
+USE_DEEPEP="${USE_DEEPEP:-1}"
+if [[ "${USE_DEEPEP}" == "1" ]]; then
+    export NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN=32
+fi
+
+# ── Logging / debugging ───────────────────────────────────────────────────────
+export PYTHONUNBUFFERED=1
+export ONE_LOGGER_JOB_CATEGORY=test
+export LOGLEVEL=DEBUG
+export TORCH_CPP_LOG_LEVEL=INFO
+export TORCH_NCCL_TRACE_BUFFER_SIZE=2000
+export TORCH_NCCL_RETHROW_CUDA_ERRORS=0
+export TORCH_NCCL_ENABLE_MONITORING=1
+export TORCH_NCCL_DUMP_ON_TIMEOUT=1
+export TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN=0
+export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=30
+export TORCH_DIST_INIT_BARRIER=0
+export TORCH_INCLUDE_STACK_TRACE=0
+export TORCH_INCLUDE_ONLY_ACTIVE=1
+export TORCH_NCCL_EXTRA_DUMP_ON_EXEC=1
+
+# ── CUDA graph ────────────────────────────────────────────────────────────────
+export ENABLE_CUDA_GRAPH="${ENABLE_CUDA_GRAPH:-1}"
+
+# ── Quantization mode: set USE_FP8=1 to use FP8, USE_FP4=1 for FP4 (default) ─
+# Only one may be active at a time.
+export USE_FP4="${USE_FP4:-0}"
+export USE_FP8="${USE_FP8:-1}"
+
+# ── Async checkpoint shm mode (default on) ────────────────────────────────────
+export USE_CPU_SHM="${USE_CPU_SHM:-1}"
+
+# ── Overlap comm (default off) ────────────────────────────────────────────────
+export USE_OVERLAP_COMM="${USE_OVERLAP_COMM:-0}"
+
+# ── Node / task geometry ─────────────────────────────────────────────────────
+export GPUS_PER_NODE=4
+TOTAL_TASKS=$((SLURM_NNODES * GPUS_PER_NODE))
+
+# ── Per-experiment output directory ───────────────────────────────────────────
+export BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-/home/sbak/experiments/n3-super-gb200}"
+export EXPERIMENT_DIR="${EXPERIMENT_DIR:-${BASE_EXPERIMENTS_DIR}/shm_test}"
+
+mkdir -p ${BASE_EXPERIMENTS_DIR}/datacache
+mkdir -p ${EXPERIMENT_DIR}/tensorboard
+
+: "${SLURM_RESTART_COUNT:=0}"
+
+LOG_DIR=${EXPERIMENT_DIR}/logs
+mkdir -p ${LOG_DIR}
+echo "Writing logs to ${LOG_DIR}"
+LOG_FILE_BASE="${LOG_DIR}/slurm/${SLURM_JOB_ID}.${SLURM_RESTART_COUNT}"
+
+# ── Container mounts ──────────────────────────────────────────────────────────
+LUSTRE=/home:/home
+SHARED_TMP=/home/sbak/tmp/${SLURM_JOB_ID}:/shared_tmp
+LOGS=${EXPERIMENT_DIR}/logs:/logs
+MEGATRON_REPO=/home/sbak/megatron-lm-main:/megatron-lm_repo
+DATACACHE=${BASE_EXPERIMENTS_DIR}/datacache:/datacache
+TENSORBOARD=${EXPERIMENT_DIR}/tensorboard:/tensorboard
+WORKSPACE=/home/sbak/tmp:/workspace
+# No /checkpoints mount — saves go to node-local /tmp inside the container.
+CONTAINER_MOUNTS=$LUSTRE,$SHARED_TMP,$LOGS,$MEGATRON_REPO,$DATACACHE,$TENSORBOARD,$WORKSPACE
+mkdir -p /home/sbak/tmp/${SLURM_JOB_ID}
+
+# ── Disk cleanup: remove stale enroot containers from prior jobs ──────────────
+log_msg "START disk_cleanup"
+srun \
+    --label \
+    --ntasks-per-node=1 \
+    --ntasks=${SLURM_NNODES} \
+    --kill-on-bad-exit=0 \
+    --mpi=none \
+    bash -c '
+        ENROOT_DIR="/var/lib/enroot/data/$(id -u)"
+        rm -rf "${ENROOT_DIR:?}"/* 2>/dev/null || true
+        echo "$(hostname): / $(df -h / | tail -1 | awk "{print \$3\" used, \"\$4\" avail\"}")"
+    '
+log_msg "END disk_cleanup"
+
+# ── All-node setup: clone Megatron into a per-node tmpdir ─────────────────────
+log_msg "START all_node_setup"
+srun \
+    --label \
+    --container-mounts ${CONTAINER_MOUNTS} \
+    --container-image /home/sbak/mcore_ci_040825.sqsh \
+    --container-name ${SLURM_JOB_ID} \
+    --container-workdir / \
+    --error=${LOG_FILE_BASE}.0.all_node_setup.log \
+    --output=${LOG_FILE_BASE}.0.all_node_setup.log \
+    --ntasks-per-node=1 \
+    --ntasks=${SLURM_NNODES} \
+    --kill-on-bad-exit=0 \
+    --mpi=none \
+    bash -c '
+        MEGATRON_PATH=/shared_tmp/megatron_${SLURM_NODEID}
+        rm -rf "${MEGATRON_PATH}"
+        mkdir -p "${MEGATRON_PATH}"
+        pushd $MEGATRON_PATH
+        CURRENT_BRANCH=$(git -C /megatron-lm_repo branch --show-current)
+        echo "Cloning Megatron branch $CURRENT_BRANCH into $MEGATRON_PATH"
+        git clone --single-branch --branch $CURRENT_BRANCH /megatron-lm_repo .
+        popd
+
+        # Install local nvidia-resiliency-ext so container picks up src changes.
+        uv pip install -e /home/sbak/nvidia-resiliency-ext
+    '
+log_msg "END all_node_setup"
+
+# ── Main workload ─────────────────────────────────────────────────────────────
+log_msg "START main_workload"
+srun \
+    --label \
+    --container-mounts ${CONTAINER_MOUNTS} \
+    --container-image /home/sbak/mcore_ci_040825.sqsh \
+    --container-name ${SLURM_JOB_ID} \
+    --container-workdir / \
+    --error=${LOG_FILE_BASE}.1.main_workload.log \
+    --output=${LOG_FILE_BASE}.1.main_workload.log \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --ntasks=${TOTAL_TASKS} \
+    --kill-on-bad-exit=0 \
+    --mpi=none \
+    bash -c '
+        MEGATRON_PATH=/shared_tmp/megatron_${SLURM_NODEID}
+
+        NFS_TRITON_CACHE=/home/sbak/experiments/n3-super-gb200/triton_cache
+        NFS_INDUCTOR_CACHE=/home/sbak/experiments/n3-super-gb200/inductor_cache
+        TRITON_READY=/tmp/.triton_ready_${SLURM_JOB_ID}
+
+        export TRITON_CACHE_DIR=/tmp/triton_${SLURM_LOCALID}
+        export TORCHINDUCTOR_CACHE_DIR=/tmp/inductor_${SLURM_LOCALID}
+
+        if [[ "${SLURM_LOCALID}" == "0" ]]; then
+            if [[ -d "${NFS_TRITON_CACHE}" ]] && [[ -n "$(ls -A ${NFS_TRITON_CACHE} 2>/dev/null)" ]]; then
+                TRITON_CACHE_WAS_WARM=1
+            else
+                TRITON_CACHE_WAS_WARM=0
+            fi
+            for r in $(seq 0 $((GPUS_PER_NODE - 1))); do
+                mkdir -p /tmp/triton_${r} /tmp/inductor_${r}
+                [[ -d "${NFS_TRITON_CACHE}" ]]   && rsync -a --ignore-existing "${NFS_TRITON_CACHE}/"   "/tmp/triton_${r}/"   2>/dev/null || true
+                [[ -d "${NFS_INDUCTOR_CACHE}" ]] && rsync -a --ignore-existing "${NFS_INDUCTOR_CACHE}/" "/tmp/inductor_${r}/" 2>/dev/null || true
+            done
+            touch "${TRITON_READY}"
+            echo "Pre-staged triton/inductor cache for all local ranks (was_warm=${TRITON_CACHE_WAS_WARM})."
+        else
+            until [[ -f "${TRITON_READY}" ]]; do sleep 1; done
+        fi
+
+        mkdir -p ${TRITON_CACHE_DIR} ${TORCHINDUCTOR_CACHE_DIR}
+
+        _stage_back() {
+            if [[ "${SLURM_LOCALID}" == "0" && "${SLURM_NODEID}" == "0" && "${TRITON_CACHE_WAS_WARM}" == "0" ]]; then
+                echo "Staging triton cache back to NFS (cold start)..."
+                mkdir -p "${NFS_TRITON_CACHE}" "${NFS_INDUCTOR_CACHE}"
+                rsync -a --ignore-existing "${TRITON_CACHE_DIR}/" "${NFS_TRITON_CACHE}/" 2>/dev/null || true
+                rsync -a --ignore-existing "${TORCHINDUCTOR_CACHE_DIR}/" "${NFS_INDUCTOR_CACHE}/" 2>/dev/null || true
+                echo "Cache staged back."
+            fi
+        }
+        trap _stage_back EXIT
+
+        if [[ "${ENABLE_CUDA_GRAPH}" == "1" ]]; then
+            CUDA_GRAPH_ARGS="--enable-cuda-graph --cuda-graph-scope mamba attn"
+        else
+            CUDA_GRAPH_ARGS=""
+        fi
+
+        if [[ "${USE_DEEPEP:-1}" == "1" ]]; then
+            MOE_DISPATCHER_ARGS="--moe-token-dispatcher-type flex --moe-flex-dispatcher-backend hybridep --moe-hybridep-num-sms 32"
+        else
+            MOE_DISPATCHER_ARGS="--moe-token-dispatcher-type alltoall"
+        fi
+
+        if [[ "${USE_FP8:-0}" == "1" ]]; then
+            QUANT_ARGS="--fp8-param-gather \
+            --reuse-grad-buf-for-mxfp8-param-ag \
+            --fp8-recipe mxfp8 \
+            --fp8-format hybrid \
+            --fp8-amax-history-len 1024 \
+            --fp8-amax-compute-algo max"
+        elif [[ "${USE_FP4:-1}" == "1" ]]; then
+            QUANT_ARGS="--first-last-layers-bf16 \
+            --num-layers-at-start-in-bf16 0 \
+            --num-layers-at-end-in-bf16 14 \
+            --fp4-format e2m1 \
+            --fp4-recipe nvfp4"
+        else
+            QUANT_ARGS=""
+        fi
+
+        # Checkpoint directory — node-local /tmp inside the container.
+        # Shards are not cross-node accessible; intentional for one-time shm validation.
+        CKPT_DIR=/tmp/ckpt_${SLURM_JOB_ID}
+        mkdir -p ${CKPT_DIR}
+
+        pushd $MEGATRON_PATH
+        LAUNCHER_CMD="python3"
+        WORKLOAD_CMD=${MEGATRON_PATH}/pretrain_mamba.py
+        WORKLOAD_ARGS=" \
+            --exit-duration-in-mins 40 \
+            --exit-interval 100 \
+            --distributed-timeout-minutes 30 \
+            --distributed-timeout-seconds-after-init 1800 \
+            --disable-gloo-process-groups \
+            --mock-data \
+            --data-cache-path /datacache \
+            --no-create-attention-mask-in-dataloader \
+            --no-mmap-bin-files \
+            --tokenizer-type NullTokenizer \
+            --tiktoken-pattern v2 \
+            --vocab-size 128000 \
+            --micro-batch-size 1 \
+            --global-batch-size 32 \
+            --train-samples 12207031 \
+            --adam-beta1 0.9 \
+            --adam-beta2 0.95 \
+            --lr 4.5e-4 \
+            --min-lr 4.5e-6 \
+            --lr-decay-style WSD \
+            --lr-warmup-samples 24414063 \
+            --lr-decay-samples 3048706055 \
+            --lr-wsd-decay-style minus_sqrt \
+            --lr-wsd-decay-samples 610351563 \
+            --weight-decay 0.1 \
+            --clip-grad 1.0 \
+            --override-opt_param-scheduler \
+            --use-mcore-models \
+            --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
+            --is-hybrid-model \
+            --mamba-num-heads 128 \
+            --num-layers 88 \
+            --hidden-size 4096 \
+            --ffn-hidden-size 2688 \
+            --num-attention-heads 32 \
+            --group-query-attention \
+            --num-query-groups 2 \
+            --kv-channels 128 \
+            --hybrid-override-pattern MEMEMEM*EMEMEMEM*EMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEM*EMEMEMEME \
+            --position-embedding-type none \
+            --normalization RMSNorm \
+            --untie-embeddings-and-output-weights \
+            --init-method-std 0.014 \
+            --disable-bias-linear \
+            --squared-relu \
+            --use-fused-weighted-squared-relu \
+            --seq-length 8192 \
+            --max-position-embeddings 8192 \
+            --num-experts 512 \
+            --moe-router-topk 22 \
+            --moe-router-topk-scaling-factor 5.0 \
+            --moe-router-score-function sigmoid \
+            --moe-router-enable-expert-bias \
+            --moe-router-dtype fp32 \
+            --moe-router-load-balancing-type seq_aux_loss \
+            --moe-aux-loss-coeff 1e-4 \
+            ${MOE_DISPATCHER_ARGS} \
+            --moe-grouped-gemm \
+            --moe-permute-fusion \
+            --moe-latent-size 1024 \
+            --moe-shared-expert-intermediate-size 5376 \
+            --calculate-per-token-loss \
+            --bf16 \
+            ${QUANT_ARGS} \
+            --attention-dropout 0.0 \
+            --hidden-dropout 0.0 \
+            --sequence-parallel \
+            --use-distributed-optimizer \
+            $([[ "${USE_OVERLAP_COMM}" == "1" ]] && echo "--overlap-grad-reduce --overlap-param-gather") \
+            --ddp-num-buckets 10 \
+            --ddp-pad-buckets-for-high-nccl-busbw \
+            --high-priority-stream-groups ep \
+            --tensor-model-parallel-size 4 \
+            --pipeline-model-parallel-size 1 \
+            --expert-model-parallel-size 32 \
+            --expert-tensor-parallel-size 1 \
+            --cross-entropy-loss-fusion \
+            --cross-entropy-fusion-impl native \
+            --attention-backend flash \
+            ${CUDA_GRAPH_ARGS} \
+            --te-rng-tracker \
+            --manual-gc \
+            --manual-gc-interval 10 \
+            --num-workers 1 \
+            --eval-interval 1000 \
+            --eval-iters 14 \
+            --log-interval 1 \
+            --log-params-norm \
+            --log-num-zeros-in-grad \
+            --log-timers-to-tensorboard \
+            --log-memory-to-tensorboard \
+            --log-throughput \
+            --log-energy \
+            --log-memory-interval 500 \
+            --logging-level 10 \
+            --timing-log-option minmax \
+            --check-weight-hash-across-dp-replicas-interval 20000 \
+            --tensorboard-dir /tensorboard \
+            --local-rank ${SLURM_LOCALID} \
+            --save ${CKPT_DIR} \
+            --save-interval 10 \
+            --ckpt-format torch_dist \
+            --ckpt-fully-parallel-save \
+            --ckpt-assume-constant-structure \
+            --async-save \
+            --use-persistent-ckpt-worker \
+            $([[ "${USE_CPU_SHM}" == "1" ]] && echo "--async-ckpt-use-cpu-shm") \
+        "
+        $LAUNCHER_CMD $WORKLOAD_CMD $WORKLOAD_ARGS
+    '
+log_msg "END main_workload"
+
+log_msg "END SBATCH"
+
+set +x
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/pools/n3_super_8n_16n.pool b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/pools/n3_super_8n_16n.pool
new file mode 100644
index 00000000..1d700863
--- /dev/null
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/pools/n3_super_8n_16n.pool
@@ -0,0 +1,40 @@
+# n3_super_8n_16n.pool — fault-injection pool for Nemotron-3 Super (TP=4, EP=32)
+# Minimum scale: 8 nodes (32 ranks, EP=32 requires exactly 32 ranks)
+# Maximum scale: 16 nodes (64 ranks)
+#
+# Rank coverage per node count (4 GPUs/node):
+#   8 nodes  → 32 ranks:  rank-0=0, rank-1=1, mid=16, last=31
+#   16 nodes → 64 ranks:  rank-0=0, rank-1=1, mid=32, last=63
+#
+# NOTE: 16-node jobs require ~20 min for NCCL init + CUDA graph capture before iter 1.
+# With 5-min watchdog timeout after fault + FR dumps, total is ~30+ min.
+# Use TIME=00:45:00 (set in workloads.conf) to avoid SLURM wall-time kills.
+#
+# Format: FAULT_TYPE:RANK:ITER:NODES  (one per line, # comments ignored)
+# GPU faults — highest priority; rank sweep across both node counts
+GPU_SLEEP:1:5:8
+GPU_SLEEP:0:5:8
+GPU_SLEEP:16:5:8
+GPU_SLEEP:31:5:8
+GPU_SLEEP:1:5:16
+GPU_SLEEP:32:5:16
+GPU_ERROR:1:5:8
+GPU_ERROR:0:5:8
+GPU_ERROR:16:5:8
+GPU_ERROR:1:5:16
+# Crash faults
+SIGKILL:1:5:8
+SIGKILL:0:5:8
+SIGKILL:1:5:16
+SEGFAULT:1:5:8
+OS_ABORT:1:5:8
+# Python-level hangs
+LOCK_GIL:1:5:8
+LOCK_GIL:0:5:8
+# Application exceptions
+WORKLOAD_EXC:1:5:8
+ASYNC_EXC:1:5:8
+# Signal-based
+SIGTERM:1:5:8
+SIGINT:1:5:8
+SIGNAL_EXC:1:5:8
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh
new file mode 100755
index 00000000..f2c90a64
--- /dev/null
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh
@@ -0,0 +1,209 @@
+#!/bin/bash
+# prepare_node_alloc.sh
+# Submit fault-injection experiments from a prioritized pool, 2 jobs at a time,
+# waiting for each pair to complete before submitting the next pair.
+# This limits peak filesystem stress to 2 concurrent jobs while still covering
+# the full experiment matrix end-to-end in one unattended run.
+#
+# Pool ordering: GPU-related faults first (higher attribution coverage priority),
+# then crash faults, Python-level hangs, and signal-based faults.
+# Each tier covers node counts 2→4→8 and sweeps rank-0, rank-1, mid, and last.
+#
+# Usage:
+#   bash scripts/prepare_node_alloc.sh
+#   WORKLOAD=llama4_scout TIME=00:45:00 bash scripts/prepare_node_alloc.sh
+#
+# WORKLOAD selects the job script and base experiments dir from scripts/workloads.conf.
+# Override POOL (space-separated FAULT_TYPE:RANK:ITER:NODES) to run a custom set.
+# Override SBATCH_SCRIPT or BASE_EXPERIMENTS_DIR directly to bypass workloads.conf.
+#
+# Validated only with Megatron-LM workloads that emit [MEGATRON_FAULT] markers
+# and write logs / FR dumps using the directory layout expected by
+# watch_and_analyze.sh.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+WORKLOADS_CONF="${SCRIPT_DIR}/workloads.conf"
+
+# ── Workload resolution from workloads.conf ────────────────────────────────────
+# If WORKLOAD is set, look it up in workloads.conf and derive SBATCH_SCRIPT and
+# BASE_EXPERIMENTS_DIR from it (unless those are already set explicitly).
+if [[ -n "${WORKLOAD:-}" ]]; then
+    if [[ ! -f "${WORKLOADS_CONF}" ]]; then
+        echo "ERROR: workloads.conf not found at ${WORKLOADS_CONF}" >&2
+        exit 1
+    fi
+    _CONF_LINE=$(grep -E "^${WORKLOAD}\s" "${WORKLOADS_CONF}" | grep -v "^#" | head -1 || true)
+    if [[ -z "${_CONF_LINE}" ]]; then
+        echo "ERROR: workload '${WORKLOAD}' not found in ${WORKLOADS_CONF}" >&2
+        echo "Available workloads:" >&2
+        grep -v "^#\|^$" "${WORKLOADS_CONF}" | awk '{print "  " $1 "  —  " $4}' >&2
+        exit 1
+    fi
+    _CONF_SCRIPT=$(echo "${_CONF_LINE}" | awk '{print $2}')
+    _CONF_BASE=$(echo "${_CONF_LINE}"   | awk '{print $3}')
+    _CONF_DESC=$(echo "${_CONF_LINE}"   | awk '{print $4}')
+    _CONF_POOL=$(echo "${_CONF_LINE}"   | awk '{print $5}')
+    _CONF_TIME=$(echo "${_CONF_LINE}"   | awk '{print $6}')
+    # Only set if not already overridden in the environment
+    SBATCH_SCRIPT="${SBATCH_SCRIPT:-${SCRIPT_DIR}/${_CONF_SCRIPT}}"
+    BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-${_CONF_BASE}}"
+    if [[ -n "${_CONF_TIME}" && "${_CONF_TIME}" != "-" ]]; then
+        TIME="${TIME:-${_CONF_TIME}}"
+    fi
+    # Load workload-specific pool file if POOL not already set and pool file is specified
+    if [[ -z "${POOL:-}" && -n "${_CONF_POOL}" && "${_CONF_POOL}" != "-" ]]; then
+        _POOL_FILE="${SCRIPT_DIR}/pools/${_CONF_POOL}"
+        if [[ -f "${_POOL_FILE}" ]]; then
+            POOL=$(grep -v "^#\|^$" "${_POOL_FILE}" | tr '\n' ' ')
+            echo ">>> Pool:     ${_POOL_FILE}"
+        else
+            echo "WARN: pool file ${_POOL_FILE} not found, using built-in default pool" >&2
+        fi
+    fi
+    echo ">>> Workload: ${WORKLOAD}  (${_CONF_DESC//_/ })"
+fi
+
+ACCOUNT="${ACCOUNT:-root}"
+PARTITION="${PARTITION:-gb-nvl-134-135}"
+GPUS_PER_NODE="${GPUS_PER_NODE:-4}"
+TIME="${TIME:-00:30:00}"
+BATCH_SIZE="${BATCH_SIZE:-2}"
+POLL_INTERVAL="${POLL_INTERVAL:-30}"
+BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-/home/sbak/experiments/llama4-scout-gb200}"
+
+# ---------------------------------------------------------------------------
+# Fault pool — ordered by priority (GPU-related first, then crash, then other)
+# Format: FAULT_TYPE:RANK:ITER:NODES
+#
+# Rank coverage per node count (4 GPUs/node):
+#   2 nodes →  8 ranks:  rank-0, rank-1, mid=4,  last=7
+#   4 nodes → 16 ranks:  rank-0, rank-1, mid=8,  last=15
+#   8 nodes → 32 ranks:  rank-0, rank-1, mid=16, last=31
+# ---------------------------------------------------------------------------
+DEFAULT_POOL="
+GPU_SLEEP:1:5:2   GPU_SLEEP:0:5:2
+GPU_SLEEP:4:5:2   GPU_SLEEP:7:5:2
+GPU_SLEEP:1:5:4   GPU_SLEEP:0:5:4
+GPU_SLEEP:8:5:4   GPU_SLEEP:15:5:4
+GPU_SLEEP:1:5:8   GPU_SLEEP:0:5:8
+GPU_SLEEP:16:5:8  GPU_SLEEP:31:5:8
+GPU_ERROR:1:5:2   GPU_ERROR:0:5:2
+GPU_ERROR:1:5:4   GPU_ERROR:0:5:4
+GPU_ERROR:1:5:8   GPU_ERROR:0:5:8
+SIGKILL:1:5:2     SIGKILL:0:5:2
+SIGKILL:1:5:4     SIGKILL:1:5:8
+SEGFAULT:1:5:2    SEGFAULT:0:5:2
+SEGFAULT:1:5:4    OS_ABORT:1:5:2
+LOCK_GIL:1:5:2    LOCK_GIL:0:5:2
+WORKLOAD_EXC:1:5:2 ASYNC_EXC:1:5:2
+SIGTERM:1:5:2     SIGINT:1:5:2
+SIGSTOP:1:5:2     SIGNAL_EXC:1:5:2
+"
+
+# Flatten pool into an array (strips comments and blank lines)
+POOL=(${POOL:-$DEFAULT_POOL})
+
+SBATCH_SCRIPT="${SBATCH_SCRIPT:-${SCRIPT_DIR}/l4_gb200_reduced.sh}"
+SESSION_TAG="$(date +%Y%m%d_%H%M%S)"
+SESSION_DIR="${BASE_EXPERIMENTS_DIR}/fault_injection/${SESSION_TAG}"
+TRACKING_FILE="${SESSION_DIR}/experiments.tsv"
+
+mkdir -p "${SESSION_DIR}"
+printf "JOB_ID\tFAULT_TYPE\tRANK\tITER\tNODES\tEXPERIMENT_DIR\n" > "${TRACKING_FILE}"
+
+TOTAL=${#POOL[@]}
+echo ">>> Fault-injection pool: ${TOTAL} experiments, ${BATCH_SIZE} at a time"
+echo ">>> Script:    ${SBATCH_SCRIPT}"
+echo ">>> Partition: ${PARTITION}  GPUs/node: ${GPUS_PER_NODE}  Time: ${TIME}"
+echo ">>> Session:   ${SESSION_DIR}"
+echo ">>> Tracking:  ${TRACKING_FILE}"
+echo ""
+
+submit_one() {
+    local EXPERIMENT="$1"
+    IFS=':' read -r FAULT_TYPE RANK ITER NODES <<< "${EXPERIMENT}"
+
+    local EXPERIMENT_DIR="${SESSION_DIR}/n${NODES}_${FAULT_TYPE}_r${RANK}_i${ITER}"
+    mkdir -p "${EXPERIMENT_DIR}/logs/slurm"
+    mkdir -p "${EXPERIMENT_DIR}/checkpoints"
+    mkdir -p "${EXPERIMENT_DIR}/tensorboard"
+
+    local JOB_ID
+    JOB_ID=$(sbatch \
+        --account="${ACCOUNT}" \
+        --partition="${PARTITION}" \
+        --nodes="${NODES}" \
+        --ntasks-per-node="${GPUS_PER_NODE}" \
+        --gpus-per-node="${GPUS_PER_NODE}" \
+        --time="${TIME}" \
+        --exclusive \
+        --mem=0 \
+        --output="${EXPERIMENT_DIR}/logs/slurm/%j.launch.out" \
+        --error="${EXPERIMENT_DIR}/logs/slurm/%j.launch.err" \
+        --export=ALL,FAULT_TYPE="${FAULT_TYPE}",FAULT_RANK="${RANK}",FAULT_AT_ITER="${ITER}",GPUS_PER_NODE="${GPUS_PER_NODE}",EXPERIMENT_DIR="${EXPERIMENT_DIR}",BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR}" \
+        --parsable \
+        "${SBATCH_SCRIPT}")
+
+    # Print to stderr so callers using $(...) capture only the job ID on stdout
+    printf "  submitted: %s rank=%-2s iter=%s nodes=%s -> job=%s\n" \
+        "${FAULT_TYPE}" "${RANK}" "${ITER}" "${NODES}" "${JOB_ID}" >&2
+    printf "%s\t%s\t%s\t%s\t%s\t%s\n" \
+        "${JOB_ID}" "${FAULT_TYPE}" "${RANK}" "${ITER}" "${NODES}" "${EXPERIMENT_DIR}" \
+        >> "${TRACKING_FILE}"
+    echo "${JOB_ID}"   # only the bare job ID goes to stdout
+}
+
+wait_for_jobs() {
+    local JOB_LIST="$1"
+    local LABEL="$2"
+    printf "  waiting for %s (%s) ..." "${LABEL}" "${JOB_LIST}"
+    while true; do
+        local REMAINING
+        # squeue returns non-zero for unknown job IDs on some SLURM versions;
+        # || echo 0 prevents set -e from aborting the script when jobs leave the queue.
+        REMAINING=$(squeue -j "${JOB_LIST}" --noheader 2>/dev/null | wc -l || true)
+        if [[ "${REMAINING}" -eq 0 ]]; then
+            echo " done."
+            break
+        fi
+        printf " %ds" "${POLL_INTERVAL}"
+        sleep "${POLL_INTERVAL}"
+    done
+}
+
+ALL_SUBMITTED_JOBS=()
+BATCH_NUM=0
+i=0
+
+while [[ $i -lt ${TOTAL} ]]; do
+    BATCH_NUM=$((BATCH_NUM + 1))
+    BATCH_END=$((i + BATCH_SIZE))
+    [[ ${BATCH_END} -gt ${TOTAL} ]] && BATCH_END=${TOTAL}
+    BATCH_COUNT=$((BATCH_END - i))
+
+    echo ">>> Batch ${BATCH_NUM}: experiments $((i+1))–${BATCH_END} of ${TOTAL}"
+
+    BATCH_JOB_IDS=()
+    for ((b=i; b<BATCH_END; b++)); do
+        JID=$(submit_one "${POOL[$b]}")
+        BATCH_JOB_IDS+=("${JID}")
+        ALL_SUBMITTED_JOBS+=("${JID}")
+    done
+
+    BATCH_LIST=$(IFS=','; echo "${BATCH_JOB_IDS[*]}")
+    BATCH_LABEL=$(printf "%s " "${POOL[@]:$i:$BATCH_COUNT}")
+    wait_for_jobs "${BATCH_LIST}" "${BATCH_LABEL% }"
+
+    i=${BATCH_END}
+done
+
+echo ""
+ALL_JOB_LIST=$(IFS=','; echo "${ALL_SUBMITTED_JOBS[*]}")
+echo ">>> All ${TOTAL} experiments complete."
+echo ">>> Session:  ${SESSION_DIR}"
+echo ">>> Tracking: ${TRACKING_FILE}"
+echo ""
+echo ">>> Run analysis on all results:"
+echo "    bash scripts/watch_and_analyze.sh '${TRACKING_FILE}'"
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/run_session.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/run_session.sh
new file mode 100755
index 00000000..ca5251bc
--- /dev/null
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/run_session.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+# run_session.sh
+# End-to-end fault-injection session: submit all experiments from the pool
+# (2 at a time, waiting for each pair), then analyze every completed job and
+# produce a scored report.  Designed to be run unattended via nohup.
+#
+# Usage:
+#   nohup bash scripts/run_session.sh > /path/to/session.log 2>&1 &
+#   EXPERIMENT_MATRIX="GPU_SLEEP:1:5:2 SIGKILL:1:5:4" nohup bash scripts/run_session.sh ...
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+WORKLOAD="${WORKLOAD:-llama4_scout}"
+
+# ---- Phase 1: submit and wait for all experiments ----
+echo "========================================"
+echo "PHASE 1: Fault injection"
+echo "========================================"
+WORKLOAD="${WORKLOAD}" bash "${SCRIPT_DIR}/prepare_node_alloc.sh"
+
+# prepare_node_alloc.sh prints the tracking file path; re-derive it the same way
+# (SESSION_TAG is the timestamp when prepare_node_alloc ran, which is a few seconds
+# before this line — find the newest session dir instead of recomputing the tag)
+BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-/home/sbak/experiments/llama4-scout-gb200}"
+TRACKING_FILE=$(ls -td "${BASE_EXPERIMENTS_DIR}/fault_injection"/[0-9]* 2>/dev/null \
+    | head -1)/experiments.tsv
+
+if [[ ! -f "${TRACKING_FILE}" ]]; then
+    echo "ERROR: could not locate experiments.tsv in latest session dir" >&2
+    exit 1
+fi
+
+echo ""
+echo "========================================"
+echo "PHASE 2: Analysis"
+echo "Tracking: ${TRACKING_FILE}"
+echo "========================================"
+bash "${SCRIPT_DIR}/watch_and_analyze.sh" "${TRACKING_FILE}"
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py
new file mode 100644
index 00000000..15417a6c
--- /dev/null
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py
@@ -0,0 +1,237 @@
+#!/usr/bin/env python3
+"""LLM-judge scorer for fault-injection attribution experiments.
+
+Uses the same ChatOpenAI / NVIDIA-inference-API setup as nvrx_logsage.py.
+Reads ground-truth fault parameters and the raw text outputs of nvrx_logsage
+and CollectiveAnalyzer, then asks a Sonnet/Opus judge to score each attribution
+dimension and return structured JSON.
+
+Usage (called by watch_and_analyze.sh):
+    python3 score_attribution.py \
+        --fault-type GPU_SLEEP --rank 0 --iter 5 --nodes 2 \
+        --log-output "$LOG_OUT" \
+        --fr-output  "$FR_OUT" \
+        [--model claude-sonnet-4-6] \
+        [--base-url https://inference-api.nvidia.com/v1]
+
+Stdout: one line of JSON with keys:
+    restart_correct, rank_primary, rank_any, fault_described, fr_rank_correct, notes
+"""
+
+import argparse
+import json
+import logging
+import sys
+from typing import Union
+
+from langchain_openai import ChatOpenAI
+
+sys.path.insert(0, str(__import__("pathlib").Path(__file__).resolve().parents[4]))
+from nvidia_resiliency_ext.attribution.api_keys import load_nvidia_api_key
+from nvidia_resiliency_ext.attribution.svc.config import DEFAULT_LLM_BASE_URL
+
+logger = logging.getLogger(__name__)
+
+# Default judge model — override with --model
+DEFAULT_JUDGE_MODEL = "azure/anthropic/claude-sonnet-4-6"
+
+# Expected restart decision and rationale per fault type
+_RESTART_TABLE = {
+    "GPU_SLEEP":    ("RESTART IMMEDIATE", "transient GPU hang, recoverable"),
+    "LOCK_GIL":     ("RESTART IMMEDIATE", "transient Python GIL hang, recoverable"),
+    "SIGTERM":      ("RESTART IMMEDIATE", "external termination signal, recoverable"),
+    "SIGINT":       ("RESTART IMMEDIATE", "external interrupt signal, recoverable"),
+    "SIGSTOP":      ("RESTART IMMEDIATE", "external stop signal, recoverable"),
+    "SIGNAL_EXC":   ("RESTART IMMEDIATE", "signal-based exception, typically recoverable"),
+    "GPU_ERROR":    ("STOP - DONT RESTART IMMEDIATE", "hardware GPU error, may be persistent"),
+    "SIGKILL":      ("STOP - DONT RESTART IMMEDIATE", "hard kill, possible external pressure or OOM"),
+    "SEGFAULT":     ("STOP - DONT RESTART IMMEDIATE", "segmentation fault, likely code or memory corruption"),
+    "OS_ABORT":     ("STOP - DONT RESTART IMMEDIATE", "OS abort, likely severe system or hardware fault"),
+    "WORKLOAD_EXC": ("STOP - DONT RESTART IMMEDIATE", "application exception, likely a code bug"),
+    "ASYNC_EXC":    ("STOP - DONT RESTART IMMEDIATE", "async exception in workload, likely a code bug"),
+}
+
+
+def load_log_excerpt(log_path, max_lines=400):
+    """Return up to max_lines from the log, keeping the tail (where errors appear).
+
+    Applies the same exclude_nvrx_logs filtering as nvrx_logsage.py:analyze_logs().
+    """
+    if not log_path:
+        return "(log file not provided)"
+    try:
+        try:
+            with open(log_path, "r", encoding="utf-8") as f:
+                lines = f.readlines()
+        except UnicodeDecodeError:
+            with open(log_path, "r", encoding="latin-1") as f:
+                lines = f.readlines()
+        # Mirrors nvrx_logsage.py exclude_nvrx_logs logic exactly
+        lines = [line for line in lines if "nvidia_resiliency_ext" not in line]
+        lines = [line for line in lines if "[workload:" not in line or 'Cycle:' in line]
+        # Strip fault-injection markers — the judge must not see which rank/fault was
+        # injected in the raw log; it knows the ground truth from the structured args.
+        lines = [line for line in lines if "[MEGATRON_FAULT]" not in line]
+        if len(lines) > max_lines:
+            lines = lines[-max_lines:]
+        return "".join(lines).strip()
+    except Exception as exc:
+        return f"(could not read log file: {exc})"
+
+
+def build_judge_prompt(fault_type, rank, iter_, nodes, run_valid, log_output, fr_output, log_excerpt):
+    total_ranks = nodes * 4  # GPUS_PER_NODE=4 in the example SBATCH_SCRIPT
+    expected_restart, restart_rationale = _RESTART_TABLE.get(
+        fault_type, ("unknown", "unknown fault type")
+    )
+
+    if not run_valid:
+        # Return early dict — caller will skip LLM call
+        return {
+            "restart_correct": "N/A",
+            "rank_primary": "N/A",
+            "rank_any": "N/A",
+            "fault_described": "N/A",
+            "fr_rank_correct": "N/A",
+            "notes": "run_invalid: training did not reach the fault injection point; scores not meaningful",
+        }
+
+    fr_section = (
+        fr_output
+        if fr_output and fr_output.strip() not in ("no_dumps", "no results", "run_invalid", "")
+        else "(no flight-recorder dumps available for this experiment)"
+    )
+
+    log_section = log_excerpt.strip() if log_excerpt.strip() else "(not provided)"
+
+    return f"""You are evaluating the accuracy of an AI-based fault attribution system for \
+distributed ML training.
+
+## Ground truth (injected fault)
+- Fault type : {fault_type}
+- Injected rank : {rank}  (global rank index, 0-based; total ranks = {total_ranks})
+- Injected at iteration : {iter_}
+- Cluster : {nodes} nodes × 4 GPUs = {total_ranks} total ranks
+
+## Expected correct behavior
+- restart_decision should be : {expected_restart}
+  Rationale: {restart_rationale}
+- Rank {rank} should appear in Primary issues as the root cause
+
+## Raw job log (filtered, last 400 lines)
+{log_section}
+
+## Log attribution output (from nvrx_logsage)
+{log_output if log_output.strip() else "(no log output — analyzer produced no output)"}
+
+## FR (flight recorder) analysis output (from CollectiveAnalyzer)
+{fr_section}
+
+## Scoring instructions
+Score each dimension below. Use only the values listed for each.
+
+1. **restart_correct** — Is the restart decision in the log output correct for {fault_type}?
+   Values: "true" | "false" | "N/A" (if log output is empty or unparseable)
+
+2. **rank_primary** — Is rank {rank} identified as the PRIMARY root cause (in Primary issues)?
+   Values: "true" | "false" | "partial" (rank mentioned but only as secondary/collateral)
+
+3. **rank_any** — Is rank {rank} mentioned anywhere in the log attribution output?
+   Values: "true" | "false"
+
+4. **fault_described** — Does the log output correctly describe the nature of the fault
+   (e.g., GPU hang, segfault, signal kill) appropriate for {fault_type}?
+   Values: "true" | "false" | "partial" (category right but specifics wrong)
+
+5. **fr_rank_correct** — Does the FR analysis output identify rank {rank} as a suspect?
+   Values: "true" | "false" | "no_dumps" (no FR dumps available)
+
+6. **notes** — One concise sentence summarizing the main gap or confirming correctness.
+
+Respond ONLY with a JSON object — no markdown, no explanation outside the JSON:
+{{
+  "restart_correct": "...",
+  "rank_primary": "...",
+  "rank_any": "...",
+  "fault_described": "...",
+  "fr_rank_correct": "...",
+  "notes": "..."
+}}"""
+
+
+def score(args):
+    args.run_valid = args.run_valid.lower() == "true"
+    api_key = load_nvidia_api_key()
+    if not api_key:
+        raise ValueError(
+            "NVIDIA_API_KEY not found. Set NVIDIA_API_KEY env var, "
+            "NVIDIA_API_KEY_FILE, or create ~/.nvidia_api_key"
+        )
+
+    llm = ChatOpenAI(
+        model=args.model,
+        api_key=api_key,
+        base_url=args.base_url,
+        temperature=0.0,
+        max_completion_tokens=512,
+    )
+
+    log_excerpt = load_log_excerpt(args.log_path) if args.log_path else ""
+
+    prompt_or_result = build_judge_prompt(
+        fault_type=args.fault_type,
+        rank=args.rank,
+        iter_=args.iter,
+        nodes=args.nodes,
+        run_valid=args.run_valid,
+        log_output=args.log_output,
+        fr_output=args.fr_output,
+        log_excerpt=log_excerpt,
+    )
+
+    # build_judge_prompt returns a dict directly for invalid runs (no LLM call needed)
+    if isinstance(prompt_or_result, dict):
+        return prompt_or_result
+
+    response = llm.invoke(prompt_or_result)
+    text = response.content.strip()
+
+    # Strip markdown code fences if present
+    if text.startswith("```"):
+        lines = text.splitlines()
+        text = "\n".join(
+            line for line in lines if not line.startswith("```")
+        ).strip()
+
+    result = json.loads(text)
+    return result
+
+
+def main():
+    parser = argparse.ArgumentParser(description="LLM judge for fault attribution scoring")
+    parser.add_argument("--fault-type", required=True, help="Injected fault type")
+    parser.add_argument("--rank", type=int, required=True, help="Injected global rank")
+    parser.add_argument("--iter", type=int, required=True, help="Injected iteration")
+    parser.add_argument("--nodes", type=int, required=True, help="Node count")
+    parser.add_argument("--run-valid", default="true",
+                        help="'true' if training reached the fault injection point, 'false' otherwise")
+    parser.add_argument("--log-path", default="", help="Path to the raw job log file")
+    parser.add_argument("--log-output", default="", help="Raw stdout from nvrx_logsage")
+    parser.add_argument("--fr-output", default="no_dumps", help="Raw text from CollectiveAnalyzer")
+    parser.add_argument("--model", default=DEFAULT_JUDGE_MODEL, help="Judge LLM model")
+    parser.add_argument("--base-url", default=DEFAULT_LLM_BASE_URL, help="API base URL")
+    args = parser.parse_args()
+
+    try:
+        result = score(args)
+        print(json.dumps(result))
+    except Exception as exc:
+        logger.warning("Judge failed: %s", exc)
+        print(json.dumps({"notes": f"judge_failed: {exc}"}))
+        sys.exit(0)  # non-fatal — caller handles missing keys gracefully
+
+
+if __name__ == "__main__":
+    if not logging.root.handlers:
+        logging.basicConfig(level=logging.WARNING)
+    main()
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh
new file mode 100755
index 00000000..8a5e3a4d
--- /dev/null
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh
@@ -0,0 +1,202 @@
+#!/bin/bash
+# watch_and_analyze.sh
+# Poll SLURM for job completions from a fault-injection session tracking file,
+# run log-analysis and fr-analysis on each completed job, then call the LLM judge
+# (score_attribution.py) to score each attribution dimension.
+#
+# Usage:
+#   bash scripts/watch_and_analyze.sh <TRACKING_FILE>
+
+set -euo pipefail
+
+TRACKING_FILE="${1:?Usage: $0 <tracking_file.tsv>}"
+POLL_INTERVAL=30
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+SKILL_DIR="$(dirname "${SCRIPT_DIR}")"
+NVRX_SRC_DIR="$(cd "${SKILL_DIR}/../../.." && pwd)"
+
+LOGSAGE_PY="${SKILL_DIR}/log-analysis/scripts/nvrx_logsage.py"
+SCORE_PY="${SCRIPT_DIR}/score_attribution.py"
+
+# Ensure nvidia_resiliency_ext is importable from source tree
+export PYTHONPATH="${NVRX_SRC_DIR}${PYTHONPATH:+:$PYTHONPATH}"
+
+REPORT_FILE="${TRACKING_FILE%.tsv}_report.md"
+DONE_JOBS_FILE="${TRACKING_FILE%.tsv}_done.txt"
+
+touch "${DONE_JOBS_FILE}"
+
+cat > "${REPORT_FILE}" <<'EOF'
+# Fault Injection Experiment Report
+
+| # | FAULT_TYPE | NODES | RANK | ITER | JOB_ID | STATE | run_valid | restart_correct | rank_primary | rank_any | fault_described | fr_rank_correct | judge_notes |
+|---|------------|-------|------|------|--------|-------|-----------|-----------------|--------------|----------|-----------------|-----------------|-------------|
+EOF
+
+echo ">>> Watching tracking file: ${TRACKING_FILE}"
+echo ">>> Report: ${REPORT_FILE}"
+echo ">>> Polling every ${POLL_INTERVAL}s ..."
+
+TOTAL=$(tail -n +2 "${TRACKING_FILE}" | wc -l)
+EXP_NUM=0
+
+while true; do
+    PENDING=0
+
+    while IFS=$'\t' read -r JOB_ID FAULT_TYPE RANK ITER NODES EXPERIMENT_DIR; do
+        # Skip already-analyzed jobs
+        if grep -q "^${JOB_ID}$" "${DONE_JOBS_FILE}" 2>/dev/null; then
+            continue
+        fi
+
+        # Check job state
+        STATE=$(scontrol show job "${JOB_ID}" 2>/dev/null \
+            | grep -oP 'JobState=\K\S+' || echo "UNKNOWN")
+
+        case "${STATE}" in
+            RUNNING|PENDING|COMPLETING)
+                PENDING=$((PENDING + 1))
+                continue
+                ;;
+            COMPLETED|FAILED|TIMEOUT|CANCELLED|NODE_FAIL)
+                ;;
+            *)
+                # Job left the queue — treat as done
+                ;;
+        esac
+
+        EXP_NUM=$((EXP_NUM + 1))
+        echo ""
+        echo ">>> [${EXP_NUM}/${TOTAL}] Analyzing: ${FAULT_TYPE} n=${NODES} rank=${RANK} iter=${ITER} job=${JOB_ID} state=${STATE}"
+
+        # ---- Log analysis ----
+        LOG_GLOB="${EXPERIMENT_DIR}/logs/slurm/${JOB_ID}.*.1.main_workload.log"
+        LOG_FILE=$(ls ${LOG_GLOB} 2>/dev/null | head -1 || true)
+        LOG_OUT=""
+
+        # ---- Check run validity: did the fault actually fire? ----
+        # The fault injection prints: [MEGATRON_FAULT] global_rank=RANK/...: injecting FAULT_TYPE at iteration ITER
+        RUN_VALID="false"
+        STRIPPED_LOG=""
+        if [[ -n "${LOG_FILE}" && -f "${LOG_FILE}" ]]; then
+            echo "    log: ${LOG_FILE}"
+            if grep -qF "[MEGATRON_FAULT]" "${LOG_FILE}" 2>/dev/null; then
+                RUN_VALID="true"
+            fi
+            echo "    run_valid: ${RUN_VALID}"
+
+            # Strip fault-injection markers so neither nvrx_logsage nor the judge
+            # can see which rank/fault was injected — evaluation must be fair.
+            # [MEGATRON_FAULT] lines are printed by Megatron's debug_fault_injection.py
+            # and are not covered by --exclude_nvrx_logs.
+            STRIPPED_LOG=$(mktemp /tmp/fi_log_stripped.XXXXXX)
+            grep -vF "[MEGATRON_FAULT]" "${LOG_FILE}" > "${STRIPPED_LOG}" 2>/dev/null || true
+
+            # nvrx_logsage.py prints 5 newline-joined fields to stdout:
+            #   line 1: restart_decision
+            #   line 2: error_explanation  (often empty)
+            #   line 3+: attribution_text  (multi-line, starts with "Attribution:")
+            #   then: additional_detail    (often empty)
+            #   last line: checkpoint_saved ("True" / "False")
+            LOG_OUT=$(python3 "${LOGSAGE_PY}" \
+                --log-path "${STRIPPED_LOG}" \
+                --exclude_nvrx_logs 2>/dev/null || echo "")
+            LOG_RESTART=$(echo "${LOG_OUT}" | head -1)
+            echo "    restart_decision: ${LOG_RESTART:-<empty>}"
+        else
+            echo "    WARN: no log file at ${LOG_GLOB}"
+            echo "    run_valid: false (no log)"
+        fi
+
+        # ---- FR analysis (only when run is valid) ----
+        FR_DIR="${EXPERIMENT_DIR}/checkpoints"
+        FR_OUT="no_dumps"
+
+        if [[ "${RUN_VALID}" == "true" ]] && ls "${FR_DIR}"/_dump_* 2>/dev/null | grep -q .; then
+            echo "    FR dumps: $(ls "${FR_DIR}"/_dump_* 2>/dev/null | wc -l) files"
+            FR_OUT=$(python3 -c "
+import sys, logging
+logging.basicConfig(level=logging.WARNING)
+sys.path.insert(0, '${NVRX_SRC_DIR}')
+from nvidia_resiliency_ext.attribution.trace_analyzer.fr_attribution import CollectiveAnalyzer
+try:
+    ca = CollectiveAnalyzer({'fr_path': '${FR_DIR}'})
+    results = ca.run_sync({'fr_path': '${FR_DIR}'})
+    if results:
+        result_data = results[0]
+        if isinstance(result_data, dict):
+            text = result_data.get('analysis_text', '')
+            ranks = result_data.get('hanging_ranks', '')
+            if text:
+                print(text)
+            if ranks:
+                print(ranks)
+        else:
+            print(str(result_data))
+    else:
+        print('no results')
+except Exception as e:
+    print('error: ' + str(e), file=sys.stderr)
+    print('no_dumps')
+" 2>/dev/null || echo "no_dumps")
+        elif [[ "${RUN_VALID}" == "false" ]]; then
+            FR_OUT="run_invalid"
+            echo "    FR analysis skipped (run did not reach fault injection point)"
+        fi
+
+        # ---- LLM judge scoring ----
+        echo "    scoring with judge..."
+        SCORE_JSON=$(python3 "${SCORE_PY}" \
+            --fault-type "${FAULT_TYPE}" \
+            --rank "${RANK}" \
+            --iter "${ITER}" \
+            --nodes "${NODES}" \
+            --run-valid "${RUN_VALID}" \
+            --log-path "${STRIPPED_LOG:-}" \
+            --log-output "${LOG_OUT}" \
+            --fr-output "${FR_OUT}" 2>/dev/null || echo '{"notes":"judge_failed"}')
+
+        # Clean up temp stripped log
+        [[ -n "${STRIPPED_LOG}" && -f "${STRIPPED_LOG}" ]] && rm -f "${STRIPPED_LOG}"
+
+        _get() { echo "${SCORE_JSON}" | python3 -c \
+            "import sys,json; d=json.load(sys.stdin); print(d.get('$1','N/A'))" 2>/dev/null || echo "N/A"; }
+
+        RESTART_CORRECT=$(_get restart_correct)
+        RANK_PRIMARY=$(_get rank_primary)
+        RANK_ANY=$(_get rank_any)
+        FAULT_DESC=$(_get fault_described)
+        FR_RANK=$(_get fr_rank_correct)
+        JUDGE_NOTES=$(_get notes)
+
+        echo "    run_valid=${RUN_VALID}  restart_correct=${RESTART_CORRECT}  rank_primary=${RANK_PRIMARY}  rank_any=${RANK_ANY}  fault_described=${FAULT_DESC}  fr_rank=${FR_RANK}"
+        echo "    judge: ${JUDGE_NOTES}"
+
+        # Append to report
+        printf "| %d | %s | %s | %s | %s | %s | %s | %s | %s | %s | %s | %s | %s | %s |\n" \
+            "${EXP_NUM}" "${FAULT_TYPE}" "${NODES}" "${RANK}" "${ITER}" \
+            "${JOB_ID}" "${STATE}" "${RUN_VALID}" \
+            "${RESTART_CORRECT}" "${RANK_PRIMARY}" "${RANK_ANY}" \
+            "${FAULT_DESC}" "${FR_RANK}" \
+            "${JUDGE_NOTES}" >> "${REPORT_FILE}"
+
+        echo "${JOB_ID}" >> "${DONE_JOBS_FILE}"
+
+    done < <(tail -n +2 "${TRACKING_FILE}")
+
+    DONE_COUNT=$(wc -l < "${DONE_JOBS_FILE}")
+    echo "$(date '+%H:%M:%S') >>> ${DONE_COUNT}/${TOTAL} done, ${PENDING} still running"
+
+    if [[ ${DONE_COUNT} -ge ${TOTAL} ]]; then
+        break
+    fi
+
+    sleep "${POLL_INTERVAL}"
+done
+
+echo ""
+echo ">>> All ${TOTAL} experiments analyzed."
+echo ">>> Report: ${REPORT_FILE}"
+echo ""
+cat "${REPORT_FILE}"
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/workloads.conf b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/workloads.conf
new file mode 100644
index 00000000..7cea1674
--- /dev/null
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/workloads.conf
@@ -0,0 +1,17 @@
+# workloads.conf — fault-injection workload registry
+#
+# Each non-comment line defines one workload:
+#   NAME  SCRIPT  BASE_EXPERIMENTS_DIR  DESCRIPTION  POOL_FILE  TIME
+#
+# NAME                 : identifier passed as WORKLOAD=<name> to prepare_node_alloc.sh
+# SCRIPT               : path to the sbatch job script (relative to the scripts/ dir)
+# BASE_EXPERIMENTS_DIR : root directory for all experiment output (logs, checkpoints, etc.)
+# DESCRIPTION          : free-form human-readable label (no spaces; use underscores)
+# POOL_FILE            : (optional) pool file under scripts/pools/ to use as default pool
+#                        when POOL env var is not set; "-" means use the built-in default pool
+# TIME                 : (optional) default wall-clock limit per job (HH:MM:SS);
+#                        "-" means use the TIME env var or prepare_node_alloc.sh default (00:30:00)
+#
+# Fields are whitespace-separated. Lines starting with # are ignored.
+
+llama4_scout  l4_gb200_reduced.sh   /home/sbak/experiments/llama4-scout-gb200  Llama4-Scout_(reduced_layers)_on_GB200     -                    -

From df5f5c740867c5f02745316cfb6c9d98d364ea33 Mon Sep 17 00:00:00 2001
From: Seonmyeong Bak <sbak@nvidia.com>
Date: Wed, 22 Apr 2026 11:24:12 -0700
Subject: [PATCH 02/21] chore(skills): remove extraneous nvrx-attr artifacts

---
 .../nvrx-attr/SESSION_REPORT_20260409_13.md   | 137 -------
 .../skills/nvrx-attr/l4_gb200_reduced.sh      | 363 -----------------
 .../nvrx-attr/scripts/n3_super_gb200.sh       | 166 --------
 .../scripts/n3_super_gb200_shm_test.sh        | 369 ------------------
 .../scripts/pools/n3_super_8n_16n.pool        |  40 --
 5 files changed, 1075 deletions(-)
 delete mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/SESSION_REPORT_20260409_13.md
 delete mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/l4_gb200_reduced.sh
 delete mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200.sh
 delete mode 100755 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_shm_test.sh
 delete mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/pools/n3_super_8n_16n.pool

diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/SESSION_REPORT_20260409_13.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/SESSION_REPORT_20260409_13.md
deleted file mode 100644
index 657cbbd5..00000000
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/SESSION_REPORT_20260409_13.md
+++ /dev/null
@@ -1,137 +0,0 @@
-# Fault Injection Session Report — April 9–13, 2026
-
-## Summary
-
-End-to-end validation of the fault-injection attribution pipeline across 48 experiments.
-Identified and fixed three pipeline bugs, confirmed FR analysis is solid, and isolated the
-remaining attribution gap to a single issue: **logsage returns RESTART IMMEDIATE for
-crash/exception-type faults that should be STOP**.
-
----
-
-## Pipeline Fixes Applied
-
-| File | Fix |
-|---|---|
-| `trace_analyzer/capture.py` | `capture_logs()` now saves/restores logger level and lowers it to INFO — previously, root logger at WARNING silently dropped all `logger.info()` calls inside the capture block, producing empty `analysis_text` from `CollectiveAnalyzer` |
-| `trace_analyzer/fr_attribution.py` | `main()` now prints `analysis_text` + `hanging_ranks` to stdout (was discarding results) |
-| `scripts/watch_and_analyze.sh` | FR inline Python block: import from installed package (not local skill copy), correctly extract `analysis_text`/`hanging_ranks` from returned dict, redirect stderr to `/dev/null` instead of mixing into FR output |
-| `scripts/score_attribution.py` | **New file** — LLM judge (Claude Sonnet) that scores 5 attribution dimensions per experiment and returns structured JSON |
-
----
-
-## Experiment Sessions
-
-### Session 1 — Mini-batch validation (Apr 9, `20260409_160245`)
-
-6 experiments: GPU_SLEEP×2, GPU_ERROR×2, SIGKILL×1, SIGTERM×1 — all 2-node.
-Purpose: confirm pipeline works end-to-end after fixes.
-
-| # | FAULT_TYPE | RANK | restart | rank_p | rank_a | fault_d | fr_rank |
-|---|---|---|---|---|---|---|---|
-| 1 | GPU_SLEEP | 1 | ✅ | ✅ | ✅ | ✅ | ✅ |
-| 2 | GPU_SLEEP | 0 | ✅ | ✅ | ✅ | partial | ✅ |
-| 3 | GPU_ERROR | 1 | ❌ | ❌ | ❌ | partial | ✅ |
-| 4 | GPU_ERROR | 0 | ❌ | ❌ | ❌ | partial | ✅ |
-| 5 | SIGKILL | 1 | ❌ | ✅ | ✅ | partial | ✅ |
-| 6 | SIGTERM | 1 | ✅ | ❌ | ❌ | partial | ✅ |
-
-FR analysis: 6/6 correct. Pipeline confirmed working.
-
----
-
-### Session 2 — Full default pool (Apr 9, `20260409_170603`)
-
-34 experiments across all fault types and node counts (2/4/8 nodes).
-
-**Infrastructure issue:** 18/34 jobs failed at container startup due to a pyxis/enroot
-`nvidia-container-cli ldcache` error on certain compute nodes:
-
-```
-nvidia-container-cli: ldcache error: process /usr/sbin/ldconfig.real failed with error code: 1
-[ERROR] /etc/enroot/hooks.d/98-nvidia.sh exited with return code 1
-pyxis: couldn't start container
-rm: cannot remove '/usr/local/cuda/compat/lib': Read-only file system
-```
-
-The CUDA compat overlay was not being applied on those nodes — `ldconfig` could not write its
-cache inside the read-only squashfs container. These jobs produced no FR dumps and their logs
-contained only the container error, which logsage misattributed as a disk/storage fault.
-The issue was transient and node-specific; jobs submitted the next day ran cleanly.
-
-**Clean-run results (16/34):** see full table in
-`/home/sbak/experiments/llama4-scout-gb200/fault_injection/20260409_170603/experiments_report.md`
-
-Aggregate for clean-run jobs:
-
-| FAULT_TYPE | N (clean) | restart% | rank_primary% | fr_rank% |
-|---|---|---|---|---|
-| GPU_SLEEP | 5 | 80% | 40% | 60% |
-| GPU_ERROR | 4 | 0% | 25% | 75% |
-| SIGKILL | 3 | 33% | 33% | 100% |
-| OS_ABORT | 1 | 0% | 0% | 100% |
-
----
-
-### Session 3 — SEGFAULT cluster health check (Apr 10, `20260410_135216`)
-
-2 experiments: SEGFAULT rank=0 and rank=1, 2-node. Purpose: confirm cluster healthy after
-the Apr 9 enroot issue.
-
-| # | FAULT_TYPE | RANK | restart | rank_p | rank_a | fault_d | fr_rank |
-|---|---|---|---|---|---|---|---|
-| 1 | SEGFAULT | 1 | ❌ | ✅ | ✅ | ✅ | ✅ |
-| 2 | SEGFAULT | 0 | ❌ | ✅ | ✅ | ✅ | ✅ |
-
-Cluster healthy (both COMPLETED, 7 FR dumps each). Rank and fault description correct;
-restart decision wrong (RESTART instead of STOP).
-
----
-
-### Session 4 — Python fault types (Apr 10, `20260410_143501`)
-
-4 experiments: LOCK_GIL×2, WORKLOAD_EXC×1, ASYNC_EXC×1 — all 2-node.
-These were skipped in the full session due to the enroot issue.
-
-| # | FAULT_TYPE | RANK | restart | rank_p | rank_a | fault_d | fr_rank |
-|---|---|---|---|---|---|---|---|
-| 1 | LOCK_GIL | 1 | ✅ | ✅ | ✅ | partial | ✅ |
-| 2 | LOCK_GIL | 0 | ✅ | ✅ | ✅ | partial | ✅ |
-| 3 | WORKLOAD_EXC | 1 | ❌ | ✅ | ✅ | partial | ❌ (rank 7) |
-| 4 | ASYNC_EXC | 1 | ❌ | ❌ | ❌ | false | ✅ |
-
-Note on WORKLOAD_EXC FR result: FR flagged rank 7 instead of rank 1. When a rank throws an
-application exception and crashes, the last rank detected as missing by NCCL's collective
-timeout isn't necessarily the originating rank — FR is identifying the symptom rank.
-
----
-
-## Attribution Quality Summary (clean runs only)
-
-| Dimension | Assessment |
-|---|---|
-| **FR rank identification** | Solid — correctly identified the hanging rank in all clean-run experiments where NCCL completed enough to produce dumps. The `capture_logs()` fix was the key enabler. |
-| **Log rank identification** | Good for hang types (GPU_SLEEP, LOCK_GIL); weaker for crash/signal types where all ranks see a simultaneous NCCL timeout masking the originator. FR compensates for this gap. |
-| **Restart decision** | ✅ Correct for hang/recoverable types: GPU_SLEEP, LOCK_GIL, SIGTERM. ❌ Wrong for crash/exception types: GPU_ERROR, SIGKILL, SEGFAULT, WORKLOAD_EXC, ASYNC_EXC — logsage consistently returns RESTART IMMEDIATE when the correct decision is STOP. |
-| **Fault description** | Consistently `partial` — logsage describes the observable NCCL collective timeout symptom, not the underlying injected fault (GPU hang, kill signal, exception). This is expected given the log contains only symptoms. |
-
----
-
-## Open Gap
-
-**Single actionable fix:** logsage restart decision for crash/exception-type faults.
-
-Logsage sees the same NCCL collective timeout pattern whether the root cause is a recoverable
-GPU hang or a hard crash (SIGKILL, SEGFAULT, CUDA error, application exception). It needs
-keyword-based fast-path rules to detect crash signals before the LLM runs:
-
-| Fault type | Expected | Currently returns |
-|---|---|---|
-| GPU_ERROR | STOP | RESTART IMMEDIATE |
-| SIGKILL | STOP | RESTART IMMEDIATE |
-| SEGFAULT | STOP | RESTART IMMEDIATE |
-| WORKLOAD_EXC | STOP | RESTART IMMEDIATE |
-| ASYNC_EXC | STOP | RESTART IMMEDIATE |
-| OS_ABORT | STOP | RESTART IMMEDIATE |
-
-Target file: `attribution/log_analyzer/nvrx_logsage.py`
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/l4_gb200_reduced.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/l4_gb200_reduced.sh
deleted file mode 100644
index 5c903e7a..00000000
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/l4_gb200_reduced.sh
+++ /dev/null
@@ -1,363 +0,0 @@
-#!/bin/bash
-
-#SBATCH --account=root
-#SBATCH --partition=gb-nvl-134-135
-#SBATCH --time=00:30:00
-
-#SBATCH --job-name=llama4-scout-gb200
-#SBATCH --output=/tmp/slurm-%j.launch.out
-#SBATCH --error=/tmp/slurm-%j.launch.err
-
-#SBATCH --nodes=2
-#SBATCH --ntasks-per-node=4
-#SBATCH --gpus-per-node=4
-#SBATCH --exclusive
-#SBATCH --mem=0
-
-log_msg() {
-    local msg="$1"
-    UNIX_DATETIME=$(date +%s)
-    HUMAN_DATETIME=$(date -d "@$UNIX_DATETIME" '+%Y-%m-%d %H:%M:%S.%3N')
-    echo ">>> ${msg} ${UNIX_DATETIME} (${HUMAN_DATETIME})"
-}
-
-log_msg "START SBATCH"
-echo "Running on nodes: ${SLURM_NODELIST}"
-export RITS_PLATFORM_TYPE=gb200
-export RITS_GPUS_PER_NODE=4
-export RITS_NVL_DOMAIN_SIZE=72
-export NCCL_IB_DISABLE=0
-export NCCL_NET_GDR_LEVEL=3
-export RITS_CLUSTER_NAME=nvl72
-export PYXIS_LOG_LEVEL=debug
-export NCCL_IB_SL=1
-export NCCL_IB_TIMEOUT=19
-export UB_TIMEOUT=720
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NVTE_FWD_LAYERNORM_SM_MARGIN=16
-export NVTE_BWD_LAYERNORM_SM_MARGIN=16
-export NCCL_P2P_NET_CHUNKSIZE=2097152
-export NCCL_DEBUG=WARN
-export PYTHONUNBUFFERED=1
-export ONE_LOGGER_JOB_CATEGORY=test
-export LOGLEVEL=DEBUG
-export TORCHINDUCTOR_WORKER_START=fork
-export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
-export TORCH_CPP_LOG_LEVEL=INFO
-export TORCH_NCCL_TRACE_BUFFER_SIZE=2000
-export TORCH_NCCL_RETHROW_CUDA_ERRORS=0
-export TORCH_NCCL_ENABLE_MONITORING=1
-export TORCH_NCCL_DUMP_ON_TIMEOUT=1
-export TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN=0
-export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=20
-export TORCH_DIST_INIT_BARRIER=0
-export TORCH_INCLUDE_STACK_TRACE=0
-export TORCH_INCLUDE_ONLY_ACTIVE=1
-export TORCH_NCCL_EXTRA_DUMP_ON_EXEC=1
-
-# Checkpoint settings (overridable via sbatch --export)
-export DIST_TIMEOUT_AFTER_INIT="${DIST_TIMEOUT_AFTER_INIT:-1800}"
-# USE_ASYNC_CKPT=1: enable async checkpointing every CKPT_SAVE_INTERVAL iters
-export USE_ASYNC_CKPT="${USE_ASYNC_CKPT:-0}"
-export CKPT_SAVE_INTERVAL="${CKPT_SAVE_INTERVAL:-100}"
-export USE_CPU_SHM="${USE_CPU_SHM:-1}"
-
-# Quantization mode (overridable via sbatch --export)
-export USE_FP8="${USE_FP8:-1}"
-export USE_FP4="${USE_FP4:-0}"
-
-# Overlap comm (overridable via sbatch --export)
-export USE_OVERLAP_COMM="${USE_OVERLAP_COMM:-0}"
-
-# Node / task geometry (SLURM_NNODES is set by SLURM from --nodes override)
-export GPUS_PER_NODE="${GPUS_PER_NODE:-4}"
-TOTAL_TASKS=$((SLURM_NNODES * GPUS_PER_NODE))
-
-# Per-experiment output directory (overridable via sbatch --export)
-export BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-/home/sbak/experiments/llama4-scout-gb200}"
-export EXPERIMENT_DIR="${EXPERIMENT_DIR:-${BASE_EXPERIMENTS_DIR}/ckpt_test/n${SLURM_NNODES}}"
-
-mkdir -p ${BASE_EXPERIMENTS_DIR}/datacache
-mkdir -p ${EXPERIMENT_DIR}/tensorboard
-
-: "${SLURM_RESTART_COUNT:=0}"
-
-LOG_DIR=${EXPERIMENT_DIR}/logs
-mkdir -p ${LOG_DIR}
-echo "Writing logs to ${LOG_DIR}"
-LOG_FILE_BASE="${LOG_DIR}/slurm/${SLURM_JOB_ID}.${SLURM_RESTART_COUNT}"
-
-# ── Shared-tmp directory (NFS, for cross-srun-step communication) ─────────────
-# Mounted to /shared_tmp (NOT /tmp) so the container keeps its native fast /tmp.
-SHARED_TMP_HOST=/home/sbak/tmp/${SLURM_JOB_ID}
-mkdir -p ${SHARED_TMP_HOST}
-
-# ── Pre-populate .myenv with all variables that must reach the container ───────
-# Pyxis env forwarding is unreliable for vars set via sbatch --export; writing
-# them into .myenv guarantees the inner bash picks them up via `source`.
-MYENV_FILE=${SHARED_TMP_HOST}/.myenv_${SLURM_JOB_ID}.sh
-cat > ${MYENV_FILE} << MYENVEOF
-# Auto-generated by l4_gb200_reduced.sh — do not edit by hand.
-export DIST_TIMEOUT_AFTER_INIT=${DIST_TIMEOUT_AFTER_INIT}
-export USE_ASYNC_CKPT=${USE_ASYNC_CKPT}
-export CKPT_SAVE_INTERVAL=${CKPT_SAVE_INTERVAL}
-export USE_CPU_SHM=${USE_CPU_SHM}
-export USE_FP8=${USE_FP8}
-export USE_FP4=${USE_FP4}
-export USE_OVERLAP_COMM=${USE_OVERLAP_COMM}
-# Prepend local nvrx src so container picks up our changes without a pip install step.
-export PYTHONPATH=/home/sbak/nvidia-resiliency-ext/src:\${PYTHONPATH}
-MYENVEOF
-
-# Mounts
-LUSTRE=/home:/home
-SHARED_TMP=${SHARED_TMP_HOST}:/shared_tmp
-LOGS=${EXPERIMENT_DIR}/logs:/logs
-MEGATRON_REPO=/home/sbak/megatron-lm-main:/megatron-lm_repo
-DATACACHE=${BASE_EXPERIMENTS_DIR}/datacache:/datacache
-TENSORBOARD=${EXPERIMENT_DIR}/tensorboard:/tensorboard
-WORKSPACE=/home/sbak/tmp:/workspace
-FR_DUMP=${EXPERIMENT_DIR}/flight_recorder:/flight_recorder
-mkdir -p ${EXPERIMENT_DIR}/flight_recorder
-CONTAINER_MOUNTS=$LUSTRE,$SHARED_TMP,$LOGS,$MEGATRON_REPO,$DATACACHE,$TENSORBOARD,$WORKSPACE,$FR_DUMP
-
-# ── Disk cleanup: remove stale enroot containers from prior jobs ──────────────
-log_msg "START disk_cleanup"
-srun \
-    --label \
-    --ntasks-per-node=1 \
-    --ntasks=${SLURM_NNODES} \
-    --kill-on-bad-exit=0 \
-    --mpi=none \
-    bash -c '
-        ENROOT_DIR="/var/lib/enroot/data/$(id -u)"
-        rm -rf "${ENROOT_DIR:?}"/* 2>/dev/null || true
-        echo "$(hostname): / $(df -h / | tail -1 | awk "{print \$3\" used, \"\$4\" avail\"}")"
-    '
-log_msg "END disk_cleanup"
-
-# all node setup
-#--------------------------------
-log_msg "START all_node_setup"
-srun \
-    --label \
-    --container-mounts ${CONTAINER_MOUNTS} \
-    --container-image /home/sbak/mcore_ci_0415.sqsh \
-    --container-name ${SLURM_JOB_ID} \
-    --container-workdir / \
-    --exclusive \
-    --error=${LOG_FILE_BASE}.0.all_node_setup.log \
-    --output=${LOG_FILE_BASE}.0.all_node_setup.log \
-    --ntasks-per-node=1 \
-    --ntasks=${SLURM_NNODES} \
-    --kill-on-bad-exit=0 \
-    --mpi=none \
-    bash -c '
-        # Use a per-node NFS path so all ranks on each node find the right clone.
-        MEGATRON_PATH=/shared_tmp/megatron_$(hostname)_${SLURM_JOB_ID}
-        mkdir -p ${MEGATRON_PATH}
-        pushd $MEGATRON_PATH
-        CURRENT_BRANCH=$(git -C /megatron-lm_repo branch --show-current)
-        echo "Cloning Megatron branch $CURRENT_BRANCH to ${MEGATRON_PATH}"
-        git clone --single-branch --branch $CURRENT_BRANCH /megatron-lm_repo .
-        popd
-    '
-log_msg "END all_node_setup"
-
-# main workload
-#--------------------------------
-log_msg "START main_workload"
-srun \
-    --label \
-    --container-mounts ${CONTAINER_MOUNTS} \
-    --container-image /home/sbak/mcore_ci_0415.sqsh \
-    --container-name ${SLURM_JOB_ID} \
-    --container-workdir / \
-    --error=${LOG_FILE_BASE}.1.main_workload.log \
-    --output=${LOG_FILE_BASE}.1.main_workload.log \
-    --ntasks-per-node=${GPUS_PER_NODE} \
-    --ntasks=${TOTAL_TASKS} \
-    --kill-on-bad-exit=0 \
-    --mpi=none \
-    bash -c '
-        source /shared_tmp/.myenv_${SLURM_JOB_ID}.sh
-
-        # Match the per-node path used in all_node_setup.
-        MEGATRON_PATH=/shared_tmp/megatron_$(hostname)_${SLURM_JOB_ID}
-
-        NFS_TRITON_CACHE=/home/sbak/experiments/llama4-scout-gb200/triton_cache
-        NFS_INDUCTOR_CACHE=/home/sbak/experiments/llama4-scout-gb200/inductor_cache
-
-        # Per-rank Triton/inductor cache on the container native /tmp (local fast storage).
-        export TRITON_CACHE_DIR=/tmp/triton_${SLURM_LOCALID}
-        export TORCHINDUCTOR_CACHE_DIR=/tmp/inductor_${SLURM_LOCALID}
-        mkdir -p ${TRITON_CACHE_DIR} ${TORCHINDUCTOR_CACHE_DIR}
-
-        # Pre-stage: warm local cache from NFS (one rank per node)
-        if [[ "${SLURM_LOCALID}" == "0" ]]; then
-            if [[ -d "${NFS_TRITON_CACHE}" ]]; then
-                echo "Pre-staging triton cache from NFS..."
-                rsync -a --ignore-existing "${NFS_TRITON_CACHE}/" "${TRITON_CACHE_DIR}/" 2>/dev/null || true
-            fi
-            if [[ -d "${NFS_INDUCTOR_CACHE}" ]]; then
-                echo "Pre-staging inductor cache from NFS..."
-                rsync -a --ignore-existing "${NFS_INDUCTOR_CACHE}/" "${TORCHINDUCTOR_CACHE_DIR}/" 2>/dev/null || true
-            fi
-        fi
-
-        # Post-stage: write back to NFS on exit (one rank per node)
-        _stage_back() {
-            if [[ "${SLURM_LOCALID}" == "0" ]]; then
-                echo "Staging triton cache back to NFS..."
-                mkdir -p "${NFS_TRITON_CACHE}" "${NFS_INDUCTOR_CACHE}"
-                rsync -a --ignore-existing "${TRITON_CACHE_DIR}/" "${NFS_TRITON_CACHE}/" 2>/dev/null || true
-                rsync -a --ignore-existing "${TORCHINDUCTOR_CACHE_DIR}/" "${NFS_INDUCTOR_CACHE}/" 2>/dev/null || true
-                echo "Cache staged back."
-            fi
-        }
-        trap _stage_back EXIT
-
-        # Checkpoint directory — node-local /tmp (cleaned up by the cleanup job).
-        CKPT_DIR=/tmp/ckpt_${SLURM_JOB_ID}
-        mkdir -p ${CKPT_DIR}
-
-        if [[ "${USE_FP8:-1}" == "1" ]]; then
-            QUANT_ARGS="--fp8-format hybrid \
-            --fp8-recipe delayed \
-            --fp8-param-gather \
-            --fp8-amax-history-len 1024 \
-            --fp8-amax-compute-algo max \
-            --fp8-margin 0"
-        elif [[ "${USE_FP4:-0}" == "1" ]]; then
-            QUANT_ARGS="--fp4-format e2m1 \
-            --fp4-recipe nvfp4"
-        else
-            QUANT_ARGS=""
-        fi
-
-        if [[ "${USE_OVERLAP_COMM:-0}" == "1" ]]; then
-            OVERLAP_ARGS="--overlap-grad-reduce --overlap-param-gather"
-        else
-            OVERLAP_ARGS=""
-        fi
-
-        # Build checkpoint args (controlled by USE_ASYNC_CKPT from .myenv).
-        # No --load: we only want to test save here.
-        CKPT_SAVE_ARGS=""
-        if [[ "${USE_ASYNC_CKPT}" == "1" ]]; then
-            CKPT_SAVE_ARGS="--save ${CKPT_DIR} --save-interval ${CKPT_SAVE_INTERVAL} --async-save --use-persistent-ckpt-worker --use-dist-ckpt --ckpt-fully-parallel-save --ckpt-assume-constant-structure $([[ "${USE_CPU_SHM}" == "1" ]] && echo "--async-ckpt-use-cpu-shm")"
-        fi
-
-        pushd $MEGATRON_PATH
-        LAUNCHER_CMD="python3"
-        LAUNCHER_ARGS=" \
-        "
-        WORKLOAD_CMD=${MEGATRON_PATH}/pretrain_gpt.py
-        WORKLOAD_ARGS=" \
-            --exit-duration-in-mins 5750 \
-            --distributed-timeout-minutes 10 \
-            --disable-gloo-process-groups \
-            --mock-data \
-            --data-cache-path /datacache \
-            --no-create-attention-mask-in-dataloader \
-            --no-mmap-bin-files \
-            --tokenizer-type NullTokenizer \
-            --tiktoken-pattern v2 \
-            --tokenizer-model /lustre/fsw/portfolios/llmservice/projects/llmservice_nlp_fm/nemotron6/tokenizers/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json \
-            --micro-batch-size 1 \
-            --global-batch-size 64 \
-            --train-samples 10240000 \
-            --adam-beta1 0.9 \
-            --adam-beta2 0.95 \
-            --adam-eps 1e-05 \
-            --lr-decay-style cosine \
-            --lr-warmup-samples 1024000 \
-            --lr-decay-samples 20480000 \
-            --lr 0.0003 \
-            --min-lr 2.9999999999999997e-05 \
-            --weight-decay 0.1 \
-            --clip-grad 1.0 \
-            --loss-scale 1.0 \
-            --use-mcore-models \
-            --untie-embeddings-and-output-weights \
-            --disable-bias-linear \
-            --attention-backend flash \
-            --transformer-impl transformer_engine \
-            --position-embedding-type rope \
-            --rotary-base 500000 \
-            --rotary-interleaved \
-            --use-rope-scaling \
-            --rope-scaling-factor 8.0 \
-            --no-rope-fusion \
-            --no-rope-freq 4 \
-            --use-flash-attn \
-            --cross-entropy-fusion-impl te \
-            --cross-entropy-loss-fusion \
-            --seq-length 8192 \
-            --max-position-embeddings 8192 \
-            --num-layers 12 \
-            --swiglu \
-            --hidden-size 5120 \
-            --num-attention-heads 40 \
-            --group-query-attention \
-            --num-query-groups 8 \
-            --ffn-hidden-size 16384 \
-            --kv-channels 128 \
-            --normalization RMSNorm \
-            --attention-dropout 0.0 \
-            --hidden-dropout 0.0 \
-            --grad-reduce-in-bf16 \
-            --qk-l2-norm \
-            --num-experts 16 \
-            --moe-layer-freq 1 \
-            --moe-ffn-hidden-size 8192 \
-            --moe-shared-expert-intermediate-size 8192 \
-            --moe-router-topk 1 \
-            --moe-router-score-function sigmoid \
-            --moe-token-dispatcher-type alltoall \
-            --moe-grouped-gemm \
-            --moe-shared-expert-overlap \
-            --moe-router-bias-update-rate 0.001 \
-            --moe-router-load-balancing-type aux_loss \
-            --moe-aux-loss-coeff 0.01 \
-            --moe-router-enable-expert-bias \
-            --moe-apply-probs-on-input \
-            --moe-router-force-load-balancing \
-            --bf16 \
-            ${QUANT_ARGS} \
-            --te-rng-tracker \
-            --sequence-parallel \
-            --use-distributed-optimizer \
-            ${OVERLAP_ARGS} \
-            --ddp-num-buckets 5 \
-            --tensor-model-parallel-size 1 \
-            --pipeline-model-parallel-size 1 \
-            --expert-model-parallel-size 8 \
-            --expert-tensor-parallel-size 1 \
-            --ddp-average-in-collective \
-            --log-interval 1 \
-            --timing-log-option minmax \
-            --log-params-norm \
-            --log-num-zeros-in-grad \
-            --log-throughput \
-            --check-weight-hash-across-dp-replicas-interval 20000 \
-            --tensorboard-dir /tensorboard \
-            --logging-level 10 \
-            --eval-iters 14 \
-            --eval-interval 2000 \
-            --manual-gc \
-            --manual-gc-interval 100 \
-            --num-workers 1 \
-            --local-rank ${SLURM_LOCALID} \
-            --context-parallel-size 1 \
-            --vocab-size 238600 \
-            --distributed-timeout-seconds-after-init ${DIST_TIMEOUT_AFTER_INIT} \
-            --flight-recorder-dump-path /flight_recorder \
-        "
-        $LAUNCHER_CMD $LAUNCHER_ARGS $WORKLOAD_CMD $WORKLOAD_ARGS $CKPT_SAVE_ARGS
-    '
-log_msg "END main_workload"
-
-log_msg "END SBATCH"
-
-set +x
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200.sh
deleted file mode 100644
index 1147dda6..00000000
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200.sh
+++ /dev/null
@@ -1,166 +0,0 @@
-ENV_VARS:
-  NVTE_FWD_LAYERNORM_SM_MARGIN: 16
-  NVTE_BWD_LAYERNORM_SM_MARGIN: 16
-  TORCHINDUCTOR_WORKER_START: fork
-  QUANTIZATION_TYPE_DEBUG: 1
-  PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
-  NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN: 64
-  USE_MNNVL: 1
-TEST_TYPE: "release"
-MODEL_ARGS:
-  # Distributed args
-  --tensor-model-parallel-size: 2
-  --pipeline-model-parallel-size: 1
-  --expert-model-parallel-size: 64
-  --expert-tensor-parallel-size: 1
-  --use-distributed-optimizer: true
-  --overlap-grad-reduce: true
-  --overlap-param-gather: true
-  --sequence-parallel: true
-  --ddp-num-buckets: 10
-  --ddp-pad-buckets-for-high-nccl-busbw: true
-  --high-priority-stream-groups: ep
-  --distributed-timeout-minutes: 10
-  --disable-gloo-process-groups: true
-
-  # Training args
-  --micro-batch-size: 1
-  --global-batch-size: 3072
-  --train-samples: 12207031
-  --cross-entropy-loss-fusion: true
-  --cross-entropy-fusion-impl: native
-  --attention-backend: flash
-  --enable-cuda-graph: true
-  --cuda-graph-scope: mamba attn moe_router
-  --te-rng-tracker: true
-  --manual-gc: true
-  --manual-gc-interval: 10
-  --no-create-attention-mask-in-dataloader: true
-  --num-workers: 1
-  --exit-interval: 51000
-  --override-opt_param-scheduler: true
-
-  # Network size args
-  --use-mcore-models: true
-  --spec: megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec
-  --is-hybrid-model: true
-  --mamba-num-heads: 128
-  --num-layers: 88
-  --hidden-size: 4096
-  --ffn-hidden-size: 2688
-  --num-attention-heads: 32
-  --group-query-attention: true
-  --num-query-groups: 2
-  --kv-channels: 128
-  --hybrid-override-pattern: MEMEMEM*EMEMEMEM*EMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEM*EMEMEMEME
-  --position-embedding-type: none
-  --normalization: RMSNorm
-  --untie-embeddings-and-output-weights: true
-  --init-method-std: 0.014
-  --disable-bias-linear: true
-  --squared-relu: true
-  --use-fused-weighted-squared-relu: true
-
-  # Data args
-  --seq-length: 8192
-  --max-position-embeddings: 8192
-  --data-path: ${DATA_BLEND}
-  --data-cache-path: ${DATA_CACHE_PATH}
-  --tiktoken-pattern: v2
-  --tokenizer-type: ${TOKENIZER_TYPE}
-  --tokenizer-model: ${TOKENIZER_MODEL_PATH}
-  --no-mmap-bin-files: true
-
-  # MoE args
-  --num-experts: 512
-  --moe-router-topk: 22
-  --moe-router-topk-scaling-factor: 5.0
-  --moe-router-score-function: sigmoid
-  --moe-router-enable-expert-bias: true
-  --moe-router-dtype: fp32
-  --moe-router-load-balancing-type: seq_aux_loss
-  --moe-aux-loss-coeff: 1e-4
-  --moe-token-dispatcher-type: flex
-  --moe-flex-dispatcher-backend: hybridep
-  --moe-hybridep-num-sms: 32
-  --moe-grouped-gemm: true
-  --moe-permute-fusion: true
-  --moe-latent-size: 1024
-  --moe-shared-expert-intermediate-size: 5376
-  --moe-shared-expert-compute-before-router: true
-
-  # MTP args
-  --mtp-spec: megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec
-  --mtp-num-layers: 2
-  --mtp-hybrid-override-pattern: \"*E\"
-  --calculate-per-token-loss: true
-  --mtp-loss-scaling-factor: 0.3
-
-  # Mixed precision / quantization args
-  --bf16: true
-  --keep-mtp-spec-in-bf16: true
-  --keep-mamba-stack-attention-linear-in-bf16: true
-  --keep-mamba-out-proj-in-mxfp8: true
-  --keep-moe-latent-projections-in-bf16: true
-  --first-last-layers-bf16: true
-  --num-layers-at-start-in-bf16: 0
-  --num-layers-at-end-in-bf16: 14
-  --fp4-format: e2m1
-  --fp4-recipe: nvfp4
-
-  # Regularization args
-  --attention-dropout: 0.0
-  --hidden-dropout: 0.0
-  --clip-grad: 1.0
-  --weight-decay: 0.1
-
-  # Learning rate args
-  --lr: 4.5e-4
-  --min-lr: 4.5e-6
-  --lr-decay-style: WSD
-  --lr-warmup-samples: 24414063
-  --lr-decay-samples: 3048706055
-  --lr-wsd-decay-style: minus_sqrt
-  --lr-wsd-decay-samples: 610351563
-  --adam-beta1: 0.9
-  --adam-beta2: 0.95
-
-  # Checkpointing args
-  --save: ${CHECKPOINT_SAVE_PATH}
-  --load: ${CHECKPOINT_LOAD_PATH}
-  --ckpt-format: torch_dist
-  --ckpt-fully-parallel-save: true
-  --ckpt-fully-parallel-load: true
-  --ckpt-assume-constant-structure: true
-  --async-save: true
-  --use-persistent-ckpt-worker: true
-  --save-interval: 1000
-  --save-retain-interval: 5000
-
-  # Validation args
-  --eval-interval: 1000
-  --eval-iters: 14
-
-  # Logging args
-  --log-interval: 100
-  --log-params-norm: true
-  --log-num-zeros-in-grad: true
-  --log-timers-to-tensorboard: true
-  --log-memory-to-tensorboard: true
-  --log-throughput: true
-  --log-progress: true
-  --log-energy: true
-  --log-memory-interval: 500
-  --logging-level: 20
-  --timing-log-option: minmax
-  --check-weight-hash-across-dp-replicas-interval: 20000
-  --tensorboard-dir: ${TENSORBOARD_PATH}
-  --wandb-project: megatron-core-release-runs
-  --wandb-entity: adlr
-  --wandb-exp-name: ${WANDB_EXPERIMENT}
-  --wandb-save-dir: ${WANDB_SAVE_PATH}
-METRICS:
-  - "iteration-time"
-  - "lm loss"
-  - "mem-allocated-bytes"
-  - "mem-max-allocated-bytes"
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_shm_test.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_shm_test.sh
deleted file mode 100755
index 673965f2..00000000
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_shm_test.sh
+++ /dev/null
@@ -1,369 +0,0 @@
-#!/bin/bash
-# n3_super_gb200_shm_test.sh — one-time validation: Nemotron Super 8N with async cpu-shm ckpt.
-# Model/infra config mirrors n3_super_gb200_fi.sh. No fault injection.
-# Checkpoints to node-local /tmp (discardable — not cross-node accessible).
-
-#SBATCH --account=root
-#SBATCH --partition=gb-nvl-134-135
-#SBATCH --time=00:45:00
-
-#SBATCH --job-name=n3-super-shm-test
-#SBATCH --output=/tmp/slurm-%j.launch.out
-#SBATCH --error=/tmp/slurm-%j.launch.err
-
-#SBATCH --nodes=8
-#SBATCH --ntasks-per-node=4
-#SBATCH --gpus-per-node=4
-#SBATCH --exclusive
-#SBATCH --mem=0
-
-log_msg() {
-    local msg="$1"
-    UNIX_DATETIME=$(date +%s)
-    HUMAN_DATETIME=$(date -d "@$UNIX_DATETIME" '+%Y-%m-%d %H:%M:%S.%3N')
-    echo ">>> ${msg} ${UNIX_DATETIME} (${HUMAN_DATETIME})"
-}
-
-log_msg "START SBATCH"
-echo "Running on nodes: ${SLURM_NODELIST}"
-
-# ── Platform / NCCL / RITS ────────────────────────────────────────────────────
-export RITS_PLATFORM_TYPE=gb200
-export RITS_GPUS_PER_NODE=4
-export RITS_NVL_DOMAIN_SIZE=72
-export NCCL_IB_DISABLE=0
-export NCCL_NET_GDR_LEVEL=3
-export RITS_CLUSTER_NAME=nvl72
-export PYXIS_LOG_LEVEL=debug
-export NCCL_IB_SL=1
-export NCCL_IB_TIMEOUT=19
-export UB_TIMEOUT=720
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NCCL_P2P_NET_CHUNKSIZE=2097152
-export NCCL_DEBUG=WARN
-
-# ── PyTorch / TE / inductor (from n3_super_gb200.sh ENV_VARS) ─────────────────
-export NVTE_FWD_LAYERNORM_SM_MARGIN=16
-export NVTE_BWD_LAYERNORM_SM_MARGIN=16
-export TORCHINDUCTOR_WORKER_START=fork
-export QUANTIZATION_TYPE_DEBUG=1
-export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
-export USE_MNNVL=1
-
-# ── DeepEP (hybridep MoE routing) — set USE_DEEPEP=0 to use alltoall instead ──
-USE_DEEPEP="${USE_DEEPEP:-1}"
-if [[ "${USE_DEEPEP}" == "1" ]]; then
-    export NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN=32
-fi
-
-# ── Logging / debugging ───────────────────────────────────────────────────────
-export PYTHONUNBUFFERED=1
-export ONE_LOGGER_JOB_CATEGORY=test
-export LOGLEVEL=DEBUG
-export TORCH_CPP_LOG_LEVEL=INFO
-export TORCH_NCCL_TRACE_BUFFER_SIZE=2000
-export TORCH_NCCL_RETHROW_CUDA_ERRORS=0
-export TORCH_NCCL_ENABLE_MONITORING=1
-export TORCH_NCCL_DUMP_ON_TIMEOUT=1
-export TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN=0
-export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=30
-export TORCH_DIST_INIT_BARRIER=0
-export TORCH_INCLUDE_STACK_TRACE=0
-export TORCH_INCLUDE_ONLY_ACTIVE=1
-export TORCH_NCCL_EXTRA_DUMP_ON_EXEC=1
-
-# ── CUDA graph ────────────────────────────────────────────────────────────────
-export ENABLE_CUDA_GRAPH="${ENABLE_CUDA_GRAPH:-1}"
-
-# ── Quantization mode: set USE_FP8=1 to use FP8, USE_FP4=1 for FP4 (default) ─
-# Only one may be active at a time.
-export USE_FP4="${USE_FP4:-0}"
-export USE_FP8="${USE_FP8:-1}"
-
-# ── Async checkpoint shm mode (default on) ────────────────────────────────────
-export USE_CPU_SHM="${USE_CPU_SHM:-1}"
-
-# ── Overlap comm (default off) ────────────────────────────────────────────────
-export USE_OVERLAP_COMM="${USE_OVERLAP_COMM:-0}"
-
-# ── Node / task geometry ─────────────────────────────────────────────────────
-export GPUS_PER_NODE=4
-TOTAL_TASKS=$((SLURM_NNODES * GPUS_PER_NODE))
-
-# ── Per-experiment output directory ───────────────────────────────────────────
-export BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-/home/sbak/experiments/n3-super-gb200}"
-export EXPERIMENT_DIR="${EXPERIMENT_DIR:-${BASE_EXPERIMENTS_DIR}/shm_test}"
-
-mkdir -p ${BASE_EXPERIMENTS_DIR}/datacache
-mkdir -p ${EXPERIMENT_DIR}/tensorboard
-
-: "${SLURM_RESTART_COUNT:=0}"
-
-LOG_DIR=${EXPERIMENT_DIR}/logs
-mkdir -p ${LOG_DIR}
-echo "Writing logs to ${LOG_DIR}"
-LOG_FILE_BASE="${LOG_DIR}/slurm/${SLURM_JOB_ID}.${SLURM_RESTART_COUNT}"
-
-# ── Container mounts ──────────────────────────────────────────────────────────
-LUSTRE=/home:/home
-SHARED_TMP=/home/sbak/tmp/${SLURM_JOB_ID}:/shared_tmp
-LOGS=${EXPERIMENT_DIR}/logs:/logs
-MEGATRON_REPO=/home/sbak/megatron-lm-main:/megatron-lm_repo
-DATACACHE=${BASE_EXPERIMENTS_DIR}/datacache:/datacache
-TENSORBOARD=${EXPERIMENT_DIR}/tensorboard:/tensorboard
-WORKSPACE=/home/sbak/tmp:/workspace
-# No /checkpoints mount — saves go to node-local /tmp inside the container.
-CONTAINER_MOUNTS=$LUSTRE,$SHARED_TMP,$LOGS,$MEGATRON_REPO,$DATACACHE,$TENSORBOARD,$WORKSPACE
-mkdir -p /home/sbak/tmp/${SLURM_JOB_ID}
-
-# ── Disk cleanup: remove stale enroot containers from prior jobs ──────────────
-log_msg "START disk_cleanup"
-srun \
-    --label \
-    --ntasks-per-node=1 \
-    --ntasks=${SLURM_NNODES} \
-    --kill-on-bad-exit=0 \
-    --mpi=none \
-    bash -c '
-        ENROOT_DIR="/var/lib/enroot/data/$(id -u)"
-        rm -rf "${ENROOT_DIR:?}"/* 2>/dev/null || true
-        echo "$(hostname): / $(df -h / | tail -1 | awk "{print \$3\" used, \"\$4\" avail\"}")"
-    '
-log_msg "END disk_cleanup"
-
-# ── All-node setup: clone Megatron into a per-node tmpdir ─────────────────────
-log_msg "START all_node_setup"
-srun \
-    --label \
-    --container-mounts ${CONTAINER_MOUNTS} \
-    --container-image /home/sbak/mcore_ci_040825.sqsh \
-    --container-name ${SLURM_JOB_ID} \
-    --container-workdir / \
-    --error=${LOG_FILE_BASE}.0.all_node_setup.log \
-    --output=${LOG_FILE_BASE}.0.all_node_setup.log \
-    --ntasks-per-node=1 \
-    --ntasks=${SLURM_NNODES} \
-    --kill-on-bad-exit=0 \
-    --mpi=none \
-    bash -c '
-        MEGATRON_PATH=/shared_tmp/megatron_${SLURM_NODEID}
-        rm -rf "${MEGATRON_PATH}"
-        mkdir -p "${MEGATRON_PATH}"
-        pushd $MEGATRON_PATH
-        CURRENT_BRANCH=$(git -C /megatron-lm_repo branch --show-current)
-        echo "Cloning Megatron branch $CURRENT_BRANCH into $MEGATRON_PATH"
-        git clone --single-branch --branch $CURRENT_BRANCH /megatron-lm_repo .
-        popd
-
-        # Install local nvidia-resiliency-ext so container picks up src changes.
-        uv pip install -e /home/sbak/nvidia-resiliency-ext
-    '
-log_msg "END all_node_setup"
-
-# ── Main workload ─────────────────────────────────────────────────────────────
-log_msg "START main_workload"
-srun \
-    --label \
-    --container-mounts ${CONTAINER_MOUNTS} \
-    --container-image /home/sbak/mcore_ci_040825.sqsh \
-    --container-name ${SLURM_JOB_ID} \
-    --container-workdir / \
-    --error=${LOG_FILE_BASE}.1.main_workload.log \
-    --output=${LOG_FILE_BASE}.1.main_workload.log \
-    --ntasks-per-node=${GPUS_PER_NODE} \
-    --ntasks=${TOTAL_TASKS} \
-    --kill-on-bad-exit=0 \
-    --mpi=none \
-    bash -c '
-        MEGATRON_PATH=/shared_tmp/megatron_${SLURM_NODEID}
-
-        NFS_TRITON_CACHE=/home/sbak/experiments/n3-super-gb200/triton_cache
-        NFS_INDUCTOR_CACHE=/home/sbak/experiments/n3-super-gb200/inductor_cache
-        TRITON_READY=/tmp/.triton_ready_${SLURM_JOB_ID}
-
-        export TRITON_CACHE_DIR=/tmp/triton_${SLURM_LOCALID}
-        export TORCHINDUCTOR_CACHE_DIR=/tmp/inductor_${SLURM_LOCALID}
-
-        if [[ "${SLURM_LOCALID}" == "0" ]]; then
-            if [[ -d "${NFS_TRITON_CACHE}" ]] && [[ -n "$(ls -A ${NFS_TRITON_CACHE} 2>/dev/null)" ]]; then
-                TRITON_CACHE_WAS_WARM=1
-            else
-                TRITON_CACHE_WAS_WARM=0
-            fi
-            for r in $(seq 0 $((GPUS_PER_NODE - 1))); do
-                mkdir -p /tmp/triton_${r} /tmp/inductor_${r}
-                [[ -d "${NFS_TRITON_CACHE}" ]]   && rsync -a --ignore-existing "${NFS_TRITON_CACHE}/"   "/tmp/triton_${r}/"   2>/dev/null || true
-                [[ -d "${NFS_INDUCTOR_CACHE}" ]] && rsync -a --ignore-existing "${NFS_INDUCTOR_CACHE}/" "/tmp/inductor_${r}/" 2>/dev/null || true
-            done
-            touch "${TRITON_READY}"
-            echo "Pre-staged triton/inductor cache for all local ranks (was_warm=${TRITON_CACHE_WAS_WARM})."
-        else
-            until [[ -f "${TRITON_READY}" ]]; do sleep 1; done
-        fi
-
-        mkdir -p ${TRITON_CACHE_DIR} ${TORCHINDUCTOR_CACHE_DIR}
-
-        _stage_back() {
-            if [[ "${SLURM_LOCALID}" == "0" && "${SLURM_NODEID}" == "0" && "${TRITON_CACHE_WAS_WARM}" == "0" ]]; then
-                echo "Staging triton cache back to NFS (cold start)..."
-                mkdir -p "${NFS_TRITON_CACHE}" "${NFS_INDUCTOR_CACHE}"
-                rsync -a --ignore-existing "${TRITON_CACHE_DIR}/" "${NFS_TRITON_CACHE}/" 2>/dev/null || true
-                rsync -a --ignore-existing "${TORCHINDUCTOR_CACHE_DIR}/" "${NFS_INDUCTOR_CACHE}/" 2>/dev/null || true
-                echo "Cache staged back."
-            fi
-        }
-        trap _stage_back EXIT
-
-        if [[ "${ENABLE_CUDA_GRAPH}" == "1" ]]; then
-            CUDA_GRAPH_ARGS="--enable-cuda-graph --cuda-graph-scope mamba attn"
-        else
-            CUDA_GRAPH_ARGS=""
-        fi
-
-        if [[ "${USE_DEEPEP:-1}" == "1" ]]; then
-            MOE_DISPATCHER_ARGS="--moe-token-dispatcher-type flex --moe-flex-dispatcher-backend hybridep --moe-hybridep-num-sms 32"
-        else
-            MOE_DISPATCHER_ARGS="--moe-token-dispatcher-type alltoall"
-        fi
-
-        if [[ "${USE_FP8:-0}" == "1" ]]; then
-            QUANT_ARGS="--fp8-param-gather \
-            --reuse-grad-buf-for-mxfp8-param-ag \
-            --fp8-recipe mxfp8 \
-            --fp8-format hybrid \
-            --fp8-amax-history-len 1024 \
-            --fp8-amax-compute-algo max"
-        elif [[ "${USE_FP4:-1}" == "1" ]]; then
-            QUANT_ARGS="--first-last-layers-bf16 \
-            --num-layers-at-start-in-bf16 0 \
-            --num-layers-at-end-in-bf16 14 \
-            --fp4-format e2m1 \
-            --fp4-recipe nvfp4"
-        else
-            QUANT_ARGS=""
-        fi
-
-        # Checkpoint directory — node-local /tmp inside the container.
-        # Shards are not cross-node accessible; intentional for one-time shm validation.
-        CKPT_DIR=/tmp/ckpt_${SLURM_JOB_ID}
-        mkdir -p ${CKPT_DIR}
-
-        pushd $MEGATRON_PATH
-        LAUNCHER_CMD="python3"
-        WORKLOAD_CMD=${MEGATRON_PATH}/pretrain_mamba.py
-        WORKLOAD_ARGS=" \
-            --exit-duration-in-mins 40 \
-            --exit-interval 100 \
-            --distributed-timeout-minutes 30 \
-            --distributed-timeout-seconds-after-init 1800 \
-            --disable-gloo-process-groups \
-            --mock-data \
-            --data-cache-path /datacache \
-            --no-create-attention-mask-in-dataloader \
-            --no-mmap-bin-files \
-            --tokenizer-type NullTokenizer \
-            --tiktoken-pattern v2 \
-            --vocab-size 128000 \
-            --micro-batch-size 1 \
-            --global-batch-size 32 \
-            --train-samples 12207031 \
-            --adam-beta1 0.9 \
-            --adam-beta2 0.95 \
-            --lr 4.5e-4 \
-            --min-lr 4.5e-6 \
-            --lr-decay-style WSD \
-            --lr-warmup-samples 24414063 \
-            --lr-decay-samples 3048706055 \
-            --lr-wsd-decay-style minus_sqrt \
-            --lr-wsd-decay-samples 610351563 \
-            --weight-decay 0.1 \
-            --clip-grad 1.0 \
-            --override-opt_param-scheduler \
-            --use-mcore-models \
-            --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
-            --is-hybrid-model \
-            --mamba-num-heads 128 \
-            --num-layers 88 \
-            --hidden-size 4096 \
-            --ffn-hidden-size 2688 \
-            --num-attention-heads 32 \
-            --group-query-attention \
-            --num-query-groups 2 \
-            --kv-channels 128 \
-            --hybrid-override-pattern MEMEMEM*EMEMEMEM*EMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEM*EMEMEMEME \
-            --position-embedding-type none \
-            --normalization RMSNorm \
-            --untie-embeddings-and-output-weights \
-            --init-method-std 0.014 \
-            --disable-bias-linear \
-            --squared-relu \
-            --use-fused-weighted-squared-relu \
-            --seq-length 8192 \
-            --max-position-embeddings 8192 \
-            --num-experts 512 \
-            --moe-router-topk 22 \
-            --moe-router-topk-scaling-factor 5.0 \
-            --moe-router-score-function sigmoid \
-            --moe-router-enable-expert-bias \
-            --moe-router-dtype fp32 \
-            --moe-router-load-balancing-type seq_aux_loss \
-            --moe-aux-loss-coeff 1e-4 \
-            ${MOE_DISPATCHER_ARGS} \
-            --moe-grouped-gemm \
-            --moe-permute-fusion \
-            --moe-latent-size 1024 \
-            --moe-shared-expert-intermediate-size 5376 \
-            --calculate-per-token-loss \
-            --bf16 \
-            ${QUANT_ARGS} \
-            --attention-dropout 0.0 \
-            --hidden-dropout 0.0 \
-            --sequence-parallel \
-            --use-distributed-optimizer \
-            $([[ "${USE_OVERLAP_COMM}" == "1" ]] && echo "--overlap-grad-reduce --overlap-param-gather") \
-            --ddp-num-buckets 10 \
-            --ddp-pad-buckets-for-high-nccl-busbw \
-            --high-priority-stream-groups ep \
-            --tensor-model-parallel-size 4 \
-            --pipeline-model-parallel-size 1 \
-            --expert-model-parallel-size 32 \
-            --expert-tensor-parallel-size 1 \
-            --cross-entropy-loss-fusion \
-            --cross-entropy-fusion-impl native \
-            --attention-backend flash \
-            ${CUDA_GRAPH_ARGS} \
-            --te-rng-tracker \
-            --manual-gc \
-            --manual-gc-interval 10 \
-            --num-workers 1 \
-            --eval-interval 1000 \
-            --eval-iters 14 \
-            --log-interval 1 \
-            --log-params-norm \
-            --log-num-zeros-in-grad \
-            --log-timers-to-tensorboard \
-            --log-memory-to-tensorboard \
-            --log-throughput \
-            --log-energy \
-            --log-memory-interval 500 \
-            --logging-level 10 \
-            --timing-log-option minmax \
-            --check-weight-hash-across-dp-replicas-interval 20000 \
-            --tensorboard-dir /tensorboard \
-            --local-rank ${SLURM_LOCALID} \
-            --save ${CKPT_DIR} \
-            --save-interval 10 \
-            --ckpt-format torch_dist \
-            --ckpt-fully-parallel-save \
-            --ckpt-assume-constant-structure \
-            --async-save \
-            --use-persistent-ckpt-worker \
-            $([[ "${USE_CPU_SHM}" == "1" ]] && echo "--async-ckpt-use-cpu-shm") \
-        "
-        $LAUNCHER_CMD $WORKLOAD_CMD $WORKLOAD_ARGS
-    '
-log_msg "END main_workload"
-
-log_msg "END SBATCH"
-
-set +x
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/pools/n3_super_8n_16n.pool b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/pools/n3_super_8n_16n.pool
deleted file mode 100644
index 1d700863..00000000
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/pools/n3_super_8n_16n.pool
+++ /dev/null
@@ -1,40 +0,0 @@
-# n3_super_8n_16n.pool — fault-injection pool for Nemotron-3 Super (TP=4, EP=32)
-# Minimum scale: 8 nodes (32 ranks, EP=32 requires exactly 32 ranks)
-# Maximum scale: 16 nodes (64 ranks)
-#
-# Rank coverage per node count (4 GPUs/node):
-#   8 nodes  → 32 ranks:  rank-0=0, rank-1=1, mid=16, last=31
-#   16 nodes → 64 ranks:  rank-0=0, rank-1=1, mid=32, last=63
-#
-# NOTE: 16-node jobs require ~20 min for NCCL init + CUDA graph capture before iter 1.
-# With 5-min watchdog timeout after fault + FR dumps, total is ~30+ min.
-# Use TIME=00:45:00 (set in workloads.conf) to avoid SLURM wall-time kills.
-#
-# Format: FAULT_TYPE:RANK:ITER:NODES  (one per line, # comments ignored)
-# GPU faults — highest priority; rank sweep across both node counts
-GPU_SLEEP:1:5:8
-GPU_SLEEP:0:5:8
-GPU_SLEEP:16:5:8
-GPU_SLEEP:31:5:8
-GPU_SLEEP:1:5:16
-GPU_SLEEP:32:5:16
-GPU_ERROR:1:5:8
-GPU_ERROR:0:5:8
-GPU_ERROR:16:5:8
-GPU_ERROR:1:5:16
-# Crash faults
-SIGKILL:1:5:8
-SIGKILL:0:5:8
-SIGKILL:1:5:16
-SEGFAULT:1:5:8
-OS_ABORT:1:5:8
-# Python-level hangs
-LOCK_GIL:1:5:8
-LOCK_GIL:0:5:8
-# Application exceptions
-WORKLOAD_EXC:1:5:8
-ASYNC_EXC:1:5:8
-# Signal-based
-SIGTERM:1:5:8
-SIGINT:1:5:8
-SIGNAL_EXC:1:5:8

From a146d54f293118824c30718aefb0af18d50f1cdd Mon Sep 17 00:00:00 2001
From: Seonmyeong Bak <sbak@nvidia.com>
Date: Thu, 23 Apr 2026 16:01:14 -0700
Subject: [PATCH 03/21] feat(skills): harden nvrx-attr fault injection workflow

---
 .../attribution/log_analyzer/nvrx_logsage.py  | 104 ++++++++++-
 .../nvrx-attr/fault-injection-loop/SKILL.md   |   2 +-
 .../skills/nvrx-attr/fr-analysis/SKILL.md     |   4 +-
 .../skills/nvrx-attr/log-analysis/SKILL.md    |   4 +-
 .../nvrx-attr/scripts/l4_gb200_reduced.sh     | 173 ++++++++++++------
 .../nvrx-attr/scripts/prepare_node_alloc.sh   |  31 +++-
 .../nvrx-attr/scripts/score_attribution.py    |  38 +++-
 .../nvrx-attr/scripts/watch_and_analyze.sh    |  21 ++-
 .../skills/nvrx-attr/scripts/workloads.conf   |   6 +-
 9 files changed, 294 insertions(+), 89 deletions(-)

diff --git a/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py b/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py
index c72aaae2..4ae1653b 100644
--- a/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py
+++ b/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py
@@ -1,7 +1,9 @@
 import argparse
 import logging
 import os
+import random
 import re
+import time
 from typing import Any, Dict, Mapping, Union
 
 from langchain_openai import ChatOpenAI
@@ -37,6 +39,7 @@
 ATTR_ERRORS_NOT_FOUND = "ERRORS NOT FOUND"
 ATTR_NO_LOGS = "NO LOGS"
 ATTR_SLURM_CANCELLED_DUE_TO_PREEMPTION = "SLURM CANCELLED DUE TO PREEMPTION"
+LOGSAGE_LLM_ENDPOINT_FAILED = "LLM ENDPOINT FAILED"
 
 
 MARKER_NEW_RUN_DIR_ADDED = "[sbatch_script]: New run dir added:"
@@ -108,6 +111,98 @@ def chunk_logs_strict(lines):
     return final_chunks
 
 
+def _log_analysis_retry_config() -> tuple[int, float, float, float]:
+    retries = int(os.getenv("NVRX_LOG_ANALYSIS_LLM_RETRIES", "3"))
+    initial_backoff = float(os.getenv("NVRX_LOG_ANALYSIS_LLM_INITIAL_BACKOFF_SEC", "1.0"))
+    max_backoff = float(os.getenv("NVRX_LOG_ANALYSIS_LLM_MAX_BACKOFF_SEC", "8.0"))
+    jitter = float(os.getenv("NVRX_LOG_ANALYSIS_LLM_JITTER_SEC", "0.25"))
+    return retries, initial_backoff, max_backoff, jitter
+
+
+def _finished_status_name(status: Any) -> str:
+    return getattr(status, "name", status)
+
+
+def _sleep_with_backoff(attempt: int, retries: int, backoff: float, max_backoff: float, jitter: float) -> float:
+    sleep_for = min(backoff, max_backoff) + random.uniform(0.0, jitter)
+    logger.info(
+        "Retrying log-analysis LLM in %.2fs after attempt %d/%d",
+        sleep_for,
+        attempt,
+        retries,
+    )
+    time.sleep(sleep_for)
+    return min(backoff * 2, max_backoff)
+
+
+def _retry_return_application_errors(
+    llm: ChatOpenAI, lines: list[str], cache_dict: LRUCache
+) -> ApplicationData:
+    retries, initial_backoff, max_backoff, jitter = _log_analysis_retry_config()
+    backoff = initial_backoff
+    last_status = None
+
+    for attempt in range(1, retries + 1):
+        app_data = return_application_errors(llm, lines, cache_dict)
+        status_name = _finished_status_name(app_data.finished)
+        if status_name != FINISHED_STATUS_LLM_FAILURE:
+            return app_data
+
+        last_status = status_name
+        if attempt == retries:
+            logger.error(
+                "Log-analysis extraction failed after %d attempts; last status: %s",
+                retries,
+                last_status,
+            )
+            return app_data
+
+        backoff = _sleep_with_backoff(attempt, retries, backoff, max_backoff, jitter)
+
+    return app_data
+
+
+def _with_exponential_backoff(llm_call, checkpoint_saved: bool) -> tuple[str, str, str, str, str]:
+    retries, initial_backoff, max_backoff, jitter = _log_analysis_retry_config()
+    backoff = initial_backoff
+
+    for attempt in range(1, retries + 1):
+        try:
+            result = llm_call()
+            if result and not any(
+                field == LOGSAGE_LLM_ENDPOINT_FAILED for field in result[:4]
+            ):
+                return result
+            last_error = LOGSAGE_LLM_ENDPOINT_FAILED
+        except Exception as exc:
+            last_error = str(exc)
+            logger.warning("Log-analysis LLM attempt %d/%d failed: %s", attempt, retries, exc)
+
+        if attempt == retries:
+            logger.error(
+                "Log-analysis LLM failed after %d attempts; last error: %s",
+                retries,
+                last_error,
+            )
+            return (
+                ATTR_LLM_FAILURE,
+                ATTR_LLM_FAILURE,
+                ATTR_LLM_FAILURE,
+                ATTR_LLM_FAILURE,
+                str(checkpoint_saved),
+            )
+
+        backoff = _sleep_with_backoff(attempt, retries, backoff, max_backoff, jitter)
+
+    return (
+        ATTR_LLM_FAILURE,
+        ATTR_LLM_FAILURE,
+        ATTR_LLM_FAILURE,
+        ATTR_LLM_FAILURE,
+        str(checkpoint_saved),
+    )
+
+
 class NVRxLogAnalyzer(NVRxAttribution):
     def __init__(self, args: Union[argparse.Namespace, Mapping[str, Any]]):
         from nvidia_resiliency_ext.attribution.api_keys import load_llm_api_key
@@ -213,7 +308,7 @@ async def analyze_logs(self) -> list[ApplicationData]:
                     current_chunk.append(line)
 
         output_list = [
-            return_application_errors(self.llm, lines, self.lru_cache)
+            _retry_return_application_errors(self.llm, lines, self.lru_cache)
             for cycle, lines in chunks.items()
         ]
         return output_list
@@ -248,7 +343,12 @@ async def llm_analyze(self, output_list: list[ApplicationData]) -> list[str]:
                 )
             else:
                 if len(output.application_errors_list_full):
-                    result.append(get_proposed_solution_cat(self.llm, output))
+                    result.append(
+                        _with_exponential_backoff(
+                            lambda: get_proposed_solution_cat(self.llm, output),
+                            checkpoint_saved=output.checkpoint_saved,
+                        )
+                    )
                 else:
                     if output.finished == FINISHED_STATUS_LLM_FAILURE:
                         result.append(
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
index abec6a91..228d7ce1 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
@@ -207,7 +207,7 @@ The judge is given:
 4. Raw logsage stdout (5-field text format)
 5. Raw CollectiveAnalyzer text output
 
-Default judge model: `azure/anthropic/claude-sonnet-4-6`. Override with `--model` in `score_attribution.py`.
+Default judge model: `qwen/qwen3.5-397b-a17b`. Override with `--model` in `score_attribution.py`.
 
 ---
 
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md
index df038451..d07911ec 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md
@@ -52,7 +52,7 @@ python scripts/fr_attribution.py \
 | `--verbose`, `-v` | off | Print detailed per-rank collective tables |
 | `--health-check`, `-c` | off | Include node health check results in output |
 | `--llm-analyze`, `-l` | off | Pass structured findings to the LLM for a narrative summary |
-| `--model`, `-m` | `nvdev/nvidia/llama-3.3-nemotron-super-49b-v1` | LLM model (only used with `--llm-analyze`) |
+| `--model`, `-m` | `nvidia/nemotron-3-super-120b-a12b` | LLM model (only used with `--llm-analyze`) |
 | `--debug` | off | Convert binary trace files to JSON for inspection |
 
 ---
@@ -68,7 +68,7 @@ analyzer = CollectiveAnalyzer({
     "verbose": False,
     "health_check": False,
     "llm_analyze": False,
-    "model": "nvdev/nvidia/llama-3.3-nemotron-super-49b-v1",
+    "model": "nvidia/nemotron-3-super-120b-a12b",
 })
 results = analyzer.run_sync({
     "fr_path": "/path/to/fr_dumps/",
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/SKILL.md
index a86e2ff7..e793d5de 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/SKILL.md
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/SKILL.md
@@ -45,7 +45,7 @@ python scripts/nvrx_logsage.py \
 | Flag | Default | Description |
 |------|---------|-------------|
 | `--log-path` | required | Path to the job log file |
-| `--model` | `nvidia/qwen/qwen3.5-35b-a3b` | LLM model |
+| `--model` | `nvidia/nemotron-3-super-120b-a12b` | LLM model |
 | `--temperature` | `0.2` | Sampling temperature |
 | `--top_p` | `0.7` | Top-p nucleus sampling |
 | `--max_tokens` | `8192` | Max output tokens |
@@ -61,7 +61,7 @@ from nvidia_resiliency_ext.attribution.log_analyzer.nvrx_logsage import NVRxLogA
 
 analyzer = NVRxLogAnalyzer({
     "log_path": "/path/to/job.log",
-    "model": "nvidia/qwen/qwen3.5-35b-a3b",
+    "model": "nvidia/nemotron-3-super-120b-a12b",
     "temperature": 0.2,
     "top_p": 0.7,
     "max_tokens": 8192,
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh
index 9fd39ab8..0c87a30d 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh
@@ -1,9 +1,10 @@
 #!/bin/bash
 
 # Validated only with Megatron-LM as the feedback-loop example workload.
+# Direct sbatch usage:
+#   sbatch --account=<account> --partition=<partition> scripts/l4_gb200_reduced.sh
+# If your cluster has defaults for those, the extra flags are not required.
 
-#SBATCH --account=root
-#SBATCH --partition=gb-nvl-134-135
 #SBATCH --time=00:30:00
 
 #SBATCH --job-name=llama4-scout-gb200
@@ -16,6 +17,10 @@
 #SBATCH --exclusive
 #SBATCH --mem=0
 
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+NVRX_SRC_ROOT_DEFAULT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)"
+NVRX_REPO_ROOT_DEFAULT="$(cd "${SCRIPT_DIR}/../../../../.." && pwd)"
+
 log_msg() {
     local msg="$1"
     UNIX_DATETIME=$(date +%s)
@@ -25,12 +30,8 @@ log_msg() {
 
 log_msg "START SBATCH"
 echo "Running on nodes: ${SLURM_NODELIST}"
-export RITS_PLATFORM_TYPE=gb200
-export RITS_GPUS_PER_NODE=4
-export RITS_NVL_DOMAIN_SIZE=72
 export NCCL_IB_DISABLE=0
 export NCCL_NET_GDR_LEVEL=3
-export RITS_CLUSTER_NAME=nvl72
 export PYXIS_LOG_LEVEL=debug
 export NCCL_IB_SL=1
 export NCCL_IB_TIMEOUT=19
@@ -58,26 +59,49 @@ export TORCH_INCLUDE_ONLY_ACTIVE=1
 export TORCH_NCCL_EXTRA_DUMP_ON_EXEC=1
 
 # Fault injection parameters (overridable via sbatch --export or environment)
+# Current Megatron behavior:
+# - FAULT_AT_ITER anchors the fault-delay timer after iteration N completes
+# - FAULT_DELAY is the delay in seconds from that anchor (or from training start if unset)
 export FAULT_AT_ITER="${FAULT_AT_ITER:-5}"
+export FAULT_DELAY="${FAULT_DELAY:-}"
 export FAULT_RANK="${FAULT_RANK:-1}"
 export FAULT_TYPE="${FAULT_TYPE:-GPU_SLEEP}"
+export ENABLE_FAULT_INJECTION="${ENABLE_FAULT_INJECTION:-1}"
 
 # Checkpoint settings (overridable via sbatch --export)
 export NVRX_CKPT_USE_CPU_SHM="${NVRX_CKPT_USE_CPU_SHM:-0}"
 # Enable GPU-IPC cached-data-structure path without cpu-shm (for comparison baseline)
 export NVRX_CKPT_USE_CACHED_STRUCTURE="${NVRX_CKPT_USE_CACHED_STRUCTURE:-0}"
 export DIST_TIMEOUT_AFTER_INIT="${DIST_TIMEOUT_AFTER_INIT:-1}"
+export ENABLE_NFS_CACHE_STAGING="${ENABLE_NFS_CACHE_STAGING:-0}"
+export NFS_TRITON_CACHE="${NFS_TRITON_CACHE:-}"
+export NFS_INDUCTOR_CACHE="${NFS_INDUCTOR_CACHE:-}"
 # USE_ASYNC_CKPT=1: enable async checkpointing every CKPT_SAVE_INTERVAL iters
 export USE_ASYNC_CKPT="${USE_ASYNC_CKPT:-0}"
 export CKPT_SAVE_INTERVAL="${CKPT_SAVE_INTERVAL:-100}"
+export ENABLE_ENROOT_CLEANUP="${ENABLE_ENROOT_CLEANUP:-0}"
 
 # Node / task geometry (SLURM_NNODES is set by SLURM from --nodes override)
 export GPUS_PER_NODE="${GPUS_PER_NODE:-4}"
 TOTAL_TASKS=$((SLURM_NNODES * GPUS_PER_NODE))
 
 # Per-experiment output directory (overridable via sbatch --export)
-export BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-/home/sbak/experiments/llama4-scout-gb200}"
-export EXPERIMENT_DIR="${EXPERIMENT_DIR:-${BASE_EXPERIMENTS_DIR}/fault_injection/manual/n${SLURM_NNODES}_${FAULT_TYPE}_r${FAULT_RANK}_i${FAULT_AT_ITER}}"
+export BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-${HOME}/nvrx-attr-experiments}"
+FAULT_LABEL="i${FAULT_AT_ITER}"
+if [[ -n "${FAULT_DELAY}" ]]; then
+    FAULT_LABEL="d${FAULT_DELAY}"
+fi
+export EXPERIMENT_DIR="${EXPERIMENT_DIR:-${BASE_EXPERIMENTS_DIR}/fault_injection/manual/n${SLURM_NNODES}_${FAULT_TYPE}_r${FAULT_RANK}_${FAULT_LABEL}}"
+export NVRX_REPO_ROOT="${NVRX_REPO_ROOT:-${NVRX_REPO_ROOT_DEFAULT}}"
+export NVRX_SRC_ROOT="${NVRX_SRC_ROOT:-${NVRX_SRC_ROOT_DEFAULT}}"
+export NVRX_CONTAINER_REPO_PATH="${NVRX_CONTAINER_REPO_PATH:-${HOME}/nvidia-resiliency-ext}"
+export NVRX_CONTAINER_SRC_PATH="${NVRX_CONTAINER_SRC_PATH:-${NVRX_CONTAINER_REPO_PATH}/src}"
+export SHARED_TMP_BASE_DIR="${SHARED_TMP_BASE_DIR:-${HOME}/tmp}"
+export MEGATRON_REPO_HOST_PATH="${MEGATRON_REPO_HOST_PATH:-${HOME}/megatron-lm}"
+export WORKSPACE_HOST_PATH="${WORKSPACE_HOST_PATH:-${HOME}/tmp}"
+export CONTAINER_IMAGE="${CONTAINER_IMAGE:-nvcr.io/nvidia/nemo:26.04}"
+export CONTAINER_NAME="${CONTAINER_NAME:-}"
+export CONTAINER_WORKDIR="${CONTAINER_WORKDIR:-/}"
 
 mkdir -p ${BASE_EXPERIMENTS_DIR}/datacache
 mkdir -p ${EXPERIMENT_DIR}/tensorboard
@@ -91,7 +115,7 @@ LOG_FILE_BASE="${LOG_DIR}/slurm/${SLURM_JOB_ID}.${SLURM_RESTART_COUNT}"
 
 # ── Shared-tmp directory (NFS, for cross-srun-step communication) ─────────────
 # Mounted to /shared_tmp (NOT /tmp) so the container keeps its native fast /tmp.
-SHARED_TMP_HOST=/home/sbak/tmp/${SLURM_JOB_ID}
+SHARED_TMP_HOST=${SHARED_TMP_BASE_DIR}/${SLURM_JOB_ID}
 mkdir -p ${SHARED_TMP_HOST}
 
 # ── Pre-populate .myenv with all variables that must reach the container ───────
@@ -106,48 +130,64 @@ export DIST_TIMEOUT_AFTER_INIT=${DIST_TIMEOUT_AFTER_INIT}
 export USE_ASYNC_CKPT=${USE_ASYNC_CKPT}
 export CKPT_SAVE_INTERVAL=${CKPT_SAVE_INTERVAL}
 export FAULT_AT_ITER=${FAULT_AT_ITER}
+export FAULT_DELAY=${FAULT_DELAY}
 export FAULT_RANK=${FAULT_RANK}
 export FAULT_TYPE=${FAULT_TYPE}
-# Prepend local nvrx src so container picks up our changes without a pip install step.
-export PYTHONPATH=/home/sbak/nvidia-resiliency-ext/src:\${PYTHONPATH}
+export ENABLE_FAULT_INJECTION=${ENABLE_FAULT_INJECTION}
+export ENABLE_NFS_CACHE_STAGING=${ENABLE_NFS_CACHE_STAGING}
+export NFS_TRITON_CACHE=${NFS_TRITON_CACHE}
+export NFS_INDUCTOR_CACHE=${NFS_INDUCTOR_CACHE}
+# Prepend local nvrx checkout so container picks up our changes without a pip install step.
+export NVRX_REPO_ROOT=${NVRX_CONTAINER_REPO_PATH}
+export NVRX_SRC_ROOT=${NVRX_CONTAINER_SRC_PATH}
+export PYTHONPATH=\${NVRX_REPO_ROOT}:\${NVRX_SRC_ROOT}:\${PYTHONPATH}
 MYENVEOF
 
 # Mounts
 LUSTRE=/home:/home
 SHARED_TMP=${SHARED_TMP_HOST}:/shared_tmp
 LOGS=${EXPERIMENT_DIR}/logs:/logs
-MEGATRON_REPO=/home/sbak/megatron-lm-main:/megatron-lm_repo
+MEGATRON_REPO=${MEGATRON_REPO_HOST_PATH}:/megatron-lm_repo
 DATACACHE=${BASE_EXPERIMENTS_DIR}/datacache:/datacache
 TENSORBOARD=${EXPERIMENT_DIR}/tensorboard:/tensorboard
-WORKSPACE=/home/sbak/tmp:/workspace
+WORKSPACE=${WORKSPACE_HOST_PATH}:/workspace
 CHECKPOINTS=${EXPERIMENT_DIR}/checkpoints:/checkpoints
 mkdir -p ${EXPERIMENT_DIR}/checkpoints
 CONTAINER_MOUNTS=$LUSTRE,$SHARED_TMP,$LOGS,$MEGATRON_REPO,$DATACACHE,$TENSORBOARD,$WORKSPACE,$CHECKPOINTS
+CONTAINER_ARGS=(
+    --container-mounts "${CONTAINER_MOUNTS}"
+    --container-image "${CONTAINER_IMAGE}"
+    --container-workdir "${CONTAINER_WORKDIR}"
+)
+if [[ -n "${CONTAINER_NAME}" ]]; then
+    CONTAINER_ARGS+=(--container-name "${CONTAINER_NAME}")
+fi
 
 # ── Disk cleanup: remove stale enroot containers from prior jobs ──────────────
-log_msg "START disk_cleanup"
-srun \
-    --label \
-    --ntasks-per-node=1 \
-    --ntasks=${SLURM_NNODES} \
-    --kill-on-bad-exit=0 \
-    --mpi=none \
-    bash -c '
-        ENROOT_DIR="/var/lib/enroot/data/$(id -u)"
-        rm -rf "${ENROOT_DIR:?}"/* 2>/dev/null || true
-        echo "$(hostname): / $(df -h / | tail -1 | awk "{print \$3\" used, \"\$4\" avail\"}")"
-    '
-log_msg "END disk_cleanup"
+if [[ "${ENABLE_ENROOT_CLEANUP}" == "1" ]]; then
+    log_msg "START disk_cleanup"
+    srun \
+        --label \
+        --ntasks-per-node=1 \
+        --ntasks=${SLURM_NNODES} \
+        --kill-on-bad-exit=0 \
+        --mpi=none \
+        bash -c '
+            ENROOT_DIR="/var/lib/enroot/data/$(id -u)"
+            rm -rf "${ENROOT_DIR:?}"/* 2>/dev/null || true
+            echo "$(hostname): / $(df -h / | tail -1 | awk "{print \$3\" used, \"\$4\" avail\"}")"
+        '
+    log_msg "END disk_cleanup"
+else
+    log_msg "SKIP disk_cleanup"
+fi
 
 # all node setup
 #--------------------------------
 log_msg "START all_node_setup"
 srun \
     --label \
-    --container-mounts ${CONTAINER_MOUNTS} \
-    --container-image /home/sbak/mcore_ci_0415.sqsh \
-    --container-name ${SLURM_JOB_ID} \
-    --container-workdir / \
+    "${CONTAINER_ARGS[@]}" \
     --exclusive \
     --error=${LOG_FILE_BASE}.0.all_node_setup.log \
     --output=${LOG_FILE_BASE}.0.all_node_setup.log \
@@ -163,6 +203,8 @@ srun \
         CURRENT_BRANCH=$(git -C /megatron-lm_repo branch --show-current)
         echo "Cloning Megatron branch $CURRENT_BRANCH to ${MEGATRON_PATH}"
         git clone --single-branch --branch $CURRENT_BRANCH /megatron-lm_repo .
+        rm -rf ${MEGATRON_PATH}/nvidia_resiliency_ext
+        rsync -a ${NVRX_CONTAINER_SRC_PATH}/nvidia_resiliency_ext/ ${MEGATRON_PATH}/nvidia_resiliency_ext/
         popd
     '
 log_msg "END all_node_setup"
@@ -172,10 +214,7 @@ log_msg "END all_node_setup"
 log_msg "START main_workload"
 srun \
     --label \
-    --container-mounts ${CONTAINER_MOUNTS} \
-    --container-image /home/sbak/mcore_ci_0415.sqsh \
-    --container-name ${SLURM_JOB_ID} \
-    --container-workdir / \
+    "${CONTAINER_ARGS[@]}" \
     --error=${LOG_FILE_BASE}.1.main_workload.log \
     --output=${LOG_FILE_BASE}.1.main_workload.log \
     --ntasks-per-node=${GPUS_PER_NODE} \
@@ -184,38 +223,48 @@ srun \
     --mpi=none \
     bash -c '
         source /shared_tmp/.myenv_${SLURM_JOB_ID}.sh
-
-        # Match the per-node path used in all_node_setup.
         MEGATRON_PATH=/shared_tmp/megatron_$(hostname)_${SLURM_JOB_ID}
-
-        NFS_TRITON_CACHE=/home/sbak/experiments/llama4-scout-gb200/triton_cache
-        NFS_INDUCTOR_CACHE=/home/sbak/experiments/llama4-scout-gb200/inductor_cache
+        export PYTHONPATH=${MEGATRON_PATH}:${NVRX_REPO_ROOT}:${NVRX_SRC_ROOT}:${PYTHONPATH}
+        echo "NVRX_REPO_ROOT=${NVRX_REPO_ROOT}"
+        echo "NVRX_SRC_ROOT=${NVRX_SRC_ROOT}"
+        echo "PYTHONPATH=${PYTHONPATH}"
+        python3 - <<'"'"'PY'"'"'
+import sys
+print(f"sys.path[:8]={sys.path[:8]}")
+import nvidia_resiliency_ext
+from nvidia_resiliency_ext.shared_utils.inject_fault import Fault
+print(f"nvidia_resiliency_ext={nvidia_resiliency_ext.__file__}")
+print(f"fault_enum={Fault}")
+PY
 
         # Per-rank Triton/inductor cache on the container native /tmp (local fast storage).
         export TRITON_CACHE_DIR=/tmp/triton_${SLURM_LOCALID}
         export TORCHINDUCTOR_CACHE_DIR=/tmp/inductor_${SLURM_LOCALID}
         mkdir -p ${TRITON_CACHE_DIR} ${TORCHINDUCTOR_CACHE_DIR}
 
-        # Pre-stage: warm local cache from NFS (one rank per node)
-        if [[ "${SLURM_LOCALID}" == "0" ]]; then
-            if [[ -d "${NFS_TRITON_CACHE}" ]]; then
-                echo "Pre-staging triton cache from NFS..."
+        # Optional pre/post-stage between a shared cache and the node-local /tmp cache.
+        if [[ "${ENABLE_NFS_CACHE_STAGING}" == "1" && "${SLURM_LOCALID}" == "0" ]]; then
+            if [[ -n "${NFS_TRITON_CACHE}" && -d "${NFS_TRITON_CACHE}" ]]; then
+                echo "Pre-staging triton cache from ${NFS_TRITON_CACHE}..."
                 rsync -a --ignore-existing "${NFS_TRITON_CACHE}/" "${TRITON_CACHE_DIR}/" 2>/dev/null || true
             fi
-            if [[ -d "${NFS_INDUCTOR_CACHE}" ]]; then
-                echo "Pre-staging inductor cache from NFS..."
+            if [[ -n "${NFS_INDUCTOR_CACHE}" && -d "${NFS_INDUCTOR_CACHE}" ]]; then
+                echo "Pre-staging inductor cache from ${NFS_INDUCTOR_CACHE}..."
                 rsync -a --ignore-existing "${NFS_INDUCTOR_CACHE}/" "${TORCHINDUCTOR_CACHE_DIR}/" 2>/dev/null || true
             fi
         fi
 
         # Post-stage: write back to NFS on exit (one rank per node)
         _stage_back() {
-            if [[ "${SLURM_LOCALID}" == "0" ]]; then
-                echo "Staging triton cache back to NFS..."
-                mkdir -p "${NFS_TRITON_CACHE}" "${NFS_INDUCTOR_CACHE}"
-                rsync -a --ignore-existing "${TRITON_CACHE_DIR}/" "${NFS_TRITON_CACHE}/" 2>/dev/null || true
-                rsync -a --ignore-existing "${TORCHINDUCTOR_CACHE_DIR}/" "${NFS_INDUCTOR_CACHE}/" 2>/dev/null || true
-                echo "Cache staged back."
+            if [[ "${ENABLE_NFS_CACHE_STAGING}" == "1" && "${SLURM_LOCALID}" == "0" ]]; then
+                if [[ -n "${NFS_TRITON_CACHE}" ]]; then
+                    mkdir -p "${NFS_TRITON_CACHE}"
+                    rsync -a --ignore-existing "${TRITON_CACHE_DIR}/" "${NFS_TRITON_CACHE}/" 2>/dev/null || true
+                fi
+                if [[ -n "${NFS_INDUCTOR_CACHE}" ]]; then
+                    mkdir -p "${NFS_INDUCTOR_CACHE}"
+                    rsync -a --ignore-existing "${TORCHINDUCTOR_CACHE_DIR}/" "${NFS_INDUCTOR_CACHE}/" 2>/dev/null || true
+                fi
             fi
         }
         trap _stage_back EXIT
@@ -239,6 +288,21 @@ srun \
         LAUNCHER_ARGS=" \
         "
         WORKLOAD_CMD=${MEGATRON_PATH}/pretrain_gpt.py
+        FAULT_INJECTOR_ARGS=""
+        if [[ "${ENABLE_FAULT_INJECTION}" == "1" ]]; then
+            FAULT_INJECTOR_ARGS=" \
+                --fault-injector-ranks ${FAULT_RANK} \
+                --fault-injector-fault-types ${FAULT_TYPE} \
+            "
+            if [[ -n "${FAULT_DELAY}" ]]; then
+                FAULT_INJECTOR_ARGS="${FAULT_INJECTOR_ARGS} --fault-injector-fault-delay ${FAULT_DELAY}"
+                if [[ -n "${FAULT_AT_ITER}" ]]; then
+                    FAULT_INJECTOR_ARGS="${FAULT_INJECTOR_ARGS} --fault-injector-delay-start-iteration ${FAULT_AT_ITER}"
+                fi
+            elif [[ -n "${FAULT_AT_ITER}" ]]; then
+                FAULT_INJECTOR_ARGS="${FAULT_INJECTOR_ARGS} --fault-injector-fault-delay 0 --fault-injector-delay-start-iteration ${FAULT_AT_ITER}"
+            fi
+        fi
         WORKLOAD_ARGS=" \
             --exit-duration-in-mins 5750 \
             --distributed-timeout-minutes 10 \
@@ -347,13 +411,12 @@ srun \
             --local-rank ${SLURM_LOCALID} \
             --context-parallel-size 1 \
             --vocab-size 238600 \
-            --megatron-fault-at-iter ${FAULT_AT_ITER} \
-            --megatron-fault-rank ${FAULT_RANK} \
-            --megatron-fault-type ${FAULT_TYPE} \
+            ${FAULT_INJECTOR_ARGS} \
             --distributed-timeout-seconds-after-init ${DIST_TIMEOUT_AFTER_INIT} \
             --flight-recorder-dump-path ${CKPT_DIR} \
         "
-        $LAUNCHER_CMD $LAUNCHER_ARGS $WORKLOAD_CMD $WORKLOAD_ARGS $CKPT_SAVE_ARGS
+        PYTHONPATH=${MEGATRON_PATH}:${NVRX_REPO_ROOT}:${NVRX_SRC_ROOT}:${PYTHONPATH} \
+            $LAUNCHER_CMD $LAUNCHER_ARGS $WORKLOAD_CMD $WORKLOAD_ARGS $CKPT_SAVE_ARGS
     '
 log_msg "END main_workload"
 
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh
index f2c90a64..8b8d7b01 100755
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh
@@ -25,6 +25,12 @@ set -euo pipefail
 
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 WORKLOADS_CONF="${SCRIPT_DIR}/workloads.conf"
+SLURM_DEFAULTS_CONF="${SCRIPT_DIR}/slurm.conf"
+
+if [[ -f "${SLURM_DEFAULTS_CONF}" ]]; then
+    # shellcheck disable=SC1090
+    source "${SLURM_DEFAULTS_CONF}"
+fi
 
 # ── Workload resolution from workloads.conf ────────────────────────────────────
 # If WORKLOAD is set, look it up in workloads.conf and derive SBATCH_SCRIPT and
@@ -48,7 +54,9 @@ if [[ -n "${WORKLOAD:-}" ]]; then
     _CONF_TIME=$(echo "${_CONF_LINE}"   | awk '{print $6}')
     # Only set if not already overridden in the environment
     SBATCH_SCRIPT="${SBATCH_SCRIPT:-${SCRIPT_DIR}/${_CONF_SCRIPT}}"
-    BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-${_CONF_BASE}}"
+    if [[ -n "${_CONF_BASE}" && "${_CONF_BASE}" != "-" ]]; then
+        BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-${_CONF_BASE}}"
+    fi
     if [[ -n "${_CONF_TIME}" && "${_CONF_TIME}" != "-" ]]; then
         TIME="${TIME:-${_CONF_TIME}}"
     fi
@@ -65,13 +73,13 @@ if [[ -n "${WORKLOAD:-}" ]]; then
     echo ">>> Workload: ${WORKLOAD}  (${_CONF_DESC//_/ })"
 fi
 
-ACCOUNT="${ACCOUNT:-root}"
-PARTITION="${PARTITION:-gb-nvl-134-135}"
+ACCOUNT="${ACCOUNT:-}"
+PARTITION="${PARTITION:-}"
 GPUS_PER_NODE="${GPUS_PER_NODE:-4}"
 TIME="${TIME:-00:30:00}"
 BATCH_SIZE="${BATCH_SIZE:-2}"
 POLL_INTERVAL="${POLL_INTERVAL:-30}"
-BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-/home/sbak/experiments/llama4-scout-gb200}"
+BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-${HOME}/nvrx-attr-experiments}"
 
 # ---------------------------------------------------------------------------
 # Fault pool — ordered by priority (GPU-related first, then crash, then other)
@@ -131,9 +139,7 @@ submit_one() {
     mkdir -p "${EXPERIMENT_DIR}/tensorboard"
 
     local JOB_ID
-    JOB_ID=$(sbatch \
-        --account="${ACCOUNT}" \
-        --partition="${PARTITION}" \
+    local SBATCH_ARGS=(
         --nodes="${NODES}" \
         --ntasks-per-node="${GPUS_PER_NODE}" \
         --gpus-per-node="${GPUS_PER_NODE}" \
@@ -143,8 +149,15 @@ submit_one() {
         --output="${EXPERIMENT_DIR}/logs/slurm/%j.launch.out" \
         --error="${EXPERIMENT_DIR}/logs/slurm/%j.launch.err" \
         --export=ALL,FAULT_TYPE="${FAULT_TYPE}",FAULT_RANK="${RANK}",FAULT_AT_ITER="${ITER}",GPUS_PER_NODE="${GPUS_PER_NODE}",EXPERIMENT_DIR="${EXPERIMENT_DIR}",BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR}" \
-        --parsable \
-        "${SBATCH_SCRIPT}")
+        --parsable
+    )
+    if [[ -n "${ACCOUNT}" ]]; then
+        SBATCH_ARGS+=(--account="${ACCOUNT}")
+    fi
+    if [[ -n "${PARTITION}" ]]; then
+        SBATCH_ARGS+=(--partition="${PARTITION}")
+    fi
+    JOB_ID=$(sbatch "${SBATCH_ARGS[@]}" "${SBATCH_SCRIPT}")
 
     # Print to stderr so callers using $(...) capture only the job ID on stdout
     printf "  submitted: %s rank=%-2s iter=%s nodes=%s -> job=%s\n" \
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py
index 15417a6c..4d36ad66 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python3
 """LLM-judge scorer for fault-injection attribution experiments.
 
-Uses the same ChatOpenAI / NVIDIA-inference-API setup as nvrx_logsage.py.
+Uses the same ChatOpenAI / NVIDIA inference API setup as nvrx_logsage.py.
 Reads ground-truth fault parameters and the raw text outputs of nvrx_logsage
-and CollectiveAnalyzer, then asks a Sonnet/Opus judge to score each attribution
+and CollectiveAnalyzer, then asks a judge model to score each attribution
 dimension and return structured JSON.
 
 Usage (called by watch_and_analyze.sh):
@@ -11,8 +11,8 @@
         --fault-type GPU_SLEEP --rank 0 --iter 5 --nodes 2 \
         --log-output "$LOG_OUT" \
         --fr-output  "$FR_OUT" \
-        [--model claude-sonnet-4-6] \
-        [--base-url https://inference-api.nvidia.com/v1]
+        [--model qwen/qwen3.5-397b-a17b] \
+        [--base-url https://inference.api.nvidia.com/v1]
 
 Stdout: one line of JSON with keys:
     restart_correct, rank_primary, rank_any, fault_described, fr_rank_correct, notes
@@ -21,6 +21,7 @@
 import argparse
 import json
 import logging
+import os
 import sys
 from typing import Union
 
@@ -32,8 +33,13 @@
 
 logger = logging.getLogger(__name__)
 
+INJECTION_MARKERS = (
+    "FAULT INJECTION",
+    "nvidia_resiliency_ext.shared_utils.inject_fault",
+)
+
 # Default judge model — override with --model
-DEFAULT_JUDGE_MODEL = "azure/anthropic/claude-sonnet-4-6"
+DEFAULT_JUDGE_MODEL = "qwen/qwen3.5-397b-a17b"
 
 # Expected restart decision and rationale per fault type
 _RESTART_TABLE = {
@@ -71,7 +77,7 @@ def load_log_excerpt(log_path, max_lines=400):
         lines = [line for line in lines if "[workload:" not in line or 'Cycle:' in line]
         # Strip fault-injection markers — the judge must not see which rank/fault was
         # injected in the raw log; it knows the ground truth from the structured args.
-        lines = [line for line in lines if "[MEGATRON_FAULT]" not in line]
+        lines = [line for line in lines if not any(marker in line for marker in INJECTION_MARKERS)]
         if len(lines) > max_lines:
             lines = lines[-max_lines:]
         return "".join(lines).strip()
@@ -161,17 +167,29 @@ def build_judge_prompt(fault_type, rank, iter_, nodes, run_valid, log_output, fr
 
 def score(args):
     args.run_valid = args.run_valid.lower() == "true"
-    api_key = load_nvidia_api_key()
+    api_key = os.getenv("JUDGE_API_KEY", "").strip()
+    if not api_key:
+        judge_key_file = os.getenv("JUDGE_API_KEY_FILE", "").strip()
+        if judge_key_file:
+            try:
+                with open(judge_key_file, encoding="utf-8") as f:
+                    api_key = f.read().strip()
+            except OSError:
+                api_key = ""
+    if not api_key:
+        api_key = load_nvidia_api_key()
     if not api_key:
         raise ValueError(
-            "NVIDIA_API_KEY not found. Set NVIDIA_API_KEY env var, "
-            "NVIDIA_API_KEY_FILE, or create ~/.nvidia_api_key"
+            "Judge API key not found. Set JUDGE_API_KEY/JUDGE_API_KEY_FILE, "
+            "or NVIDIA_API_KEY/NVIDIA_API_KEY_FILE, or create ~/.nvidia_api_key"
         )
 
+    base_url = os.getenv("JUDGE_BASE_URL", "").strip() or args.base_url
+
     llm = ChatOpenAI(
         model=args.model,
         api_key=api_key,
-        base_url=args.base_url,
+        base_url=base_url,
         temperature=0.0,
         max_completion_tokens=512,
     )
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh
index 8a5e3a4d..6b2a13c9 100755
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh
@@ -22,6 +22,13 @@ SCORE_PY="${SCRIPT_DIR}/score_attribution.py"
 # Ensure nvidia_resiliency_ext is importable from source tree
 export PYTHONPATH="${NVRX_SRC_DIR}${PYTHONPATH:+:$PYTHONPATH}"
 
+strip_injection_markers() {
+    local input_log="$1"
+    local output_log="$2"
+    grep -v -E 'FAULT INJECTION|nvidia_resiliency_ext\.shared_utils\.inject_fault' \
+        "${input_log}" > "${output_log}" 2>/dev/null || true
+}
+
 REPORT_FILE="${TRACKING_FILE%.tsv}_report.md"
 DONE_JOBS_FILE="${TRACKING_FILE%.tsv}_done.txt"
 
@@ -75,23 +82,25 @@ while true; do
         LOG_FILE=$(ls ${LOG_GLOB} 2>/dev/null | head -1 || true)
         LOG_OUT=""
 
-        # ---- Check run validity: did the fault actually fire? ----
-        # The fault injection prints: [MEGATRON_FAULT] global_rank=RANK/...: injecting FAULT_TYPE at iteration ITER
+        # ---- Check run validity: did the fault actually arm/fire? ----
+        # The fault injector prints:
+        #   [timestamp] FAULT INJECTION: Rank R will inject fault TYPE at timestamp
         RUN_VALID="false"
         STRIPPED_LOG=""
         if [[ -n "${LOG_FILE}" && -f "${LOG_FILE}" ]]; then
             echo "    log: ${LOG_FILE}"
-            if grep -qF "[MEGATRON_FAULT]" "${LOG_FILE}" 2>/dev/null; then
+            if grep -q "FAULT INJECTION" "${LOG_FILE}" 2>/dev/null; then
                 RUN_VALID="true"
             fi
             echo "    run_valid: ${RUN_VALID}"
 
             # Strip fault-injection markers so neither nvrx_logsage nor the judge
             # can see which rank/fault was injected — evaluation must be fair.
-            # [MEGATRON_FAULT] lines are printed by Megatron's debug_fault_injection.py
-            # and are not covered by --exclude_nvrx_logs.
+            # This removes:
+            # - scheduler lines from megatron.core.fault_injector ("FAULT INJECTION")
+            # - direct fault-tool log lines from nvidia_resiliency_ext.shared_utils.inject_fault
             STRIPPED_LOG=$(mktemp /tmp/fi_log_stripped.XXXXXX)
-            grep -vF "[MEGATRON_FAULT]" "${LOG_FILE}" > "${STRIPPED_LOG}" 2>/dev/null || true
+            strip_injection_markers "${LOG_FILE}" "${STRIPPED_LOG}"
 
             # nvrx_logsage.py prints 5 newline-joined fields to stdout:
             #   line 1: restart_decision
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/workloads.conf b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/workloads.conf
index 7cea1674..dcc1dc62 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/workloads.conf
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/workloads.conf
@@ -5,7 +5,9 @@
 #
 # NAME                 : identifier passed as WORKLOAD=<name> to prepare_node_alloc.sh
 # SCRIPT               : path to the sbatch job script (relative to the scripts/ dir)
-# BASE_EXPERIMENTS_DIR : root directory for all experiment output (logs, checkpoints, etc.)
+# BASE_EXPERIMENTS_DIR : root directory for all experiment output (logs, checkpoints, etc.);
+#                        "-" means use BASE_EXPERIMENTS_DIR from the environment or
+#                        prepare_node_alloc.sh default
 # DESCRIPTION          : free-form human-readable label (no spaces; use underscores)
 # POOL_FILE            : (optional) pool file under scripts/pools/ to use as default pool
 #                        when POOL env var is not set; "-" means use the built-in default pool
@@ -14,4 +16,4 @@
 #
 # Fields are whitespace-separated. Lines starting with # are ignored.
 
-llama4_scout  l4_gb200_reduced.sh   /home/sbak/experiments/llama4-scout-gb200  Llama4-Scout_(reduced_layers)_on_GB200     -                    -
+llama4_scout  l4_gb200_reduced.sh   -                                           Llama4-Scout_(reduced_layers)_on_GB200     -                    -

From f73ff8b16bc6eecad5766f70e8c1fdc2041aa668 Mon Sep 17 00:00:00 2001
From: Seonmyeong Bak <sbak@nvidia.com>
Date: Thu, 23 Apr 2026 16:04:32 -0700
Subject: [PATCH 04/21] chore(skills): add slurm defaults template

---
 .../skills/nvrx-attr/scripts/slurm.conf               | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/slurm.conf

diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/slurm.conf b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/slurm.conf
new file mode 100644
index 00000000..764003dc
--- /dev/null
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/slurm.conf
@@ -0,0 +1,11 @@
+# Optional site-specific Slurm defaults for nvrx-attr scripts.
+#
+# This file is sourced by prepare_node_alloc.sh. Environment variables still
+# take precedence, so you can override these per invocation:
+#
+#   ACCOUNT=myacct PARTITION=gpu bash scripts/prepare_node_alloc.sh
+#
+# Leave values empty to rely on the cluster's default account / partition.
+
+ACCOUNT=""
+PARTITION=""

From 2275c6a6b33f04b832614a3be3a24bef915ebd57 Mon Sep 17 00:00:00 2001
From: Seonmyeong Bak <sbak@nvidia.com>
Date: Thu, 23 Apr 2026 21:56:08 -0700
Subject: [PATCH 05/21] feat(skills): add local env support for fault loop

---
 .gitignore                                    |  1 +
 .../nvrx-attr/fault-injection-loop/SKILL.md   | 55 ++++++++++++++++---
 .../nvrx-attr/scripts/l4_gb200_reduced.sh     | 20 +++----
 .../nvrx-attr/scripts/prepare_node_alloc.sh   | 35 +++++++++++-
 .../skills/nvrx-attr/scripts/run_session.sh   |  7 ++-
 5 files changed, 97 insertions(+), 21 deletions(-)

diff --git a/.gitignore b/.gitignore
index a24dba38..28d90ebe 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,4 @@ ft_state.json
 *_pb2.pyi
 *_pb2_grpc.py
 .idea/
+src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
index 228d7ce1..0358f7ab 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
@@ -92,18 +92,44 @@ To run a custom subset, override `POOL` before calling the script:
 POOL="GPU_SLEEP:0:5:2 GPU_SLEEP:1:5:2" bash scripts/prepare_node_alloc.sh
 ```
 
+## Local User Config
+
+Put cluster-specific settings in `scripts/user.env`. This file is sourced by
+`run_session.sh`, `prepare_node_alloc.sh`, and `l4_gb200_reduced.sh`, and it is
+intended to stay local and untracked.
+
+Recommended contents:
+
+```bash
+PARTITION=gb-nvl-134-135
+BASE_EXPERIMENTS_DIR="${HOME}/nvrx-attr-experiments"
+MEGATRON_REPO_HOST_PATH="${HOME}/megatron-lm-main"
+SHARED_TMP_BASE_DIR="${HOME}/tmp"
+WORKSPACE_HOST_PATH="${HOME}/tmp"
+CONTAINER_IMAGE="nvcr.io/nvidia/nemo:26.04"
+```
+
+Use `user.env` for stable site defaults such as partition, container image, and
+host paths. Use per-run environment overrides for experiment-specific controls
+such as `POOL`, `WORKLOAD`, `BATCH_SIZE`, `FAULT_TYPE`, `FAULT_AT_ITER`, or
+`FAULT_DELAY`.
+
 Environment variables:
 
 | Variable | Default | Description |
 |---|---|---|
 | `WORKLOAD` | `llama4_scout` | Select a registered workload by name (see `scripts/workloads.conf`) |
-| `ACCOUNT` | `root` | SLURM account |
-| `PARTITION` | `gb-nvl-134-135` | SLURM partition |
+| `ACCOUNT` | _(cluster default or `scripts/slurm.conf`)_ | SLURM account |
+| `PARTITION` | _(cluster default or `scripts/slurm.conf`)_ | SLURM partition |
 | `GPUS_PER_NODE` | `4` | GPUs per node |
 | `TIME` | `00:30:00` | Per-job wall-clock limit |
 | `BATCH_SIZE` | `2` | Jobs submitted per round |
 | `POLL_INTERVAL` | `30` | Seconds between queue polls |
-| `BASE_EXPERIMENTS_DIR` | _(from workloads.conf or `llama4-scout-gb200`)_ | Root for all output |
+| `BASE_EXPERIMENTS_DIR` | `${HOME}/nvrx-attr-experiments` | Root for all output |
+| `MEGATRON_REPO_HOST_PATH` | `${HOME}/megatron-lm-main` | Host path to the Megatron checkout mounted into the container |
+| `SHARED_TMP_BASE_DIR` | `${HOME}/tmp` | Shared filesystem path used for cross-step coordination |
+| `WORKSPACE_HOST_PATH` | `${HOME}/tmp` | Host path mounted at `/workspace` inside the container |
+| `CONTAINER_IMAGE` | `nvcr.io/nvidia/nemo:26.04` | Container image used by the workload script |
 | `SBATCH_SCRIPT` | `scripts/l4_gb200_reduced.sh` | Job script to submit |
 | `POOL` | _(default pool above)_ | Space-separated experiment triplets |
 
@@ -111,7 +137,7 @@ Environment variables:
 
 | Name | Script | Base dir | Description |
 |---|---|---|---|
-| `llama4_scout` | `l4_gb200_reduced.sh` | `.../llama4-scout-gb200` | Llama4-Scout (reduced layers) on GB200 |
+| `llama4_scout` | `l4_gb200_reduced.sh` | `${HOME}/nvrx-attr-experiments` | Llama4-Scout (reduced layers) on GB200 |
 
 ```bash
 # Run the full pool against the validated example workload
@@ -187,7 +213,7 @@ To also run the sub-skills interactively for a single experiment:
 
 ## Step 4 — Score Each Experiment
 
-Scoring is performed by `scripts/score_attribution.py`, an LLM judge (Sonnet or Opus) that
+Scoring is performed by `scripts/score_attribution.py`, an LLM judge that
 receives the ground truth, the filtered raw log, the logsage attribution output, and the FR
 analysis output, then returns structured JSON scores with a reasoning note.
 
@@ -292,7 +318,7 @@ Required changes for a custom workload script:
    `${EXPERIMENT_DIR}/logs/slurm/${SLURM_JOB_ID}.*.1.main_workload.log`
    so `watch_and_analyze.sh` can find it.
 3. Write NCCL flight-recorder dumps under `${EXPERIMENT_DIR}/checkpoints/`.
-4. Emit a `[MEGATRON_FAULT] ...` marker when the fault is injected.
+4. Emit a fault-injection marker when the fault is injected.
    `watch_and_analyze.sh` uses this to decide whether the run reached the injection point.
 5. Preserve the per-experiment directory layout:
    `logs/slurm/`, `checkpoints/`, and `tensorboard/`.
@@ -308,11 +334,12 @@ The example `SBATCH_SCRIPT` reads these env vars from `prepare_node_alloc.sh` vi
 | Variable | Default | Description |
 |---|---|---|
 | `FAULT_AT_ITER` | `5` | Training iteration at which to inject |
+| `FAULT_DELAY` | `15` | Delay in seconds before fault injection after the iteration anchor |
 | `FAULT_RANK` | `1` | Global rank to inject `[0, total_ranks)` |
 | `FAULT_TYPE` | `GPU_SLEEP` | Megatron fault type enum name |
 | `GPUS_PER_NODE` | `4` | GPUs per node (used to compute `TOTAL_TASKS`) |
 | `EXPERIMENT_DIR` | `${BASE_EXPERIMENTS_DIR}/fault_injection/n${SLURM_NNODES}_${FAULT_TYPE}_r${FAULT_RANK}_i${FAULT_AT_ITER}` | Per-experiment output root |
-| `BASE_EXPERIMENTS_DIR` | `/home/sbak/experiments/llama4-scout-gb200` | Shared root (datacache, triton/inductor caches) |
+| `BASE_EXPERIMENTS_DIR` | `${HOME}/nvrx-attr-experiments` | Shared root (datacache, triton/inductor caches) |
 
 Valid `FAULT_TYPE` values:
 `GPU_ERROR`, `GPU_SLEEP`, `WORKLOAD_EXC`, `ASYNC_EXC`, `SIGNAL_EXC`, `OS_ABORT`,
@@ -324,13 +351,23 @@ Valid `FAULT_TYPE` values:
 
 ```bash
 # Manual runs land under fault_injection/manual/ by default (no session dir needed)
-EXPERIMENT_DIR=/home/sbak/experiments/llama4-scout-gb200/fault_injection/manual/n2_GPU_SLEEP_r1_i5
+EXPERIMENT_DIR=${HOME}/nvrx-attr-experiments/fault_injection/manual/n2_GPU_SLEEP_r1_i5
 mkdir -p ${EXPERIMENT_DIR}/logs/slurm ${EXPERIMENT_DIR}/checkpoints ${EXPERIMENT_DIR}/tensorboard
 
 sbatch \
     --nodes=2 \
     --output=${EXPERIMENT_DIR}/logs/slurm/%j.launch.out \
     --error=${EXPERIMENT_DIR}/logs/slurm/%j.launch.err \
-    --export=ALL,FAULT_TYPE=GPU_SLEEP,FAULT_RANK=1,FAULT_AT_ITER=5,GPUS_PER_NODE=4,EXPERIMENT_DIR=${EXPERIMENT_DIR} \
+    --export=ALL,FAULT_TYPE=GPU_SLEEP,FAULT_RANK=1,FAULT_AT_ITER=5,FAULT_DELAY=15,GPUS_PER_NODE=4,EXPERIMENT_DIR=${EXPERIMENT_DIR} \
     scripts/l4_gb200_reduced.sh
 ```
+
+Optional site-specific cleanup:
+
+```bash
+export CONTAINER_CLEANUP_CMD='
+ENROOT_DIR="/var/lib/enroot/data/$(id -u)"
+rm -rf "${ENROOT_DIR:?}"/* 2>/dev/null || true
+echo "$(hostname): / $(df -h / | tail -1 | awk "{print \$3\" used, \"\$4\" avail\"}")"
+'
+```
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh
index 0c87a30d..a6e99b47 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh
@@ -18,8 +18,13 @@
 #SBATCH --mem=0
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+USER_ENV_FILE="${SCRIPT_DIR}/user.env"
 NVRX_SRC_ROOT_DEFAULT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)"
 NVRX_REPO_ROOT_DEFAULT="$(cd "${SCRIPT_DIR}/../../../../.." && pwd)"
+if [[ -f "${USER_ENV_FILE}" ]]; then
+    # shellcheck disable=SC1090
+    source "${USER_ENV_FILE}"
+fi
 
 log_msg() {
     local msg="$1"
@@ -63,7 +68,7 @@ export TORCH_NCCL_EXTRA_DUMP_ON_EXEC=1
 # - FAULT_AT_ITER anchors the fault-delay timer after iteration N completes
 # - FAULT_DELAY is the delay in seconds from that anchor (or from training start if unset)
 export FAULT_AT_ITER="${FAULT_AT_ITER:-5}"
-export FAULT_DELAY="${FAULT_DELAY:-}"
+export FAULT_DELAY="${FAULT_DELAY:-15}"
 export FAULT_RANK="${FAULT_RANK:-1}"
 export FAULT_TYPE="${FAULT_TYPE:-GPU_SLEEP}"
 export ENABLE_FAULT_INJECTION="${ENABLE_FAULT_INJECTION:-1}"
@@ -79,7 +84,7 @@ export NFS_INDUCTOR_CACHE="${NFS_INDUCTOR_CACHE:-}"
 # USE_ASYNC_CKPT=1: enable async checkpointing every CKPT_SAVE_INTERVAL iters
 export USE_ASYNC_CKPT="${USE_ASYNC_CKPT:-0}"
 export CKPT_SAVE_INTERVAL="${CKPT_SAVE_INTERVAL:-100}"
-export ENABLE_ENROOT_CLEANUP="${ENABLE_ENROOT_CLEANUP:-0}"
+export CONTAINER_CLEANUP_CMD="${CONTAINER_CLEANUP_CMD:-}"
 
 # Node / task geometry (SLURM_NNODES is set by SLURM from --nodes override)
 export GPUS_PER_NODE="${GPUS_PER_NODE:-4}"
@@ -163,8 +168,8 @@ if [[ -n "${CONTAINER_NAME}" ]]; then
     CONTAINER_ARGS+=(--container-name "${CONTAINER_NAME}")
 fi
 
-# ── Disk cleanup: remove stale enroot containers from prior jobs ──────────────
-if [[ "${ENABLE_ENROOT_CLEANUP}" == "1" ]]; then
+# ── Optional site-specific container cleanup hook ──────────────────────────────
+if [[ -n "${CONTAINER_CLEANUP_CMD}" ]]; then
     log_msg "START disk_cleanup"
     srun \
         --label \
@@ -172,11 +177,7 @@ if [[ "${ENABLE_ENROOT_CLEANUP}" == "1" ]]; then
         --ntasks=${SLURM_NNODES} \
         --kill-on-bad-exit=0 \
         --mpi=none \
-        bash -c '
-            ENROOT_DIR="/var/lib/enroot/data/$(id -u)"
-            rm -rf "${ENROOT_DIR:?}"/* 2>/dev/null || true
-            echo "$(hostname): / $(df -h / | tail -1 | awk "{print \$3\" used, \"\$4\" avail\"}")"
-        '
+        bash -lc "${CONTAINER_CLEANUP_CMD}"
     log_msg "END disk_cleanup"
 else
     log_msg "SKIP disk_cleanup"
@@ -313,7 +314,6 @@ PY
             --no-mmap-bin-files \
             --tokenizer-type NullTokenizer \
             --tiktoken-pattern v2 \
-            --tokenizer-model /lustre/fsw/portfolios/llmservice/projects/llmservice_nlp_fm/nemotron6/tokenizers/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json \
             --micro-batch-size 1 \
             --global-batch-size 64 \
             --train-samples 10240000 \
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh
index 8b8d7b01..67d80be1 100755
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh
@@ -26,11 +26,44 @@ set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 WORKLOADS_CONF="${SCRIPT_DIR}/workloads.conf"
 SLURM_DEFAULTS_CONF="${SCRIPT_DIR}/slurm.conf"
+USER_ENV_FILE="${SCRIPT_DIR}/user.env"
+ACCOUNT_FROM_ENV="${ACCOUNT-}"
+PARTITION_FROM_ENV="${PARTITION-}"
+BASE_EXPERIMENTS_DIR_FROM_ENV="${BASE_EXPERIMENTS_DIR-}"
+MEGATRON_REPO_HOST_PATH_FROM_ENV="${MEGATRON_REPO_HOST_PATH-}"
+CONTAINER_IMAGE_FROM_ENV="${CONTAINER_IMAGE-}"
+SHARED_TMP_BASE_DIR_FROM_ENV="${SHARED_TMP_BASE_DIR-}"
+WORKSPACE_HOST_PATH_FROM_ENV="${WORKSPACE_HOST_PATH-}"
 
 if [[ -f "${SLURM_DEFAULTS_CONF}" ]]; then
     # shellcheck disable=SC1090
     source "${SLURM_DEFAULTS_CONF}"
 fi
+if [[ -f "${USER_ENV_FILE}" ]]; then
+    # shellcheck disable=SC1090
+    source "${USER_ENV_FILE}"
+fi
+if [[ -n "${ACCOUNT_FROM_ENV}" ]]; then
+    ACCOUNT="${ACCOUNT_FROM_ENV}"
+fi
+if [[ -n "${PARTITION_FROM_ENV}" ]]; then
+    PARTITION="${PARTITION_FROM_ENV}"
+fi
+if [[ -n "${BASE_EXPERIMENTS_DIR_FROM_ENV}" ]]; then
+    BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR_FROM_ENV}"
+fi
+if [[ -n "${MEGATRON_REPO_HOST_PATH_FROM_ENV}" ]]; then
+    MEGATRON_REPO_HOST_PATH="${MEGATRON_REPO_HOST_PATH_FROM_ENV}"
+fi
+if [[ -n "${CONTAINER_IMAGE_FROM_ENV}" ]]; then
+    CONTAINER_IMAGE="${CONTAINER_IMAGE_FROM_ENV}"
+fi
+if [[ -n "${SHARED_TMP_BASE_DIR_FROM_ENV}" ]]; then
+    SHARED_TMP_BASE_DIR="${SHARED_TMP_BASE_DIR_FROM_ENV}"
+fi
+if [[ -n "${WORKSPACE_HOST_PATH_FROM_ENV}" ]]; then
+    WORKSPACE_HOST_PATH="${WORKSPACE_HOST_PATH_FROM_ENV}"
+fi
 
 # ── Workload resolution from workloads.conf ────────────────────────────────────
 # If WORKLOAD is set, look it up in workloads.conf and derive SBATCH_SCRIPT and
@@ -148,7 +181,7 @@ submit_one() {
         --mem=0 \
         --output="${EXPERIMENT_DIR}/logs/slurm/%j.launch.out" \
         --error="${EXPERIMENT_DIR}/logs/slurm/%j.launch.err" \
-        --export=ALL,FAULT_TYPE="${FAULT_TYPE}",FAULT_RANK="${RANK}",FAULT_AT_ITER="${ITER}",GPUS_PER_NODE="${GPUS_PER_NODE}",EXPERIMENT_DIR="${EXPERIMENT_DIR}",BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR}" \
+        --export=ALL,FAULT_TYPE="${FAULT_TYPE}",FAULT_RANK="${RANK}",FAULT_AT_ITER="${ITER}",GPUS_PER_NODE="${GPUS_PER_NODE}",EXPERIMENT_DIR="${EXPERIMENT_DIR}",BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR}",MEGATRON_REPO_HOST_PATH="${MEGATRON_REPO_HOST_PATH:-}",CONTAINER_IMAGE="${CONTAINER_IMAGE:-}",SHARED_TMP_BASE_DIR="${SHARED_TMP_BASE_DIR:-}",WORKSPACE_HOST_PATH="${WORKSPACE_HOST_PATH:-}" \
         --parsable
     )
     if [[ -n "${ACCOUNT}" ]]; then
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/run_session.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/run_session.sh
index ca5251bc..a8145d6c 100755
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/run_session.sh
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/run_session.sh
@@ -11,6 +11,11 @@
 set -euo pipefail
 
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+USER_ENV_FILE="${SCRIPT_DIR}/user.env"
+if [[ -f "${USER_ENV_FILE}" ]]; then
+    # shellcheck disable=SC1090
+    source "${USER_ENV_FILE}"
+fi
 WORKLOAD="${WORKLOAD:-llama4_scout}"
 
 # ---- Phase 1: submit and wait for all experiments ----
@@ -22,7 +27,7 @@ WORKLOAD="${WORKLOAD}" bash "${SCRIPT_DIR}/prepare_node_alloc.sh"
 # prepare_node_alloc.sh prints the tracking file path; re-derive it the same way
 # (SESSION_TAG is the timestamp when prepare_node_alloc ran, which is a few seconds
 # before this line — find the newest session dir instead of recomputing the tag)
-BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-/home/sbak/experiments/llama4-scout-gb200}"
+BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-${HOME}/nvrx-attr-experiments}"
 TRACKING_FILE=$(ls -td "${BASE_EXPERIMENTS_DIR}/fault_injection"/[0-9]* 2>/dev/null \
     | head -1)/experiments.tsv
 

From 3f2016d3385ee116b808285ba6e6542d25466c18 Mon Sep 17 00:00:00 2001
From: Seonmyeong Bak <sbak@nvidia.com>
Date: Thu, 23 Apr 2026 22:04:47 -0700
Subject: [PATCH 06/21] chore(skills): reduce torch cpp log verbosity

---
 .../skills/nvrx-attr/scripts/l4_gb200_reduced.sh                | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh
index a6e99b47..f91f99ce 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh
@@ -51,7 +51,7 @@ export ONE_LOGGER_JOB_CATEGORY=test
 export LOGLEVEL=DEBUG
 export TORCHINDUCTOR_WORKER_START=fork
 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
-export TORCH_CPP_LOG_LEVEL=INFO
+export TORCH_CPP_LOG_LEVEL=WARNING
 export TORCH_NCCL_TRACE_BUFFER_SIZE=2000
 export TORCH_NCCL_RETHROW_CUDA_ERRORS=0
 export TORCH_NCCL_ENABLE_MONITORING=1

From 43e39f74f6a835edc9a663ffd569dda6bfa090fc Mon Sep 17 00:00:00 2001
From: Seonmyeong Bak <sbak@nvidia.com>
Date: Thu, 23 Apr 2026 22:20:51 -0700
Subject: [PATCH 07/21] style(skills): format changed python files

---
 .../attribution/log_analyzer/nvrx_logsage.py  |  8 ++--
 .../nvrx-attr/scripts/score_attribution.py    | 47 ++++++++++++-------
 2 files changed, 33 insertions(+), 22 deletions(-)

diff --git a/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py b/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py
index 4ae1653b..be5aa7f1 100644
--- a/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py
+++ b/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py
@@ -123,7 +123,9 @@ def _finished_status_name(status: Any) -> str:
     return getattr(status, "name", status)
 
 
-def _sleep_with_backoff(attempt: int, retries: int, backoff: float, max_backoff: float, jitter: float) -> float:
+def _sleep_with_backoff(
+    attempt: int, retries: int, backoff: float, max_backoff: float, jitter: float
+) -> float:
     sleep_for = min(backoff, max_backoff) + random.uniform(0.0, jitter)
     logger.info(
         "Retrying log-analysis LLM in %.2fs after attempt %d/%d",
@@ -169,9 +171,7 @@ def _with_exponential_backoff(llm_call, checkpoint_saved: bool) -> tuple[str, st
     for attempt in range(1, retries + 1):
         try:
             result = llm_call()
-            if result and not any(
-                field == LOGSAGE_LLM_ENDPOINT_FAILED for field in result[:4]
-            ):
+            if result and not any(field == LOGSAGE_LLM_ENDPOINT_FAILED for field in result[:4]):
                 return result
             last_error = LOGSAGE_LLM_ENDPOINT_FAILED
         except Exception as exc:
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py
index 4d36ad66..8588f018 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py
@@ -23,7 +23,6 @@
 import logging
 import os
 import sys
-from typing import Union
 
 from langchain_openai import ChatOpenAI
 
@@ -43,18 +42,27 @@
 
 # Expected restart decision and rationale per fault type
 _RESTART_TABLE = {
-    "GPU_SLEEP":    ("RESTART IMMEDIATE", "transient GPU hang, recoverable"),
-    "LOCK_GIL":     ("RESTART IMMEDIATE", "transient Python GIL hang, recoverable"),
-    "SIGTERM":      ("RESTART IMMEDIATE", "external termination signal, recoverable"),
-    "SIGINT":       ("RESTART IMMEDIATE", "external interrupt signal, recoverable"),
-    "SIGSTOP":      ("RESTART IMMEDIATE", "external stop signal, recoverable"),
-    "SIGNAL_EXC":   ("RESTART IMMEDIATE", "signal-based exception, typically recoverable"),
-    "GPU_ERROR":    ("STOP - DONT RESTART IMMEDIATE", "hardware GPU error, may be persistent"),
-    "SIGKILL":      ("STOP - DONT RESTART IMMEDIATE", "hard kill, possible external pressure or OOM"),
-    "SEGFAULT":     ("STOP - DONT RESTART IMMEDIATE", "segmentation fault, likely code or memory corruption"),
-    "OS_ABORT":     ("STOP - DONT RESTART IMMEDIATE", "OS abort, likely severe system or hardware fault"),
+    "GPU_SLEEP": ("RESTART IMMEDIATE", "transient GPU hang, recoverable"),
+    "LOCK_GIL": ("RESTART IMMEDIATE", "transient Python GIL hang, recoverable"),
+    "SIGTERM": ("RESTART IMMEDIATE", "external termination signal, recoverable"),
+    "SIGINT": ("RESTART IMMEDIATE", "external interrupt signal, recoverable"),
+    "SIGSTOP": ("RESTART IMMEDIATE", "external stop signal, recoverable"),
+    "SIGNAL_EXC": ("RESTART IMMEDIATE", "signal-based exception, typically recoverable"),
+    "GPU_ERROR": ("STOP - DONT RESTART IMMEDIATE", "hardware GPU error, may be persistent"),
+    "SIGKILL": ("STOP - DONT RESTART IMMEDIATE", "hard kill, possible external pressure or OOM"),
+    "SEGFAULT": (
+        "STOP - DONT RESTART IMMEDIATE",
+        "segmentation fault, likely code or memory corruption",
+    ),
+    "OS_ABORT": (
+        "STOP - DONT RESTART IMMEDIATE",
+        "OS abort, likely severe system or hardware fault",
+    ),
     "WORKLOAD_EXC": ("STOP - DONT RESTART IMMEDIATE", "application exception, likely a code bug"),
-    "ASYNC_EXC":    ("STOP - DONT RESTART IMMEDIATE", "async exception in workload, likely a code bug"),
+    "ASYNC_EXC": (
+        "STOP - DONT RESTART IMMEDIATE",
+        "async exception in workload, likely a code bug",
+    ),
 }
 
 
@@ -85,7 +93,9 @@ def load_log_excerpt(log_path, max_lines=400):
         return f"(could not read log file: {exc})"
 
 
-def build_judge_prompt(fault_type, rank, iter_, nodes, run_valid, log_output, fr_output, log_excerpt):
+def build_judge_prompt(
+    fault_type, rank, iter_, nodes, run_valid, log_output, fr_output, log_excerpt
+):
     total_ranks = nodes * 4  # GPUS_PER_NODE=4 in the example SBATCH_SCRIPT
     expected_restart, restart_rationale = _RESTART_TABLE.get(
         fault_type, ("unknown", "unknown fault type")
@@ -217,9 +227,7 @@ def score(args):
     # Strip markdown code fences if present
     if text.startswith("```"):
         lines = text.splitlines()
-        text = "\n".join(
-            line for line in lines if not line.startswith("```")
-        ).strip()
+        text = "\n".join(line for line in lines if not line.startswith("```")).strip()
 
     result = json.loads(text)
     return result
@@ -231,8 +239,11 @@ def main():
     parser.add_argument("--rank", type=int, required=True, help="Injected global rank")
     parser.add_argument("--iter", type=int, required=True, help="Injected iteration")
     parser.add_argument("--nodes", type=int, required=True, help="Node count")
-    parser.add_argument("--run-valid", default="true",
-                        help="'true' if training reached the fault injection point, 'false' otherwise")
+    parser.add_argument(
+        "--run-valid",
+        default="true",
+        help="'true' if training reached the fault injection point, 'false' otherwise",
+    )
     parser.add_argument("--log-path", default="", help="Path to the raw job log file")
     parser.add_argument("--log-output", default="", help="Raw stdout from nvrx_logsage")
     parser.add_argument("--fr-output", default="no_dumps", help="Raw text from CollectiveAnalyzer")

From 251df4eb7039011d9951876716ea6934dbb4c5b3 Mon Sep 17 00:00:00 2001
From: Seonmyeong Bak <sbak@nvidia.com>
Date: Fri, 24 Apr 2026 11:54:21 -0700
Subject: [PATCH 08/21] fix(skills): wire feedback-loop analysis outputs

---
 .../attribution/log_analyzer/nvrx_logsage.py  | 15 +++-
 .../trace_analyzer/fr_attribution.py          | 87 ++++++++++++++-----
 .../nvrx-attr/fault-injection-loop/SKILL.md   | 18 ++--
 .../skills/nvrx-attr/fr-analysis/SKILL.md     | 18 ++--
 .../skills/nvrx-attr/scripts/user.env.example | 25 ++++++
 .../nvrx-attr/scripts/watch_and_analyze.sh    | 51 +++++------
 6 files changed, 147 insertions(+), 67 deletions(-)
 create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example

diff --git a/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py b/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py
index be5aa7f1..0340a6b3 100644
--- a/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py
+++ b/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py
@@ -461,11 +461,24 @@ def main():
         action='store_true',
         help='Input is already per-cycle data (skip filtering and chunking)',
     )
+    parser.add_argument(
+        '--emit-stdout',
+        action='store_true',
+        help='Print final attribution payload to stdout for machine consumers',
+    )
 
     args = parser.parse_args()
 
     analyzer = NVRxLogAnalyzer(args)
-    analyzer.run_sync(args)
+    results = analyzer.run_sync(args)
+
+    if args.emit_stdout:
+        for result in results:
+            if not result:
+                continue
+            payload = result[0] if isinstance(result, tuple) else result
+            if payload:
+                print(payload)
 
 
 if __name__ == "__main__":
diff --git a/src/nvidia_resiliency_ext/attribution/trace_analyzer/fr_attribution.py b/src/nvidia_resiliency_ext/attribution/trace_analyzer/fr_attribution.py
index a5d06560..f4584cb4 100644
--- a/src/nvidia_resiliency_ext/attribution/trace_analyzer/fr_attribution.py
+++ b/src/nvidia_resiliency_ext/attribution/trace_analyzer/fr_attribution.py
@@ -34,6 +34,42 @@ def eprint(*args, **kwargs):
     print(*args, file=sys.stderr, **kwargs)
 
 
+def _parse_rank_list(rank_text: str) -> List[int]:
+    ranks = []
+    for token in rank_text.split(','):
+        token = token.strip()
+        if not token:
+            continue
+        try:
+            ranks.append(int(token))
+        except ValueError:
+            continue
+    return ranks
+
+
+def _extract_missing_ranks_from_table(text: str) -> List[int]:
+    hanging_ranks = set()
+    capture = False
+
+    for line in text.splitlines():
+        stripped = line.strip()
+        if not stripped:
+            continue
+        if stripped.startswith("PGID") and "Missing Ranks" in stripped:
+            capture = True
+            continue
+        if not capture or "|" not in stripped:
+            continue
+
+        columns = [col.strip() for col in stripped.split("|")]
+        if len(columns) < 6:
+            continue
+        for rank in _parse_rank_list(columns[-1]):
+            hanging_ranks.add(rank)
+
+    return sorted(hanging_ranks)
+
+
 @dataclass
 class Collective:
     """
@@ -134,12 +170,7 @@ async def print_output(self, attribution_result: Optional[str]):
                 hanging_ranks_str = hanging_ranks.group(1).strip()
                 hanging_ranks_list = list(map(int, hanging_ranks_str.split(',')))
         else:
-            for idx, line in enumerate(text.split('\n')):
-                line_list = line.split('|')
-                if len(line_list) >= 5:
-                    logger.info(line)
-                    if idx >= 1:
-                        hanging_ranks_list.append(line_list[5])
+            hanging_ranks_list = _extract_missing_ranks_from_table(text)
         hanging_ranks = f"hanging ranks: {hanging_ranks_list}"
         # Dict form preserves collective table text for MCP clients and FRAnalysisResult parity.
         return (
@@ -218,20 +249,18 @@ def build_collectives_to_order():
         # analyze collectives to find process groups with missing and completed ranks
         completed_pg, missing_pg = self.analyze_matches(verbose=bool(cfg.get("verbose")))
         grouped_missing_pgs = {}
-        grouped_completed_pgs = {}
 
         # if the dump file contains health check results, parse the health check results
         # and print them in a format
         if cfg.get("health_check"):
             self.print_node_health_status(verbose=bool(cfg.get("verbose")))
 
-        # group the process groups with missing and completed ranks
-        # by finding longest paths in the graph
+        # Group only process groups with missing ranks.
+        # Completed-rank summaries are not actionable for attribution and create
+        # misleading output in the feedback loop.
         grouped_missing_pgs = self.group_pgs(missing_pg)
-        if len(grouped_missing_pgs) == 0:
-            grouped_completed_pgs = self.group_pgs(completed_pg)
 
-        # gather the head node of each group with missing and completed ranks
+        # gather the head node of each group with missing ranks
         # the head node is the first node in the group
         # the missing ranks in the head node of the missing process groups
         # are considered to cause the other nodes in the group to hang
@@ -242,16 +271,16 @@ def gather_head_nodes(grouped_pgs):
             return head_nodes
 
         head_nodes_missing = None
-        head_nodes_completed = None
-        # Gather the head node of each group
+        # Gather the head node of each missing-rank group.
         if len(grouped_missing_pgs) > 0:
             head_nodes_missing = gather_head_nodes(grouped_missing_pgs)
             logger.debug(f"head_nodes of missing_pg: {head_nodes_missing}")
-        else:
-            head_nodes_completed = gather_head_nodes(grouped_completed_pgs)
-            logger.debug(f"head_nodes of completed_pg: {head_nodes_completed}")
         # Print the analysis output
-        with capture_logs() as output:
+        original_level = logger.level
+        if logger.getEffectiveLevel() > logging.INFO:
+            logger.setLevel(logging.INFO)
+
+        with capture_logs(logger.name) as output:
 
             def print_ranks_in_pgs(head_nodes, pg_dict, missing_or_completed="Missing"):
                 logger.info(
@@ -273,10 +302,8 @@ def print_ranks_in_pgs(head_nodes, pg_dict, missing_or_completed="Missing"):
             if head_nodes_missing:
                 logger.debug(f"head_nodes_missing: {head_nodes_missing}")
                 print_ranks_in_pgs(head_nodes_missing, missing_pg, "Missing")
-            # TODO: using this completed pg needs to be updated with new algorithm for isolation
-            if head_nodes_completed:
-                print_ranks_in_pgs(head_nodes_completed, completed_pg, "Completed")
         analysis_output = output.getvalue()
+        logger.setLevel(original_level)
         return analysis_output
 
     async def collective_analysis(self, analysis_output: str) -> Optional[str]:
@@ -1117,7 +1144,7 @@ def main():
         '--fr-path', type=str, help='Path to JSON files or directories containing JSON files'
     )
     parser.add_argument(
-        '-p', '--pattern', default="*.json", help='File pattern to match (default: *.json)'
+        '-p', '--pattern', default="_dump_*", help='File pattern to match (default: _dump_*)'
     )
     parser.add_argument('-v', '--verbose', action='store_true', help='verbose output')
     parser.add_argument(
@@ -1143,11 +1170,25 @@ def main():
         action='store_true',
         help='Convert the trace file to json file, if the trace is binary, for debugging',
     )
+    parser.add_argument(
+        '--emit-stdout',
+        action='store_true',
+        help='Print final FR summary table to stdout for machine consumers',
+    )
 
     args = parser.parse_args()
 
     analyzer = CollectiveAnalyzer(args)
-    analyzer.run_sync(args)
+    result = analyzer.run_sync(args)
+
+    if args.emit_stdout and isinstance(result, tuple) and result:
+        payload = result[0]
+        if isinstance(payload, dict):
+            text = payload.get("analysis_text", "")
+            if text:
+                print(text)
+        elif payload:
+            print(payload)
 
 
 if __name__ == "__main__":
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
index 0358f7ab..879cbe31 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
@@ -94,9 +94,15 @@ POOL="GPU_SLEEP:0:5:2 GPU_SLEEP:1:5:2" bash scripts/prepare_node_alloc.sh
 
 ## Local User Config
 
-Put cluster-specific settings in `scripts/user.env`. This file is sourced by
-`run_session.sh`, `prepare_node_alloc.sh`, and `l4_gb200_reduced.sh`, and it is
-intended to stay local and untracked.
+Start from the tracked template:
+
+```bash
+cp scripts/user.env.example scripts/user.env
+```
+
+Then edit `scripts/user.env` with cluster-specific settings. This file is
+sourced by `run_session.sh`, `prepare_node_alloc.sh`, and
+`l4_gb200_reduced.sh`, and it is intended to stay local and untracked.
 
 Recommended contents:
 
@@ -198,7 +204,7 @@ The watcher:
 1. Reads each row from the tracking TSV
 2. Calls `nvrx_logsage.py --exclude_nvrx_logs` and parses the text output to get
    `restart_decision` and `attribution_text`
-3. Calls `CollectiveAnalyzer` from `fr_attribution.py` to get suspect ranks
+3. Calls FR analysis as `python -m nvidia_resiliency_ext.attribution.trace_analyzer.fr_attribution --fr-path "${EXPERIMENT_DIR}/checkpoints" -p "_dump_*"` and passes the raw table output to the judge
 4. Scores 7 dimensions (restart correctness, rank primary, rank any, category, type, FR rank)
 5. Appends a scored row to `<session>_report.md`
 6. Repeats until all experiments are analyzed
@@ -206,7 +212,7 @@ The watcher:
 To also run the sub-skills interactively for a single experiment:
 ```bash
 /log-analysis --log-path "${EXPERIMENT_DIR}/logs/slurm/${JOB_ID}.*.1.main_workload.log"
-/fr-analysis  --fr-path  "${EXPERIMENT_DIR}/checkpoints/"
+/fr-analysis  --fr-path "${EXPERIMENT_DIR}/checkpoints" -p "_dump_*"
 ```
 
 ---
@@ -231,7 +237,7 @@ The judge is given:
 2. Expected restart decision + rationale (derived from `score_attribution.py:_RESTART_TABLE`)
 3. Filtered raw log (last 400 lines, same `exclude_nvrx_logs` filtering as logsage)
 4. Raw logsage stdout (5-field text format)
-5. Raw CollectiveAnalyzer text output
+5. Raw FR analysis table output from `fr_attribution.py --fr-path ... -p "_dump_*"`
 
 Default judge model: `qwen/qwen3.5-397b-a17b`. Override with `--model` in `score_attribution.py`.
 
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md
index d07911ec..17cc7de5 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md
@@ -22,7 +22,7 @@ and isolate the ranks responsible, using `CollectiveAnalyzer`.
 
 ## What it does
 
-1. Loads all FR dump files (JSON or binary pickle) matching a glob pattern under `--fr-path`.
+1. Loads all FR dump files matching a glob pattern under `--fr-path`.
 2. Parses each dump into `Collective` records (op type, ranks, process group, timing, state).
 3. Groups collectives by process group and sequence ID across ranks to detect mismatches.
 4. Identifies the **wavefront** — the process group boundary where collectives diverge — and
@@ -37,7 +37,7 @@ and isolate the ranks responsible, using `CollectiveAnalyzer`.
 ```bash
 python scripts/fr_attribution.py \
     --fr-path /path/to/fr_dumps/ \
-    [--pattern "*.json"] \
+    [-p "_dump_*"] \
     [--verbose] \
     [--health-check] \
     [--llm-analyze] \
@@ -48,7 +48,7 @@ python scripts/fr_attribution.py \
 | Flag | Default | Description |
 |------|---------|-------------|
 | `--fr-path` | required | Path to a directory (or single file) containing FR dump files |
-| `--pattern` | `*.json` | Glob pattern for dump files within `--fr-path` |
+| `--pattern`, `-p` | `_dump_*` | Glob pattern for dump files within `--fr-path` |
 | `--verbose`, `-v` | off | Print detailed per-rank collective tables |
 | `--health-check`, `-c` | off | Include node health check results in output |
 | `--llm-analyze`, `-l` | off | Pass structured findings to the LLM for a narrative summary |
@@ -64,7 +64,7 @@ from nvidia_resiliency_ext.attribution.trace_analyzer.fr_attribution import Coll
 
 analyzer = CollectiveAnalyzer({
     "fr_path": "/path/to/fr_dumps/",
-    "pattern": "*.json",
+    "pattern": "_dump_*",
     "verbose": False,
     "health_check": False,
     "llm_analyze": False,
@@ -80,10 +80,10 @@ results = analyzer.run_sync({
 
 ## Output
 
-Returns `(text, AttributionState)` pairs where `text` describes:
+Returns `(text, AttributionState)` pairs where `text` is the FR analysis table and describes:
 
-- The **wavefront process group** where collectives diverged
-- **Missing ranks** at the wavefront (root-cause suspects)
+- The selected wavefront/front process group
+- **Missing ranks** at that process group (root-cause suspects)
 - Per-rank collective status tables (when `--verbose`)
 - Node health summary (when `--health-check`)
 - LLM narrative (when `--llm-analyze`)
@@ -97,8 +97,8 @@ may be restartable after isolating the identified ranks.
 
 | Format | Notes |
 |--------|-------|
-| JSON (`.json`) | Standard PyTorch FR export; default glob pattern |
-| Binary pickle | Detected automatically; use `--debug` to convert to JSON |
+| `_dump_*` files | PyTorch FR dump prefix pattern used by the feedback loop |
+| Binary pickle / JSON payloads | Detected automatically; use `--debug` to convert binary traces to JSON |
 
 FR dumps are typically written to the directory specified by `TORCH_NCCL_DEBUG_INFO_TEMP_FILE`
 or triggered automatically on NCCL timeout.
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example
new file mode 100644
index 00000000..273a488d
--- /dev/null
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example
@@ -0,0 +1,25 @@
+# Local site settings for the nvrx-attr fault-injection scripts.
+# Copy to `user.env` and adjust for your cluster and host paths.
+#
+# This file is sourced by:
+# - run_session.sh
+# - prepare_node_alloc.sh
+# - l4_gb200_reduced.sh
+#
+# Per-run overrides can still be provided as environment variables when invoking
+# the scripts.
+
+# SLURM defaults
+# ACCOUNT=myacct
+# PARTITION=my-partition
+
+# Host paths
+BASE_EXPERIMENTS_DIR="${HOME}/nvrx-attr-experiments"
+MEGATRON_REPO_HOST_PATH="${HOME}/megatron-lm-main"
+SHARED_TMP_BASE_DIR="${HOME}/tmp"
+WORKSPACE_HOST_PATH="${HOME}/tmp"
+
+# Container settings
+CONTAINER_IMAGE="nvcr.io/nvidia/nemo:26.04"
+# CONTAINER_NAME=
+# CONTAINER_WORKDIR=/
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh
index 6b2a13c9..591af5ea 100755
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh
@@ -17,7 +17,13 @@ SKILL_DIR="$(dirname "${SCRIPT_DIR}")"
 NVRX_SRC_DIR="$(cd "${SKILL_DIR}/../../.." && pwd)"
 
 LOGSAGE_PY="${SKILL_DIR}/log-analysis/scripts/nvrx_logsage.py"
+FR_ANALYSIS_MODULE="nvidia_resiliency_ext.attribution.trace_analyzer.fr_attribution"
 SCORE_PY="${SCRIPT_DIR}/score_attribution.py"
+LOG_ANALYSIS_MODEL="${LOG_ANALYSIS_MODEL:-${NVRX_LLM_MODEL:-nvidia/nemotron-3-super-120b-a12b}}"
+LOG_ANALYSIS_BASE_URL="${LOG_ANALYSIS_BASE_URL:-${NVRX_LLM_BASE_URL:-https://inference-api.nvidia.com}}"
+JUDGE_MODEL="${JUDGE_MODEL:-qwen/qwen3.5-397b-a17b}"
+JUDGE_BASE_URL="${JUDGE_BASE_URL:-https://inference-api.nvidia.com}"
+FR_PATTERN="${FR_PATTERN:-_dump_*}"
 
 # Ensure nvidia_resiliency_ext is importable from source tree
 export PYTHONPATH="${NVRX_SRC_DIR}${PYTHONPATH:+:$PYTHONPATH}"
@@ -110,6 +116,9 @@ while true; do
             #   last line: checkpoint_saved ("True" / "False")
             LOG_OUT=$(python3 "${LOGSAGE_PY}" \
                 --log-path "${STRIPPED_LOG}" \
+                --model "${LOG_ANALYSIS_MODEL}" \
+                --base_url "${LOG_ANALYSIS_BASE_URL}" \
+                --emit-stdout \
                 --exclude_nvrx_logs 2>/dev/null || echo "")
             LOG_RESTART=$(echo "${LOG_OUT}" | head -1)
             echo "    restart_decision: ${LOG_RESTART:-<empty>}"
@@ -122,33 +131,17 @@ while true; do
         FR_DIR="${EXPERIMENT_DIR}/checkpoints"
         FR_OUT="no_dumps"
 
-        if [[ "${RUN_VALID}" == "true" ]] && ls "${FR_DIR}"/_dump_* 2>/dev/null | grep -q .; then
-            echo "    FR dumps: $(ls "${FR_DIR}"/_dump_* 2>/dev/null | wc -l) files"
-            FR_OUT=$(python3 -c "
-import sys, logging
-logging.basicConfig(level=logging.WARNING)
-sys.path.insert(0, '${NVRX_SRC_DIR}')
-from nvidia_resiliency_ext.attribution.trace_analyzer.fr_attribution import CollectiveAnalyzer
-try:
-    ca = CollectiveAnalyzer({'fr_path': '${FR_DIR}'})
-    results = ca.run_sync({'fr_path': '${FR_DIR}'})
-    if results:
-        result_data = results[0]
-        if isinstance(result_data, dict):
-            text = result_data.get('analysis_text', '')
-            ranks = result_data.get('hanging_ranks', '')
-            if text:
-                print(text)
-            if ranks:
-                print(ranks)
-        else:
-            print(str(result_data))
-    else:
-        print('no results')
-except Exception as e:
-    print('error: ' + str(e), file=sys.stderr)
-    print('no_dumps')
-" 2>/dev/null || echo "no_dumps")
+        if [[ "${RUN_VALID}" == "true" ]] && ls "${FR_DIR}"/${FR_PATTERN} 2>/dev/null | grep -q .; then
+            echo "    FR dumps: $(ls "${FR_DIR}"/${FR_PATTERN} 2>/dev/null | wc -l) files"
+            # Use the FR CLI contract directly:
+            #   --fr-path <directory containing dumps> -p '_dump_*'
+            FR_OUT=$(python3 -m "${FR_ANALYSIS_MODULE}" \
+                --fr-path "${FR_DIR}" \
+                --emit-stdout \
+                -p "${FR_PATTERN}" 2>/dev/null || echo "no_dumps")
+            if [[ -z "${FR_OUT}" ]]; then
+                FR_OUT="no_dumps"
+            fi
         elif [[ "${RUN_VALID}" == "false" ]]; then
             FR_OUT="run_invalid"
             echo "    FR analysis skipped (run did not reach fault injection point)"
@@ -164,7 +157,9 @@ except Exception as e:
             --run-valid "${RUN_VALID}" \
             --log-path "${STRIPPED_LOG:-}" \
             --log-output "${LOG_OUT}" \
-            --fr-output "${FR_OUT}" 2>/dev/null || echo '{"notes":"judge_failed"}')
+            --fr-output "${FR_OUT}" \
+            --model "${JUDGE_MODEL}" \
+            --base-url "${JUDGE_BASE_URL}" 2>/dev/null || echo '{"notes":"judge_failed"}')
 
         # Clean up temp stripped log
         [[ -n "${STRIPPED_LOG}" && -f "${STRIPPED_LOG}" ]] && rm -f "${STRIPPED_LOG}"

From 3b791bc3f5d5622b3f946cdede3931c8b3cdf5f9 Mon Sep 17 00:00:00 2001
From: Seonmyeong Bak <sbak@nvidia.com>
Date: Fri, 24 Apr 2026 12:39:24 -0700
Subject: [PATCH 09/21] fix(skills): refine feedback-loop scoring config

---
 .../nvrx-attr/fault-injection-loop/SKILL.md   | 35 ++++++++++---
 .../nvrx-attr/scripts/score_attribution.py    | 51 +++++++++++++++++--
 .../skills/nvrx-attr/scripts/user.env.example | 10 ++++
 .../nvrx-attr/scripts/watch_and_analyze.sh    |  9 +++-
 4 files changed, 92 insertions(+), 13 deletions(-)

diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
index 879cbe31..857ba4f3 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
@@ -101,7 +101,7 @@ cp scripts/user.env.example scripts/user.env
 ```
 
 Then edit `scripts/user.env` with cluster-specific settings. This file is
-sourced by `run_session.sh`, `prepare_node_alloc.sh`, and
+sourced by `run_session.sh`, `prepare_node_alloc.sh`, `watch_and_analyze.sh`, and
 `l4_gb200_reduced.sh`, and it is intended to stay local and untracked.
 
 Recommended contents:
@@ -113,12 +113,20 @@ MEGATRON_REPO_HOST_PATH="${HOME}/megatron-lm-main"
 SHARED_TMP_BASE_DIR="${HOME}/tmp"
 WORKSPACE_HOST_PATH="${HOME}/tmp"
 CONTAINER_IMAGE="nvcr.io/nvidia/nemo:26.04"
+NVIDIA_API_KEY_FILE="${HOME}/.nvidia_api_key"
+JUDGE_API_KEY_FILE="${HOME}/.nvidia_api_key"
+NVRX_LLM_MODEL="nvidia/nemotron-3-super-120b-a12b"
+NVRX_LLM_BASE_URL="https://integrate.api.nvidia.com/v1"
+JUDGE_MODEL="qwen/qwen3.5-397b-a17b"
+JUDGE_BASE_URL="https://integrate.api.nvidia.com/v1"
+FR_RACK_SIZE=32
 ```
 
 Use `user.env` for stable site defaults such as partition, container image, and
-host paths. Use per-run environment overrides for experiment-specific controls
-such as `POOL`, `WORKLOAD`, `BATCH_SIZE`, `FAULT_TYPE`, `FAULT_AT_ITER`, or
-`FAULT_DELAY`.
+host paths, plus local LLM credentials and endpoint settings for log-analysis
+and the judge. Use per-run environment overrides for experiment-specific
+controls such as `POOL`, `WORKLOAD`, `BATCH_SIZE`, `FAULT_TYPE`,
+`FAULT_AT_ITER`, or `FAULT_DELAY`.
 
 Environment variables:
 
@@ -136,6 +144,13 @@ Environment variables:
 | `SHARED_TMP_BASE_DIR` | `${HOME}/tmp` | Shared filesystem path used for cross-step coordination |
 | `WORKSPACE_HOST_PATH` | `${HOME}/tmp` | Host path mounted at `/workspace` inside the container |
 | `CONTAINER_IMAGE` | `nvcr.io/nvidia/nemo:26.04` | Container image used by the workload script |
+| `NVIDIA_API_KEY_FILE` | _unset_ | File containing the log-analysis API key |
+| `JUDGE_API_KEY_FILE` | _unset_ | File containing the judge API key |
+| `NVRX_LLM_MODEL` | `nvidia/nemotron-3-super-120b-a12b` | Model for log-analysis |
+| `NVRX_LLM_BASE_URL` | `https://integrate.api.nvidia.com/v1` | Base URL for log-analysis |
+| `JUDGE_MODEL` | `qwen/qwen3.5-397b-a17b` | Model for judge scoring |
+| `JUDGE_BASE_URL` | `https://integrate.api.nvidia.com/v1` | Base URL for judge scoring |
+| `FR_RACK_SIZE` | `32` | Ranks per rack for coarse FR scoring |
 | `SBATCH_SCRIPT` | `scripts/l4_gb200_reduced.sh` | Job script to submit |
 | `POOL` | _(default pool above)_ | Space-separated experiment triplets |
 
@@ -206,7 +221,7 @@ The watcher:
    `restart_decision` and `attribution_text`
 3. Calls FR analysis as `python -m nvidia_resiliency_ext.attribution.trace_analyzer.fr_attribution --fr-path "${EXPERIMENT_DIR}/checkpoints" -p "_dump_*"` and passes the raw table output to the judge
 4. Scores 7 dimensions (restart correctness, rank primary, rank any, category, type, FR rank)
-5. Appends a scored row to `<session>_report.md`
+5. Appends a scored row to `<session>_report.md` as a markdown table row
 6. Repeats until all experiments are analyzed
 
 To also run the sub-skills interactively for a single experiment:
@@ -229,7 +244,7 @@ analysis output, then returns structured JSON scores with a reasoning note.
 | **rank_primary** | `true` / `false` / `partial` | Injected rank is the primary root-cause in attribution |
 | **rank_any** | `true` / `false` | Injected rank mentioned anywhere in attribution |
 | **fault_described** | `true` / `false` / `partial` | Fault nature (hang/crash/signal/exception) correctly described |
-| **fr_rank_correct** | `true` / `false` / `no_dumps` | FR analysis identifies injected rank as suspect |
+| **fr_rank_correct** | `rank` / `node` / `rack` / `false` / `no_dumps` | FR analysis narrows correctly to the injected rank, node, rack, or fails to narrow usefully |
 | **judge_notes** | string | One-sentence summary of the main gap or confirmation |
 
 The judge is given:
@@ -238,13 +253,19 @@ The judge is given:
 3. Filtered raw log (last 400 lines, same `exclude_nvrx_logs` filtering as logsage)
 4. Raw logsage stdout (5-field text format)
 5. Raw FR analysis table output from `fr_attribution.py --fr-path ... -p "_dump_*"`
+6. `GPUS_PER_NODE` and `FR_RACK_SIZE` to map the injected rank to node and rack scopes for FR scoring
 
 Default judge model: `qwen/qwen3.5-397b-a17b`. Override with `--model` in `score_attribution.py`.
+Default rack size for FR scope scoring: `32` ranks. Override with `FR_RACK_SIZE`.
 
 ---
 
 ## Step 5 — Aggregate Results
 
+The canonical output of the loop is the markdown table in `<session>_report.md`.
+When summarizing results for users, prefer linking to that file and reproducing the
+same table shape rather than flattening the results into plain prose.
+
 The report markdown table from `watch_and_analyze.sh` gives a matrix view. Look for
 patterns across rows:
 
@@ -266,6 +287,8 @@ Common failure mode patterns and their meaning:
 | `fault_described=partial` for crash types | Crash keywords present but fault type not specifically named |
 | `restart_correct=false` for GPU_ERROR | LLM conflating hardware error with recoverable hang |
 | `fr_rank_correct=no_dumps` | NCCL watchdog did not fire before job ended — adjust `TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC` |
+| `fr_rank_correct=node` | FR isolated the correct node but not the exact rank |
+| `fr_rank_correct=rack` | FR isolated the correct rack-sized rank group but not the exact node/rank |
 
 ---
 
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py
index 8588f018..b699096f 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py
@@ -39,6 +39,8 @@
 
 # Default judge model — override with --model
 DEFAULT_JUDGE_MODEL = "qwen/qwen3.5-397b-a17b"
+DEFAULT_GPUS_PER_NODE = int(os.getenv("GPUS_PER_NODE", "4"))
+DEFAULT_FR_RACK_SIZE = int(os.getenv("FR_RACK_SIZE", "32"))
 
 # Expected restart decision and rationale per fault type
 _RESTART_TABLE = {
@@ -94,9 +96,21 @@ def load_log_excerpt(log_path, max_lines=400):
 
 
 def build_judge_prompt(
-    fault_type, rank, iter_, nodes, run_valid, log_output, fr_output, log_excerpt
+    fault_type,
+    rank,
+    iter_,
+    nodes,
+    run_valid,
+    log_output,
+    fr_output,
+    log_excerpt,
+    gpus_per_node,
+    rack_size,
 ):
-    total_ranks = nodes * 4  # GPUS_PER_NODE=4 in the example SBATCH_SCRIPT
+    total_ranks = nodes * gpus_per_node
+    node_index = rank // gpus_per_node
+    rack_start = (rank // rack_size) * rack_size
+    rack_end = min(rack_start + rack_size - 1, total_ranks - 1)
     expected_restart, restart_rationale = _RESTART_TABLE.get(
         fault_type, ("unknown", "unknown fault type")
     )
@@ -126,13 +140,21 @@ def build_judge_prompt(
 ## Ground truth (injected fault)
 - Fault type : {fault_type}
 - Injected rank : {rank}  (global rank index, 0-based; total ranks = {total_ranks})
+- Injected node : {node_index}  (using {gpus_per_node} GPUs per node)
+- Injected rack : ranks {rack_start}-{rack_end}  (using rack size {rack_size})
 - Injected at iteration : {iter_}
-- Cluster : {nodes} nodes × 4 GPUs = {total_ranks} total ranks
+- Cluster : {nodes} nodes × {gpus_per_node} GPUs = {total_ranks} total ranks
 
 ## Expected correct behavior
 - restart_decision should be : {expected_restart}
   Rationale: {restart_rationale}
 - Rank {rank} should appear in Primary issues as the root cause
+- FR scope scoring:
+  - "rank" if FR points directly to rank {rank}
+  - "node" if FR does not isolate rank {rank} but correctly narrows to node {node_index}
+  - "rack" if FR does not isolate rank {rank} or node {node_index} but correctly narrows to rack ranks {rack_start}-{rack_end}
+  - "false" if FR points elsewhere or is not useful
+  - "no_dumps" if there is no actionable FR output
 
 ## Raw job log (filtered, last 400 lines)
 {log_section}
@@ -159,8 +181,13 @@ def build_judge_prompt(
    (e.g., GPU hang, segfault, signal kill) appropriate for {fault_type}?
    Values: "true" | "false" | "partial" (category right but specifics wrong)
 
-5. **fr_rank_correct** — Does the FR analysis output identify rank {rank} as a suspect?
-   Values: "true" | "false" | "no_dumps" (no FR dumps available)
+5. **fr_rank_correct** — How precise is the FR analysis output?
+   Values: "rank" | "node" | "rack" | "false" | "no_dumps"
+   Use "rank" only if rank {rank} is explicitly implicated.
+   Use "node" only if the FR output narrows correctly to node {node_index} but not the exact rank.
+   Use "rack" only if the FR output narrows correctly to rack ranks {rack_start}-{rack_end} but not the exact node or rank.
+   Use "false" if the FR output points somewhere else, is misleading, or does not narrow correctly.
+   Use "no_dumps" if there is no actionable FR output.
 
 6. **notes** — One concise sentence summarizing the main gap or confirming correctness.
 
@@ -215,6 +242,8 @@ def score(args):
         log_output=args.log_output,
         fr_output=args.fr_output,
         log_excerpt=log_excerpt,
+        gpus_per_node=args.gpus_per_node,
+        rack_size=args.rack_size,
     )
 
     # build_judge_prompt returns a dict directly for invalid runs (no LLM call needed)
@@ -249,6 +278,18 @@ def main():
     parser.add_argument("--fr-output", default="no_dumps", help="Raw text from CollectiveAnalyzer")
     parser.add_argument("--model", default=DEFAULT_JUDGE_MODEL, help="Judge LLM model")
     parser.add_argument("--base-url", default=DEFAULT_LLM_BASE_URL, help="API base URL")
+    parser.add_argument(
+        "--gpus-per-node",
+        type=int,
+        default=DEFAULT_GPUS_PER_NODE,
+        help="GPUs per node for rank-to-node mapping",
+    )
+    parser.add_argument(
+        "--rack-size",
+        type=int,
+        default=DEFAULT_FR_RACK_SIZE,
+        help="Ranks per rack for coarse FR scope scoring",
+    )
     args = parser.parse_args()
 
     try:
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example
index 273a488d..a00999fb 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example
@@ -23,3 +23,13 @@ WORKSPACE_HOST_PATH="${HOME}/tmp"
 CONTAINER_IMAGE="nvcr.io/nvidia/nemo:26.04"
 # CONTAINER_NAME=
 # CONTAINER_WORKDIR=/
+
+# Log-analysis / judge LLM settings
+# Keep these local. Prefer *_API_KEY_FILE over inline secrets.
+# NVIDIA_API_KEY_FILE="${HOME}/.nvidia_api_key"
+# JUDGE_API_KEY_FILE="${HOME}/.nvidia_api_key"
+# NVRX_LLM_MODEL="nvidia/nemotron-3-super-120b-a12b"
+# NVRX_LLM_BASE_URL="https://integrate.api.nvidia.com/v1"
+# JUDGE_MODEL="qwen/qwen3.5-397b-a17b"
+# JUDGE_BASE_URL="https://integrate.api.nvidia.com/v1"
+# FR_RACK_SIZE=32
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh
index 591af5ea..27685ee7 100755
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh
@@ -13,6 +13,11 @@ TRACKING_FILE="${1:?Usage: $0 <tracking_file.tsv>}"
 POLL_INTERVAL=30
 
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+USER_ENV_FILE="${SCRIPT_DIR}/user.env"
+if [[ -f "${USER_ENV_FILE}" ]]; then
+    # shellcheck disable=SC1090
+    source "${USER_ENV_FILE}"
+fi
 SKILL_DIR="$(dirname "${SCRIPT_DIR}")"
 NVRX_SRC_DIR="$(cd "${SKILL_DIR}/../../.." && pwd)"
 
@@ -20,9 +25,9 @@ LOGSAGE_PY="${SKILL_DIR}/log-analysis/scripts/nvrx_logsage.py"
 FR_ANALYSIS_MODULE="nvidia_resiliency_ext.attribution.trace_analyzer.fr_attribution"
 SCORE_PY="${SCRIPT_DIR}/score_attribution.py"
 LOG_ANALYSIS_MODEL="${LOG_ANALYSIS_MODEL:-${NVRX_LLM_MODEL:-nvidia/nemotron-3-super-120b-a12b}}"
-LOG_ANALYSIS_BASE_URL="${LOG_ANALYSIS_BASE_URL:-${NVRX_LLM_BASE_URL:-https://inference-api.nvidia.com}}"
+LOG_ANALYSIS_BASE_URL="${LOG_ANALYSIS_BASE_URL:-${NVRX_LLM_BASE_URL:-https://integrate.api.nvidia.com/v1}}"
 JUDGE_MODEL="${JUDGE_MODEL:-qwen/qwen3.5-397b-a17b}"
-JUDGE_BASE_URL="${JUDGE_BASE_URL:-https://inference-api.nvidia.com}"
+JUDGE_BASE_URL="${JUDGE_BASE_URL:-https://integrate.api.nvidia.com/v1}"
 FR_PATTERN="${FR_PATTERN:-_dump_*}"
 
 # Ensure nvidia_resiliency_ext is importable from source tree

From af9c3259b5fd02bcd2aad3bf1da60515e022b6f3 Mon Sep 17 00:00:00 2001
From: Seonmyeong Bak <sbak@nvidia.com>
Date: Fri, 24 Apr 2026 14:32:39 -0700
Subject: [PATCH 10/21] feat(skills): add n3 super fault-loop workload

---
 .../nvrx-attr/fault-injection-loop/SKILL.md   |   7 +-
 .../nvrx-attr/scripts/n3_super_gb200_fi.sh    | 408 ++++++++++++++++++
 .../scripts/pools/n3_super_8node.txt          |  10 +
 .../skills/nvrx-attr/scripts/workloads.conf   |   5 +
 4 files changed, 429 insertions(+), 1 deletion(-)
 create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh
 create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/pools/n3_super_8node.txt

diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
index 857ba4f3..ce7736f6 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
@@ -158,7 +158,12 @@ Environment variables:
 
 | Name | Script | Base dir | Description |
 |---|---|---|---|
-| `llama4_scout` | `l4_gb200_reduced.sh` | `${HOME}/nvrx-attr-experiments` | Llama4-Scout (reduced layers) on GB200 |
+| `llama4_scout` | `l4_gb200_reduced.sh` | `${HOME}/nvrx-attr-experiments` | Llama4-Scout (reduced layers) on GB200; minimum supported size is 2 nodes |
+| `n3_super` | `n3_super_gb200_fi.sh` | `${HOME}/nvrx-attr-experiments` | Nemotron3-Super on GB200; minimum supported size is 8 nodes |
+
+Workload note:
+- `llama4_scout` requires at least 2 nodes.
+- `n3_super` requires at least 8 nodes. Its default registered pool contains only 8-node experiments.
 
 ```bash
 # Run the full pool against the validated example workload
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh
new file mode 100644
index 00000000..799e8c98
--- /dev/null
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh
@@ -0,0 +1,408 @@
+#!/bin/bash
+# n3_super_gb200_fi.sh — fault-injection job script for the n3_super_gb200 workload.
+# Production model args are kept aligned with the previously working nemotron config.
+# Only path/container plumbing is adapted for the nvrx-attr feedback-loop workflow.
+
+#SBATCH --time=00:30:00
+
+#SBATCH --job-name=n3-super-gb200-fi
+#SBATCH --output=/tmp/slurm-%j.launch.out
+#SBATCH --error=/tmp/slurm-%j.launch.err
+
+#SBATCH --nodes=8
+#SBATCH --ntasks-per-node=4
+#SBATCH --gpus-per-node=4
+#SBATCH --exclusive
+#SBATCH --mem=0
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+USER_ENV_FILE="${SCRIPT_DIR}/user.env"
+NVRX_SRC_ROOT_DEFAULT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)"
+NVRX_REPO_ROOT_DEFAULT="$(cd "${SCRIPT_DIR}/../../../../.." && pwd)"
+if [[ -f "${USER_ENV_FILE}" ]]; then
+    # shellcheck disable=SC1090
+    source "${USER_ENV_FILE}"
+fi
+
+log_msg() {
+    local msg="$1"
+    UNIX_DATETIME=$(date +%s)
+    HUMAN_DATETIME=$(date -d "@$UNIX_DATETIME" '+%Y-%m-%d %H:%M:%S.%3N')
+    echo ">>> ${msg} ${UNIX_DATETIME} (${HUMAN_DATETIME})"
+}
+
+log_msg "START SBATCH"
+echo "Running on nodes: ${SLURM_NODELIST}"
+
+# ── Platform / NCCL ───────────────────────────────────────────────────────────
+export NCCL_IB_DISABLE=0
+export NCCL_NET_GDR_LEVEL=3
+export PYXIS_LOG_LEVEL=debug
+export NCCL_IB_SL=1
+export NCCL_IB_TIMEOUT=19
+export UB_TIMEOUT=720
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_P2P_NET_CHUNKSIZE=2097152
+export NCCL_DEBUG=WARN
+
+# ── PyTorch / TE / inductor (from n3_super_gb200.sh ENV_VARS) ─────────────────
+export NVTE_FWD_LAYERNORM_SM_MARGIN=16
+export NVTE_BWD_LAYERNORM_SM_MARGIN=16
+export TORCHINDUCTOR_WORKER_START=fork
+export QUANTIZATION_TYPE_DEBUG=1
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export USE_MNNVL=1
+
+# ── DeepEP (hybridep MoE routing) — set USE_DEEPEP=0 to use alltoall instead ──
+export USE_DEEPEP="${USE_DEEPEP:-1}"
+if [[ "${USE_DEEPEP}" == "1" ]]; then
+    export NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN=32
+fi
+
+# ── Logging / debugging ───────────────────────────────────────────────────────
+export PYTHONUNBUFFERED=1
+export ONE_LOGGER_JOB_CATEGORY=test
+export LOGLEVEL=DEBUG
+export TORCH_CPP_LOG_LEVEL=WARNING
+export TORCH_NCCL_TRACE_BUFFER_SIZE=2000
+export TORCH_NCCL_RETHROW_CUDA_ERRORS=0
+export TORCH_NCCL_ENABLE_MONITORING=1
+export TORCH_NCCL_DUMP_ON_TIMEOUT=1
+export TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN=0
+export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=30
+export TORCH_DIST_INIT_BARRIER=0
+export TORCH_INCLUDE_STACK_TRACE=0
+export TORCH_INCLUDE_ONLY_ACTIVE=1
+export TORCH_NCCL_EXTRA_DUMP_ON_EXEC=1
+
+# ── Fault injection parameters (overridable via sbatch --export) ──────────────
+export FAULT_AT_ITER="${FAULT_AT_ITER:-5}"
+export FAULT_DELAY="${FAULT_DELAY:-15}"
+export FAULT_RANK="${FAULT_RANK:-1}"
+export FAULT_TYPE="${FAULT_TYPE:-GPU_SLEEP}"
+export ENABLE_FAULT_INJECTION="${ENABLE_FAULT_INJECTION:-1}"
+
+# ── CUDA graph (set ENABLE_CUDA_GRAPH=0 to disable) ───────────────────────────
+export ENABLE_CUDA_GRAPH="${ENABLE_CUDA_GRAPH:-1}"
+
+# ── Node / task geometry (SLURM_NNODES is set by SLURM from --nodes override) ─
+export GPUS_PER_NODE="${GPUS_PER_NODE:-4}"
+TOTAL_TASKS=$((SLURM_NNODES * GPUS_PER_NODE))
+
+# ── Per-experiment output directory (overridable via sbatch --export) ─────────
+export BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-${HOME}/nvrx-attr-experiments}"
+export EXPERIMENT_DIR="${EXPERIMENT_DIR:-${BASE_EXPERIMENTS_DIR}/fault_injection/manual/n${SLURM_NNODES}_${FAULT_TYPE}_r${FAULT_RANK}_i${FAULT_AT_ITER}}"
+export NVRX_REPO_ROOT="${NVRX_REPO_ROOT:-${NVRX_REPO_ROOT_DEFAULT}}"
+export NVRX_SRC_ROOT="${NVRX_SRC_ROOT:-${NVRX_SRC_ROOT_DEFAULT}}"
+export NVRX_CONTAINER_REPO_PATH="${NVRX_CONTAINER_REPO_PATH:-${HOME}/nvidia-resiliency-ext}"
+export NVRX_CONTAINER_SRC_PATH="${NVRX_CONTAINER_SRC_PATH:-${NVRX_CONTAINER_REPO_PATH}/src}"
+export SHARED_TMP_BASE_DIR="${SHARED_TMP_BASE_DIR:-${HOME}/tmp}"
+export MEGATRON_REPO_HOST_PATH="${MEGATRON_REPO_HOST_PATH:-${HOME}/megatron-lm-main}"
+export WORKSPACE_HOST_PATH="${WORKSPACE_HOST_PATH:-${HOME}/tmp}"
+export CONTAINER_IMAGE="${CONTAINER_IMAGE:-nvcr.io/nvidia/nemo:26.04}"
+export CONTAINER_NAME="${CONTAINER_NAME:-}"
+export CONTAINER_WORKDIR="${CONTAINER_WORKDIR:-/}"
+export CONTAINER_CLEANUP_CMD="${CONTAINER_CLEANUP_CMD:-}"
+export ENABLE_NFS_CACHE_STAGING="${ENABLE_NFS_CACHE_STAGING:-0}"
+export NFS_TRITON_CACHE="${NFS_TRITON_CACHE:-}"
+export NFS_INDUCTOR_CACHE="${NFS_INDUCTOR_CACHE:-}"
+
+mkdir -p ${BASE_EXPERIMENTS_DIR}/datacache
+mkdir -p ${EXPERIMENT_DIR}/checkpoints
+mkdir -p ${EXPERIMENT_DIR}/tensorboard
+
+: "${SLURM_RESTART_COUNT:=0}"
+
+LOG_DIR=${EXPERIMENT_DIR}/logs
+mkdir -p ${LOG_DIR}
+echo "Writing logs to ${LOG_DIR}"
+LOG_FILE_BASE="${LOG_DIR}/slurm/${SLURM_JOB_ID}.${SLURM_RESTART_COUNT}"
+
+# ── Container mounts ──────────────────────────────────────────────────────────
+LUSTRE=/home:/home
+SHARED_TMP_HOST=${SHARED_TMP_BASE_DIR}/${SLURM_JOB_ID}
+mkdir -p ${SHARED_TMP_HOST}
+SHARED_TMP=${SHARED_TMP_HOST}:/shared_tmp
+LOGS=${EXPERIMENT_DIR}/logs:/logs
+MEGATRON_REPO=${MEGATRON_REPO_HOST_PATH}:/megatron-lm_repo
+DATACACHE=${BASE_EXPERIMENTS_DIR}/datacache:/datacache
+CHECKPOINT_LOAD=${EXPERIMENT_DIR}/checkpoints:/checkpoint-load
+CHECKPOINT_SAVE=${EXPERIMENT_DIR}/checkpoints:/checkpoint-save
+TENSORBOARD=${EXPERIMENT_DIR}/tensorboard:/tensorboard
+WORKSPACE=${WORKSPACE_HOST_PATH}:/workspace
+CONTAINER_MOUNTS=$LUSTRE,$SHARED_TMP,$LOGS,$MEGATRON_REPO,$DATACACHE,$CHECKPOINT_LOAD,$CHECKPOINT_SAVE,$TENSORBOARD,$WORKSPACE
+CONTAINER_ARGS=(
+    --container-mounts "${CONTAINER_MOUNTS}"
+    --container-image "${CONTAINER_IMAGE}"
+    --container-workdir "${CONTAINER_WORKDIR}"
+)
+if [[ -n "${CONTAINER_NAME}" ]]; then
+    CONTAINER_ARGS+=(--container-name "${CONTAINER_NAME}")
+fi
+
+MYENV_FILE=${SHARED_TMP_HOST}/.myenv_${SLURM_JOB_ID}.sh
+cat > ${MYENV_FILE} << MYENVEOF
+export FAULT_AT_ITER=${FAULT_AT_ITER}
+export FAULT_DELAY=${FAULT_DELAY}
+export FAULT_RANK=${FAULT_RANK}
+export FAULT_TYPE=${FAULT_TYPE}
+export ENABLE_FAULT_INJECTION=${ENABLE_FAULT_INJECTION}
+export ENABLE_CUDA_GRAPH=${ENABLE_CUDA_GRAPH}
+export USE_DEEPEP=${USE_DEEPEP}
+export ENABLE_NFS_CACHE_STAGING=${ENABLE_NFS_CACHE_STAGING}
+export NFS_TRITON_CACHE=${NFS_TRITON_CACHE}
+export NFS_INDUCTOR_CACHE=${NFS_INDUCTOR_CACHE}
+export NVRX_REPO_ROOT=${NVRX_CONTAINER_REPO_PATH}
+export NVRX_SRC_ROOT=${NVRX_CONTAINER_SRC_PATH}
+export PYTHONPATH=\${NVRX_REPO_ROOT}:\${NVRX_SRC_ROOT}:\${PYTHONPATH}
+MYENVEOF
+
+# ── Optional site-specific cleanup hook ───────────────────────────────────────
+if [[ -n "${CONTAINER_CLEANUP_CMD}" ]]; then
+    log_msg "START disk_cleanup"
+    srun \
+        --label \
+        --ntasks-per-node=1 \
+        --ntasks=${SLURM_NNODES} \
+        --kill-on-bad-exit=0 \
+        --mpi=none \
+        bash -lc "${CONTAINER_CLEANUP_CMD}"
+    log_msg "END disk_cleanup"
+else
+    log_msg "SKIP disk_cleanup"
+fi
+
+# ── All-node setup: clone Megatron into a per-job tmpdir ─────────────────────
+log_msg "START all_node_setup"
+srun \
+    --label \
+    "${CONTAINER_ARGS[@]}" \
+    --exclusive \
+    --error=${LOG_FILE_BASE}.0.all_node_setup.log \
+    --output=${LOG_FILE_BASE}.0.all_node_setup.log \
+    --ntasks-per-node=1 \
+    --ntasks=${SLURM_NNODES} \
+    --kill-on-bad-exit=0 \
+    --mpi=none \
+    bash -c '
+        MEGATRON_PATH=/shared_tmp/megatron_${SLURM_NODEID}
+        rm -rf "${MEGATRON_PATH}"
+        mkdir -p "${MEGATRON_PATH}"
+        pushd $MEGATRON_PATH
+        CURRENT_BRANCH=$(git -C /megatron-lm_repo branch --show-current)
+        echo "Cloning Megatron branch $CURRENT_BRANCH into $MEGATRON_PATH"
+        git clone --single-branch --branch $CURRENT_BRANCH /megatron-lm_repo .
+        rm -rf "${MEGATRON_PATH}/nvidia_resiliency_ext"
+        if command -v rsync >/dev/null 2>&1; then
+            rsync -a "${NVRX_CONTAINER_SRC_PATH}/nvidia_resiliency_ext/" "${MEGATRON_PATH}/nvidia_resiliency_ext/"
+        else
+            cp -a "${NVRX_CONTAINER_SRC_PATH}/nvidia_resiliency_ext" "${MEGATRON_PATH}/"
+        fi
+        popd
+    '
+log_msg "END all_node_setup"
+
+# ── Main workload ─────────────────────────────────────────────────────────────
+log_msg "START main_workload"
+srun \
+    --label \
+    "${CONTAINER_ARGS[@]}" \
+    --error=${LOG_FILE_BASE}.1.main_workload.log \
+    --output=${LOG_FILE_BASE}.1.main_workload.log \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --ntasks=${TOTAL_TASKS} \
+    --kill-on-bad-exit=0 \
+    --mpi=none \
+    bash -c '
+        source /shared_tmp/.myenv_${SLURM_JOB_ID}.sh
+        MEGATRON_PATH=/shared_tmp/megatron_${SLURM_NODEID}
+        export PYTHONPATH=${MEGATRON_PATH}:${NVRX_REPO_ROOT}:${NVRX_SRC_ROOT}:${PYTHONPATH}
+
+        # Triton/inductor cache strategy:
+        #   - /tmp inside the container is the node-local in-memory tmpfs (not NFS-backed)
+        #   - Optional pre-stage from a persistent cache to each local rank /tmp dir
+        #   - Barrier via marker file in /tmp ensures other ranks wait before Python starts
+        #   - On exit: global rank 0 stages back to NFS only on cold start
+        TRITON_READY=/tmp/.triton_ready_${SLURM_JOB_ID}
+
+        export TRITON_CACHE_DIR=/tmp/triton_${SLURM_LOCALID}
+        export TORCHINDUCTOR_CACHE_DIR=/tmp/inductor_${SLURM_LOCALID}
+
+        if [[ "${ENABLE_NFS_CACHE_STAGING}" == "1" && "${SLURM_LOCALID}" == "0" ]]; then
+            if [[ -d "${NFS_TRITON_CACHE}" ]] && [[ -n "$(ls -A ${NFS_TRITON_CACHE} 2>/dev/null)" ]]; then
+                TRITON_CACHE_WAS_WARM=1
+            else
+                TRITON_CACHE_WAS_WARM=0
+            fi
+            for r in 0 1 2 3; do
+                mkdir -p /tmp/triton_${r} /tmp/inductor_${r}
+                [[ -d "${NFS_TRITON_CACHE}" ]] && rsync -a --ignore-existing "${NFS_TRITON_CACHE}/" "/tmp/triton_${r}/" 2>/dev/null || true
+                [[ -d "${NFS_INDUCTOR_CACHE}" ]] && rsync -a --ignore-existing "${NFS_INDUCTOR_CACHE}/" "/tmp/inductor_${r}/" 2>/dev/null || true
+            done
+            touch "${TRITON_READY}"
+            echo "Pre-staged triton/inductor cache for all local ranks (was_warm=${TRITON_CACHE_WAS_WARM})."
+        elif [[ "${SLURM_LOCALID}" != "0" ]]; then
+            until [[ -f "${TRITON_READY}" ]]; do sleep 1; done
+        fi
+
+        mkdir -p ${TRITON_CACHE_DIR} ${TORCHINDUCTOR_CACHE_DIR}
+
+        _stage_back() {
+            if [[ "${ENABLE_NFS_CACHE_STAGING}" == "1" && "${SLURM_LOCALID}" == "0" && "${SLURM_NODEID}" == "0" && "${TRITON_CACHE_WAS_WARM:-0}" == "0" ]]; then
+                echo "Staging triton cache back to NFS (cold start)..."
+                mkdir -p "${NFS_TRITON_CACHE}" "${NFS_INDUCTOR_CACHE}"
+                rsync -a --ignore-existing "${TRITON_CACHE_DIR}/" "${NFS_TRITON_CACHE}/" 2>/dev/null || true
+                rsync -a --ignore-existing "${TORCHINDUCTOR_CACHE_DIR}/" "${NFS_INDUCTOR_CACHE}/" 2>/dev/null || true
+                echo "Cache staged back."
+            fi
+        }
+        trap _stage_back EXIT
+
+        if [[ "${ENABLE_CUDA_GRAPH}" == "1" ]]; then
+            CUDA_GRAPH_ARGS="--enable-cuda-graph --cuda-graph-scope mamba attn"
+        else
+            CUDA_GRAPH_ARGS=""
+        fi
+
+        if [[ "${USE_DEEPEP:-1}" == "1" ]]; then
+            MOE_DISPATCHER_ARGS="--moe-token-dispatcher-type flex --moe-flex-dispatcher-backend hybridep --moe-hybridep-num-sms 32"
+        else
+            MOE_DISPATCHER_ARGS="--moe-token-dispatcher-type alltoall"
+        fi
+
+        pushd $MEGATRON_PATH
+        LAUNCHER_CMD="python3"
+        LAUNCHER_ARGS=" \
+        "
+        WORKLOAD_CMD=${MEGATRON_PATH}/pretrain_mamba.py
+        FAULT_INJECTOR_ARGS=""
+        if [[ "${ENABLE_FAULT_INJECTION}" == "1" ]]; then
+            FAULT_INJECTOR_ARGS=" \
+                --fault-injector-ranks ${FAULT_RANK} \
+                --fault-injector-fault-types ${FAULT_TYPE} \
+            "
+            if [[ -n "${FAULT_DELAY:-}" ]]; then
+                FAULT_INJECTOR_ARGS="${FAULT_INJECTOR_ARGS} --fault-injector-fault-delay ${FAULT_DELAY}"
+                if [[ -n "${FAULT_AT_ITER:-}" ]]; then
+                    FAULT_INJECTOR_ARGS="${FAULT_INJECTOR_ARGS} --fault-injector-delay-start-iteration ${FAULT_AT_ITER}"
+                fi
+            elif [[ -n "${FAULT_AT_ITER:-}" ]]; then
+                FAULT_INJECTOR_ARGS="${FAULT_INJECTOR_ARGS} --fault-injector-fault-delay 0 --fault-injector-delay-start-iteration ${FAULT_AT_ITER}"
+            fi
+        fi
+        WORKLOAD_ARGS=" \
+            --exit-duration-in-mins 5750 \
+            --exit-interval 100 \
+            --distributed-timeout-minutes 10 \
+            --disable-gloo-process-groups \
+            --mock-data \
+            --data-cache-path /datacache \
+            --no-create-attention-mask-in-dataloader \
+            --no-mmap-bin-files \
+            --tokenizer-type NullTokenizer \
+            --tiktoken-pattern v2 \
+            --vocab-size 128000 \
+            --micro-batch-size 1 \
+            --global-batch-size 32 \
+            --train-samples 12207031 \
+            --adam-beta1 0.9 \
+            --adam-beta2 0.95 \
+            --lr 4.5e-4 \
+            --min-lr 4.5e-6 \
+            --lr-decay-style WSD \
+            --lr-warmup-samples 24414063 \
+            --lr-decay-samples 3048706055 \
+            --lr-wsd-decay-style minus_sqrt \
+            --lr-wsd-decay-samples 610351563 \
+            --weight-decay 0.1 \
+            --clip-grad 1.0 \
+            --override-opt_param-scheduler \
+            --use-mcore-models \
+            --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
+            --is-hybrid-model \
+            --mamba-num-heads 128 \
+            --num-layers 88 \
+            --hidden-size 4096 \
+            --ffn-hidden-size 2688 \
+            --num-attention-heads 32 \
+            --group-query-attention \
+            --num-query-groups 2 \
+            --kv-channels 128 \
+            --hybrid-override-pattern MEMEMEM*EMEMEMEM*EMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEM*EMEMEMEME \
+            --position-embedding-type none \
+            --normalization RMSNorm \
+            --untie-embeddings-and-output-weights \
+            --init-method-std 0.014 \
+            --disable-bias-linear \
+            --squared-relu \
+            --use-fused-weighted-squared-relu \
+            --seq-length 8192 \
+            --max-position-embeddings 8192 \
+            --num-experts 512 \
+            --moe-router-topk 22 \
+            --moe-router-topk-scaling-factor 5.0 \
+            --moe-router-score-function sigmoid \
+            --moe-router-enable-expert-bias \
+            --moe-router-dtype fp32 \
+            --moe-router-load-balancing-type seq_aux_loss \
+            --moe-aux-loss-coeff 1e-4 \
+            ${MOE_DISPATCHER_ARGS} \
+            --moe-grouped-gemm \
+            --moe-permute-fusion \
+            --moe-latent-size 1024 \
+            --moe-shared-expert-intermediate-size 5376 \
+            --calculate-per-token-loss \
+            --bf16 \
+            --first-last-layers-bf16 \
+            --num-layers-at-start-in-bf16 0 \
+            --num-layers-at-end-in-bf16 14 \
+            --fp4-format e2m1 \
+            --fp4-recipe nvfp4 \
+            --attention-dropout 0.0 \
+            --hidden-dropout 0.0 \
+            --sequence-parallel \
+            --use-distributed-optimizer \
+            --overlap-grad-reduce \
+            --overlap-param-gather \
+            --ddp-num-buckets 10 \
+            --ddp-pad-buckets-for-high-nccl-busbw \
+            --high-priority-stream-groups ep \
+            --tensor-model-parallel-size 4 \
+            --pipeline-model-parallel-size 1 \
+            --expert-model-parallel-size 32 \
+            --expert-tensor-parallel-size 1 \
+            --cross-entropy-loss-fusion \
+            --cross-entropy-fusion-impl native \
+            --attention-backend flash \
+            ${CUDA_GRAPH_ARGS} \
+            --te-rng-tracker \
+            --manual-gc \
+            --manual-gc-interval 10 \
+            --num-workers 1 \
+            --eval-interval 1000 \
+            --eval-iters 14 \
+            --log-interval 1 \
+            --log-params-norm \
+            --log-num-zeros-in-grad \
+            --log-timers-to-tensorboard \
+            --log-memory-to-tensorboard \
+            --log-throughput \
+            --log-progress \
+            --log-energy \
+            --log-memory-interval 500 \
+            --logging-level 20 \
+            --timing-log-option minmax \
+            --check-weight-hash-across-dp-replicas-interval 20000 \
+            --tensorboard-dir /tensorboard \
+            --local-rank ${SLURM_LOCALID} \
+            --distributed-timeout-seconds-after-init 1 \
+            --flight-recorder-dump-path /checkpoint-save \
+        "
+        WORKLOAD_ARGS="${WORKLOAD_ARGS} ${FAULT_INJECTOR_ARGS}"
+        $LAUNCHER_CMD $LAUNCHER_ARGS $WORKLOAD_CMD $WORKLOAD_ARGS
+    '
+log_msg "END main_workload"
+
+log_msg "END SBATCH"
+
+set +x
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/pools/n3_super_8node.txt b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/pools/n3_super_8node.txt
new file mode 100644
index 00000000..dc6a75cd
--- /dev/null
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/pools/n3_super_8node.txt
@@ -0,0 +1,10 @@
+# n3_super minimum supported size is 8 nodes.
+# Format: FAULT_TYPE:RANK:ITER:NODES
+
+GPU_SLEEP:1:5:8
+GPU_SLEEP:0:5:8
+GPU_SLEEP:16:5:8
+GPU_SLEEP:31:5:8
+GPU_ERROR:1:5:8
+GPU_ERROR:0:5:8
+SIGKILL:1:5:8
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/workloads.conf b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/workloads.conf
index dcc1dc62..8d638e78 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/workloads.conf
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/workloads.conf
@@ -15,5 +15,10 @@
 #                        "-" means use the TIME env var or prepare_node_alloc.sh default (00:30:00)
 #
 # Fields are whitespace-separated. Lines starting with # are ignored.
+#
+# Workload-specific notes:
+# - llama4_scout requires a minimum of 2 nodes.
+# - n3_super requires a minimum of 8 nodes; use its dedicated 8-node pool file.
 
 llama4_scout  l4_gb200_reduced.sh   -                                           Llama4-Scout_(reduced_layers)_on_GB200     -                    -
+n3_super      n3_super_gb200_fi.sh  -                                           Nemotron3-Super_on_GB200                   n3_super_8node.txt   -

From 9a12531425eb95bd4a52eefd2c3a85dbc7d25eed Mon Sep 17 00:00:00 2001
From: Seonmyeong Bak <sbak@nvidia.com>
Date: Fri, 24 Apr 2026 15:22:01 -0700
Subject: [PATCH 11/21] fix(skills): harden n3 fault-loop analysis

---
 .../skills/nvrx-attr/scripts/n3_super_gb200_fi.sh             | 2 +-
 .../skills/nvrx-attr/scripts/watch_and_analyze.sh             | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh
index 799e8c98..ff91debc 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh
@@ -234,7 +234,7 @@ srun \
             else
                 TRITON_CACHE_WAS_WARM=0
             fi
-            for r in 0 1 2 3; do
+            for ((r=0; r<GPUS_PER_NODE; r++)); do
                 mkdir -p /tmp/triton_${r} /tmp/inductor_${r}
                 [[ -d "${NFS_TRITON_CACHE}" ]] && rsync -a --ignore-existing "${NFS_TRITON_CACHE}/" "/tmp/triton_${r}/" 2>/dev/null || true
                 [[ -d "${NFS_INDUCTOR_CACHE}" ]] && rsync -a --ignore-existing "${NFS_INDUCTOR_CACHE}/" "/tmp/inductor_${r}/" 2>/dev/null || true
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh
index 27685ee7..b3cc1df2 100755
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh
@@ -36,7 +36,7 @@ export PYTHONPATH="${NVRX_SRC_DIR}${PYTHONPATH:+:$PYTHONPATH}"
 strip_injection_markers() {
     local input_log="$1"
     local output_log="$2"
-    grep -v -E 'FAULT INJECTION|nvidia_resiliency_ext\.shared_utils\.inject_fault' \
+    grep -a -v -E 'FAULT INJECTION|nvidia_resiliency_ext\.shared_utils\.inject_fault' \
         "${input_log}" > "${output_log}" 2>/dev/null || true
 }
 
@@ -100,7 +100,7 @@ while true; do
         STRIPPED_LOG=""
         if [[ -n "${LOG_FILE}" && -f "${LOG_FILE}" ]]; then
             echo "    log: ${LOG_FILE}"
-            if grep -q "FAULT INJECTION" "${LOG_FILE}" 2>/dev/null; then
+            if grep -a -q "FAULT INJECTION" "${LOG_FILE}" 2>/dev/null; then
                 RUN_VALID="true"
             fi
             echo "    run_valid: ${RUN_VALID}"

From 1e23cb903a94fba3474628185b16c16ed78ce9a1 Mon Sep 17 00:00:00 2001
From: Seonmyeong Bak <sbak@nvidia.com>
Date: Fri, 24 Apr 2026 15:24:40 -0700
Subject: [PATCH 12/21] fix(skills): repair fr-analysis wrapper symlink

---
 .../skills/nvrx-attr/fr-analysis/scripts/fr_attribution.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/scripts/fr_attribution.py b/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/scripts/fr_attribution.py
index cfac8e34..d98699dd 120000
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/scripts/fr_attribution.py
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/scripts/fr_attribution.py
@@ -1 +1 @@
-../../../trace_analyzer/fr_attribution.py
\ No newline at end of file
+../../../../attribution/trace_analyzer/fr_attribution.py
\ No newline at end of file

From 5e30a25247a62907e93b49adfe2c2b4620376087 Mon Sep 17 00:00:00 2001
From: Seonmyeong Bak <sbak@nvidia.com>
Date: Fri, 24 Apr 2026 15:29:41 -0700
Subject: [PATCH 13/21] docs(skills): clarify local env configuration

---
 .../skills/nvrx-attr/fault-injection-loop/SKILL.md  |  5 +++++
 .../skills/nvrx-attr/scripts/user.env.example       | 13 +++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
index ce7736f6..ed908b05 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
@@ -128,6 +128,11 @@ and the judge. Use per-run environment overrides for experiment-specific
 controls such as `POOL`, `WORKLOAD`, `BATCH_SIZE`, `FAULT_TYPE`,
 `FAULT_AT_ITER`, or `FAULT_DELAY`.
 
+If you use local Triton/Inductor cache staging, set the cache variables in
+`scripts/user.env`. See `scripts/user.env.example` for the supported
+`ENABLE_NFS_CACHE_STAGING`, `NFS_TRITON_CACHE`, and `NFS_INDUCTOR_CACHE`
+entries and workload-specific path examples.
+
 Environment variables:
 
 | Variable | Default | Description |
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example
index a00999fb..72a2efca 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example
@@ -24,8 +24,21 @@ CONTAINER_IMAGE="nvcr.io/nvidia/nemo:26.04"
 # CONTAINER_NAME=
 # CONTAINER_WORKDIR=/
 
+# Optional NFS-backed cache staging
+# Set ENABLE_NFS_CACHE_STAGING=1 to pre-stage Triton/Inductor caches to local /tmp.
+# Pick the workload-specific cache roots that match the workload you are running:
+# NFS_TRITON_CACHE="/home/sbak/experiments/llama4-scout-gb200/triton_cache"
+# NFS_INDUCTOR_CACHE="/home/sbak/experiments/llama4-scout-gb200/inductor_cache"
+# NFS_TRITON_CACHE="/home/sbak/experiments/n3-super-gb200/triton_cache"
+# NFS_INDUCTOR_CACHE="/home/sbak/experiments/n3-super-gb200/inductor_cache"
+# ENABLE_NFS_CACHE_STAGING=1
+# NFS_TRITON_CACHE="/path/to/<workload>/triton_cache"
+# NFS_INDUCTOR_CACHE="/path/to/<workload>/inductor_cache"
+
 # Log-analysis / judge LLM settings
 # Keep these local. Prefer *_API_KEY_FILE over inline secrets.
+# NVIDIA_API_KEY="..."
+# JUDGE_API_KEY="..."
 # NVIDIA_API_KEY_FILE="${HOME}/.nvidia_api_key"
 # JUDGE_API_KEY_FILE="${HOME}/.nvidia_api_key"
 # NVRX_LLM_MODEL="nvidia/nemotron-3-super-120b-a12b"

From 810c534abed5b231b7a030fa10abb1159f1d05e3 Mon Sep 17 00:00:00 2001
From: Seonmyeong Bak <sbak@nvidia.com>
Date: Mon, 27 Apr 2026 10:27:59 -0700
Subject: [PATCH 14/21] refactor(skills): simplify local config wiring

---
 .../attribution/log_analyzer/nvrx_logsage.py          |  8 --------
 .../skills/nvrx-attr/fault-injection-loop/SKILL.md    |  4 ++--
 .../skills/nvrx-attr/scripts/prepare_node_alloc.sh    |  5 -----
 .../skills/nvrx-attr/scripts/slurm.conf               | 11 -----------
 .../skills/nvrx-attr/scripts/user.env.example         |  4 ----
 5 files changed, 2 insertions(+), 30 deletions(-)
 delete mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/slurm.conf

diff --git a/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py b/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py
index 0340a6b3..30d90e8c 100644
--- a/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py
+++ b/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py
@@ -194,14 +194,6 @@ def _with_exponential_backoff(llm_call, checkpoint_saved: bool) -> tuple[str, st
 
         backoff = _sleep_with_backoff(attempt, retries, backoff, max_backoff, jitter)
 
-    return (
-        ATTR_LLM_FAILURE,
-        ATTR_LLM_FAILURE,
-        ATTR_LLM_FAILURE,
-        ATTR_LLM_FAILURE,
-        str(checkpoint_saved),
-    )
-
 
 class NVRxLogAnalyzer(NVRxAttribution):
     def __init__(self, args: Union[argparse.Namespace, Mapping[str, Any]]):
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
index ed908b05..3cb4a93b 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
@@ -138,8 +138,8 @@ Environment variables:
 | Variable | Default | Description |
 |---|---|---|
 | `WORKLOAD` | `llama4_scout` | Select a registered workload by name (see `scripts/workloads.conf`) |
-| `ACCOUNT` | _(cluster default or `scripts/slurm.conf`)_ | SLURM account |
-| `PARTITION` | _(cluster default or `scripts/slurm.conf`)_ | SLURM partition |
+| `ACCOUNT` | _(cluster default or `scripts/user.env`)_ | SLURM account |
+| `PARTITION` | _(cluster default or `scripts/user.env`)_ | SLURM partition |
 | `GPUS_PER_NODE` | `4` | GPUs per node |
 | `TIME` | `00:30:00` | Per-job wall-clock limit |
 | `BATCH_SIZE` | `2` | Jobs submitted per round |
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh
index 67d80be1..9deac578 100755
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh
@@ -25,7 +25,6 @@ set -euo pipefail
 
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 WORKLOADS_CONF="${SCRIPT_DIR}/workloads.conf"
-SLURM_DEFAULTS_CONF="${SCRIPT_DIR}/slurm.conf"
 USER_ENV_FILE="${SCRIPT_DIR}/user.env"
 ACCOUNT_FROM_ENV="${ACCOUNT-}"
 PARTITION_FROM_ENV="${PARTITION-}"
@@ -35,10 +34,6 @@ CONTAINER_IMAGE_FROM_ENV="${CONTAINER_IMAGE-}"
 SHARED_TMP_BASE_DIR_FROM_ENV="${SHARED_TMP_BASE_DIR-}"
 WORKSPACE_HOST_PATH_FROM_ENV="${WORKSPACE_HOST_PATH-}"
 
-if [[ -f "${SLURM_DEFAULTS_CONF}" ]]; then
-    # shellcheck disable=SC1090
-    source "${SLURM_DEFAULTS_CONF}"
-fi
 if [[ -f "${USER_ENV_FILE}" ]]; then
     # shellcheck disable=SC1090
     source "${USER_ENV_FILE}"
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/slurm.conf b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/slurm.conf
deleted file mode 100644
index 764003dc..00000000
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/slurm.conf
+++ /dev/null
@@ -1,11 +0,0 @@
-# Optional site-specific Slurm defaults for nvrx-attr scripts.
-#
-# This file is sourced by prepare_node_alloc.sh. Environment variables still
-# take precedence, so you can override these per invocation:
-#
-#   ACCOUNT=myacct PARTITION=gpu bash scripts/prepare_node_alloc.sh
-#
-# Leave values empty to rely on the cluster's default account / partition.
-
-ACCOUNT=""
-PARTITION=""
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example
index 72a2efca..cf8f8f5e 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example
@@ -27,10 +27,6 @@ CONTAINER_IMAGE="nvcr.io/nvidia/nemo:26.04"
 # Optional NFS-backed cache staging
 # Set ENABLE_NFS_CACHE_STAGING=1 to pre-stage Triton/Inductor caches to local /tmp.
 # Pick the workload-specific cache roots that match the workload you are running:
-# NFS_TRITON_CACHE="/home/sbak/experiments/llama4-scout-gb200/triton_cache"
-# NFS_INDUCTOR_CACHE="/home/sbak/experiments/llama4-scout-gb200/inductor_cache"
-# NFS_TRITON_CACHE="/home/sbak/experiments/n3-super-gb200/triton_cache"
-# NFS_INDUCTOR_CACHE="/home/sbak/experiments/n3-super-gb200/inductor_cache"
 # ENABLE_NFS_CACHE_STAGING=1
 # NFS_TRITON_CACHE="/path/to/<workload>/triton_cache"
 # NFS_INDUCTOR_CACHE="/path/to/<workload>/inductor_cache"

From d0fac001702801a510621bad168b6a0c9b7675e7 Mon Sep 17 00:00:00 2001
From: Seonmyeong Bak <sbak@nvidia.com>
Date: Mon, 27 Apr 2026 10:30:34 -0700
Subject: [PATCH 15/21] fix(fr): restore logger level on analysis errors

---
 .../trace_analyzer/fr_attribution.py          | 43 ++++++++++---------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/src/nvidia_resiliency_ext/attribution/trace_analyzer/fr_attribution.py b/src/nvidia_resiliency_ext/attribution/trace_analyzer/fr_attribution.py
index f4584cb4..8fe6c134 100644
--- a/src/nvidia_resiliency_ext/attribution/trace_analyzer/fr_attribution.py
+++ b/src/nvidia_resiliency_ext/attribution/trace_analyzer/fr_attribution.py
@@ -279,31 +279,32 @@ def gather_head_nodes(grouped_pgs):
         original_level = logger.level
         if logger.getEffectiveLevel() > logging.INFO:
             logger.setLevel(logging.INFO)
+        try:
+            with capture_logs(logger.name) as output:
 
-        with capture_logs(logger.name) as output:
-
-            def print_ranks_in_pgs(head_nodes, pg_dict, missing_or_completed="Missing"):
-                logger.info(
-                    f"{'PGID':<6} | {'Process Group Desc':<25} | {'Op Type':<10} | {'Size':<8} \
-                        | {'Dtype':<8} | {missing_or_completed} Ranks"
-                )
-                for pg_idx in head_nodes:
-                    entry = list(pg_dict[pg_idx][0])
-                    entry.remove(entry[-2])
-                    if missing_or_completed == "Missing":
-                        ranks_to_print = entry[6]
-                    else:
-                        ranks_to_print = entry[5]
+                def print_ranks_in_pgs(head_nodes, pg_dict, missing_or_completed="Missing"):
                     logger.info(
-                        f"{entry[0]:<6} | {entry[1]:<25} | {entry[2]:<10} | {entry[3]:<8} \
-                            | {entry[4]:<8} | {ranks_to_print}"
+                        f"{'PGID':<6} | {'Process Group Desc':<25} | {'Op Type':<10} | {'Size':<8} \
+                            | {'Dtype':<8} | {missing_or_completed} Ranks"
                     )
+                    for pg_idx in head_nodes:
+                        entry = list(pg_dict[pg_idx][0])
+                        entry.remove(entry[-2])
+                        if missing_or_completed == "Missing":
+                            ranks_to_print = entry[6]
+                        else:
+                            ranks_to_print = entry[5]
+                        logger.info(
+                            f"{entry[0]:<6} | {entry[1]:<25} | {entry[2]:<10} | {entry[3]:<8} \
+                                | {entry[4]:<8} | {ranks_to_print}"
+                        )
 
-            if head_nodes_missing:
-                logger.debug(f"head_nodes_missing: {head_nodes_missing}")
-                print_ranks_in_pgs(head_nodes_missing, missing_pg, "Missing")
-        analysis_output = output.getvalue()
-        logger.setLevel(original_level)
+                if head_nodes_missing:
+                    logger.debug(f"head_nodes_missing: {head_nodes_missing}")
+                    print_ranks_in_pgs(head_nodes_missing, missing_pg, "Missing")
+            analysis_output = output.getvalue()
+        finally:
+            logger.setLevel(original_level)
         return analysis_output
 
     async def collective_analysis(self, analysis_output: str) -> Optional[str]:

From 950b97b2e9cecea53531d8c4c4563ad308bf4983 Mon Sep 17 00:00:00 2001
From: Seonmyeong Bak <sbak@nvidia.com>
Date: Mon, 27 Apr 2026 10:44:04 -0700
Subject: [PATCH 16/21] refactor(skills): require local env for fault loop

---
 .../skills/nvrx-attr/fault-injection-loop/SKILL.md       | 5 +++--
 .../skills/nvrx-attr/scripts/l4_gb200_reduced.sh         | 9 ++++++---
 .../skills/nvrx-attr/scripts/n3_super_gb200_fi.sh        | 9 ++++++---
 .../skills/nvrx-attr/scripts/prepare_node_alloc.sh       | 9 ++++++---
 .../skills/nvrx-attr/scripts/run_session.sh              | 9 ++++++---
 .../skills/nvrx-attr/scripts/watch_and_analyze.sh        | 9 ++++++---
 6 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
index 3cb4a93b..1598f221 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
@@ -101,8 +101,9 @@ cp scripts/user.env.example scripts/user.env
 ```
 
 Then edit `scripts/user.env` with cluster-specific settings. This file is
-sourced by `run_session.sh`, `prepare_node_alloc.sh`, `watch_and_analyze.sh`, and
-`l4_gb200_reduced.sh`, and it is intended to stay local and untracked.
+sourced by `run_session.sh`, `prepare_node_alloc.sh`, `watch_and_analyze.sh`,
+`l4_gb200_reduced.sh`, and `n3_super_gb200_fi.sh`. It is required for this skill
+to run and is intended to stay local and untracked.
 
 Recommended contents:
 
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh
index f91f99ce..319053fc 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh
@@ -21,10 +21,13 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 USER_ENV_FILE="${SCRIPT_DIR}/user.env"
 NVRX_SRC_ROOT_DEFAULT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)"
 NVRX_REPO_ROOT_DEFAULT="$(cd "${SCRIPT_DIR}/../../../../.." && pwd)"
-if [[ -f "${USER_ENV_FILE}" ]]; then
-    # shellcheck disable=SC1090
-    source "${USER_ENV_FILE}"
+if [[ ! -f "${USER_ENV_FILE}" ]]; then
+    echo "ERROR: required local config not found: ${USER_ENV_FILE}" >&2
+    echo "Create it from ${SCRIPT_DIR}/user.env.example and fill in your local settings." >&2
+    exit 1
 fi
+# shellcheck disable=SC1090
+source "${USER_ENV_FILE}"
 
 log_msg() {
     local msg="$1"
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh
index ff91debc..7f077575 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh
@@ -19,10 +19,13 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 USER_ENV_FILE="${SCRIPT_DIR}/user.env"
 NVRX_SRC_ROOT_DEFAULT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)"
 NVRX_REPO_ROOT_DEFAULT="$(cd "${SCRIPT_DIR}/../../../../.." && pwd)"
-if [[ -f "${USER_ENV_FILE}" ]]; then
-    # shellcheck disable=SC1090
-    source "${USER_ENV_FILE}"
+if [[ ! -f "${USER_ENV_FILE}" ]]; then
+    echo "ERROR: required local config not found: ${USER_ENV_FILE}" >&2
+    echo "Create it from ${SCRIPT_DIR}/user.env.example and fill in your local settings." >&2
+    exit 1
 fi
+# shellcheck disable=SC1090
+source "${USER_ENV_FILE}"
 
 log_msg() {
     local msg="$1"
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh
index 9deac578..9ce92b67 100755
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh
@@ -34,10 +34,13 @@ CONTAINER_IMAGE_FROM_ENV="${CONTAINER_IMAGE-}"
 SHARED_TMP_BASE_DIR_FROM_ENV="${SHARED_TMP_BASE_DIR-}"
 WORKSPACE_HOST_PATH_FROM_ENV="${WORKSPACE_HOST_PATH-}"
 
-if [[ -f "${USER_ENV_FILE}" ]]; then
-    # shellcheck disable=SC1090
-    source "${USER_ENV_FILE}"
+if [[ ! -f "${USER_ENV_FILE}" ]]; then
+    echo "ERROR: required local config not found: ${USER_ENV_FILE}" >&2
+    echo "Create it from ${SCRIPT_DIR}/user.env.example and fill in your local settings." >&2
+    exit 1
 fi
+# shellcheck disable=SC1090
+source "${USER_ENV_FILE}"
 if [[ -n "${ACCOUNT_FROM_ENV}" ]]; then
     ACCOUNT="${ACCOUNT_FROM_ENV}"
 fi
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/run_session.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/run_session.sh
index a8145d6c..df3b2c10 100755
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/run_session.sh
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/run_session.sh
@@ -12,10 +12,13 @@ set -euo pipefail
 
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 USER_ENV_FILE="${SCRIPT_DIR}/user.env"
-if [[ -f "${USER_ENV_FILE}" ]]; then
-    # shellcheck disable=SC1090
-    source "${USER_ENV_FILE}"
+if [[ ! -f "${USER_ENV_FILE}" ]]; then
+    echo "ERROR: required local config not found: ${USER_ENV_FILE}" >&2
+    echo "Create it from ${SCRIPT_DIR}/user.env.example and fill in your local settings." >&2
+    exit 1
 fi
+# shellcheck disable=SC1090
+source "${USER_ENV_FILE}"
 WORKLOAD="${WORKLOAD:-llama4_scout}"
 
 # ---- Phase 1: submit and wait for all experiments ----
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh
index b3cc1df2..249e8606 100755
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh
@@ -14,10 +14,13 @@ POLL_INTERVAL=30
 
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 USER_ENV_FILE="${SCRIPT_DIR}/user.env"
-if [[ -f "${USER_ENV_FILE}" ]]; then
-    # shellcheck disable=SC1090
-    source "${USER_ENV_FILE}"
+if [[ ! -f "${USER_ENV_FILE}" ]]; then
+    echo "ERROR: required local config not found: ${USER_ENV_FILE}" >&2
+    echo "Create it from ${SCRIPT_DIR}/user.env.example and fill in your local settings." >&2
+    exit 1
 fi
+# shellcheck disable=SC1090
+source "${USER_ENV_FILE}"
 SKILL_DIR="$(dirname "${SCRIPT_DIR}")"
 NVRX_SRC_DIR="$(cd "${SKILL_DIR}/../../.." && pwd)"
 

From 269a3e3f46be5e1ec22910e15d7f069a568b65db Mon Sep 17 00:00:00 2001
From: Seonmyeong Bak <sbak@nvidia.com>
Date: Mon, 27 Apr 2026 14:26:37 -0700
Subject: [PATCH 17/21] fix(skills): normalize FR segment scoring

---
 .../skills/nvrx-attr/SKILL.md                 | 12 +++
 .../nvrx-attr/fault-injection-loop/SKILL.md   | 14 +--
 .../nvrx-attr/scripts/score_attribution.py    | 98 ++++++++++++++++---
 .../skills/nvrx-attr/scripts/user.env.example |  3 +-
 4 files changed, 108 insertions(+), 19 deletions(-)

diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/SKILL.md
index 6884f96f..1f018b2a 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/SKILL.md
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/SKILL.md
@@ -50,3 +50,15 @@ full coalescing stack.
 - `logsage` package installed (required by `log_analysis`)
 - Package installed: `pip install nvidia-resiliency-ext` or `pip install -e .` from repo root
 - The fault-injection loop has only been validated with Megatron-LM training scripts
+
+## Fault-Loop Local Setup
+
+Before using `fault-injection-loop/`, create the local config file from the tracked
+template and fill in your site-specific values:
+
+```bash
+cp scripts/user.env.example scripts/user.env
+```
+
+The feedback-loop scripts require `src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env`
+to exist at runtime. Keep `user.env` local and untracked.
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
index 1598f221..79b3637e 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
@@ -120,7 +120,7 @@ NVRX_LLM_MODEL="nvidia/nemotron-3-super-120b-a12b"
 NVRX_LLM_BASE_URL="https://integrate.api.nvidia.com/v1"
 JUDGE_MODEL="qwen/qwen3.5-397b-a17b"
 JUDGE_BASE_URL="https://integrate.api.nvidia.com/v1"
-FR_RACK_SIZE=32
+FR_SEGMENT_SIZE=32
 ```
 
 Use `user.env` for stable site defaults such as partition, container image, and
@@ -156,7 +156,7 @@ Environment variables:
 | `NVRX_LLM_BASE_URL` | `https://integrate.api.nvidia.com/v1` | Base URL for log-analysis |
 | `JUDGE_MODEL` | `qwen/qwen3.5-397b-a17b` | Model for judge scoring |
 | `JUDGE_BASE_URL` | `https://integrate.api.nvidia.com/v1` | Base URL for judge scoring |
-| `FR_RACK_SIZE` | `32` | Ranks per rack for coarse FR scoring |
+| `FR_SEGMENT_SIZE` | `32` | Ranks per segment for coarse FR scoring |
 | `SBATCH_SCRIPT` | `scripts/l4_gb200_reduced.sh` | Job script to submit |
 | `POOL` | _(default pool above)_ | Space-separated experiment triplets |
 
@@ -255,7 +255,7 @@ analysis output, then returns structured JSON scores with a reasoning note.
 | **rank_primary** | `true` / `false` / `partial` | Injected rank is the primary root-cause in attribution |
 | **rank_any** | `true` / `false` | Injected rank mentioned anywhere in attribution |
 | **fault_described** | `true` / `false` / `partial` | Fault nature (hang/crash/signal/exception) correctly described |
-| **fr_rank_correct** | `rank` / `node` / `rack` / `false` / `no_dumps` | FR analysis narrows correctly to the injected rank, node, rack, or fails to narrow usefully |
+| **fr_rank_correct** | `rank` / `node` / `segment` / `false` / `no_dumps` | FR analysis narrows correctly to the injected rank, exactly one `GPUS_PER_NODE` rank block containing that rank, the configured `FR_SEGMENT_SIZE` rank block containing the injected rank, or fails to narrow usefully |
 | **judge_notes** | string | One-sentence summary of the main gap or confirmation |
 
 The judge is given:
@@ -264,10 +264,10 @@ The judge is given:
 3. Filtered raw log (last 400 lines, same `exclude_nvrx_logs` filtering as logsage)
 4. Raw logsage stdout (5-field text format)
 5. Raw FR analysis table output from `fr_attribution.py --fr-path ... -p "_dump_*"`
-6. `GPUS_PER_NODE` and `FR_RACK_SIZE` to map the injected rank to node and rack scopes for FR scoring
+6. `GPUS_PER_NODE` and `FR_SEGMENT_SIZE` to map the injected rank to exact node-sized and segment-sized scopes for FR scoring
 
 Default judge model: `qwen/qwen3.5-397b-a17b`. Override with `--model` in `score_attribution.py`.
-Default rack size for FR scope scoring: `32` ranks. Override with `FR_RACK_SIZE`.
+Default segment size for FR scope scoring: `32` ranks. Override with `FR_SEGMENT_SIZE`.
 
 ---
 
@@ -298,8 +298,8 @@ Common failure mode patterns and their meaning:
 | `fault_described=partial` for crash types | Crash keywords present but fault type not specifically named |
 | `restart_correct=false` for GPU_ERROR | LLM conflating hardware error with recoverable hang |
 | `fr_rank_correct=no_dumps` | NCCL watchdog did not fire before job ended — adjust `TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC` |
-| `fr_rank_correct=node` | FR isolated the correct node but not the exact rank |
-| `fr_rank_correct=rack` | FR isolated the correct rack-sized rank group but not the exact node/rank |
+| `fr_rank_correct=node` | FR isolated exactly one `GPUS_PER_NODE` rank block containing the injected rank, but not the exact rank |
+| `fr_rank_correct=segment` | FR isolated the configured `FR_SEGMENT_SIZE` rank block containing the injected rank, but not the exact node/rank |
 
 ---
 
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py
index b699096f..16c6faec 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py
@@ -22,6 +22,7 @@
 import json
 import logging
 import os
+import re
 import sys
 
 from langchain_openai import ChatOpenAI
@@ -40,7 +41,7 @@
 # Default judge model — override with --model
 DEFAULT_JUDGE_MODEL = "qwen/qwen3.5-397b-a17b"
 DEFAULT_GPUS_PER_NODE = int(os.getenv("GPUS_PER_NODE", "4"))
-DEFAULT_FR_RACK_SIZE = int(os.getenv("FR_RACK_SIZE", "32"))
+DEFAULT_FR_SEGMENT_SIZE = int(os.getenv("FR_SEGMENT_SIZE", os.getenv("FR_RACK_SIZE", "32")))
 
 # Expected restart decision and rationale per fault type
 _RESTART_TABLE = {
@@ -95,6 +96,64 @@ def load_log_excerpt(log_path, max_lines=400):
         return f"(could not read log file: {exc})"
 
 
+def parse_fr_missing_ranks(fr_output: str) -> set[int]:
+    if not fr_output or fr_output.strip() in ("", "no_dumps", "no results", "run_invalid"):
+        return set()
+
+    ranks: set[int] = set()
+    for line in fr_output.splitlines():
+        if "|" not in line or "Missing Ranks" in line:
+            continue
+        parts = [part.strip() for part in line.split("|")]
+        if len(parts) < 6:
+            continue
+        last_col = parts[-1]
+        for match in re.finditer(r"\d+", last_col):
+            ranks.add(int(match.group(0)))
+    return ranks
+
+
+def normalize_fr_rank_correct(
+    raw_label: str,
+    fr_output: str,
+    rank: int,
+    total_ranks: int,
+    gpus_per_node: int,
+    segment_size: int,
+) -> str:
+    label = (raw_label or "").strip().lower()
+    if label in {"n/a", ""}:
+        return raw_label
+    if label == "rack":
+        label = "segment"
+    if label == "no_dumps":
+        return "no_dumps"
+
+    fr_ranks = parse_fr_missing_ranks(fr_output)
+    if not fr_ranks:
+        return "no_dumps"
+
+    node_start = (rank // gpus_per_node) * gpus_per_node
+    node_end = min(node_start + gpus_per_node - 1, total_ranks - 1)
+    segment_start = (rank // segment_size) * segment_size
+    segment_end = min(segment_start + segment_size - 1, total_ranks - 1)
+
+    in_node = all(node_start <= fr_rank <= node_end for fr_rank in fr_ranks)
+    in_segment = all(segment_start <= fr_rank <= segment_end for fr_rank in fr_ranks)
+
+    if label == "node":
+        if in_node:
+            return "node"
+        if in_segment:
+            return "segment"
+        return "false"
+
+    if label == "segment":
+        return "segment" if in_segment else "false"
+
+    return label
+
+
 def build_judge_prompt(
     fault_type,
     rank,
@@ -105,12 +164,12 @@ def build_judge_prompt(
     fr_output,
     log_excerpt,
     gpus_per_node,
-    rack_size,
+    segment_size,
 ):
     total_ranks = nodes * gpus_per_node
     node_index = rank // gpus_per_node
-    rack_start = (rank // rack_size) * rack_size
-    rack_end = min(rack_start + rack_size - 1, total_ranks - 1)
+    segment_start = (rank // segment_size) * segment_size
+    segment_end = min(segment_start + segment_size - 1, total_ranks - 1)
     expected_restart, restart_rationale = _RESTART_TABLE.get(
         fault_type, ("unknown", "unknown fault type")
     )
@@ -141,7 +200,7 @@ def build_judge_prompt(
 - Fault type : {fault_type}
 - Injected rank : {rank}  (global rank index, 0-based; total ranks = {total_ranks})
 - Injected node : {node_index}  (using {gpus_per_node} GPUs per node)
-- Injected rack : ranks {rack_start}-{rack_end}  (using rack size {rack_size})
+- Injected segment : ranks {segment_start}-{segment_end}  (using segment size {segment_size})
 - Injected at iteration : {iter_}
 - Cluster : {nodes} nodes × {gpus_per_node} GPUs = {total_ranks} total ranks
 
@@ -152,7 +211,7 @@ def build_judge_prompt(
 - FR scope scoring:
   - "rank" if FR points directly to rank {rank}
   - "node" if FR does not isolate rank {rank} but correctly narrows to node {node_index}
-  - "rack" if FR does not isolate rank {rank} or node {node_index} but correctly narrows to rack ranks {rack_start}-{rack_end}
+  - "segment" if FR does not isolate rank {rank} or node {node_index} but correctly narrows to segment ranks {segment_start}-{segment_end}
   - "false" if FR points elsewhere or is not useful
   - "no_dumps" if there is no actionable FR output
 
@@ -182,10 +241,10 @@ def build_judge_prompt(
    Values: "true" | "false" | "partial" (category right but specifics wrong)
 
 5. **fr_rank_correct** — How precise is the FR analysis output?
-   Values: "rank" | "node" | "rack" | "false" | "no_dumps"
+   Values: "rank" | "node" | "segment" | "false" | "no_dumps"
    Use "rank" only if rank {rank} is explicitly implicated.
    Use "node" only if the FR output narrows correctly to node {node_index} but not the exact rank.
-   Use "rack" only if the FR output narrows correctly to rack ranks {rack_start}-{rack_end} but not the exact node or rank.
+   Use "segment" only if the FR output narrows correctly to segment ranks {segment_start}-{segment_end} but not the exact node or rank.
    Use "false" if the FR output points somewhere else, is misleading, or does not narrow correctly.
    Use "no_dumps" if there is no actionable FR output.
 
@@ -243,7 +302,7 @@ def score(args):
         fr_output=args.fr_output,
         log_excerpt=log_excerpt,
         gpus_per_node=args.gpus_per_node,
-        rack_size=args.rack_size,
+        segment_size=args.segment_size,
     )
 
     # build_judge_prompt returns a dict directly for invalid runs (no LLM call needed)
@@ -259,6 +318,15 @@ def score(args):
         text = "\n".join(line for line in lines if not line.startswith("```")).strip()
 
     result = json.loads(text)
+    total_ranks = args.nodes * args.gpus_per_node
+    result["fr_rank_correct"] = normalize_fr_rank_correct(
+        raw_label=result.get("fr_rank_correct", ""),
+        fr_output=args.fr_output,
+        rank=args.rank,
+        total_ranks=total_ranks,
+        gpus_per_node=args.gpus_per_node,
+        segment_size=args.segment_size,
+    )
     return result
 
 
@@ -284,13 +352,21 @@ def main():
         default=DEFAULT_GPUS_PER_NODE,
         help="GPUs per node for rank-to-node mapping",
     )
+    parser.add_argument(
+        "--segment-size",
+        type=int,
+        default=DEFAULT_FR_SEGMENT_SIZE,
+        help="Ranks per segment for coarse FR scope scoring",
+    )
     parser.add_argument(
         "--rack-size",
         type=int,
-        default=DEFAULT_FR_RACK_SIZE,
-        help="Ranks per rack for coarse FR scope scoring",
+        default=None,
+        help="Deprecated alias for --segment-size",
     )
     args = parser.parse_args()
+    if args.rack_size is not None:
+        args.segment_size = args.rack_size
 
     try:
         result = score(args)
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example
index cf8f8f5e..cf060f45 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example
@@ -41,4 +41,5 @@ CONTAINER_IMAGE="nvcr.io/nvidia/nemo:26.04"
 # NVRX_LLM_BASE_URL="https://integrate.api.nvidia.com/v1"
 # JUDGE_MODEL="qwen/qwen3.5-397b-a17b"
 # JUDGE_BASE_URL="https://integrate.api.nvidia.com/v1"
-# FR_RACK_SIZE=32
+# FR_SEGMENT_SIZE=32
+# FR_RACK_SIZE=32  # deprecated alias

From 59ebefe855091d06c58c21c1660b38ce96c23d4a Mon Sep 17 00:00:00 2001
From: Seonmyeong Bak <sbak@nvidia.com>
Date: Tue, 28 Apr 2026 13:27:55 -0700
Subject: [PATCH 18/21] fix(log-analysis): handle zero LLM retries

---
 .../attribution/log_analyzer/nvrx_logsage.py  | 23 ++++++++----
 .../unit/test_nvrx_logsage_retry.py           | 37 +++++++++++++++++++
 2 files changed, 53 insertions(+), 7 deletions(-)
 create mode 100644 tests/attribution/unit/test_nvrx_logsage_retry.py

diff --git a/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py b/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py
index 30d90e8c..39a5177c 100644
--- a/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py
+++ b/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py
@@ -167,6 +167,14 @@ def _retry_return_application_errors(
 def _with_exponential_backoff(llm_call, checkpoint_saved: bool) -> tuple[str, str, str, str, str]:
     retries, initial_backoff, max_backoff, jitter = _log_analysis_retry_config()
     backoff = initial_backoff
+    last_error = "no attempts made (retries=0)"
+    fallback = (
+        ATTR_LLM_FAILURE,
+        ATTR_LLM_FAILURE,
+        ATTR_LLM_FAILURE,
+        ATTR_LLM_FAILURE,
+        str(checkpoint_saved),
+    )
 
     for attempt in range(1, retries + 1):
         try:
@@ -184,16 +192,17 @@ def _with_exponential_backoff(llm_call, checkpoint_saved: bool) -> tuple[str, st
                 retries,
                 last_error,
             )
-            return (
-                ATTR_LLM_FAILURE,
-                ATTR_LLM_FAILURE,
-                ATTR_LLM_FAILURE,
-                ATTR_LLM_FAILURE,
-                str(checkpoint_saved),
-            )
+            return fallback
 
         backoff = _sleep_with_backoff(attempt, retries, backoff, max_backoff, jitter)
 
+    logger.error(
+        "Log-analysis LLM failed after %d attempts; last error: %s",
+        retries,
+        last_error,
+    )
+    return fallback
+
 
 class NVRxLogAnalyzer(NVRxAttribution):
     def __init__(self, args: Union[argparse.Namespace, Mapping[str, Any]]):
diff --git a/tests/attribution/unit/test_nvrx_logsage_retry.py b/tests/attribution/unit/test_nvrx_logsage_retry.py
new file mode 100644
index 00000000..7dd8ce94
--- /dev/null
+++ b/tests/attribution/unit/test_nvrx_logsage_retry.py
@@ -0,0 +1,37 @@
+import importlib
+import os
+import unittest
+from unittest.mock import patch
+
+
+try:
+    nvrx_logsage = importlib.import_module(
+        "nvidia_resiliency_ext.attribution.log_analyzer.nvrx_logsage"
+    )
+    IMPORT_ERROR = None
+except ImportError as exc:
+    nvrx_logsage = None
+    IMPORT_ERROR = exc
+
+
+@unittest.skipIf(nvrx_logsage is None, f"missing optional dependency: {IMPORT_ERROR}")
+class TestNVRxLogSageRetry(unittest.TestCase):
+    def test_with_exponential_backoff_returns_failure_when_retries_zero(self):
+        def llm_call():
+            raise AssertionError("llm_call should not run when retries=0")
+
+        with patch.dict(os.environ, {"NVRX_LOG_ANALYSIS_LLM_RETRIES": "0"}):
+            self.assertEqual(
+                nvrx_logsage._with_exponential_backoff(llm_call, checkpoint_saved=True),
+                (
+                    nvrx_logsage.ATTR_LLM_FAILURE,
+                    nvrx_logsage.ATTR_LLM_FAILURE,
+                    nvrx_logsage.ATTR_LLM_FAILURE,
+                    nvrx_logsage.ATTR_LLM_FAILURE,
+                    "True",
+                ),
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()

From 0e1d6db6693c44b67702190581dd66572c3d89f7 Mon Sep 17 00:00:00 2001
From: Seonmyeong Bak <sbak@nvidia.com>
Date: Tue, 28 Apr 2026 13:51:23 -0700
Subject: [PATCH 19/21] fix(skills): use LLM API key names

---
 src/nvidia_resiliency_ext/skills/nvrx-attr/SKILL.md       | 4 ++--
 .../skills/nvrx-attr/fault-injection-loop/SKILL.md        | 8 ++++----
 .../skills/nvrx-attr/fr-analysis/SKILL.md                 | 4 ++--
 .../skills/nvrx-attr/log-analysis/SKILL.md                | 4 ++--
 .../skills/nvrx-attr/scripts/score_attribution.py         | 6 +++---
 .../skills/nvrx-attr/scripts/user.env.example             | 6 +++---
 6 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/SKILL.md
index 1f018b2a..5da1c9db 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/SKILL.md
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/SKILL.md
@@ -4,7 +4,7 @@ description: >
   Orchestration layer over nvidia_resiliency_ext attribution modules. Provides
   log-analysis, fr-analysis, and a Megatron-LM-oriented fault-injection feedback
   loop for benchmarking attribution quality on SLURM workloads.
-compatibility: Requires Python 3.10+, nvidia-resiliency-ext installed, logsage, langchain-openai, and NVIDIA_API_KEY (env var, NVIDIA_API_KEY_FILE, or ~/.nvidia_api_key). The fault-injection loop has only been validated with Megatron-LM workloads.
+compatibility: Requires Python 3.10+, nvidia-resiliency-ext installed, logsage, langchain-openai, and LLM_API_KEY (env var, LLM_API_KEY_FILE, or ~/.llm_api_key). The fault-injection loop has only been validated with Megatron-LM workloads.
 metadata:
   author: nvidia
 ---
@@ -45,7 +45,7 @@ full coalescing stack.
 
 ## Common prerequisites
 
-- `NVIDIA_API_KEY` environment variable, `NVIDIA_API_KEY_FILE`, or `~/.nvidia_api_key`
+- `LLM_API_KEY` environment variable, `LLM_API_KEY_FILE`, or `~/.llm_api_key`
 - `langchain-openai` installed
 - `logsage` package installed (required by `log_analysis`)
 - Package installed: `pip install nvidia-resiliency-ext` or `pip install -e .` from repo root
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
index 79b3637e..063bb1cb 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md
@@ -8,7 +8,7 @@ description: >
   After all jobs complete, runs /log-analysis and /fr-analysis on every experiment,
   scores attribution vs. ground truth, aggregates gaps, and iterates on attribution
   modules to close them.
-compatibility: Requires SLURM cluster access, sbatch, NVIDIA_API_KEY, langchain-openai, logsage, and nvidia-resiliency-ext installed. This workflow has only been validated with Megatron-LM workloads.
+compatibility: Requires SLURM cluster access, sbatch, LLM_API_KEY, langchain-openai, logsage, and nvidia-resiliency-ext installed. This workflow has only been validated with Megatron-LM workloads.
 metadata:
   author: nvidia
   sub-skills: [log-analysis, fr-analysis]
@@ -114,8 +114,8 @@ MEGATRON_REPO_HOST_PATH="${HOME}/megatron-lm-main"
 SHARED_TMP_BASE_DIR="${HOME}/tmp"
 WORKSPACE_HOST_PATH="${HOME}/tmp"
 CONTAINER_IMAGE="nvcr.io/nvidia/nemo:26.04"
-NVIDIA_API_KEY_FILE="${HOME}/.nvidia_api_key"
-JUDGE_API_KEY_FILE="${HOME}/.nvidia_api_key"
+LLM_API_KEY_FILE="${HOME}/.llm_api_key"
+JUDGE_API_KEY_FILE="${HOME}/.llm_api_key"
 NVRX_LLM_MODEL="nvidia/nemotron-3-super-120b-a12b"
 NVRX_LLM_BASE_URL="https://integrate.api.nvidia.com/v1"
 JUDGE_MODEL="qwen/qwen3.5-397b-a17b"
@@ -150,7 +150,7 @@ Environment variables:
 | `SHARED_TMP_BASE_DIR` | `${HOME}/tmp` | Shared filesystem path used for cross-step coordination |
 | `WORKSPACE_HOST_PATH` | `${HOME}/tmp` | Host path mounted at `/workspace` inside the container |
 | `CONTAINER_IMAGE` | `nvcr.io/nvidia/nemo:26.04` | Container image used by the workload script |
-| `NVIDIA_API_KEY_FILE` | _unset_ | File containing the log-analysis API key |
+| `LLM_API_KEY_FILE` | _unset_ | File containing the log-analysis API key |
 | `JUDGE_API_KEY_FILE` | _unset_ | File containing the judge API key |
 | `NVRX_LLM_MODEL` | `nvidia/nemotron-3-super-120b-a12b` | Model for log-analysis |
 | `NVRX_LLM_BASE_URL` | `https://integrate.api.nvidia.com/v1` | Base URL for log-analysis |
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md
index 17cc7de5..8fcf559c 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md
@@ -5,7 +5,7 @@ description: >
   isolate the responsible ranks using CollectiveAnalyzer. Use when a distributed training job
   hangs due to an NCCL collective timeout and FR dump files are available. Detects the wavefront
   process group where collectives diverge and returns the root-cause suspect ranks.
-compatibility: Requires PyTorch NCCL FR dumps (TORCH_NCCL_TRACE_BUFFER_SIZE > 0 must be set during training). NVIDIA_API_KEY and langchain-openai are required only when using --llm-analyze.
+compatibility: Requires PyTorch NCCL FR dumps (TORCH_NCCL_TRACE_BUFFER_SIZE > 0 must be set during training). LLM_API_KEY and langchain-openai are required only when using --llm-analyze.
 metadata:
   entry-point: CollectiveAnalyzer
   script: scripts/fr_attribution.py
@@ -108,6 +108,6 @@ or triggered automatically on NCCL timeout.
 ## Prerequisites
 
 - FR dump files produced by PyTorch NCCL (set `TORCH_NCCL_TRACE_BUFFER_SIZE` > 0)
-- `NVIDIA_API_KEY` required only when using `--llm-analyze`
+- `LLM_API_KEY` required only when using `--llm-analyze`
 - `langchain-openai` required only when using `--llm-analyze`
 - `FR_DEBUG=1` env var enables verbose debug logging in the script
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/SKILL.md
index e793d5de..a1199edc 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/SKILL.md
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/SKILL.md
@@ -5,7 +5,7 @@ description: >
   NVRxLogAnalyzer. Use when you have a SLURM training job log and need to determine why the
   job failed and whether it should be restarted. Performs per-cycle chunking, fast-path pattern
   matching, and LLM-based classification.
-compatibility: Requires NVIDIA_API_KEY, langchain-openai, and logsage packages installed. nvidia-resiliency-ext must be installed.
+compatibility: Requires LLM_API_KEY, langchain-openai, and logsage packages installed. nvidia-resiliency-ext must be installed.
 metadata:
   entry-point: NVRxLogAnalyzer
   script: scripts/nvrx_logsage.py
@@ -108,5 +108,5 @@ fields joined by `\n`:
 
 ## Prerequisites
 
-- `NVIDIA_API_KEY` set (env var, `NVIDIA_API_KEY_FILE`, or `~/.nvidia_api_key`)
+- `LLM_API_KEY` set (env var, `LLM_API_KEY_FILE`, or `~/.llm_api_key`)
 - `langchain-openai` and `logsage` packages installed
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py
index 16c6faec..fcc5c3f3 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py
@@ -28,7 +28,7 @@
 from langchain_openai import ChatOpenAI
 
 sys.path.insert(0, str(__import__("pathlib").Path(__file__).resolve().parents[4]))
-from nvidia_resiliency_ext.attribution.api_keys import load_nvidia_api_key
+from nvidia_resiliency_ext.attribution.api_keys import load_llm_api_key
 from nvidia_resiliency_ext.attribution.svc.config import DEFAULT_LLM_BASE_URL
 
 logger = logging.getLogger(__name__)
@@ -273,11 +273,11 @@ def score(args):
             except OSError:
                 api_key = ""
     if not api_key:
-        api_key = load_nvidia_api_key()
+        api_key = load_llm_api_key()
     if not api_key:
         raise ValueError(
             "Judge API key not found. Set JUDGE_API_KEY/JUDGE_API_KEY_FILE, "
-            "or NVIDIA_API_KEY/NVIDIA_API_KEY_FILE, or create ~/.nvidia_api_key"
+            "or LLM_API_KEY/LLM_API_KEY_FILE, or create ~/.llm_api_key"
         )
 
     base_url = os.getenv("JUDGE_BASE_URL", "").strip() or args.base_url
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example
index cf060f45..c4b05bc9 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example
@@ -33,10 +33,10 @@ CONTAINER_IMAGE="nvcr.io/nvidia/nemo:26.04"
 
 # Log-analysis / judge LLM settings
 # Keep these local. Prefer *_API_KEY_FILE over inline secrets.
-# NVIDIA_API_KEY="..."
+# LLM_API_KEY="..."
 # JUDGE_API_KEY="..."
-# NVIDIA_API_KEY_FILE="${HOME}/.nvidia_api_key"
-# JUDGE_API_KEY_FILE="${HOME}/.nvidia_api_key"
+# LLM_API_KEY_FILE="${HOME}/.llm_api_key"
+# JUDGE_API_KEY_FILE="${HOME}/.llm_api_key"
 # NVRX_LLM_MODEL="nvidia/nemotron-3-super-120b-a12b"
 # NVRX_LLM_BASE_URL="https://integrate.api.nvidia.com/v1"
 # JUDGE_MODEL="qwen/qwen3.5-397b-a17b"

From 26e642987a6bade9fb5b620bd2533621d7d2d5c3 Mon Sep 17 00:00:00 2001
From: Seonmyeong Bak <sbak@nvidia.com>
Date: Tue, 28 Apr 2026 14:35:41 -0700
Subject: [PATCH 20/21] fix(skills): resolve user env for spooled jobs

---
 .../nvrx-attr/scripts/l4_gb200_reduced.sh     | 21 ++++++++++++++++++-
 .../nvrx-attr/scripts/n3_super_gb200_fi.sh    | 21 ++++++++++++++++++-
 .../nvrx-attr/scripts/prepare_node_alloc.sh   |  2 +-
 3 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh
index 319053fc..4da69f2a 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh
@@ -17,7 +17,26 @@
 #SBATCH --exclusive
 #SBATCH --mem=0
 
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+resolve_script_dir() {
+    local candidate
+
+    for candidate in \
+        "${NVRX_ATTR_SCRIPT_DIR:-}" \
+        "$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" \
+        "${SLURM_SUBMIT_DIR:-}" \
+        "${SLURM_SUBMIT_DIR:-}/scripts" \
+        "${SLURM_SUBMIT_DIR:-}/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts"
+    do
+        if [[ -n "${candidate}" && -f "${candidate}/user.env" ]]; then
+            cd "${candidate}" && pwd
+            return
+        fi
+    done
+
+    cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd
+}
+
+SCRIPT_DIR="$(resolve_script_dir)"
 USER_ENV_FILE="${SCRIPT_DIR}/user.env"
 NVRX_SRC_ROOT_DEFAULT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)"
 NVRX_REPO_ROOT_DEFAULT="$(cd "${SCRIPT_DIR}/../../../../.." && pwd)"
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh
index 7f077575..f1e7d818 100644
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh
@@ -15,7 +15,26 @@
 #SBATCH --exclusive
 #SBATCH --mem=0
 
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+resolve_script_dir() {
+    local candidate
+
+    for candidate in \
+        "${NVRX_ATTR_SCRIPT_DIR:-}" \
+        "$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" \
+        "${SLURM_SUBMIT_DIR:-}" \
+        "${SLURM_SUBMIT_DIR:-}/scripts" \
+        "${SLURM_SUBMIT_DIR:-}/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts"
+    do
+        if [[ -n "${candidate}" && -f "${candidate}/user.env" ]]; then
+            cd "${candidate}" && pwd
+            return
+        fi
+    done
+
+    cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd
+}
+
+SCRIPT_DIR="$(resolve_script_dir)"
 USER_ENV_FILE="${SCRIPT_DIR}/user.env"
 NVRX_SRC_ROOT_DEFAULT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)"
 NVRX_REPO_ROOT_DEFAULT="$(cd "${SCRIPT_DIR}/../../../../.." && pwd)"
diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh
index 9ce92b67..c2d43752 100755
--- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh
+++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh
@@ -179,7 +179,7 @@ submit_one() {
         --mem=0 \
         --output="${EXPERIMENT_DIR}/logs/slurm/%j.launch.out" \
         --error="${EXPERIMENT_DIR}/logs/slurm/%j.launch.err" \
-        --export=ALL,FAULT_TYPE="${FAULT_TYPE}",FAULT_RANK="${RANK}",FAULT_AT_ITER="${ITER}",GPUS_PER_NODE="${GPUS_PER_NODE}",EXPERIMENT_DIR="${EXPERIMENT_DIR}",BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR}",MEGATRON_REPO_HOST_PATH="${MEGATRON_REPO_HOST_PATH:-}",CONTAINER_IMAGE="${CONTAINER_IMAGE:-}",SHARED_TMP_BASE_DIR="${SHARED_TMP_BASE_DIR:-}",WORKSPACE_HOST_PATH="${WORKSPACE_HOST_PATH:-}" \
+        --export=ALL,FAULT_TYPE="${FAULT_TYPE}",FAULT_RANK="${RANK}",FAULT_AT_ITER="${ITER}",GPUS_PER_NODE="${GPUS_PER_NODE}",EXPERIMENT_DIR="${EXPERIMENT_DIR}",BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR}",MEGATRON_REPO_HOST_PATH="${MEGATRON_REPO_HOST_PATH:-}",CONTAINER_IMAGE="${CONTAINER_IMAGE:-}",SHARED_TMP_BASE_DIR="${SHARED_TMP_BASE_DIR:-}",WORKSPACE_HOST_PATH="${WORKSPACE_HOST_PATH:-}",NVRX_ATTR_SCRIPT_DIR="${SCRIPT_DIR}" \
         --parsable
     )
     if [[ -n "${ACCOUNT}" ]]; then

From d4a3999b80bf0e0a456bfbf73176f66b5a583de1 Mon Sep 17 00:00:00 2001
From: Seonmyeong Bak <sbak@nvidia.com>
Date: Tue, 28 Apr 2026 15:11:39 -0700
Subject: [PATCH 21/21] style(tests): sort nvrx logsage retry imports

---
 tests/attribution/unit/test_nvrx_logsage_retry.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/attribution/unit/test_nvrx_logsage_retry.py b/tests/attribution/unit/test_nvrx_logsage_retry.py
index 7dd8ce94..4c5e5890 100644
--- a/tests/attribution/unit/test_nvrx_logsage_retry.py
+++ b/tests/attribution/unit/test_nvrx_logsage_retry.py
@@ -3,7 +3,6 @@
 import unittest
 from unittest.mock import patch
 
-
 try:
     nvrx_logsage = importlib.import_module(
         "nvidia_resiliency_ext.attribution.log_analyzer.nvrx_logsage"