From b2a035ebc6fefe007a36dd9b2595dfb26178569c Mon Sep 17 00:00:00 2001 From: Seonmyeong Bak Date: Wed, 22 Apr 2026 11:23:47 -0700 Subject: [PATCH 01/21] feat(skills): add nvrx-attr skill bundle --- src/nvidia_resiliency_ext/skills/__init__.py | 1 + .../nvrx-attr/SESSION_REPORT_20260409_13.md | 137 +++++++ .../skills/nvrx-attr/SKILL.md | 52 +++ .../nvrx-attr/fault-injection-loop/SKILL.md | 336 ++++++++++++++++ .../skills/nvrx-attr/fr-analysis/SKILL.md | 113 ++++++ .../fr-analysis/scripts/fr_attribution.py | 1 + .../skills/nvrx-attr/l4_gb200_reduced.sh | 363 +++++++++++++++++ .../skills/nvrx-attr/log-analysis/SKILL.md | 112 ++++++ .../log-analysis/scripts/nvrx_logsage.py | 1 + .../nvrx-attr/scripts/l4_gb200_reduced.sh | 362 +++++++++++++++++ .../nvrx-attr/scripts/n3_super_gb200.sh | 166 ++++++++ .../scripts/n3_super_gb200_shm_test.sh | 369 ++++++++++++++++++ .../scripts/pools/n3_super_8n_16n.pool | 40 ++ .../nvrx-attr/scripts/prepare_node_alloc.sh | 209 ++++++++++ .../skills/nvrx-attr/scripts/run_session.sh | 39 ++ .../nvrx-attr/scripts/score_attribution.py | 237 +++++++++++ .../nvrx-attr/scripts/watch_and_analyze.sh | 202 ++++++++++ .../skills/nvrx-attr/scripts/workloads.conf | 17 + 18 files changed, 2757 insertions(+) create mode 100644 src/nvidia_resiliency_ext/skills/__init__.py create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/SESSION_REPORT_20260409_13.md create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/SKILL.md create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md create mode 120000 src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/scripts/fr_attribution.py create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/l4_gb200_reduced.sh create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/SKILL.md create mode 120000 src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/scripts/nvrx_logsage.py create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200.sh create mode 100755 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_shm_test.sh create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/pools/n3_super_8n_16n.pool create mode 100755 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh create mode 100755 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/run_session.sh create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py create mode 100755 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/workloads.conf diff --git a/src/nvidia_resiliency_ext/skills/__init__.py b/src/nvidia_resiliency_ext/skills/__init__.py new file mode 100644 index 00000000..1670aafe --- /dev/null +++ b/src/nvidia_resiliency_ext/skills/__init__.py @@ -0,0 +1 @@ +"""Agent skills bundled with nvidia_resiliency_ext.""" diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/SESSION_REPORT_20260409_13.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/SESSION_REPORT_20260409_13.md new file mode 100644 index 00000000..657cbbd5 --- /dev/null +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/SESSION_REPORT_20260409_13.md @@ -0,0 +1,137 @@ +# Fault Injection Session Report — April 9–13, 2026 + +## Summary + +End-to-end validation of the fault-injection attribution pipeline across 48 experiments. +Identified and fixed three pipeline bugs, confirmed FR analysis is solid, and isolated the +remaining attribution gap to a single issue: **logsage returns RESTART IMMEDIATE for +crash/exception-type faults that should be STOP**. + +--- + +## Pipeline Fixes Applied + +| File | Fix | +|---|---| +| `trace_analyzer/capture.py` | `capture_logs()` now saves/restores logger level and lowers it to INFO — previously, root logger at WARNING silently dropped all `logger.info()` calls inside the capture block, producing empty `analysis_text` from `CollectiveAnalyzer` | +| `trace_analyzer/fr_attribution.py` | `main()` now prints `analysis_text` + `hanging_ranks` to stdout (was discarding results) | +| `scripts/watch_and_analyze.sh` | FR inline Python block: import from installed package (not local skill copy), correctly extract `analysis_text`/`hanging_ranks` from returned dict, redirect stderr to `/dev/null` instead of mixing into FR output | +| `scripts/score_attribution.py` | **New file** — LLM judge (Claude Sonnet) that scores 5 attribution dimensions per experiment and returns structured JSON | + +--- + +## Experiment Sessions + +### Session 1 — Mini-batch validation (Apr 9, `20260409_160245`) + +6 experiments: GPU_SLEEP×2, GPU_ERROR×2, SIGKILL×1, SIGTERM×1 — all 2-node. +Purpose: confirm pipeline works end-to-end after fixes. + +| # | FAULT_TYPE | RANK | restart | rank_p | rank_a | fault_d | fr_rank | +|---|---|---|---|---|---|---|---| +| 1 | GPU_SLEEP | 1 | ✅ | ✅ | ✅ | ✅ | ✅ | +| 2 | GPU_SLEEP | 0 | ✅ | ✅ | ✅ | partial | ✅ | +| 3 | GPU_ERROR | 1 | ❌ | ❌ | ❌ | partial | ✅ | +| 4 | GPU_ERROR | 0 | ❌ | ❌ | ❌ | partial | ✅ | +| 5 | SIGKILL | 1 | ❌ | ✅ | ✅ | partial | ✅ | +| 6 | SIGTERM | 1 | ✅ | ❌ | ❌ | partial | ✅ | + +FR analysis: 6/6 correct. Pipeline confirmed working. + +--- + +### Session 2 — Full default pool (Apr 9, `20260409_170603`) + +34 experiments across all fault types and node counts (2/4/8 nodes). + +**Infrastructure issue:** 18/34 jobs failed at container startup due to a pyxis/enroot +`nvidia-container-cli ldcache` error on certain compute nodes: + +``` +nvidia-container-cli: ldcache error: process /usr/sbin/ldconfig.real failed with error code: 1 +[ERROR] /etc/enroot/hooks.d/98-nvidia.sh exited with return code 1 +pyxis: couldn't start container +rm: cannot remove '/usr/local/cuda/compat/lib': Read-only file system +``` + +The CUDA compat overlay was not being applied on those nodes — `ldconfig` could not write its +cache inside the read-only squashfs container. These jobs produced no FR dumps and their logs +contained only the container error, which logsage misattributed as a disk/storage fault. +The issue was transient and node-specific; jobs submitted the next day ran cleanly. + +**Clean-run results (16/34):** see full table in +`/home/sbak/experiments/llama4-scout-gb200/fault_injection/20260409_170603/experiments_report.md` + +Aggregate for clean-run jobs: + +| FAULT_TYPE | N (clean) | restart% | rank_primary% | fr_rank% | +|---|---|---|---|---| +| GPU_SLEEP | 5 | 80% | 40% | 60% | +| GPU_ERROR | 4 | 0% | 25% | 75% | +| SIGKILL | 3 | 33% | 33% | 100% | +| OS_ABORT | 1 | 0% | 0% | 100% | + +--- + +### Session 3 — SEGFAULT cluster health check (Apr 10, `20260410_135216`) + +2 experiments: SEGFAULT rank=0 and rank=1, 2-node. Purpose: confirm cluster healthy after +the Apr 9 enroot issue. + +| # | FAULT_TYPE | RANK | restart | rank_p | rank_a | fault_d | fr_rank | +|---|---|---|---|---|---|---|---| +| 1 | SEGFAULT | 1 | ❌ | ✅ | ✅ | ✅ | ✅ | +| 2 | SEGFAULT | 0 | ❌ | ✅ | ✅ | ✅ | ✅ | + +Cluster healthy (both COMPLETED, 7 FR dumps each). Rank and fault description correct; +restart decision wrong (RESTART instead of STOP). + +--- + +### Session 4 — Python fault types (Apr 10, `20260410_143501`) + +4 experiments: LOCK_GIL×2, WORKLOAD_EXC×1, ASYNC_EXC×1 — all 2-node. +These were skipped in the full session due to the enroot issue. + +| # | FAULT_TYPE | RANK | restart | rank_p | rank_a | fault_d | fr_rank | +|---|---|---|---|---|---|---|---| +| 1 | LOCK_GIL | 1 | ✅ | ✅ | ✅ | partial | ✅ | +| 2 | LOCK_GIL | 0 | ✅ | ✅ | ✅ | partial | ✅ | +| 3 | WORKLOAD_EXC | 1 | ❌ | ✅ | ✅ | partial | ❌ (rank 7) | +| 4 | ASYNC_EXC | 1 | ❌ | ❌ | ❌ | false | ✅ | + +Note on WORKLOAD_EXC FR result: FR flagged rank 7 instead of rank 1. When a rank throws an +application exception and crashes, the last rank detected as missing by NCCL's collective +timeout isn't necessarily the originating rank — FR is identifying the symptom rank. + +--- + +## Attribution Quality Summary (clean runs only) + +| Dimension | Assessment | +|---|---| +| **FR rank identification** | Solid — correctly identified the hanging rank in all clean-run experiments where NCCL completed enough to produce dumps. The `capture_logs()` fix was the key enabler. | +| **Log rank identification** | Good for hang types (GPU_SLEEP, LOCK_GIL); weaker for crash/signal types where all ranks see a simultaneous NCCL timeout masking the originator. FR compensates for this gap. | +| **Restart decision** | ✅ Correct for hang/recoverable types: GPU_SLEEP, LOCK_GIL, SIGTERM. ❌ Wrong for crash/exception types: GPU_ERROR, SIGKILL, SEGFAULT, WORKLOAD_EXC, ASYNC_EXC — logsage consistently returns RESTART IMMEDIATE when the correct decision is STOP. | +| **Fault description** | Consistently `partial` — logsage describes the observable NCCL collective timeout symptom, not the underlying injected fault (GPU hang, kill signal, exception). This is expected given the log contains only symptoms. | + +--- + +## Open Gap + +**Single actionable fix:** logsage restart decision for crash/exception-type faults. + +Logsage sees the same NCCL collective timeout pattern whether the root cause is a recoverable +GPU hang or a hard crash (SIGKILL, SEGFAULT, CUDA error, application exception). It needs +keyword-based fast-path rules to detect crash signals before the LLM runs: + +| Fault type | Expected | Currently returns | +|---|---|---| +| GPU_ERROR | STOP | RESTART IMMEDIATE | +| SIGKILL | STOP | RESTART IMMEDIATE | +| SEGFAULT | STOP | RESTART IMMEDIATE | +| WORKLOAD_EXC | STOP | RESTART IMMEDIATE | +| ASYNC_EXC | STOP | RESTART IMMEDIATE | +| OS_ABORT | STOP | RESTART IMMEDIATE | + +Target file: `attribution/log_analyzer/nvrx_logsage.py` diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/SKILL.md new file mode 100644 index 00000000..6884f96f --- /dev/null +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/SKILL.md @@ -0,0 +1,52 @@ +--- +name: nvrx-attr +description: > + Orchestration layer over nvidia_resiliency_ext attribution modules. Provides + log-analysis, fr-analysis, and a Megatron-LM-oriented fault-injection feedback + loop for benchmarking attribution quality on SLURM workloads. +compatibility: Requires Python 3.10+, nvidia-resiliency-ext installed, logsage, langchain-openai, and NVIDIA_API_KEY (env var, NVIDIA_API_KEY_FILE, or ~/.nvidia_api_key). The fault-injection loop has only been validated with Megatron-LM workloads. +metadata: + author: nvidia +--- + +# Attribution Skills + +High-level orchestration layer over the `nvidia_resiliency_ext.attribution` modules. +Each subdirectory is a self-contained skill with its own `SKILL.md` and helper scripts. + +## Skills + +| Directory | Purpose | Entry point | +|-----------|---------|------------| +| [`log-analysis/`](./log-analysis/SKILL.md) | Analyze SLURM job logs for failure root-cause and restart decisions | `NVRxLogAnalyzer` (`nvrx_logsage.py`) | +| [`fr-analysis/`](./fr-analysis/SKILL.md) | Analyze NCCL flight-recorder dumps for collective-hang root-cause | `CollectiveAnalyzer` (`fr_attribution.py`) | +| [`fault-injection-loop/`](./fault-injection-loop/SKILL.md) | Run a batched SLURM fault-injection feedback loop and score attribution accuracy | `prepare_node_alloc.sh` / `watch_and_analyze.sh` | + +## How skills relate to the library + +``` +src/nvidia_resiliency_ext/ +├── attribution/ +│ ├── log_analyzer/nvrx_logsage.py ← log-analysis implementation +│ ├── trace_analyzer/fr_attribution.py ← fr-analysis implementation +│ ├── analyzer/engine.py ← combined orchestration entry point +│ └── combined_log_fr/ ← optional log + FR fusion +└── skills/ + └── nvrx-attr/ ← this skill bundle + ├── log-analysis/ + ├── fr-analysis/ + └── fault-injection-loop/ +``` + +The `Analyzer` (`analyzer/engine.py`) is the recommended entry point when you need +request coalescing, result caching, or the combined `LOG_AND_TRACE` pipeline. +Use the individual skills when you want to run one analysis type directly without the +full coalescing stack. + +## Common prerequisites + +- `NVIDIA_API_KEY` environment variable, `NVIDIA_API_KEY_FILE`, or `~/.nvidia_api_key` +- `langchain-openai` installed +- `logsage` package installed (required by `log_analysis`) +- Package installed: `pip install nvidia-resiliency-ext` or `pip install -e .` from repo root +- The fault-injection loop has only been validated with Megatron-LM training scripts diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md new file mode 100644 index 00000000..abec6a91 --- /dev/null +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md @@ -0,0 +1,336 @@ +--- +name: fault-injection-loop +description: > + Closed-loop fault injection and attribution accuracy benchmark. Draws from a + prioritized pool of (fault_type, rank, iter, nodes) experiments and submits them + 2 at a time via sbatch — waiting for each pair to finish before submitting the + next — to bound filesystem load. GPU-related faults are front-loaded in the pool. + After all jobs complete, runs /log-analysis and /fr-analysis on every experiment, + scores attribution vs. ground truth, aggregates gaps, and iterates on attribution + modules to close them. +compatibility: Requires SLURM cluster access, sbatch, NVIDIA_API_KEY, langchain-openai, logsage, and nvidia-resiliency-ext installed. This workflow has only been validated with Megatron-LM workloads. +metadata: + author: nvidia + sub-skills: [log-analysis, fr-analysis] +--- + +# Skill: fault-injection-loop + +Iterative closed-loop skill that runs a prioritized fault-injection experiment pool +2 jobs at a time, analyzes every artifact, scores attribution accuracy, aggregates +gaps across the matrix, and proposes targeted improvements to attribution modules. + +--- + +## Overview + +``` +┌───────────────────────────────────────────────────────────────────────┐ +│ 0. POOL → build ordered pool of (fault_type, rank, iter, nodes) │ +│ GPU faults first, then crash, Python-hang, signal │ +│ │ +│ repeat until pool exhausted: │ +│ 1. SUBMIT → sbatch 2 jobs from pool head │ +│ 2. WAIT → poll until both jobs leave RUNNING/PENDING │ +│ │ +│ after all jobs done: │ +│ 3. ANALYZE → watch_and_analyze.sh: /log-analysis + /fr-analysis │ +│ per completed job, streaming as jobs finish │ +│ 4. SCORE → compare attribution output vs injected ground truth │ +│ 5. AGGREGATE→ build results table; identify systematic failure modes │ +│ 6. IMPROVE → patch log_analyzer/nvrx_logsage.py │ +│ 7. LOOP → re-run same pool with updated attribution code │ +└───────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## Step 0 — Fault Pool Design + +The pool is defined as an ordered list of `FAULT_TYPE:RANK:ITER:NODES` entries +inside `scripts/prepare_node_alloc.sh`. Default pool (34 experiments, 17 batches): + +``` +# GPU hangs — highest priority; full rank sweep across all node counts +GPU_SLEEP:1:5:2 GPU_SLEEP:0:5:2 # 2-node: rank-1, rank-0 +GPU_SLEEP:4:5:2 GPU_SLEEP:7:5:2 # 2-node: mid-rank, last-rank +GPU_SLEEP:1:5:4 GPU_SLEEP:0:5:4 # 4-node: rank-1, rank-0 +GPU_SLEEP:8:5:4 GPU_SLEEP:15:5:4 # 4-node: mid, last +GPU_SLEEP:1:5:8 GPU_SLEEP:0:5:8 # 8-node: rank-1, rank-0 +GPU_SLEEP:16:5:8 GPU_SLEEP:31:5:8 # 8-node: mid, last + +# GPU errors — high priority; rank-0 and rank-1 across all node counts +GPU_ERROR:1:5:2 GPU_ERROR:0:5:2 +GPU_ERROR:1:5:4 GPU_ERROR:0:5:4 +GPU_ERROR:1:5:8 GPU_ERROR:0:5:8 + +# Crash faults +SIGKILL:1:5:2 SIGKILL:0:5:2 +SIGKILL:1:5:4 SIGKILL:1:5:8 +SEGFAULT:1:5:2 SEGFAULT:0:5:2 +SEGFAULT:1:5:4 OS_ABORT:1:5:2 + +# Python-level hangs +LOCK_GIL:1:5:2 LOCK_GIL:0:5:2 +WORKLOAD_EXC:1:5:2 ASYNC_EXC:1:5:2 + +# Signals +SIGTERM:1:5:2 SIGINT:1:5:2 +SIGSTOP:1:5:2 SIGNAL_EXC:1:5:2 +``` + +Rank coverage per node count (4 GPUs/node): + +| Nodes | Total ranks | rank-0 | rank-1 | mid | last | +|-------|-------------|--------|--------|-----|------| +| 2 | 8 | 0 | 1 | 4 | 7 | +| 4 | 16 | 0 | 1 | 8 | 15 | +| 8 | 32 | 0 | 1 | 16 | 31 | + +To run a custom subset, override `POOL` before calling the script: +```bash +POOL="GPU_SLEEP:0:5:2 GPU_SLEEP:1:5:2" bash scripts/prepare_node_alloc.sh +``` + +Environment variables: + +| Variable | Default | Description | +|---|---|---| +| `WORKLOAD` | `llama4_scout` | Select a registered workload by name (see `scripts/workloads.conf`) | +| `ACCOUNT` | `root` | SLURM account | +| `PARTITION` | `gb-nvl-134-135` | SLURM partition | +| `GPUS_PER_NODE` | `4` | GPUs per node | +| `TIME` | `00:30:00` | Per-job wall-clock limit | +| `BATCH_SIZE` | `2` | Jobs submitted per round | +| `POLL_INTERVAL` | `30` | Seconds between queue polls | +| `BASE_EXPERIMENTS_DIR` | _(from workloads.conf or `llama4-scout-gb200`)_ | Root for all output | +| `SBATCH_SCRIPT` | `scripts/l4_gb200_reduced.sh` | Job script to submit | +| `POOL` | _(default pool above)_ | Space-separated experiment triplets | + +### Registered workloads (`scripts/workloads.conf`) + +| Name | Script | Base dir | Description | +|---|---|---|---| +| `llama4_scout` | `l4_gb200_reduced.sh` | `.../llama4-scout-gb200` | Llama4-Scout (reduced layers) on GB200 | + +```bash +# Run the full pool against the validated example workload +bash scripts/prepare_node_alloc.sh + +# Run a custom subset against llama4_scout +POOL="GPU_SLEEP:1:5:2 SIGKILL:1:5:2" WORKLOAD=llama4_scout bash scripts/prepare_node_alloc.sh +``` + +--- + +## Step 1 & 2 — Batched Submission + Wait (automated) + +```bash +bash scripts/prepare_node_alloc.sh +``` + +The script loops: submit 2 jobs → poll `squeue` every 30 s until both finish → +submit next 2. Progress is printed inline: + +``` +>>> Batch 1: experiments 1–2 of 34 + submitted: GPU_SLEEP rank=1 iter=5 nodes=2 -> job=1850 + submitted: GPU_SLEEP rank=0 iter=5 nodes=2 -> job=1851 + waiting for GPU_SLEEP:1:5:2 GPU_SLEEP:0:5:2 (1850,1851) ... 30s 60s done. +>>> Batch 2: experiments 3–4 of 34 + ... +``` + +A session directory and TSV tracking file are created at launch time: +``` +${BASE_EXPERIMENTS_DIR}/fault_injection// + experiments.tsv ← tracking file (all job IDs + paths) + n__r_i/ ← one subdir per experiment + logs/slurm/.launch.out + logs/slurm/.*.1.main_workload.log ← log-analysis input + checkpoints/ ← fr-analysis input (FR dumps) + tensorboard/ + experiments_report.md ← generated by watch_and_analyze.sh +``` + +Tracking file columns: `JOB_ID FAULT_TYPE RANK ITER NODES EXPERIMENT_DIR` + +--- + +## Step 3 — Analyze All Experiments + +Run the watcher/analyzer — it reads the tracking file and processes each experiment +as its job state leaves RUNNING/PENDING (works whether jobs are still running or +already done): + +```bash +bash scripts/watch_and_analyze.sh \ + ${BASE_EXPERIMENTS_DIR}/fault_injection//experiments.tsv +``` + +The watcher: +1. Reads each row from the tracking TSV +2. Calls `nvrx_logsage.py --exclude_nvrx_logs` and parses the text output to get + `restart_decision` and `attribution_text` +3. Calls `CollectiveAnalyzer` from `fr_attribution.py` to get suspect ranks +4. Scores 7 dimensions (restart correctness, rank primary, rank any, category, type, FR rank) +5. Appends a scored row to `_report.md` +6. Repeats until all experiments are analyzed + +To also run the sub-skills interactively for a single experiment: +```bash +/log-analysis --log-path "${EXPERIMENT_DIR}/logs/slurm/${JOB_ID}.*.1.main_workload.log" +/fr-analysis --fr-path "${EXPERIMENT_DIR}/checkpoints/" +``` + +--- + +## Step 4 — Score Each Experiment + +Scoring is performed by `scripts/score_attribution.py`, an LLM judge (Sonnet or Opus) that +receives the ground truth, the filtered raw log, the logsage attribution output, and the FR +analysis output, then returns structured JSON scores with a reasoning note. + +| Column | Values | Meaning | +|---|---|---| +| **restart_correct** | `true` / `false` / `N/A` | Restart decision matches expected for this fault type | +| **rank_primary** | `true` / `false` / `partial` | Injected rank is the primary root-cause in attribution | +| **rank_any** | `true` / `false` | Injected rank mentioned anywhere in attribution | +| **fault_described** | `true` / `false` / `partial` | Fault nature (hang/crash/signal/exception) correctly described | +| **fr_rank_correct** | `true` / `false` / `no_dumps` | FR analysis identifies injected rank as suspect | +| **judge_notes** | string | One-sentence summary of the main gap or confirmation | + +The judge is given: +1. Ground truth: `fault_type`, `rank`, `iter`, `nodes` +2. Expected restart decision + rationale (derived from `score_attribution.py:_RESTART_TABLE`) +3. Filtered raw log (last 400 lines, same `exclude_nvrx_logs` filtering as logsage) +4. Raw logsage stdout (5-field text format) +5. Raw CollectiveAnalyzer text output + +Default judge model: `azure/anthropic/claude-sonnet-4-6`. Override with `--model` in `score_attribution.py`. + +--- + +## Step 5 — Aggregate Results + +The report markdown table from `watch_and_analyze.sh` gives a matrix view. Look for +patterns across rows: + +``` +| FAULT_TYPE | NODES | RANK | restart_correct | rank_primary | rank_any | fault_described | fr_rank_correct | judge_notes | +|------------|-------|------|-----------------|--------------|----------|-----------------|-----------------|-------------| +| GPU_SLEEP | 2 | 0 | true | false | true | true | true | rank-0 identified only in secondary issues | +| GPU_SLEEP | 2 | 1 | true | true | true | true | true | correct on all dimensions | +| GPU_ERROR | 2 | 1 | false | false | false | partial | true | LLM issued RESTART; rank not mentioned | +| SIGKILL | 2 | 0 | true | false | false | false | true | attribution describes timeout not kill signal | +``` + +Common failure mode patterns and their meaning: + +| Pattern | Interpretation | +|---|---| +| `rank_primary=false`, `rank_any=true` | Rank detected but treated as collateral; logsage putting it in secondary issues | +| `rank_any=false` for rank-0 | Rank-0 hang silences watchdog on other ranks; logsage lacks rank-0 signal | +| `fault_described=partial` for crash types | Crash keywords present but fault type not specifically named | +| `restart_correct=false` for GPU_ERROR | LLM conflating hardware error with recoverable hang | +| `fr_rank_correct=no_dumps` | NCCL watchdog did not fire before job ended — adjust `TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC` | + +--- + +## Step 6 — Identify and Apply Improvements + +### FR analysis +Deterministic graph algorithm — **do not modify automatically**. +Note misidentifications and escalate to the team. + +### Log analysis (safe to modify) + +| Observation | Target location | Suggested fix | +|---|---|---| +| Wrong restart for hang | `nvrx_logsage.py` fast-path | Strengthen NCCL timeout → `RESTART IMMEDIATE` mapping | +| Missing rank in attr text | `nvrx_logsage.py` prompt | Extract rank from NCCL watchdog lines; add regex | +| Crash misclassified as hang | `nvrx_logsage.py` | Add SIGKILL/SEGFAULT/GPU_ERROR keyword patterns | +| `ERRORS NOT FOUND` when errors exist | `return_application_errors` config | Loosen error extraction filter | +| rank-0 not detected | prompt or fast-path | Add explicit rank-0 hang heuristic (other ranks silent) | +| attr off by many iters | prompt context | Increase weight of iteration-stamped log lines | +| LLM wrong on GPU_ERROR | prompt | Distinguish `cudaError` → crash from NCCL timeout → hang | + +Editable file: `attribution/log_analyzer/nvrx_logsage.py` + +After each patch, re-run the same pool subset that previously failed: +```bash +POOL="GPU_SLEEP:0:5:2 GPU_ERROR:1:5:2" bash scripts/prepare_node_alloc.sh +``` + +--- + +## Step 7 — Loop + +Increment experiment counter. Suggested sweep order across code-change iterations: + +1. **Iteration 1**: full default pool (34 experiments) +2. **Iteration 2**: targeted re-run of all failing cells from iteration 1 +3. **Iteration 3**: expand iter dimension (FAULT_AT_ITER=2 and 10) for remaining gaps +4. **Iteration 4**: add SEGFAULT and LOCK_GIL 4-node/8-node coverage + +Stop condition: all cells pass all four scoring dimensions for two consecutive +code-change iterations. + +--- + +## Adapting A SLURM Script For The Feedback Loop + +The feedback loop is not tied to `l4_gb200_reduced.sh`, but your sbatch script must +match a small contract so the loop can submit, analyze, and score each run. + +Required changes for a custom workload script: + +1. Accept these exported variables from `prepare_node_alloc.sh`: + `FAULT_TYPE`, `FAULT_RANK`, `FAULT_AT_ITER`, `EXPERIMENT_DIR`, `BASE_EXPERIMENTS_DIR`, + and `GPUS_PER_NODE`. +2. Write the main training log to: + `${EXPERIMENT_DIR}/logs/slurm/${SLURM_JOB_ID}.*.1.main_workload.log` + so `watch_and_analyze.sh` can find it. +3. Write NCCL flight-recorder dumps under `${EXPERIMENT_DIR}/checkpoints/`. +4. Emit a `[MEGATRON_FAULT] ...` marker when the fault is injected. + `watch_and_analyze.sh` uses this to decide whether the run reached the injection point. +5. Preserve the per-experiment directory layout: + `logs/slurm/`, `checkpoints/`, and `tensorboard/`. + +This has only been validated with Megatron-LM because the current run-valid check and +fault markers depend on Megatron's `debug_fault_injection.py` behavior. If you adapt the +loop to another framework, update both the sbatch script and `watch_and_analyze.sh`. + +## Appendix A: SBATCH_SCRIPT fault parameters + +The example `SBATCH_SCRIPT` reads these env vars from `prepare_node_alloc.sh` via `--export`: + +| Variable | Default | Description | +|---|---|---| +| `FAULT_AT_ITER` | `5` | Training iteration at which to inject | +| `FAULT_RANK` | `1` | Global rank to inject `[0, total_ranks)` | +| `FAULT_TYPE` | `GPU_SLEEP` | Megatron fault type enum name | +| `GPUS_PER_NODE` | `4` | GPUs per node (used to compute `TOTAL_TASKS`) | +| `EXPERIMENT_DIR` | `${BASE_EXPERIMENTS_DIR}/fault_injection/n${SLURM_NNODES}_${FAULT_TYPE}_r${FAULT_RANK}_i${FAULT_AT_ITER}` | Per-experiment output root | +| `BASE_EXPERIMENTS_DIR` | `/home/sbak/experiments/llama4-scout-gb200` | Shared root (datacache, triton/inductor caches) | + +Valid `FAULT_TYPE` values: +`GPU_ERROR`, `GPU_SLEEP`, `WORKLOAD_EXC`, `ASYNC_EXC`, `SIGNAL_EXC`, `OS_ABORT`, +`LOCK_GIL`, `SEGFAULT`, `SIGINT`, `SIGKILL`, `SIGTERM`, `SIGSTOP` + +--- + +## Appendix B: Single-experiment manual run + +```bash +# Manual runs land under fault_injection/manual/ by default (no session dir needed) +EXPERIMENT_DIR=/home/sbak/experiments/llama4-scout-gb200/fault_injection/manual/n2_GPU_SLEEP_r1_i5 +mkdir -p ${EXPERIMENT_DIR}/logs/slurm ${EXPERIMENT_DIR}/checkpoints ${EXPERIMENT_DIR}/tensorboard + +sbatch \ + --nodes=2 \ + --output=${EXPERIMENT_DIR}/logs/slurm/%j.launch.out \ + --error=${EXPERIMENT_DIR}/logs/slurm/%j.launch.err \ + --export=ALL,FAULT_TYPE=GPU_SLEEP,FAULT_RANK=1,FAULT_AT_ITER=5,GPUS_PER_NODE=4,EXPERIMENT_DIR=${EXPERIMENT_DIR} \ + scripts/l4_gb200_reduced.sh +``` diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md new file mode 100644 index 00000000..df038451 --- /dev/null +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md @@ -0,0 +1,113 @@ +--- +name: fr-analysis +description: > + Analyze PyTorch NCCL flight-recorder (FR) dumps to identify collective operation hangs and + isolate the responsible ranks using CollectiveAnalyzer. Use when a distributed training job + hangs due to an NCCL collective timeout and FR dump files are available. Detects the wavefront + process group where collectives diverge and returns the root-cause suspect ranks. +compatibility: Requires PyTorch NCCL FR dumps (TORCH_NCCL_TRACE_BUFFER_SIZE > 0 must be set during training). NVIDIA_API_KEY and langchain-openai are required only when using --llm-analyze. +metadata: + entry-point: CollectiveAnalyzer + script: scripts/fr_attribution.py +--- + +# Skill: fr_analysis + +Analyze PyTorch NCCL flight-recorder (FR) dumps to identify the collective operation hang +and isolate the ranks responsible, using `CollectiveAnalyzer`. + +**Script:** [`scripts/fr_attribution.py`](./scripts/fr_attribution.py) → `attribution/trace_analyzer/fr_attribution.py` + +--- + +## What it does + +1. Loads all FR dump files (JSON or binary pickle) matching a glob pattern under `--fr-path`. +2. Parses each dump into `Collective` records (op type, ranks, process group, timing, state). +3. Groups collectives by process group and sequence ID across ranks to detect mismatches. +4. Identifies the **wavefront** — the process group boundary where collectives diverge — and + returns the missing ranks at that boundary as the root-cause suspects. +5. Optionally runs an LLM pass (`--llm-analyze`) over the structured findings for a + human-readable summary. + +--- + +## CLI + +```bash +python scripts/fr_attribution.py \ + --fr-path /path/to/fr_dumps/ \ + [--pattern "*.json"] \ + [--verbose] \ + [--health-check] \ + [--llm-analyze] \ + [--model MODEL] \ + [--debug] +``` + +| Flag | Default | Description | +|------|---------|-------------| +| `--fr-path` | required | Path to a directory (or single file) containing FR dump files | +| `--pattern` | `*.json` | Glob pattern for dump files within `--fr-path` | +| `--verbose`, `-v` | off | Print detailed per-rank collective tables | +| `--health-check`, `-c` | off | Include node health check results in output | +| `--llm-analyze`, `-l` | off | Pass structured findings to the LLM for a narrative summary | +| `--model`, `-m` | `nvdev/nvidia/llama-3.3-nemotron-super-49b-v1` | LLM model (only used with `--llm-analyze`) | +| `--debug` | off | Convert binary trace files to JSON for inspection | + +--- + +## Programmatic API + +```python +from nvidia_resiliency_ext.attribution.trace_analyzer.fr_attribution import CollectiveAnalyzer + +analyzer = CollectiveAnalyzer({ + "fr_path": "/path/to/fr_dumps/", + "pattern": "*.json", + "verbose": False, + "health_check": False, + "llm_analyze": False, + "model": "nvdev/nvidia/llama-3.3-nemotron-super-49b-v1", +}) +results = analyzer.run_sync({ + "fr_path": "/path/to/fr_dumps/", +}) +# results: list[tuple[str, AttributionState]] +``` + +--- + +## Output + +Returns `(text, AttributionState)` pairs where `text` describes: + +- The **wavefront process group** where collectives diverged +- **Missing ranks** at the wavefront (root-cause suspects) +- Per-rank collective status tables (when `--verbose`) +- Node health summary (when `--health-check`) +- LLM narrative (when `--llm-analyze`) + +`AttributionState.STOP` indicates the hang is unrecoverable; `CONTINUE` indicates the job +may be restartable after isolating the identified ranks. + +--- + +## Dump file formats + +| Format | Notes | +|--------|-------| +| JSON (`.json`) | Standard PyTorch FR export; default glob pattern | +| Binary pickle | Detected automatically; use `--debug` to convert to JSON | + +FR dumps are typically written to the directory specified by `TORCH_NCCL_DEBUG_INFO_TEMP_FILE` +or triggered automatically on NCCL timeout. + +--- + +## Prerequisites + +- FR dump files produced by PyTorch NCCL (set `TORCH_NCCL_TRACE_BUFFER_SIZE` > 0) +- `NVIDIA_API_KEY` required only when using `--llm-analyze` +- `langchain-openai` required only when using `--llm-analyze` +- `FR_DEBUG=1` env var enables verbose debug logging in the script diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/scripts/fr_attribution.py b/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/scripts/fr_attribution.py new file mode 120000 index 00000000..cfac8e34 --- /dev/null +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/scripts/fr_attribution.py @@ -0,0 +1 @@ +../../../trace_analyzer/fr_attribution.py \ No newline at end of file diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/l4_gb200_reduced.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/l4_gb200_reduced.sh new file mode 100644 index 00000000..5c903e7a --- /dev/null +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/l4_gb200_reduced.sh @@ -0,0 +1,363 @@ +#!/bin/bash + +#SBATCH --account=root +#SBATCH --partition=gb-nvl-134-135 +#SBATCH --time=00:30:00 + +#SBATCH --job-name=llama4-scout-gb200 +#SBATCH --output=/tmp/slurm-%j.launch.out +#SBATCH --error=/tmp/slurm-%j.launch.err + +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=4 +#SBATCH --gpus-per-node=4 +#SBATCH --exclusive +#SBATCH --mem=0 + +log_msg() { + local msg="$1" + UNIX_DATETIME=$(date +%s) + HUMAN_DATETIME=$(date -d "@$UNIX_DATETIME" '+%Y-%m-%d %H:%M:%S.%3N') + echo ">>> ${msg} ${UNIX_DATETIME} (${HUMAN_DATETIME})" +} + +log_msg "START SBATCH" +echo "Running on nodes: ${SLURM_NODELIST}" +export RITS_PLATFORM_TYPE=gb200 +export RITS_GPUS_PER_NODE=4 +export RITS_NVL_DOMAIN_SIZE=72 +export NCCL_IB_DISABLE=0 +export NCCL_NET_GDR_LEVEL=3 +export RITS_CLUSTER_NAME=nvl72 +export PYXIS_LOG_LEVEL=debug +export NCCL_IB_SL=1 +export NCCL_IB_TIMEOUT=19 +export UB_TIMEOUT=720 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export NVTE_FWD_LAYERNORM_SM_MARGIN=16 +export NVTE_BWD_LAYERNORM_SM_MARGIN=16 +export NCCL_P2P_NET_CHUNKSIZE=2097152 +export NCCL_DEBUG=WARN +export PYTHONUNBUFFERED=1 +export ONE_LOGGER_JOB_CATEGORY=test +export LOGLEVEL=DEBUG +export TORCHINDUCTOR_WORKER_START=fork +export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True +export TORCH_CPP_LOG_LEVEL=INFO +export TORCH_NCCL_TRACE_BUFFER_SIZE=2000 +export TORCH_NCCL_RETHROW_CUDA_ERRORS=0 +export TORCH_NCCL_ENABLE_MONITORING=1 +export TORCH_NCCL_DUMP_ON_TIMEOUT=1 +export TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN=0 +export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=20 +export TORCH_DIST_INIT_BARRIER=0 +export TORCH_INCLUDE_STACK_TRACE=0 +export TORCH_INCLUDE_ONLY_ACTIVE=1 +export TORCH_NCCL_EXTRA_DUMP_ON_EXEC=1 + +# Checkpoint settings (overridable via sbatch --export) +export DIST_TIMEOUT_AFTER_INIT="${DIST_TIMEOUT_AFTER_INIT:-1800}" +# USE_ASYNC_CKPT=1: enable async checkpointing every CKPT_SAVE_INTERVAL iters +export USE_ASYNC_CKPT="${USE_ASYNC_CKPT:-0}" +export CKPT_SAVE_INTERVAL="${CKPT_SAVE_INTERVAL:-100}" +export USE_CPU_SHM="${USE_CPU_SHM:-1}" + +# Quantization mode (overridable via sbatch --export) +export USE_FP8="${USE_FP8:-1}" +export USE_FP4="${USE_FP4:-0}" + +# Overlap comm (overridable via sbatch --export) +export USE_OVERLAP_COMM="${USE_OVERLAP_COMM:-0}" + +# Node / task geometry (SLURM_NNODES is set by SLURM from --nodes override) +export GPUS_PER_NODE="${GPUS_PER_NODE:-4}" +TOTAL_TASKS=$((SLURM_NNODES * GPUS_PER_NODE)) + +# Per-experiment output directory (overridable via sbatch --export) +export BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-/home/sbak/experiments/llama4-scout-gb200}" +export EXPERIMENT_DIR="${EXPERIMENT_DIR:-${BASE_EXPERIMENTS_DIR}/ckpt_test/n${SLURM_NNODES}}" + +mkdir -p ${BASE_EXPERIMENTS_DIR}/datacache +mkdir -p ${EXPERIMENT_DIR}/tensorboard + +: "${SLURM_RESTART_COUNT:=0}" + +LOG_DIR=${EXPERIMENT_DIR}/logs +mkdir -p ${LOG_DIR} +echo "Writing logs to ${LOG_DIR}" +LOG_FILE_BASE="${LOG_DIR}/slurm/${SLURM_JOB_ID}.${SLURM_RESTART_COUNT}" + +# ── Shared-tmp directory (NFS, for cross-srun-step communication) ───────────── +# Mounted to /shared_tmp (NOT /tmp) so the container keeps its native fast /tmp. +SHARED_TMP_HOST=/home/sbak/tmp/${SLURM_JOB_ID} +mkdir -p ${SHARED_TMP_HOST} + +# ── Pre-populate .myenv with all variables that must reach the container ─────── +# Pyxis env forwarding is unreliable for vars set via sbatch --export; writing +# them into .myenv guarantees the inner bash picks them up via `source`. +MYENV_FILE=${SHARED_TMP_HOST}/.myenv_${SLURM_JOB_ID}.sh +cat > ${MYENV_FILE} << MYENVEOF +# Auto-generated by l4_gb200_reduced.sh — do not edit by hand. +export DIST_TIMEOUT_AFTER_INIT=${DIST_TIMEOUT_AFTER_INIT} +export USE_ASYNC_CKPT=${USE_ASYNC_CKPT} +export CKPT_SAVE_INTERVAL=${CKPT_SAVE_INTERVAL} +export USE_CPU_SHM=${USE_CPU_SHM} +export USE_FP8=${USE_FP8} +export USE_FP4=${USE_FP4} +export USE_OVERLAP_COMM=${USE_OVERLAP_COMM} +# Prepend local nvrx src so container picks up our changes without a pip install step. +export PYTHONPATH=/home/sbak/nvidia-resiliency-ext/src:\${PYTHONPATH} +MYENVEOF + +# Mounts +LUSTRE=/home:/home +SHARED_TMP=${SHARED_TMP_HOST}:/shared_tmp +LOGS=${EXPERIMENT_DIR}/logs:/logs +MEGATRON_REPO=/home/sbak/megatron-lm-main:/megatron-lm_repo +DATACACHE=${BASE_EXPERIMENTS_DIR}/datacache:/datacache +TENSORBOARD=${EXPERIMENT_DIR}/tensorboard:/tensorboard +WORKSPACE=/home/sbak/tmp:/workspace +FR_DUMP=${EXPERIMENT_DIR}/flight_recorder:/flight_recorder +mkdir -p ${EXPERIMENT_DIR}/flight_recorder +CONTAINER_MOUNTS=$LUSTRE,$SHARED_TMP,$LOGS,$MEGATRON_REPO,$DATACACHE,$TENSORBOARD,$WORKSPACE,$FR_DUMP + +# ── Disk cleanup: remove stale enroot containers from prior jobs ────────────── +log_msg "START disk_cleanup" +srun \ + --label \ + --ntasks-per-node=1 \ + --ntasks=${SLURM_NNODES} \ + --kill-on-bad-exit=0 \ + --mpi=none \ + bash -c ' + ENROOT_DIR="/var/lib/enroot/data/$(id -u)" + rm -rf "${ENROOT_DIR:?}"/* 2>/dev/null || true + echo "$(hostname): / $(df -h / | tail -1 | awk "{print \$3\" used, \"\$4\" avail\"}")" + ' +log_msg "END disk_cleanup" + +# all node setup +#-------------------------------- +log_msg "START all_node_setup" +srun \ + --label \ + --container-mounts ${CONTAINER_MOUNTS} \ + --container-image /home/sbak/mcore_ci_0415.sqsh \ + --container-name ${SLURM_JOB_ID} \ + --container-workdir / \ + --exclusive \ + --error=${LOG_FILE_BASE}.0.all_node_setup.log \ + --output=${LOG_FILE_BASE}.0.all_node_setup.log \ + --ntasks-per-node=1 \ + --ntasks=${SLURM_NNODES} \ + --kill-on-bad-exit=0 \ + --mpi=none \ + bash -c ' + # Use a per-node NFS path so all ranks on each node find the right clone. + MEGATRON_PATH=/shared_tmp/megatron_$(hostname)_${SLURM_JOB_ID} + mkdir -p ${MEGATRON_PATH} + pushd $MEGATRON_PATH + CURRENT_BRANCH=$(git -C /megatron-lm_repo branch --show-current) + echo "Cloning Megatron branch $CURRENT_BRANCH to ${MEGATRON_PATH}" + git clone --single-branch --branch $CURRENT_BRANCH /megatron-lm_repo . + popd + ' +log_msg "END all_node_setup" + +# main workload +#-------------------------------- +log_msg "START main_workload" +srun \ + --label \ + --container-mounts ${CONTAINER_MOUNTS} \ + --container-image /home/sbak/mcore_ci_0415.sqsh \ + --container-name ${SLURM_JOB_ID} \ + --container-workdir / \ + --error=${LOG_FILE_BASE}.1.main_workload.log \ + --output=${LOG_FILE_BASE}.1.main_workload.log \ + --ntasks-per-node=${GPUS_PER_NODE} \ + --ntasks=${TOTAL_TASKS} \ + --kill-on-bad-exit=0 \ + --mpi=none \ + bash -c ' + source /shared_tmp/.myenv_${SLURM_JOB_ID}.sh + + # Match the per-node path used in all_node_setup. + MEGATRON_PATH=/shared_tmp/megatron_$(hostname)_${SLURM_JOB_ID} + + NFS_TRITON_CACHE=/home/sbak/experiments/llama4-scout-gb200/triton_cache + NFS_INDUCTOR_CACHE=/home/sbak/experiments/llama4-scout-gb200/inductor_cache + + # Per-rank Triton/inductor cache on the container native /tmp (local fast storage). + export TRITON_CACHE_DIR=/tmp/triton_${SLURM_LOCALID} + export TORCHINDUCTOR_CACHE_DIR=/tmp/inductor_${SLURM_LOCALID} + mkdir -p ${TRITON_CACHE_DIR} ${TORCHINDUCTOR_CACHE_DIR} + + # Pre-stage: warm local cache from NFS (one rank per node) + if [[ "${SLURM_LOCALID}" == "0" ]]; then + if [[ -d "${NFS_TRITON_CACHE}" ]]; then + echo "Pre-staging triton cache from NFS..." + rsync -a --ignore-existing "${NFS_TRITON_CACHE}/" "${TRITON_CACHE_DIR}/" 2>/dev/null || true + fi + if [[ -d "${NFS_INDUCTOR_CACHE}" ]]; then + echo "Pre-staging inductor cache from NFS..." + rsync -a --ignore-existing "${NFS_INDUCTOR_CACHE}/" "${TORCHINDUCTOR_CACHE_DIR}/" 2>/dev/null || true + fi + fi + + # Post-stage: write back to NFS on exit (one rank per node) + _stage_back() { + if [[ "${SLURM_LOCALID}" == "0" ]]; then + echo "Staging triton cache back to NFS..." + mkdir -p "${NFS_TRITON_CACHE}" "${NFS_INDUCTOR_CACHE}" + rsync -a --ignore-existing "${TRITON_CACHE_DIR}/" "${NFS_TRITON_CACHE}/" 2>/dev/null || true + rsync -a --ignore-existing "${TORCHINDUCTOR_CACHE_DIR}/" "${NFS_INDUCTOR_CACHE}/" 2>/dev/null || true + echo "Cache staged back." + fi + } + trap _stage_back EXIT + + # Checkpoint directory — node-local /tmp (cleaned up by the cleanup job). + CKPT_DIR=/tmp/ckpt_${SLURM_JOB_ID} + mkdir -p ${CKPT_DIR} + + if [[ "${USE_FP8:-1}" == "1" ]]; then + QUANT_ARGS="--fp8-format hybrid \ + --fp8-recipe delayed \ + --fp8-param-gather \ + --fp8-amax-history-len 1024 \ + --fp8-amax-compute-algo max \ + --fp8-margin 0" + elif [[ "${USE_FP4:-0}" == "1" ]]; then + QUANT_ARGS="--fp4-format e2m1 \ + --fp4-recipe nvfp4" + else + QUANT_ARGS="" + fi + + if [[ "${USE_OVERLAP_COMM:-0}" == "1" ]]; then + OVERLAP_ARGS="--overlap-grad-reduce --overlap-param-gather" + else + OVERLAP_ARGS="" + fi + + # Build checkpoint args (controlled by USE_ASYNC_CKPT from .myenv). + # No --load: we only want to test save here. + CKPT_SAVE_ARGS="" + if [[ "${USE_ASYNC_CKPT}" == "1" ]]; then + CKPT_SAVE_ARGS="--save ${CKPT_DIR} --save-interval ${CKPT_SAVE_INTERVAL} --async-save --use-persistent-ckpt-worker --use-dist-ckpt --ckpt-fully-parallel-save --ckpt-assume-constant-structure $([[ "${USE_CPU_SHM}" == "1" ]] && echo "--async-ckpt-use-cpu-shm")" + fi + + pushd $MEGATRON_PATH + LAUNCHER_CMD="python3" + LAUNCHER_ARGS=" \ + " + WORKLOAD_CMD=${MEGATRON_PATH}/pretrain_gpt.py + WORKLOAD_ARGS=" \ + --exit-duration-in-mins 5750 \ + --distributed-timeout-minutes 10 \ + --disable-gloo-process-groups \ + --mock-data \ + --data-cache-path /datacache \ + --no-create-attention-mask-in-dataloader \ + --no-mmap-bin-files \ + --tokenizer-type NullTokenizer \ + --tiktoken-pattern v2 \ + --tokenizer-model /lustre/fsw/portfolios/llmservice/projects/llmservice_nlp_fm/nemotron6/tokenizers/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json \ + --micro-batch-size 1 \ + --global-batch-size 64 \ + --train-samples 10240000 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-05 \ + --lr-decay-style cosine \ + --lr-warmup-samples 1024000 \ + --lr-decay-samples 20480000 \ + --lr 0.0003 \ + --min-lr 2.9999999999999997e-05 \ + --weight-decay 0.1 \ + --clip-grad 1.0 \ + --loss-scale 1.0 \ + --use-mcore-models \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --attention-backend flash \ + --transformer-impl transformer_engine \ + --position-embedding-type rope \ + --rotary-base 500000 \ + --rotary-interleaved \ + --use-rope-scaling \ + --rope-scaling-factor 8.0 \ + --no-rope-fusion \ + --no-rope-freq 4 \ + --use-flash-attn \ + --cross-entropy-fusion-impl te \ + --cross-entropy-loss-fusion \ + --seq-length 8192 \ + --max-position-embeddings 8192 \ + --num-layers 12 \ + --swiglu \ + --hidden-size 5120 \ + --num-attention-heads 40 \ + --group-query-attention \ + --num-query-groups 8 \ + --ffn-hidden-size 16384 \ + --kv-channels 128 \ + --normalization RMSNorm \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --grad-reduce-in-bf16 \ + --qk-l2-norm \ + --num-experts 16 \ + --moe-layer-freq 1 \ + --moe-ffn-hidden-size 8192 \ + --moe-shared-expert-intermediate-size 8192 \ + --moe-router-topk 1 \ + --moe-router-score-function sigmoid \ + --moe-token-dispatcher-type alltoall \ + --moe-grouped-gemm \ + --moe-shared-expert-overlap \ + --moe-router-bias-update-rate 0.001 \ + --moe-router-load-balancing-type aux_loss \ + --moe-aux-loss-coeff 0.01 \ + --moe-router-enable-expert-bias \ + --moe-apply-probs-on-input \ + --moe-router-force-load-balancing \ + --bf16 \ + ${QUANT_ARGS} \ + --te-rng-tracker \ + --sequence-parallel \ + --use-distributed-optimizer \ + ${OVERLAP_ARGS} \ + --ddp-num-buckets 5 \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + --expert-model-parallel-size 8 \ + --expert-tensor-parallel-size 1 \ + --ddp-average-in-collective \ + --log-interval 1 \ + --timing-log-option minmax \ + --log-params-norm \ + --log-num-zeros-in-grad \ + --log-throughput \ + --check-weight-hash-across-dp-replicas-interval 20000 \ + --tensorboard-dir /tensorboard \ + --logging-level 10 \ + --eval-iters 14 \ + --eval-interval 2000 \ + --manual-gc \ + --manual-gc-interval 100 \ + --num-workers 1 \ + --local-rank ${SLURM_LOCALID} \ + --context-parallel-size 1 \ + --vocab-size 238600 \ + --distributed-timeout-seconds-after-init ${DIST_TIMEOUT_AFTER_INIT} \ + --flight-recorder-dump-path /flight_recorder \ + " + $LAUNCHER_CMD $LAUNCHER_ARGS $WORKLOAD_CMD $WORKLOAD_ARGS $CKPT_SAVE_ARGS + ' +log_msg "END main_workload" + +log_msg "END SBATCH" + +set +x diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/SKILL.md new file mode 100644 index 00000000..a86e2ff7 --- /dev/null +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/SKILL.md @@ -0,0 +1,112 @@ +--- +name: log-analysis +description: > + Analyze a SLURM job log file for failure root-cause attribution and restart decisions using + NVRxLogAnalyzer. Use when you have a SLURM training job log and need to determine why the + job failed and whether it should be restarted. Performs per-cycle chunking, fast-path pattern + matching, and LLM-based classification. +compatibility: Requires NVIDIA_API_KEY, langchain-openai, and logsage packages installed. nvidia-resiliency-ext must be installed. +metadata: + entry-point: NVRxLogAnalyzer + script: scripts/nvrx_logsage.py +--- + +# Skill: log_analysis + +Analyze a SLURM job log file for failure root-cause attribution and restart decisions using `NVRxLogAnalyzer`. + +**Script:** [`scripts/nvrx_logsage.py`](./scripts/nvrx_logsage.py) → `attribution/log_analyzer/nvrx_logsage.py` + +--- + +## What it does + +1. Reads the log file (UTF-8, falls back to latin-1). +2. Splits into per-cycle chunks using `chunk_logs_strict` (scans for `profiling.py:.*Cycle:\s*N` markers). Falls back to a single chunk when no markers are found. +3. For each chunk, extracts application errors via `return_application_errors` (logsage). +4. Classifies each chunk with fast-path pattern matching (training done, SLURM cancelled, preemption, time limit) or calls the LLM via `get_proposed_solution_cat`. +5. Returns one result tuple per cycle. + +--- + +## CLI + +```bash +python scripts/nvrx_logsage.py \ + --log-path /path/to/job.log \ + [--model MODEL] \ + [--temperature 0.2] \ + [--top_p 0.7] \ + [--max_tokens 8192] \ + [--exclude_nvrx_logs] \ + [--is_per_cycle] +``` + +| Flag | Default | Description | +|------|---------|-------------| +| `--log-path` | required | Path to the job log file | +| `--model` | `nvidia/qwen/qwen3.5-35b-a3b` | LLM model | +| `--temperature` | `0.2` | Sampling temperature | +| `--top_p` | `0.7` | Top-p nucleus sampling | +| `--max_tokens` | `8192` | Max output tokens | +| `--exclude_nvrx_logs` / `--no-exclude_nvrx_logs` | on | Strip `nvidia_resiliency_ext` / `[workload:]` lines before chunking (default on; use `--no-exclude_nvrx_logs` to disable) | +| `--is_per_cycle` | off | Skip chunking — treat the whole file as a single pre-split cycle | + +--- + +## Programmatic API + +```python +from nvidia_resiliency_ext.attribution.log_analyzer.nvrx_logsage import NVRxLogAnalyzer + +analyzer = NVRxLogAnalyzer({ + "log_path": "/path/to/job.log", + "model": "nvidia/qwen/qwen3.5-35b-a3b", + "temperature": 0.2, + "top_p": 0.7, + "max_tokens": 8192, + "exclude_nvrx_logs": False, + "is_per_cycle": False, +}) +results = analyzer.run_sync({"log_path": "/path/to/job.log"}) +# results: list[tuple[str, AttributionState]] +``` + +Run-time overrides take precedence over constructor config (see `base.effective_run_or_init_config`). + +--- + +## Output + +Each element of the returned list is a `(text, AttributionState)` pair where `text` is five +fields joined by `\n`: + +``` + # "RESTART IMMEDIATE" | "STOP - DONT RESTART IMMEDIATE" + # short string or "" + # "Attribution: Primary issues: [...], Secondary issues: [...]" + # extended text or "" + # "True" | "False" +``` + +`AttributionState.STOP` is set when `restart_decision` contains `"STOP"`; otherwise `CONTINUE`. + +### Fast-path decisions (no LLM call) + +| Detected condition | restart_decision | attribution_text | +|--------------------|-----------------|-----------------| +| Training complete | `STOP - DONT RESTART IMMEDIATE` | `TRAINING DONE` | +| SLURM preemption | `RESTART IMMEDIATE` | `SLURM CANCELLED DUE TO PREEMPTION` | +| SLURM step cancelled | `RESTART IMMEDIATE` | `SLURM STEP CANCELLED` | +| SLURM job requeue | `RESTART IMMEDIATE` | `SLURM STEP CANCELLED JOB REQUEUE` | +| Time-limit exceeded | `STOP - DONT RESTART IMMEDIATE` | status string | +| Empty log | — | `NO LOGS` | +| No errors found | — | `ERRORS NOT FOUND` | +| LLM failure | — | `LLM FAILURE` | + +--- + +## Prerequisites + +- `NVIDIA_API_KEY` set (env var, `NVIDIA_API_KEY_FILE`, or `~/.nvidia_api_key`) +- `langchain-openai` and `logsage` packages installed diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/scripts/nvrx_logsage.py b/src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/scripts/nvrx_logsage.py new file mode 120000 index 00000000..528751d1 --- /dev/null +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/scripts/nvrx_logsage.py @@ -0,0 +1 @@ +../../../../attribution/log_analyzer/nvrx_logsage.py \ No newline at end of file diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh new file mode 100644 index 00000000..9fd39ab8 --- /dev/null +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh @@ -0,0 +1,362 @@ +#!/bin/bash + +# Validated only with Megatron-LM as the feedback-loop example workload. + +#SBATCH --account=root +#SBATCH --partition=gb-nvl-134-135 +#SBATCH --time=00:30:00 + +#SBATCH --job-name=llama4-scout-gb200 +#SBATCH --output=/tmp/slurm-%j.launch.out +#SBATCH --error=/tmp/slurm-%j.launch.err + +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=4 +#SBATCH --gpus-per-node=4 +#SBATCH --exclusive +#SBATCH --mem=0 + +log_msg() { + local msg="$1" + UNIX_DATETIME=$(date +%s) + HUMAN_DATETIME=$(date -d "@$UNIX_DATETIME" '+%Y-%m-%d %H:%M:%S.%3N') + echo ">>> ${msg} ${UNIX_DATETIME} (${HUMAN_DATETIME})" +} + +log_msg "START SBATCH" +echo "Running on nodes: ${SLURM_NODELIST}" +export RITS_PLATFORM_TYPE=gb200 +export RITS_GPUS_PER_NODE=4 +export RITS_NVL_DOMAIN_SIZE=72 +export NCCL_IB_DISABLE=0 +export NCCL_NET_GDR_LEVEL=3 +export RITS_CLUSTER_NAME=nvl72 +export PYXIS_LOG_LEVEL=debug +export NCCL_IB_SL=1 +export NCCL_IB_TIMEOUT=19 +export UB_TIMEOUT=720 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export NVTE_FWD_LAYERNORM_SM_MARGIN=16 +export NVTE_BWD_LAYERNORM_SM_MARGIN=16 +export NCCL_P2P_NET_CHUNKSIZE=2097152 +export NCCL_DEBUG=WARN +export PYTHONUNBUFFERED=1 +export ONE_LOGGER_JOB_CATEGORY=test +export LOGLEVEL=DEBUG +export TORCHINDUCTOR_WORKER_START=fork +export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True +export TORCH_CPP_LOG_LEVEL=INFO +export TORCH_NCCL_TRACE_BUFFER_SIZE=2000 +export TORCH_NCCL_RETHROW_CUDA_ERRORS=0 +export TORCH_NCCL_ENABLE_MONITORING=1 +export TORCH_NCCL_DUMP_ON_TIMEOUT=1 +export TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN=0 +export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=20 +export TORCH_DIST_INIT_BARRIER=0 +export TORCH_INCLUDE_STACK_TRACE=0 +export TORCH_INCLUDE_ONLY_ACTIVE=1 +export TORCH_NCCL_EXTRA_DUMP_ON_EXEC=1 + +# Fault injection parameters (overridable via sbatch --export or environment) +export FAULT_AT_ITER="${FAULT_AT_ITER:-5}" +export FAULT_RANK="${FAULT_RANK:-1}" +export FAULT_TYPE="${FAULT_TYPE:-GPU_SLEEP}" + +# Checkpoint settings (overridable via sbatch --export) +export NVRX_CKPT_USE_CPU_SHM="${NVRX_CKPT_USE_CPU_SHM:-0}" +# Enable GPU-IPC cached-data-structure path without cpu-shm (for comparison baseline) +export NVRX_CKPT_USE_CACHED_STRUCTURE="${NVRX_CKPT_USE_CACHED_STRUCTURE:-0}" +export DIST_TIMEOUT_AFTER_INIT="${DIST_TIMEOUT_AFTER_INIT:-1}" +# USE_ASYNC_CKPT=1: enable async checkpointing every CKPT_SAVE_INTERVAL iters +export USE_ASYNC_CKPT="${USE_ASYNC_CKPT:-0}" +export CKPT_SAVE_INTERVAL="${CKPT_SAVE_INTERVAL:-100}" + +# Node / task geometry (SLURM_NNODES is set by SLURM from --nodes override) +export GPUS_PER_NODE="${GPUS_PER_NODE:-4}" +TOTAL_TASKS=$((SLURM_NNODES * GPUS_PER_NODE)) + +# Per-experiment output directory (overridable via sbatch --export) +export BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-/home/sbak/experiments/llama4-scout-gb200}" +export EXPERIMENT_DIR="${EXPERIMENT_DIR:-${BASE_EXPERIMENTS_DIR}/fault_injection/manual/n${SLURM_NNODES}_${FAULT_TYPE}_r${FAULT_RANK}_i${FAULT_AT_ITER}}" + +mkdir -p ${BASE_EXPERIMENTS_DIR}/datacache +mkdir -p ${EXPERIMENT_DIR}/tensorboard + +: "${SLURM_RESTART_COUNT:=0}" + +LOG_DIR=${EXPERIMENT_DIR}/logs +mkdir -p ${LOG_DIR} +echo "Writing logs to ${LOG_DIR}" +LOG_FILE_BASE="${LOG_DIR}/slurm/${SLURM_JOB_ID}.${SLURM_RESTART_COUNT}" + +# ── Shared-tmp directory (NFS, for cross-srun-step communication) ───────────── +# Mounted to /shared_tmp (NOT /tmp) so the container keeps its native fast /tmp. +SHARED_TMP_HOST=/home/sbak/tmp/${SLURM_JOB_ID} +mkdir -p ${SHARED_TMP_HOST} + +# ── Pre-populate .myenv with all variables that must reach the container ─────── +# Pyxis env forwarding is unreliable for vars set via sbatch --export; writing +# them into .myenv guarantees the inner bash picks them up via `source`. +MYENV_FILE=${SHARED_TMP_HOST}/.myenv_${SLURM_JOB_ID}.sh +cat > ${MYENV_FILE} << MYENVEOF +# Auto-generated by l4_gb200_reduced.sh — do not edit by hand. +export NVRX_CKPT_USE_CPU_SHM=${NVRX_CKPT_USE_CPU_SHM} +export NVRX_CKPT_USE_CACHED_STRUCTURE=${NVRX_CKPT_USE_CACHED_STRUCTURE} +export DIST_TIMEOUT_AFTER_INIT=${DIST_TIMEOUT_AFTER_INIT} +export USE_ASYNC_CKPT=${USE_ASYNC_CKPT} +export CKPT_SAVE_INTERVAL=${CKPT_SAVE_INTERVAL} +export FAULT_AT_ITER=${FAULT_AT_ITER} +export FAULT_RANK=${FAULT_RANK} +export FAULT_TYPE=${FAULT_TYPE} +# Prepend local nvrx src so container picks up our changes without a pip install step. +export PYTHONPATH=/home/sbak/nvidia-resiliency-ext/src:\${PYTHONPATH} +MYENVEOF + +# Mounts +LUSTRE=/home:/home +SHARED_TMP=${SHARED_TMP_HOST}:/shared_tmp +LOGS=${EXPERIMENT_DIR}/logs:/logs +MEGATRON_REPO=/home/sbak/megatron-lm-main:/megatron-lm_repo +DATACACHE=${BASE_EXPERIMENTS_DIR}/datacache:/datacache +TENSORBOARD=${EXPERIMENT_DIR}/tensorboard:/tensorboard +WORKSPACE=/home/sbak/tmp:/workspace +CHECKPOINTS=${EXPERIMENT_DIR}/checkpoints:/checkpoints +mkdir -p ${EXPERIMENT_DIR}/checkpoints +CONTAINER_MOUNTS=$LUSTRE,$SHARED_TMP,$LOGS,$MEGATRON_REPO,$DATACACHE,$TENSORBOARD,$WORKSPACE,$CHECKPOINTS + +# ── Disk cleanup: remove stale enroot containers from prior jobs ────────────── +log_msg "START disk_cleanup" +srun \ + --label \ + --ntasks-per-node=1 \ + --ntasks=${SLURM_NNODES} \ + --kill-on-bad-exit=0 \ + --mpi=none \ + bash -c ' + ENROOT_DIR="/var/lib/enroot/data/$(id -u)" + rm -rf "${ENROOT_DIR:?}"/* 2>/dev/null || true + echo "$(hostname): / $(df -h / | tail -1 | awk "{print \$3\" used, \"\$4\" avail\"}")" + ' +log_msg "END disk_cleanup" + +# all node setup +#-------------------------------- +log_msg "START all_node_setup" +srun \ + --label \ + --container-mounts ${CONTAINER_MOUNTS} \ + --container-image /home/sbak/mcore_ci_0415.sqsh \ + --container-name ${SLURM_JOB_ID} \ + --container-workdir / \ + --exclusive \ + --error=${LOG_FILE_BASE}.0.all_node_setup.log \ + --output=${LOG_FILE_BASE}.0.all_node_setup.log \ + --ntasks-per-node=1 \ + --ntasks=${SLURM_NNODES} \ + --kill-on-bad-exit=0 \ + --mpi=none \ + bash -c ' + # Use a per-node NFS path so all ranks on each node find the right clone. + MEGATRON_PATH=/shared_tmp/megatron_$(hostname)_${SLURM_JOB_ID} + mkdir -p ${MEGATRON_PATH} + pushd $MEGATRON_PATH + CURRENT_BRANCH=$(git -C /megatron-lm_repo branch --show-current) + echo "Cloning Megatron branch $CURRENT_BRANCH to ${MEGATRON_PATH}" + git clone --single-branch --branch $CURRENT_BRANCH /megatron-lm_repo . + popd + ' +log_msg "END all_node_setup" + +# main workload +#-------------------------------- +log_msg "START main_workload" +srun \ + --label \ + --container-mounts ${CONTAINER_MOUNTS} \ + --container-image /home/sbak/mcore_ci_0415.sqsh \ + --container-name ${SLURM_JOB_ID} \ + --container-workdir / \ + --error=${LOG_FILE_BASE}.1.main_workload.log \ + --output=${LOG_FILE_BASE}.1.main_workload.log \ + --ntasks-per-node=${GPUS_PER_NODE} \ + --ntasks=${TOTAL_TASKS} \ + --kill-on-bad-exit=0 \ + --mpi=none \ + bash -c ' + source /shared_tmp/.myenv_${SLURM_JOB_ID}.sh + + # Match the per-node path used in all_node_setup. + MEGATRON_PATH=/shared_tmp/megatron_$(hostname)_${SLURM_JOB_ID} + + NFS_TRITON_CACHE=/home/sbak/experiments/llama4-scout-gb200/triton_cache + NFS_INDUCTOR_CACHE=/home/sbak/experiments/llama4-scout-gb200/inductor_cache + + # Per-rank Triton/inductor cache on the container native /tmp (local fast storage). + export TRITON_CACHE_DIR=/tmp/triton_${SLURM_LOCALID} + export TORCHINDUCTOR_CACHE_DIR=/tmp/inductor_${SLURM_LOCALID} + mkdir -p ${TRITON_CACHE_DIR} ${TORCHINDUCTOR_CACHE_DIR} + + # Pre-stage: warm local cache from NFS (one rank per node) + if [[ "${SLURM_LOCALID}" == "0" ]]; then + if [[ -d "${NFS_TRITON_CACHE}" ]]; then + echo "Pre-staging triton cache from NFS..." + rsync -a --ignore-existing "${NFS_TRITON_CACHE}/" "${TRITON_CACHE_DIR}/" 2>/dev/null || true + fi + if [[ -d "${NFS_INDUCTOR_CACHE}" ]]; then + echo "Pre-staging inductor cache from NFS..." + rsync -a --ignore-existing "${NFS_INDUCTOR_CACHE}/" "${TORCHINDUCTOR_CACHE_DIR}/" 2>/dev/null || true + fi + fi + + # Post-stage: write back to NFS on exit (one rank per node) + _stage_back() { + if [[ "${SLURM_LOCALID}" == "0" ]]; then + echo "Staging triton cache back to NFS..." + mkdir -p "${NFS_TRITON_CACHE}" "${NFS_INDUCTOR_CACHE}" + rsync -a --ignore-existing "${TRITON_CACHE_DIR}/" "${NFS_TRITON_CACHE}/" 2>/dev/null || true + rsync -a --ignore-existing "${TORCHINDUCTOR_CACHE_DIR}/" "${NFS_INDUCTOR_CACHE}/" 2>/dev/null || true + echo "Cache staged back." + fi + } + trap _stage_back EXIT + + # Checkpoint directory — NFS path mounted to /checkpoints inside the container. + # /dev/shm is reserved for IPC shm tensors and the DataLoader. + # Note: --log-progress is NOT set. Megatron will not write/read progress.txt + # (which would be per-node and invisible across nodes). + CKPT_DIR=/checkpoints + mkdir -p ${CKPT_DIR} + + # Build checkpoint args (controlled by USE_ASYNC_CKPT from .myenv). + # No --load: we only want to test save here. + CKPT_SAVE_ARGS="" + if [[ "${USE_ASYNC_CKPT}" == "1" ]]; then + CKPT_SAVE_ARGS="--save ${CKPT_DIR} --save-interval ${CKPT_SAVE_INTERVAL} --async-save --use-persistent-ckpt-worker --use-dist-ckpt --ckpt-fully-parallel-save --ckpt-assume-constant-structure" + fi + + pushd $MEGATRON_PATH + LAUNCHER_CMD="python3" + LAUNCHER_ARGS=" \ + " + WORKLOAD_CMD=${MEGATRON_PATH}/pretrain_gpt.py + WORKLOAD_ARGS=" \ + --exit-duration-in-mins 5750 \ + --distributed-timeout-minutes 10 \ + --disable-gloo-process-groups \ + --mock-data \ + --data-cache-path /datacache \ + --no-create-attention-mask-in-dataloader \ + --no-mmap-bin-files \ + --tokenizer-type NullTokenizer \ + --tiktoken-pattern v2 \ + --tokenizer-model /lustre/fsw/portfolios/llmservice/projects/llmservice_nlp_fm/nemotron6/tokenizers/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json \ + --micro-batch-size 1 \ + --global-batch-size 64 \ + --train-samples 10240000 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-05 \ + --lr-decay-style cosine \ + --lr-warmup-samples 1024000 \ + --lr-decay-samples 20480000 \ + --lr 0.0003 \ + --min-lr 2.9999999999999997e-05 \ + --weight-decay 0.1 \ + --clip-grad 1.0 \ + --loss-scale 1.0 \ + --use-mcore-models \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --attention-backend flash \ + --transformer-impl transformer_engine \ + --position-embedding-type rope \ + --rotary-base 500000 \ + --rotary-interleaved \ + --use-rope-scaling \ + --rope-scaling-factor 8.0 \ + --no-rope-fusion \ + --no-rope-freq 4 \ + --use-flash-attn \ + --cross-entropy-fusion-impl te \ + --cross-entropy-loss-fusion \ + --seq-length 8192 \ + --max-position-embeddings 8192 \ + --num-layers 12 \ + --swiglu \ + --hidden-size 5120 \ + --num-attention-heads 40 \ + --group-query-attention \ + --num-query-groups 8 \ + --ffn-hidden-size 16384 \ + --kv-channels 128 \ + --normalization RMSNorm \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --grad-reduce-in-bf16 \ + --qk-l2-norm \ + --num-experts 16 \ + --moe-layer-freq 1 \ + --moe-ffn-hidden-size 8192 \ + --moe-shared-expert-intermediate-size 8192 \ + --moe-router-topk 1 \ + --moe-router-score-function sigmoid \ + --moe-token-dispatcher-type alltoall \ + --moe-grouped-gemm \ + --moe-shared-expert-overlap \ + --moe-router-bias-update-rate 0.001 \ + --moe-router-load-balancing-type aux_loss \ + --moe-aux-loss-coeff 0.01 \ + --moe-router-enable-expert-bias \ + --moe-apply-probs-on-input \ + --moe-router-force-load-balancing \ + --bf16 \ + --fp8-format hybrid \ + --fp8-recipe delayed \ + --fp8-param-gather \ + --fp8-amax-history-len 1024 \ + --fp8-amax-compute-algo max \ + --fp8-margin 0 \ + --te-rng-tracker \ + --sequence-parallel \ + --use-distributed-optimizer \ + --overlap-grad-reduce \ + --overlap-param-gather \ + --ddp-num-buckets 5 \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + --expert-model-parallel-size 8 \ + --expert-tensor-parallel-size 1 \ + --ddp-average-in-collective \ + --log-interval 1 \ + --timing-log-option minmax \ + --log-params-norm \ + --log-num-zeros-in-grad \ + --log-throughput \ + --check-weight-hash-across-dp-replicas-interval 20000 \ + --tensorboard-dir /tensorboard \ + --logging-level 10 \ + --eval-iters 14 \ + --eval-interval 2000 \ + --manual-gc \ + --manual-gc-interval 100 \ + --num-workers 1 \ + --rerun-mode validate_results \ + --log-straggler \ + --disable-straggler-on-startup \ + --straggler-minmax-count 16 \ + --local-rank ${SLURM_LOCALID} \ + --context-parallel-size 1 \ + --vocab-size 238600 \ + --megatron-fault-at-iter ${FAULT_AT_ITER} \ + --megatron-fault-rank ${FAULT_RANK} \ + --megatron-fault-type ${FAULT_TYPE} \ + --distributed-timeout-seconds-after-init ${DIST_TIMEOUT_AFTER_INIT} \ + --flight-recorder-dump-path ${CKPT_DIR} \ + " + $LAUNCHER_CMD $LAUNCHER_ARGS $WORKLOAD_CMD $WORKLOAD_ARGS $CKPT_SAVE_ARGS + ' +log_msg "END main_workload" + +log_msg "END SBATCH" + +set +x diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200.sh new file mode 100644 index 00000000..1147dda6 --- /dev/null +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200.sh @@ -0,0 +1,166 @@ +ENV_VARS: + NVTE_FWD_LAYERNORM_SM_MARGIN: 16 + NVTE_BWD_LAYERNORM_SM_MARGIN: 16 + TORCHINDUCTOR_WORKER_START: fork + QUANTIZATION_TYPE_DEBUG: 1 + PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True + NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN: 64 + USE_MNNVL: 1 +TEST_TYPE: "release" +MODEL_ARGS: + # Distributed args + --tensor-model-parallel-size: 2 + --pipeline-model-parallel-size: 1 + --expert-model-parallel-size: 64 + --expert-tensor-parallel-size: 1 + --use-distributed-optimizer: true + --overlap-grad-reduce: true + --overlap-param-gather: true + --sequence-parallel: true + --ddp-num-buckets: 10 + --ddp-pad-buckets-for-high-nccl-busbw: true + --high-priority-stream-groups: ep + --distributed-timeout-minutes: 10 + --disable-gloo-process-groups: true + + # Training args + --micro-batch-size: 1 + --global-batch-size: 3072 + --train-samples: 12207031 + --cross-entropy-loss-fusion: true + --cross-entropy-fusion-impl: native + --attention-backend: flash + --enable-cuda-graph: true + --cuda-graph-scope: mamba attn moe_router + --te-rng-tracker: true + --manual-gc: true + --manual-gc-interval: 10 + --no-create-attention-mask-in-dataloader: true + --num-workers: 1 + --exit-interval: 51000 + --override-opt_param-scheduler: true + + # Network size args + --use-mcore-models: true + --spec: megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec + --is-hybrid-model: true + --mamba-num-heads: 128 + --num-layers: 88 + --hidden-size: 4096 + --ffn-hidden-size: 2688 + --num-attention-heads: 32 + --group-query-attention: true + --num-query-groups: 2 + --kv-channels: 128 + --hybrid-override-pattern: MEMEMEM*EMEMEMEM*EMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEM*EMEMEMEME + --position-embedding-type: none + --normalization: RMSNorm + --untie-embeddings-and-output-weights: true + --init-method-std: 0.014 + --disable-bias-linear: true + --squared-relu: true + --use-fused-weighted-squared-relu: true + + # Data args + --seq-length: 8192 + --max-position-embeddings: 8192 + --data-path: ${DATA_BLEND} + --data-cache-path: ${DATA_CACHE_PATH} + --tiktoken-pattern: v2 + --tokenizer-type: ${TOKENIZER_TYPE} + --tokenizer-model: ${TOKENIZER_MODEL_PATH} + --no-mmap-bin-files: true + + # MoE args + --num-experts: 512 + --moe-router-topk: 22 + --moe-router-topk-scaling-factor: 5.0 + --moe-router-score-function: sigmoid + --moe-router-enable-expert-bias: true + --moe-router-dtype: fp32 + --moe-router-load-balancing-type: seq_aux_loss + --moe-aux-loss-coeff: 1e-4 + --moe-token-dispatcher-type: flex + --moe-flex-dispatcher-backend: hybridep + --moe-hybridep-num-sms: 32 + --moe-grouped-gemm: true + --moe-permute-fusion: true + --moe-latent-size: 1024 + --moe-shared-expert-intermediate-size: 5376 + --moe-shared-expert-compute-before-router: true + + # MTP args + --mtp-spec: megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec + --mtp-num-layers: 2 + --mtp-hybrid-override-pattern: \"*E\" + --calculate-per-token-loss: true + --mtp-loss-scaling-factor: 0.3 + + # Mixed precision / quantization args + --bf16: true + --keep-mtp-spec-in-bf16: true + --keep-mamba-stack-attention-linear-in-bf16: true + --keep-mamba-out-proj-in-mxfp8: true + --keep-moe-latent-projections-in-bf16: true + --first-last-layers-bf16: true + --num-layers-at-start-in-bf16: 0 + --num-layers-at-end-in-bf16: 14 + --fp4-format: e2m1 + --fp4-recipe: nvfp4 + + # Regularization args + --attention-dropout: 0.0 + --hidden-dropout: 0.0 + --clip-grad: 1.0 + --weight-decay: 0.1 + + # Learning rate args + --lr: 4.5e-4 + --min-lr: 4.5e-6 + --lr-decay-style: WSD + --lr-warmup-samples: 24414063 + --lr-decay-samples: 3048706055 + --lr-wsd-decay-style: minus_sqrt + --lr-wsd-decay-samples: 610351563 + --adam-beta1: 0.9 + --adam-beta2: 0.95 + + # Checkpointing args + --save: ${CHECKPOINT_SAVE_PATH} + --load: ${CHECKPOINT_LOAD_PATH} + --ckpt-format: torch_dist + --ckpt-fully-parallel-save: true + --ckpt-fully-parallel-load: true + --ckpt-assume-constant-structure: true + --async-save: true + --use-persistent-ckpt-worker: true + --save-interval: 1000 + --save-retain-interval: 5000 + + # Validation args + --eval-interval: 1000 + --eval-iters: 14 + + # Logging args + --log-interval: 100 + --log-params-norm: true + --log-num-zeros-in-grad: true + --log-timers-to-tensorboard: true + --log-memory-to-tensorboard: true + --log-throughput: true + --log-progress: true + --log-energy: true + --log-memory-interval: 500 + --logging-level: 20 + --timing-log-option: minmax + --check-weight-hash-across-dp-replicas-interval: 20000 + --tensorboard-dir: ${TENSORBOARD_PATH} + --wandb-project: megatron-core-release-runs + --wandb-entity: adlr + --wandb-exp-name: ${WANDB_EXPERIMENT} + --wandb-save-dir: ${WANDB_SAVE_PATH} +METRICS: + - "iteration-time" + - "lm loss" + - "mem-allocated-bytes" + - "mem-max-allocated-bytes" diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_shm_test.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_shm_test.sh new file mode 100755 index 00000000..673965f2 --- /dev/null +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_shm_test.sh @@ -0,0 +1,369 @@ +#!/bin/bash +# n3_super_gb200_shm_test.sh — one-time validation: Nemotron Super 8N with async cpu-shm ckpt. +# Model/infra config mirrors n3_super_gb200_fi.sh. No fault injection. +# Checkpoints to node-local /tmp (discardable — not cross-node accessible). + +#SBATCH --account=root +#SBATCH --partition=gb-nvl-134-135 +#SBATCH --time=00:45:00 + +#SBATCH --job-name=n3-super-shm-test +#SBATCH --output=/tmp/slurm-%j.launch.out +#SBATCH --error=/tmp/slurm-%j.launch.err + +#SBATCH --nodes=8 +#SBATCH --ntasks-per-node=4 +#SBATCH --gpus-per-node=4 +#SBATCH --exclusive +#SBATCH --mem=0 + +log_msg() { + local msg="$1" + UNIX_DATETIME=$(date +%s) + HUMAN_DATETIME=$(date -d "@$UNIX_DATETIME" '+%Y-%m-%d %H:%M:%S.%3N') + echo ">>> ${msg} ${UNIX_DATETIME} (${HUMAN_DATETIME})" +} + +log_msg "START SBATCH" +echo "Running on nodes: ${SLURM_NODELIST}" + +# ── Platform / NCCL / RITS ──────────────────────────────────────────────────── +export RITS_PLATFORM_TYPE=gb200 +export RITS_GPUS_PER_NODE=4 +export RITS_NVL_DOMAIN_SIZE=72 +export NCCL_IB_DISABLE=0 +export NCCL_NET_GDR_LEVEL=3 +export RITS_CLUSTER_NAME=nvl72 +export PYXIS_LOG_LEVEL=debug +export NCCL_IB_SL=1 +export NCCL_IB_TIMEOUT=19 +export UB_TIMEOUT=720 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export NCCL_P2P_NET_CHUNKSIZE=2097152 +export NCCL_DEBUG=WARN + +# ── PyTorch / TE / inductor (from n3_super_gb200.sh ENV_VARS) ───────────────── +export NVTE_FWD_LAYERNORM_SM_MARGIN=16 +export NVTE_BWD_LAYERNORM_SM_MARGIN=16 +export TORCHINDUCTOR_WORKER_START=fork +export QUANTIZATION_TYPE_DEBUG=1 +export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True +export USE_MNNVL=1 + +# ── DeepEP (hybridep MoE routing) — set USE_DEEPEP=0 to use alltoall instead ── +USE_DEEPEP="${USE_DEEPEP:-1}" +if [[ "${USE_DEEPEP}" == "1" ]]; then + export NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN=32 +fi + +# ── Logging / debugging ─────────────────────────────────────────────────────── +export PYTHONUNBUFFERED=1 +export ONE_LOGGER_JOB_CATEGORY=test +export LOGLEVEL=DEBUG +export TORCH_CPP_LOG_LEVEL=INFO +export TORCH_NCCL_TRACE_BUFFER_SIZE=2000 +export TORCH_NCCL_RETHROW_CUDA_ERRORS=0 +export TORCH_NCCL_ENABLE_MONITORING=1 +export TORCH_NCCL_DUMP_ON_TIMEOUT=1 +export TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN=0 +export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=30 +export TORCH_DIST_INIT_BARRIER=0 +export TORCH_INCLUDE_STACK_TRACE=0 +export TORCH_INCLUDE_ONLY_ACTIVE=1 +export TORCH_NCCL_EXTRA_DUMP_ON_EXEC=1 + +# ── CUDA graph ──────────────────────────────────────────────────────────────── +export ENABLE_CUDA_GRAPH="${ENABLE_CUDA_GRAPH:-1}" + +# ── Quantization mode: set USE_FP8=1 to use FP8, USE_FP4=1 for FP4 (default) ─ +# Only one may be active at a time. +export USE_FP4="${USE_FP4:-0}" +export USE_FP8="${USE_FP8:-1}" + +# ── Async checkpoint shm mode (default on) ──────────────────────────────────── +export USE_CPU_SHM="${USE_CPU_SHM:-1}" + +# ── Overlap comm (default off) ──────────────────────────────────────────────── +export USE_OVERLAP_COMM="${USE_OVERLAP_COMM:-0}" + +# ── Node / task geometry ───────────────────────────────────────────────────── +export GPUS_PER_NODE=4 +TOTAL_TASKS=$((SLURM_NNODES * GPUS_PER_NODE)) + +# ── Per-experiment output directory ─────────────────────────────────────────── +export BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-/home/sbak/experiments/n3-super-gb200}" +export EXPERIMENT_DIR="${EXPERIMENT_DIR:-${BASE_EXPERIMENTS_DIR}/shm_test}" + +mkdir -p ${BASE_EXPERIMENTS_DIR}/datacache +mkdir -p ${EXPERIMENT_DIR}/tensorboard + +: "${SLURM_RESTART_COUNT:=0}" + +LOG_DIR=${EXPERIMENT_DIR}/logs +mkdir -p ${LOG_DIR} +echo "Writing logs to ${LOG_DIR}" +LOG_FILE_BASE="${LOG_DIR}/slurm/${SLURM_JOB_ID}.${SLURM_RESTART_COUNT}" + +# ── Container mounts ────────────────────────────────────────────────────────── +LUSTRE=/home:/home +SHARED_TMP=/home/sbak/tmp/${SLURM_JOB_ID}:/shared_tmp +LOGS=${EXPERIMENT_DIR}/logs:/logs +MEGATRON_REPO=/home/sbak/megatron-lm-main:/megatron-lm_repo +DATACACHE=${BASE_EXPERIMENTS_DIR}/datacache:/datacache +TENSORBOARD=${EXPERIMENT_DIR}/tensorboard:/tensorboard +WORKSPACE=/home/sbak/tmp:/workspace +# No /checkpoints mount — saves go to node-local /tmp inside the container. +CONTAINER_MOUNTS=$LUSTRE,$SHARED_TMP,$LOGS,$MEGATRON_REPO,$DATACACHE,$TENSORBOARD,$WORKSPACE +mkdir -p /home/sbak/tmp/${SLURM_JOB_ID} + +# ── Disk cleanup: remove stale enroot containers from prior jobs ────────────── +log_msg "START disk_cleanup" +srun \ + --label \ + --ntasks-per-node=1 \ + --ntasks=${SLURM_NNODES} \ + --kill-on-bad-exit=0 \ + --mpi=none \ + bash -c ' + ENROOT_DIR="/var/lib/enroot/data/$(id -u)" + rm -rf "${ENROOT_DIR:?}"/* 2>/dev/null || true + echo "$(hostname): / $(df -h / | tail -1 | awk "{print \$3\" used, \"\$4\" avail\"}")" + ' +log_msg "END disk_cleanup" + +# ── All-node setup: clone Megatron into a per-node tmpdir ───────────────────── +log_msg "START all_node_setup" +srun \ + --label \ + --container-mounts ${CONTAINER_MOUNTS} \ + --container-image /home/sbak/mcore_ci_040825.sqsh \ + --container-name ${SLURM_JOB_ID} \ + --container-workdir / \ + --error=${LOG_FILE_BASE}.0.all_node_setup.log \ + --output=${LOG_FILE_BASE}.0.all_node_setup.log \ + --ntasks-per-node=1 \ + --ntasks=${SLURM_NNODES} \ + --kill-on-bad-exit=0 \ + --mpi=none \ + bash -c ' + MEGATRON_PATH=/shared_tmp/megatron_${SLURM_NODEID} + rm -rf "${MEGATRON_PATH}" + mkdir -p "${MEGATRON_PATH}" + pushd $MEGATRON_PATH + CURRENT_BRANCH=$(git -C /megatron-lm_repo branch --show-current) + echo "Cloning Megatron branch $CURRENT_BRANCH into $MEGATRON_PATH" + git clone --single-branch --branch $CURRENT_BRANCH /megatron-lm_repo . + popd + + # Install local nvidia-resiliency-ext so container picks up src changes. + uv pip install -e /home/sbak/nvidia-resiliency-ext + ' +log_msg "END all_node_setup" + +# ── Main workload ───────────────────────────────────────────────────────────── +log_msg "START main_workload" +srun \ + --label \ + --container-mounts ${CONTAINER_MOUNTS} \ + --container-image /home/sbak/mcore_ci_040825.sqsh \ + --container-name ${SLURM_JOB_ID} \ + --container-workdir / \ + --error=${LOG_FILE_BASE}.1.main_workload.log \ + --output=${LOG_FILE_BASE}.1.main_workload.log \ + --ntasks-per-node=${GPUS_PER_NODE} \ + --ntasks=${TOTAL_TASKS} \ + --kill-on-bad-exit=0 \ + --mpi=none \ + bash -c ' + MEGATRON_PATH=/shared_tmp/megatron_${SLURM_NODEID} + + NFS_TRITON_CACHE=/home/sbak/experiments/n3-super-gb200/triton_cache + NFS_INDUCTOR_CACHE=/home/sbak/experiments/n3-super-gb200/inductor_cache + TRITON_READY=/tmp/.triton_ready_${SLURM_JOB_ID} + + export TRITON_CACHE_DIR=/tmp/triton_${SLURM_LOCALID} + export TORCHINDUCTOR_CACHE_DIR=/tmp/inductor_${SLURM_LOCALID} + + if [[ "${SLURM_LOCALID}" == "0" ]]; then + if [[ -d "${NFS_TRITON_CACHE}" ]] && [[ -n "$(ls -A ${NFS_TRITON_CACHE} 2>/dev/null)" ]]; then + TRITON_CACHE_WAS_WARM=1 + else + TRITON_CACHE_WAS_WARM=0 + fi + for r in $(seq 0 $((GPUS_PER_NODE - 1))); do + mkdir -p /tmp/triton_${r} /tmp/inductor_${r} + [[ -d "${NFS_TRITON_CACHE}" ]] && rsync -a --ignore-existing "${NFS_TRITON_CACHE}/" "/tmp/triton_${r}/" 2>/dev/null || true + [[ -d "${NFS_INDUCTOR_CACHE}" ]] && rsync -a --ignore-existing "${NFS_INDUCTOR_CACHE}/" "/tmp/inductor_${r}/" 2>/dev/null || true + done + touch "${TRITON_READY}" + echo "Pre-staged triton/inductor cache for all local ranks (was_warm=${TRITON_CACHE_WAS_WARM})." + else + until [[ -f "${TRITON_READY}" ]]; do sleep 1; done + fi + + mkdir -p ${TRITON_CACHE_DIR} ${TORCHINDUCTOR_CACHE_DIR} + + _stage_back() { + if [[ "${SLURM_LOCALID}" == "0" && "${SLURM_NODEID}" == "0" && "${TRITON_CACHE_WAS_WARM}" == "0" ]]; then + echo "Staging triton cache back to NFS (cold start)..." + mkdir -p "${NFS_TRITON_CACHE}" "${NFS_INDUCTOR_CACHE}" + rsync -a --ignore-existing "${TRITON_CACHE_DIR}/" "${NFS_TRITON_CACHE}/" 2>/dev/null || true + rsync -a --ignore-existing "${TORCHINDUCTOR_CACHE_DIR}/" "${NFS_INDUCTOR_CACHE}/" 2>/dev/null || true + echo "Cache staged back." + fi + } + trap _stage_back EXIT + + if [[ "${ENABLE_CUDA_GRAPH}" == "1" ]]; then + CUDA_GRAPH_ARGS="--enable-cuda-graph --cuda-graph-scope mamba attn" + else + CUDA_GRAPH_ARGS="" + fi + + if [[ "${USE_DEEPEP:-1}" == "1" ]]; then + MOE_DISPATCHER_ARGS="--moe-token-dispatcher-type flex --moe-flex-dispatcher-backend hybridep --moe-hybridep-num-sms 32" + else + MOE_DISPATCHER_ARGS="--moe-token-dispatcher-type alltoall" + fi + + if [[ "${USE_FP8:-0}" == "1" ]]; then + QUANT_ARGS="--fp8-param-gather \ + --reuse-grad-buf-for-mxfp8-param-ag \ + --fp8-recipe mxfp8 \ + --fp8-format hybrid \ + --fp8-amax-history-len 1024 \ + --fp8-amax-compute-algo max" + elif [[ "${USE_FP4:-1}" == "1" ]]; then + QUANT_ARGS="--first-last-layers-bf16 \ + --num-layers-at-start-in-bf16 0 \ + --num-layers-at-end-in-bf16 14 \ + --fp4-format e2m1 \ + --fp4-recipe nvfp4" + else + QUANT_ARGS="" + fi + + # Checkpoint directory — node-local /tmp inside the container. + # Shards are not cross-node accessible; intentional for one-time shm validation. + CKPT_DIR=/tmp/ckpt_${SLURM_JOB_ID} + mkdir -p ${CKPT_DIR} + + pushd $MEGATRON_PATH + LAUNCHER_CMD="python3" + WORKLOAD_CMD=${MEGATRON_PATH}/pretrain_mamba.py + WORKLOAD_ARGS=" \ + --exit-duration-in-mins 40 \ + --exit-interval 100 \ + --distributed-timeout-minutes 30 \ + --distributed-timeout-seconds-after-init 1800 \ + --disable-gloo-process-groups \ + --mock-data \ + --data-cache-path /datacache \ + --no-create-attention-mask-in-dataloader \ + --no-mmap-bin-files \ + --tokenizer-type NullTokenizer \ + --tiktoken-pattern v2 \ + --vocab-size 128000 \ + --micro-batch-size 1 \ + --global-batch-size 32 \ + --train-samples 12207031 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --lr 4.5e-4 \ + --min-lr 4.5e-6 \ + --lr-decay-style WSD \ + --lr-warmup-samples 24414063 \ + --lr-decay-samples 3048706055 \ + --lr-wsd-decay-style minus_sqrt \ + --lr-wsd-decay-samples 610351563 \ + --weight-decay 0.1 \ + --clip-grad 1.0 \ + --override-opt_param-scheduler \ + --use-mcore-models \ + --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \ + --is-hybrid-model \ + --mamba-num-heads 128 \ + --num-layers 88 \ + --hidden-size 4096 \ + --ffn-hidden-size 2688 \ + --num-attention-heads 32 \ + --group-query-attention \ + --num-query-groups 2 \ + --kv-channels 128 \ + --hybrid-override-pattern MEMEMEM*EMEMEMEM*EMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEM*EMEMEMEME \ + --position-embedding-type none \ + --normalization RMSNorm \ + --untie-embeddings-and-output-weights \ + --init-method-std 0.014 \ + --disable-bias-linear \ + --squared-relu \ + --use-fused-weighted-squared-relu \ + --seq-length 8192 \ + --max-position-embeddings 8192 \ + --num-experts 512 \ + --moe-router-topk 22 \ + --moe-router-topk-scaling-factor 5.0 \ + --moe-router-score-function sigmoid \ + --moe-router-enable-expert-bias \ + --moe-router-dtype fp32 \ + --moe-router-load-balancing-type seq_aux_loss \ + --moe-aux-loss-coeff 1e-4 \ + ${MOE_DISPATCHER_ARGS} \ + --moe-grouped-gemm \ + --moe-permute-fusion \ + --moe-latent-size 1024 \ + --moe-shared-expert-intermediate-size 5376 \ + --calculate-per-token-loss \ + --bf16 \ + ${QUANT_ARGS} \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --sequence-parallel \ + --use-distributed-optimizer \ + $([[ "${USE_OVERLAP_COMM}" == "1" ]] && echo "--overlap-grad-reduce --overlap-param-gather") \ + --ddp-num-buckets 10 \ + --ddp-pad-buckets-for-high-nccl-busbw \ + --high-priority-stream-groups ep \ + --tensor-model-parallel-size 4 \ + --pipeline-model-parallel-size 1 \ + --expert-model-parallel-size 32 \ + --expert-tensor-parallel-size 1 \ + --cross-entropy-loss-fusion \ + --cross-entropy-fusion-impl native \ + --attention-backend flash \ + ${CUDA_GRAPH_ARGS} \ + --te-rng-tracker \ + --manual-gc \ + --manual-gc-interval 10 \ + --num-workers 1 \ + --eval-interval 1000 \ + --eval-iters 14 \ + --log-interval 1 \ + --log-params-norm \ + --log-num-zeros-in-grad \ + --log-timers-to-tensorboard \ + --log-memory-to-tensorboard \ + --log-throughput \ + --log-energy \ + --log-memory-interval 500 \ + --logging-level 10 \ + --timing-log-option minmax \ + --check-weight-hash-across-dp-replicas-interval 20000 \ + --tensorboard-dir /tensorboard \ + --local-rank ${SLURM_LOCALID} \ + --save ${CKPT_DIR} \ + --save-interval 10 \ + --ckpt-format torch_dist \ + --ckpt-fully-parallel-save \ + --ckpt-assume-constant-structure \ + --async-save \ + --use-persistent-ckpt-worker \ + $([[ "${USE_CPU_SHM}" == "1" ]] && echo "--async-ckpt-use-cpu-shm") \ + " + $LAUNCHER_CMD $WORKLOAD_CMD $WORKLOAD_ARGS + ' +log_msg "END main_workload" + +log_msg "END SBATCH" + +set +x diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/pools/n3_super_8n_16n.pool b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/pools/n3_super_8n_16n.pool new file mode 100644 index 00000000..1d700863 --- /dev/null +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/pools/n3_super_8n_16n.pool @@ -0,0 +1,40 @@ +# n3_super_8n_16n.pool — fault-injection pool for Nemotron-3 Super (TP=4, EP=32) +# Minimum scale: 8 nodes (32 ranks, EP=32 requires exactly 32 ranks) +# Maximum scale: 16 nodes (64 ranks) +# +# Rank coverage per node count (4 GPUs/node): +# 8 nodes → 32 ranks: rank-0=0, rank-1=1, mid=16, last=31 +# 16 nodes → 64 ranks: rank-0=0, rank-1=1, mid=32, last=63 +# +# NOTE: 16-node jobs require ~20 min for NCCL init + CUDA graph capture before iter 1. +# With 5-min watchdog timeout after fault + FR dumps, total is ~30+ min. +# Use TIME=00:45:00 (set in workloads.conf) to avoid SLURM wall-time kills. +# +# Format: FAULT_TYPE:RANK:ITER:NODES (one per line, # comments ignored) +# GPU faults — highest priority; rank sweep across both node counts +GPU_SLEEP:1:5:8 +GPU_SLEEP:0:5:8 +GPU_SLEEP:16:5:8 +GPU_SLEEP:31:5:8 +GPU_SLEEP:1:5:16 +GPU_SLEEP:32:5:16 +GPU_ERROR:1:5:8 +GPU_ERROR:0:5:8 +GPU_ERROR:16:5:8 +GPU_ERROR:1:5:16 +# Crash faults +SIGKILL:1:5:8 +SIGKILL:0:5:8 +SIGKILL:1:5:16 +SEGFAULT:1:5:8 +OS_ABORT:1:5:8 +# Python-level hangs +LOCK_GIL:1:5:8 +LOCK_GIL:0:5:8 +# Application exceptions +WORKLOAD_EXC:1:5:8 +ASYNC_EXC:1:5:8 +# Signal-based +SIGTERM:1:5:8 +SIGINT:1:5:8 +SIGNAL_EXC:1:5:8 diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh new file mode 100755 index 00000000..f2c90a64 --- /dev/null +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh @@ -0,0 +1,209 @@ +#!/bin/bash +# prepare_node_alloc.sh +# Submit fault-injection experiments from a prioritized pool, 2 jobs at a time, +# waiting for each pair to complete before submitting the next pair. +# This limits peak filesystem stress to 2 concurrent jobs while still covering +# the full experiment matrix end-to-end in one unattended run. +# +# Pool ordering: GPU-related faults first (higher attribution coverage priority), +# then crash faults, Python-level hangs, and signal-based faults. +# Each tier covers node counts 2→4→8 and sweeps rank-0, rank-1, mid, and last. +# +# Usage: +# bash scripts/prepare_node_alloc.sh +# WORKLOAD=llama4_scout TIME=00:45:00 bash scripts/prepare_node_alloc.sh +# +# WORKLOAD selects the job script and base experiments dir from scripts/workloads.conf. +# Override POOL (space-separated FAULT_TYPE:RANK:ITER:NODES) to run a custom set. +# Override SBATCH_SCRIPT or BASE_EXPERIMENTS_DIR directly to bypass workloads.conf. +# +# Validated only with Megatron-LM workloads that emit [MEGATRON_FAULT] markers +# and write logs / FR dumps using the directory layout expected by +# watch_and_analyze.sh. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +WORKLOADS_CONF="${SCRIPT_DIR}/workloads.conf" + +# ── Workload resolution from workloads.conf ──────────────────────────────────── +# If WORKLOAD is set, look it up in workloads.conf and derive SBATCH_SCRIPT and +# BASE_EXPERIMENTS_DIR from it (unless those are already set explicitly). +if [[ -n "${WORKLOAD:-}" ]]; then + if [[ ! -f "${WORKLOADS_CONF}" ]]; then + echo "ERROR: workloads.conf not found at ${WORKLOADS_CONF}" >&2 + exit 1 + fi + _CONF_LINE=$(grep -E "^${WORKLOAD}\s" "${WORKLOADS_CONF}" | grep -v "^#" | head -1 || true) + if [[ -z "${_CONF_LINE}" ]]; then + echo "ERROR: workload '${WORKLOAD}' not found in ${WORKLOADS_CONF}" >&2 + echo "Available workloads:" >&2 + grep -v "^#\|^$" "${WORKLOADS_CONF}" | awk '{print " " $1 " — " $4}' >&2 + exit 1 + fi + _CONF_SCRIPT=$(echo "${_CONF_LINE}" | awk '{print $2}') + _CONF_BASE=$(echo "${_CONF_LINE}" | awk '{print $3}') + _CONF_DESC=$(echo "${_CONF_LINE}" | awk '{print $4}') + _CONF_POOL=$(echo "${_CONF_LINE}" | awk '{print $5}') + _CONF_TIME=$(echo "${_CONF_LINE}" | awk '{print $6}') + # Only set if not already overridden in the environment + SBATCH_SCRIPT="${SBATCH_SCRIPT:-${SCRIPT_DIR}/${_CONF_SCRIPT}}" + BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-${_CONF_BASE}}" + if [[ -n "${_CONF_TIME}" && "${_CONF_TIME}" != "-" ]]; then + TIME="${TIME:-${_CONF_TIME}}" + fi + # Load workload-specific pool file if POOL not already set and pool file is specified + if [[ -z "${POOL:-}" && -n "${_CONF_POOL}" && "${_CONF_POOL}" != "-" ]]; then + _POOL_FILE="${SCRIPT_DIR}/pools/${_CONF_POOL}" + if [[ -f "${_POOL_FILE}" ]]; then + POOL=$(grep -v "^#\|^$" "${_POOL_FILE}" | tr '\n' ' ') + echo ">>> Pool: ${_POOL_FILE}" + else + echo "WARN: pool file ${_POOL_FILE} not found, using built-in default pool" >&2 + fi + fi + echo ">>> Workload: ${WORKLOAD} (${_CONF_DESC//_/ })" +fi + +ACCOUNT="${ACCOUNT:-root}" +PARTITION="${PARTITION:-gb-nvl-134-135}" +GPUS_PER_NODE="${GPUS_PER_NODE:-4}" +TIME="${TIME:-00:30:00}" +BATCH_SIZE="${BATCH_SIZE:-2}" +POLL_INTERVAL="${POLL_INTERVAL:-30}" +BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-/home/sbak/experiments/llama4-scout-gb200}" + +# --------------------------------------------------------------------------- +# Fault pool — ordered by priority (GPU-related first, then crash, then other) +# Format: FAULT_TYPE:RANK:ITER:NODES +# +# Rank coverage per node count (4 GPUs/node): +# 2 nodes → 8 ranks: rank-0, rank-1, mid=4, last=7 +# 4 nodes → 16 ranks: rank-0, rank-1, mid=8, last=15 +# 8 nodes → 32 ranks: rank-0, rank-1, mid=16, last=31 +# --------------------------------------------------------------------------- +DEFAULT_POOL=" +GPU_SLEEP:1:5:2 GPU_SLEEP:0:5:2 +GPU_SLEEP:4:5:2 GPU_SLEEP:7:5:2 +GPU_SLEEP:1:5:4 GPU_SLEEP:0:5:4 +GPU_SLEEP:8:5:4 GPU_SLEEP:15:5:4 +GPU_SLEEP:1:5:8 GPU_SLEEP:0:5:8 +GPU_SLEEP:16:5:8 GPU_SLEEP:31:5:8 +GPU_ERROR:1:5:2 GPU_ERROR:0:5:2 +GPU_ERROR:1:5:4 GPU_ERROR:0:5:4 +GPU_ERROR:1:5:8 GPU_ERROR:0:5:8 +SIGKILL:1:5:2 SIGKILL:0:5:2 +SIGKILL:1:5:4 SIGKILL:1:5:8 +SEGFAULT:1:5:2 SEGFAULT:0:5:2 +SEGFAULT:1:5:4 OS_ABORT:1:5:2 +LOCK_GIL:1:5:2 LOCK_GIL:0:5:2 +WORKLOAD_EXC:1:5:2 ASYNC_EXC:1:5:2 +SIGTERM:1:5:2 SIGINT:1:5:2 +SIGSTOP:1:5:2 SIGNAL_EXC:1:5:2 +" + +# Flatten pool into an array (strips comments and blank lines) +POOL=(${POOL:-$DEFAULT_POOL}) + +SBATCH_SCRIPT="${SBATCH_SCRIPT:-${SCRIPT_DIR}/l4_gb200_reduced.sh}" +SESSION_TAG="$(date +%Y%m%d_%H%M%S)" +SESSION_DIR="${BASE_EXPERIMENTS_DIR}/fault_injection/${SESSION_TAG}" +TRACKING_FILE="${SESSION_DIR}/experiments.tsv" + +mkdir -p "${SESSION_DIR}" +printf "JOB_ID\tFAULT_TYPE\tRANK\tITER\tNODES\tEXPERIMENT_DIR\n" > "${TRACKING_FILE}" + +TOTAL=${#POOL[@]} +echo ">>> Fault-injection pool: ${TOTAL} experiments, ${BATCH_SIZE} at a time" +echo ">>> Script: ${SBATCH_SCRIPT}" +echo ">>> Partition: ${PARTITION} GPUs/node: ${GPUS_PER_NODE} Time: ${TIME}" +echo ">>> Session: ${SESSION_DIR}" +echo ">>> Tracking: ${TRACKING_FILE}" +echo "" + +submit_one() { + local EXPERIMENT="$1" + IFS=':' read -r FAULT_TYPE RANK ITER NODES <<< "${EXPERIMENT}" + + local EXPERIMENT_DIR="${SESSION_DIR}/n${NODES}_${FAULT_TYPE}_r${RANK}_i${ITER}" + mkdir -p "${EXPERIMENT_DIR}/logs/slurm" + mkdir -p "${EXPERIMENT_DIR}/checkpoints" + mkdir -p "${EXPERIMENT_DIR}/tensorboard" + + local JOB_ID + JOB_ID=$(sbatch \ + --account="${ACCOUNT}" \ + --partition="${PARTITION}" \ + --nodes="${NODES}" \ + --ntasks-per-node="${GPUS_PER_NODE}" \ + --gpus-per-node="${GPUS_PER_NODE}" \ + --time="${TIME}" \ + --exclusive \ + --mem=0 \ + --output="${EXPERIMENT_DIR}/logs/slurm/%j.launch.out" \ + --error="${EXPERIMENT_DIR}/logs/slurm/%j.launch.err" \ + --export=ALL,FAULT_TYPE="${FAULT_TYPE}",FAULT_RANK="${RANK}",FAULT_AT_ITER="${ITER}",GPUS_PER_NODE="${GPUS_PER_NODE}",EXPERIMENT_DIR="${EXPERIMENT_DIR}",BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR}" \ + --parsable \ + "${SBATCH_SCRIPT}") + + # Print to stderr so callers using $(...) capture only the job ID on stdout + printf " submitted: %s rank=%-2s iter=%s nodes=%s -> job=%s\n" \ + "${FAULT_TYPE}" "${RANK}" "${ITER}" "${NODES}" "${JOB_ID}" >&2 + printf "%s\t%s\t%s\t%s\t%s\t%s\n" \ + "${JOB_ID}" "${FAULT_TYPE}" "${RANK}" "${ITER}" "${NODES}" "${EXPERIMENT_DIR}" \ + >> "${TRACKING_FILE}" + echo "${JOB_ID}" # only the bare job ID goes to stdout +} + +wait_for_jobs() { + local JOB_LIST="$1" + local LABEL="$2" + printf " waiting for %s (%s) ..." "${LABEL}" "${JOB_LIST}" + while true; do + local REMAINING + # squeue returns non-zero for unknown job IDs on some SLURM versions; + # || echo 0 prevents set -e from aborting the script when jobs leave the queue. + REMAINING=$(squeue -j "${JOB_LIST}" --noheader 2>/dev/null | wc -l || true) + if [[ "${REMAINING}" -eq 0 ]]; then + echo " done." + break + fi + printf " %ds" "${POLL_INTERVAL}" + sleep "${POLL_INTERVAL}" + done +} + +ALL_SUBMITTED_JOBS=() +BATCH_NUM=0 +i=0 + +while [[ $i -lt ${TOTAL} ]]; do + BATCH_NUM=$((BATCH_NUM + 1)) + BATCH_END=$((i + BATCH_SIZE)) + [[ ${BATCH_END} -gt ${TOTAL} ]] && BATCH_END=${TOTAL} + BATCH_COUNT=$((BATCH_END - i)) + + echo ">>> Batch ${BATCH_NUM}: experiments $((i+1))–${BATCH_END} of ${TOTAL}" + + BATCH_JOB_IDS=() + for ((b=i; b>> All ${TOTAL} experiments complete." +echo ">>> Session: ${SESSION_DIR}" +echo ">>> Tracking: ${TRACKING_FILE}" +echo "" +echo ">>> Run analysis on all results:" +echo " bash scripts/watch_and_analyze.sh '${TRACKING_FILE}'" diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/run_session.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/run_session.sh new file mode 100755 index 00000000..ca5251bc --- /dev/null +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/run_session.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# run_session.sh +# End-to-end fault-injection session: submit all experiments from the pool +# (2 at a time, waiting for each pair), then analyze every completed job and +# produce a scored report. Designed to be run unattended via nohup. +# +# Usage: +# nohup bash scripts/run_session.sh > /path/to/session.log 2>&1 & +# EXPERIMENT_MATRIX="GPU_SLEEP:1:5:2 SIGKILL:1:5:4" nohup bash scripts/run_session.sh ... + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +WORKLOAD="${WORKLOAD:-llama4_scout}" + +# ---- Phase 1: submit and wait for all experiments ---- +echo "========================================" +echo "PHASE 1: Fault injection" +echo "========================================" +WORKLOAD="${WORKLOAD}" bash "${SCRIPT_DIR}/prepare_node_alloc.sh" + +# prepare_node_alloc.sh prints the tracking file path; re-derive it the same way +# (SESSION_TAG is the timestamp when prepare_node_alloc ran, which is a few seconds +# before this line — find the newest session dir instead of recomputing the tag) +BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-/home/sbak/experiments/llama4-scout-gb200}" +TRACKING_FILE=$(ls -td "${BASE_EXPERIMENTS_DIR}/fault_injection"/[0-9]* 2>/dev/null \ + | head -1)/experiments.tsv + +if [[ ! -f "${TRACKING_FILE}" ]]; then + echo "ERROR: could not locate experiments.tsv in latest session dir" >&2 + exit 1 +fi + +echo "" +echo "========================================" +echo "PHASE 2: Analysis" +echo "Tracking: ${TRACKING_FILE}" +echo "========================================" +bash "${SCRIPT_DIR}/watch_and_analyze.sh" "${TRACKING_FILE}" diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py new file mode 100644 index 00000000..15417a6c --- /dev/null +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py @@ -0,0 +1,237 @@ +#!/usr/bin/env python3 +"""LLM-judge scorer for fault-injection attribution experiments. + +Uses the same ChatOpenAI / NVIDIA-inference-API setup as nvrx_logsage.py. +Reads ground-truth fault parameters and the raw text outputs of nvrx_logsage +and CollectiveAnalyzer, then asks a Sonnet/Opus judge to score each attribution +dimension and return structured JSON. + +Usage (called by watch_and_analyze.sh): + python3 score_attribution.py \ + --fault-type GPU_SLEEP --rank 0 --iter 5 --nodes 2 \ + --log-output "$LOG_OUT" \ + --fr-output "$FR_OUT" \ + [--model claude-sonnet-4-6] \ + [--base-url https://inference-api.nvidia.com/v1] + +Stdout: one line of JSON with keys: + restart_correct, rank_primary, rank_any, fault_described, fr_rank_correct, notes +""" + +import argparse +import json +import logging +import sys +from typing import Union + +from langchain_openai import ChatOpenAI + +sys.path.insert(0, str(__import__("pathlib").Path(__file__).resolve().parents[4])) +from nvidia_resiliency_ext.attribution.api_keys import load_nvidia_api_key +from nvidia_resiliency_ext.attribution.svc.config import DEFAULT_LLM_BASE_URL + +logger = logging.getLogger(__name__) + +# Default judge model — override with --model +DEFAULT_JUDGE_MODEL = "azure/anthropic/claude-sonnet-4-6" + +# Expected restart decision and rationale per fault type +_RESTART_TABLE = { + "GPU_SLEEP": ("RESTART IMMEDIATE", "transient GPU hang, recoverable"), + "LOCK_GIL": ("RESTART IMMEDIATE", "transient Python GIL hang, recoverable"), + "SIGTERM": ("RESTART IMMEDIATE", "external termination signal, recoverable"), + "SIGINT": ("RESTART IMMEDIATE", "external interrupt signal, recoverable"), + "SIGSTOP": ("RESTART IMMEDIATE", "external stop signal, recoverable"), + "SIGNAL_EXC": ("RESTART IMMEDIATE", "signal-based exception, typically recoverable"), + "GPU_ERROR": ("STOP - DONT RESTART IMMEDIATE", "hardware GPU error, may be persistent"), + "SIGKILL": ("STOP - DONT RESTART IMMEDIATE", "hard kill, possible external pressure or OOM"), + "SEGFAULT": ("STOP - DONT RESTART IMMEDIATE", "segmentation fault, likely code or memory corruption"), + "OS_ABORT": ("STOP - DONT RESTART IMMEDIATE", "OS abort, likely severe system or hardware fault"), + "WORKLOAD_EXC": ("STOP - DONT RESTART IMMEDIATE", "application exception, likely a code bug"), + "ASYNC_EXC": ("STOP - DONT RESTART IMMEDIATE", "async exception in workload, likely a code bug"), +} + + +def load_log_excerpt(log_path, max_lines=400): + """Return up to max_lines from the log, keeping the tail (where errors appear). + + Applies the same exclude_nvrx_logs filtering as nvrx_logsage.py:analyze_logs(). + """ + if not log_path: + return "(log file not provided)" + try: + try: + with open(log_path, "r", encoding="utf-8") as f: + lines = f.readlines() + except UnicodeDecodeError: + with open(log_path, "r", encoding="latin-1") as f: + lines = f.readlines() + # Mirrors nvrx_logsage.py exclude_nvrx_logs logic exactly + lines = [line for line in lines if "nvidia_resiliency_ext" not in line] + lines = [line for line in lines if "[workload:" not in line or 'Cycle:' in line] + # Strip fault-injection markers — the judge must not see which rank/fault was + # injected in the raw log; it knows the ground truth from the structured args. + lines = [line for line in lines if "[MEGATRON_FAULT]" not in line] + if len(lines) > max_lines: + lines = lines[-max_lines:] + return "".join(lines).strip() + except Exception as exc: + return f"(could not read log file: {exc})" + + +def build_judge_prompt(fault_type, rank, iter_, nodes, run_valid, log_output, fr_output, log_excerpt): + total_ranks = nodes * 4 # GPUS_PER_NODE=4 in the example SBATCH_SCRIPT + expected_restart, restart_rationale = _RESTART_TABLE.get( + fault_type, ("unknown", "unknown fault type") + ) + + if not run_valid: + # Return early dict — caller will skip LLM call + return { + "restart_correct": "N/A", + "rank_primary": "N/A", + "rank_any": "N/A", + "fault_described": "N/A", + "fr_rank_correct": "N/A", + "notes": "run_invalid: training did not reach the fault injection point; scores not meaningful", + } + + fr_section = ( + fr_output + if fr_output and fr_output.strip() not in ("no_dumps", "no results", "run_invalid", "") + else "(no flight-recorder dumps available for this experiment)" + ) + + log_section = log_excerpt.strip() if log_excerpt.strip() else "(not provided)" + + return f"""You are evaluating the accuracy of an AI-based fault attribution system for \ +distributed ML training. + +## Ground truth (injected fault) +- Fault type : {fault_type} +- Injected rank : {rank} (global rank index, 0-based; total ranks = {total_ranks}) +- Injected at iteration : {iter_} +- Cluster : {nodes} nodes × 4 GPUs = {total_ranks} total ranks + +## Expected correct behavior +- restart_decision should be : {expected_restart} + Rationale: {restart_rationale} +- Rank {rank} should appear in Primary issues as the root cause + +## Raw job log (filtered, last 400 lines) +{log_section} + +## Log attribution output (from nvrx_logsage) +{log_output if log_output.strip() else "(no log output — analyzer produced no output)"} + +## FR (flight recorder) analysis output (from CollectiveAnalyzer) +{fr_section} + +## Scoring instructions +Score each dimension below. Use only the values listed for each. + +1. **restart_correct** — Is the restart decision in the log output correct for {fault_type}? + Values: "true" | "false" | "N/A" (if log output is empty or unparseable) + +2. **rank_primary** — Is rank {rank} identified as the PRIMARY root cause (in Primary issues)? + Values: "true" | "false" | "partial" (rank mentioned but only as secondary/collateral) + +3. **rank_any** — Is rank {rank} mentioned anywhere in the log attribution output? + Values: "true" | "false" + +4. **fault_described** — Does the log output correctly describe the nature of the fault + (e.g., GPU hang, segfault, signal kill) appropriate for {fault_type}? + Values: "true" | "false" | "partial" (category right but specifics wrong) + +5. **fr_rank_correct** — Does the FR analysis output identify rank {rank} as a suspect? + Values: "true" | "false" | "no_dumps" (no FR dumps available) + +6. **notes** — One concise sentence summarizing the main gap or confirming correctness. + +Respond ONLY with a JSON object — no markdown, no explanation outside the JSON: +{{ + "restart_correct": "...", + "rank_primary": "...", + "rank_any": "...", + "fault_described": "...", + "fr_rank_correct": "...", + "notes": "..." +}}""" + + +def score(args): + args.run_valid = args.run_valid.lower() == "true" + api_key = load_nvidia_api_key() + if not api_key: + raise ValueError( + "NVIDIA_API_KEY not found. Set NVIDIA_API_KEY env var, " + "NVIDIA_API_KEY_FILE, or create ~/.nvidia_api_key" + ) + + llm = ChatOpenAI( + model=args.model, + api_key=api_key, + base_url=args.base_url, + temperature=0.0, + max_completion_tokens=512, + ) + + log_excerpt = load_log_excerpt(args.log_path) if args.log_path else "" + + prompt_or_result = build_judge_prompt( + fault_type=args.fault_type, + rank=args.rank, + iter_=args.iter, + nodes=args.nodes, + run_valid=args.run_valid, + log_output=args.log_output, + fr_output=args.fr_output, + log_excerpt=log_excerpt, + ) + + # build_judge_prompt returns a dict directly for invalid runs (no LLM call needed) + if isinstance(prompt_or_result, dict): + return prompt_or_result + + response = llm.invoke(prompt_or_result) + text = response.content.strip() + + # Strip markdown code fences if present + if text.startswith("```"): + lines = text.splitlines() + text = "\n".join( + line for line in lines if not line.startswith("```") + ).strip() + + result = json.loads(text) + return result + + +def main(): + parser = argparse.ArgumentParser(description="LLM judge for fault attribution scoring") + parser.add_argument("--fault-type", required=True, help="Injected fault type") + parser.add_argument("--rank", type=int, required=True, help="Injected global rank") + parser.add_argument("--iter", type=int, required=True, help="Injected iteration") + parser.add_argument("--nodes", type=int, required=True, help="Node count") + parser.add_argument("--run-valid", default="true", + help="'true' if training reached the fault injection point, 'false' otherwise") + parser.add_argument("--log-path", default="", help="Path to the raw job log file") + parser.add_argument("--log-output", default="", help="Raw stdout from nvrx_logsage") + parser.add_argument("--fr-output", default="no_dumps", help="Raw text from CollectiveAnalyzer") + parser.add_argument("--model", default=DEFAULT_JUDGE_MODEL, help="Judge LLM model") + parser.add_argument("--base-url", default=DEFAULT_LLM_BASE_URL, help="API base URL") + args = parser.parse_args() + + try: + result = score(args) + print(json.dumps(result)) + except Exception as exc: + logger.warning("Judge failed: %s", exc) + print(json.dumps({"notes": f"judge_failed: {exc}"})) + sys.exit(0) # non-fatal — caller handles missing keys gracefully + + +if __name__ == "__main__": + if not logging.root.handlers: + logging.basicConfig(level=logging.WARNING) + main() diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh new file mode 100755 index 00000000..8a5e3a4d --- /dev/null +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh @@ -0,0 +1,202 @@ +#!/bin/bash +# watch_and_analyze.sh +# Poll SLURM for job completions from a fault-injection session tracking file, +# run log-analysis and fr-analysis on each completed job, then call the LLM judge +# (score_attribution.py) to score each attribution dimension. +# +# Usage: +# bash scripts/watch_and_analyze.sh + +set -euo pipefail + +TRACKING_FILE="${1:?Usage: $0 }" +POLL_INTERVAL=30 + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +SKILL_DIR="$(dirname "${SCRIPT_DIR}")" +NVRX_SRC_DIR="$(cd "${SKILL_DIR}/../../.." && pwd)" + +LOGSAGE_PY="${SKILL_DIR}/log-analysis/scripts/nvrx_logsage.py" +SCORE_PY="${SCRIPT_DIR}/score_attribution.py" + +# Ensure nvidia_resiliency_ext is importable from source tree +export PYTHONPATH="${NVRX_SRC_DIR}${PYTHONPATH:+:$PYTHONPATH}" + +REPORT_FILE="${TRACKING_FILE%.tsv}_report.md" +DONE_JOBS_FILE="${TRACKING_FILE%.tsv}_done.txt" + +touch "${DONE_JOBS_FILE}" + +cat > "${REPORT_FILE}" <<'EOF' +# Fault Injection Experiment Report + +| # | FAULT_TYPE | NODES | RANK | ITER | JOB_ID | STATE | run_valid | restart_correct | rank_primary | rank_any | fault_described | fr_rank_correct | judge_notes | +|---|------------|-------|------|------|--------|-------|-----------|-----------------|--------------|----------|-----------------|-----------------|-------------| +EOF + +echo ">>> Watching tracking file: ${TRACKING_FILE}" +echo ">>> Report: ${REPORT_FILE}" +echo ">>> Polling every ${POLL_INTERVAL}s ..." + +TOTAL=$(tail -n +2 "${TRACKING_FILE}" | wc -l) +EXP_NUM=0 + +while true; do + PENDING=0 + + while IFS=$'\t' read -r JOB_ID FAULT_TYPE RANK ITER NODES EXPERIMENT_DIR; do + # Skip already-analyzed jobs + if grep -q "^${JOB_ID}$" "${DONE_JOBS_FILE}" 2>/dev/null; then + continue + fi + + # Check job state + STATE=$(scontrol show job "${JOB_ID}" 2>/dev/null \ + | grep -oP 'JobState=\K\S+' || echo "UNKNOWN") + + case "${STATE}" in + RUNNING|PENDING|COMPLETING) + PENDING=$((PENDING + 1)) + continue + ;; + COMPLETED|FAILED|TIMEOUT|CANCELLED|NODE_FAIL) + ;; + *) + # Job left the queue — treat as done + ;; + esac + + EXP_NUM=$((EXP_NUM + 1)) + echo "" + echo ">>> [${EXP_NUM}/${TOTAL}] Analyzing: ${FAULT_TYPE} n=${NODES} rank=${RANK} iter=${ITER} job=${JOB_ID} state=${STATE}" + + # ---- Log analysis ---- + LOG_GLOB="${EXPERIMENT_DIR}/logs/slurm/${JOB_ID}.*.1.main_workload.log" + LOG_FILE=$(ls ${LOG_GLOB} 2>/dev/null | head -1 || true) + LOG_OUT="" + + # ---- Check run validity: did the fault actually fire? ---- + # The fault injection prints: [MEGATRON_FAULT] global_rank=RANK/...: injecting FAULT_TYPE at iteration ITER + RUN_VALID="false" + STRIPPED_LOG="" + if [[ -n "${LOG_FILE}" && -f "${LOG_FILE}" ]]; then + echo " log: ${LOG_FILE}" + if grep -qF "[MEGATRON_FAULT]" "${LOG_FILE}" 2>/dev/null; then + RUN_VALID="true" + fi + echo " run_valid: ${RUN_VALID}" + + # Strip fault-injection markers so neither nvrx_logsage nor the judge + # can see which rank/fault was injected — evaluation must be fair. + # [MEGATRON_FAULT] lines are printed by Megatron's debug_fault_injection.py + # and are not covered by --exclude_nvrx_logs. + STRIPPED_LOG=$(mktemp /tmp/fi_log_stripped.XXXXXX) + grep -vF "[MEGATRON_FAULT]" "${LOG_FILE}" > "${STRIPPED_LOG}" 2>/dev/null || true + + # nvrx_logsage.py prints 5 newline-joined fields to stdout: + # line 1: restart_decision + # line 2: error_explanation (often empty) + # line 3+: attribution_text (multi-line, starts with "Attribution:") + # then: additional_detail (often empty) + # last line: checkpoint_saved ("True" / "False") + LOG_OUT=$(python3 "${LOGSAGE_PY}" \ + --log-path "${STRIPPED_LOG}" \ + --exclude_nvrx_logs 2>/dev/null || echo "") + LOG_RESTART=$(echo "${LOG_OUT}" | head -1) + echo " restart_decision: ${LOG_RESTART:-}" + else + echo " WARN: no log file at ${LOG_GLOB}" + echo " run_valid: false (no log)" + fi + + # ---- FR analysis (only when run is valid) ---- + FR_DIR="${EXPERIMENT_DIR}/checkpoints" + FR_OUT="no_dumps" + + if [[ "${RUN_VALID}" == "true" ]] && ls "${FR_DIR}"/_dump_* 2>/dev/null | grep -q .; then + echo " FR dumps: $(ls "${FR_DIR}"/_dump_* 2>/dev/null | wc -l) files" + FR_OUT=$(python3 -c " +import sys, logging +logging.basicConfig(level=logging.WARNING) +sys.path.insert(0, '${NVRX_SRC_DIR}') +from nvidia_resiliency_ext.attribution.trace_analyzer.fr_attribution import CollectiveAnalyzer +try: + ca = CollectiveAnalyzer({'fr_path': '${FR_DIR}'}) + results = ca.run_sync({'fr_path': '${FR_DIR}'}) + if results: + result_data = results[0] + if isinstance(result_data, dict): + text = result_data.get('analysis_text', '') + ranks = result_data.get('hanging_ranks', '') + if text: + print(text) + if ranks: + print(ranks) + else: + print(str(result_data)) + else: + print('no results') +except Exception as e: + print('error: ' + str(e), file=sys.stderr) + print('no_dumps') +" 2>/dev/null || echo "no_dumps") + elif [[ "${RUN_VALID}" == "false" ]]; then + FR_OUT="run_invalid" + echo " FR analysis skipped (run did not reach fault injection point)" + fi + + # ---- LLM judge scoring ---- + echo " scoring with judge..." + SCORE_JSON=$(python3 "${SCORE_PY}" \ + --fault-type "${FAULT_TYPE}" \ + --rank "${RANK}" \ + --iter "${ITER}" \ + --nodes "${NODES}" \ + --run-valid "${RUN_VALID}" \ + --log-path "${STRIPPED_LOG:-}" \ + --log-output "${LOG_OUT}" \ + --fr-output "${FR_OUT}" 2>/dev/null || echo '{"notes":"judge_failed"}') + + # Clean up temp stripped log + [[ -n "${STRIPPED_LOG}" && -f "${STRIPPED_LOG}" ]] && rm -f "${STRIPPED_LOG}" + + _get() { echo "${SCORE_JSON}" | python3 -c \ + "import sys,json; d=json.load(sys.stdin); print(d.get('$1','N/A'))" 2>/dev/null || echo "N/A"; } + + RESTART_CORRECT=$(_get restart_correct) + RANK_PRIMARY=$(_get rank_primary) + RANK_ANY=$(_get rank_any) + FAULT_DESC=$(_get fault_described) + FR_RANK=$(_get fr_rank_correct) + JUDGE_NOTES=$(_get notes) + + echo " run_valid=${RUN_VALID} restart_correct=${RESTART_CORRECT} rank_primary=${RANK_PRIMARY} rank_any=${RANK_ANY} fault_described=${FAULT_DESC} fr_rank=${FR_RANK}" + echo " judge: ${JUDGE_NOTES}" + + # Append to report + printf "| %d | %s | %s | %s | %s | %s | %s | %s | %s | %s | %s | %s | %s | %s |\n" \ + "${EXP_NUM}" "${FAULT_TYPE}" "${NODES}" "${RANK}" "${ITER}" \ + "${JOB_ID}" "${STATE}" "${RUN_VALID}" \ + "${RESTART_CORRECT}" "${RANK_PRIMARY}" "${RANK_ANY}" \ + "${FAULT_DESC}" "${FR_RANK}" \ + "${JUDGE_NOTES}" >> "${REPORT_FILE}" + + echo "${JOB_ID}" >> "${DONE_JOBS_FILE}" + + done < <(tail -n +2 "${TRACKING_FILE}") + + DONE_COUNT=$(wc -l < "${DONE_JOBS_FILE}") + echo "$(date '+%H:%M:%S') >>> ${DONE_COUNT}/${TOTAL} done, ${PENDING} still running" + + if [[ ${DONE_COUNT} -ge ${TOTAL} ]]; then + break + fi + + sleep "${POLL_INTERVAL}" +done + +echo "" +echo ">>> All ${TOTAL} experiments analyzed." +echo ">>> Report: ${REPORT_FILE}" +echo "" +cat "${REPORT_FILE}" diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/workloads.conf b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/workloads.conf new file mode 100644 index 00000000..7cea1674 --- /dev/null +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/workloads.conf @@ -0,0 +1,17 @@ +# workloads.conf — fault-injection workload registry +# +# Each non-comment line defines one workload: +# NAME SCRIPT BASE_EXPERIMENTS_DIR DESCRIPTION POOL_FILE TIME +# +# NAME : identifier passed as WORKLOAD= to prepare_node_alloc.sh +# SCRIPT : path to the sbatch job script (relative to the scripts/ dir) +# BASE_EXPERIMENTS_DIR : root directory for all experiment output (logs, checkpoints, etc.) +# DESCRIPTION : free-form human-readable label (no spaces; use underscores) +# POOL_FILE : (optional) pool file under scripts/pools/ to use as default pool +# when POOL env var is not set; "-" means use the built-in default pool +# TIME : (optional) default wall-clock limit per job (HH:MM:SS); +# "-" means use the TIME env var or prepare_node_alloc.sh default (00:30:00) +# +# Fields are whitespace-separated. Lines starting with # are ignored. + +llama4_scout l4_gb200_reduced.sh /home/sbak/experiments/llama4-scout-gb200 Llama4-Scout_(reduced_layers)_on_GB200 - - From df5f5c740867c5f02745316cfb6c9d98d364ea33 Mon Sep 17 00:00:00 2001 From: Seonmyeong Bak Date: Wed, 22 Apr 2026 11:24:12 -0700 Subject: [PATCH 02/21] chore(skills): remove extraneous nvrx-attr artifacts --- .../nvrx-attr/SESSION_REPORT_20260409_13.md | 137 ------- .../skills/nvrx-attr/l4_gb200_reduced.sh | 363 ----------------- .../nvrx-attr/scripts/n3_super_gb200.sh | 166 -------- .../scripts/n3_super_gb200_shm_test.sh | 369 ------------------ .../scripts/pools/n3_super_8n_16n.pool | 40 -- 5 files changed, 1075 deletions(-) delete mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/SESSION_REPORT_20260409_13.md delete mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/l4_gb200_reduced.sh delete mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200.sh delete mode 100755 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_shm_test.sh delete mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/pools/n3_super_8n_16n.pool diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/SESSION_REPORT_20260409_13.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/SESSION_REPORT_20260409_13.md deleted file mode 100644 index 657cbbd5..00000000 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/SESSION_REPORT_20260409_13.md +++ /dev/null @@ -1,137 +0,0 @@ -# Fault Injection Session Report — April 9–13, 2026 - -## Summary - -End-to-end validation of the fault-injection attribution pipeline across 48 experiments. -Identified and fixed three pipeline bugs, confirmed FR analysis is solid, and isolated the -remaining attribution gap to a single issue: **logsage returns RESTART IMMEDIATE for -crash/exception-type faults that should be STOP**. - ---- - -## Pipeline Fixes Applied - -| File | Fix | -|---|---| -| `trace_analyzer/capture.py` | `capture_logs()` now saves/restores logger level and lowers it to INFO — previously, root logger at WARNING silently dropped all `logger.info()` calls inside the capture block, producing empty `analysis_text` from `CollectiveAnalyzer` | -| `trace_analyzer/fr_attribution.py` | `main()` now prints `analysis_text` + `hanging_ranks` to stdout (was discarding results) | -| `scripts/watch_and_analyze.sh` | FR inline Python block: import from installed package (not local skill copy), correctly extract `analysis_text`/`hanging_ranks` from returned dict, redirect stderr to `/dev/null` instead of mixing into FR output | -| `scripts/score_attribution.py` | **New file** — LLM judge (Claude Sonnet) that scores 5 attribution dimensions per experiment and returns structured JSON | - ---- - -## Experiment Sessions - -### Session 1 — Mini-batch validation (Apr 9, `20260409_160245`) - -6 experiments: GPU_SLEEP×2, GPU_ERROR×2, SIGKILL×1, SIGTERM×1 — all 2-node. -Purpose: confirm pipeline works end-to-end after fixes. - -| # | FAULT_TYPE | RANK | restart | rank_p | rank_a | fault_d | fr_rank | -|---|---|---|---|---|---|---|---| -| 1 | GPU_SLEEP | 1 | ✅ | ✅ | ✅ | ✅ | ✅ | -| 2 | GPU_SLEEP | 0 | ✅ | ✅ | ✅ | partial | ✅ | -| 3 | GPU_ERROR | 1 | ❌ | ❌ | ❌ | partial | ✅ | -| 4 | GPU_ERROR | 0 | ❌ | ❌ | ❌ | partial | ✅ | -| 5 | SIGKILL | 1 | ❌ | ✅ | ✅ | partial | ✅ | -| 6 | SIGTERM | 1 | ✅ | ❌ | ❌ | partial | ✅ | - -FR analysis: 6/6 correct. Pipeline confirmed working. - ---- - -### Session 2 — Full default pool (Apr 9, `20260409_170603`) - -34 experiments across all fault types and node counts (2/4/8 nodes). - -**Infrastructure issue:** 18/34 jobs failed at container startup due to a pyxis/enroot -`nvidia-container-cli ldcache` error on certain compute nodes: - -``` -nvidia-container-cli: ldcache error: process /usr/sbin/ldconfig.real failed with error code: 1 -[ERROR] /etc/enroot/hooks.d/98-nvidia.sh exited with return code 1 -pyxis: couldn't start container -rm: cannot remove '/usr/local/cuda/compat/lib': Read-only file system -``` - -The CUDA compat overlay was not being applied on those nodes — `ldconfig` could not write its -cache inside the read-only squashfs container. These jobs produced no FR dumps and their logs -contained only the container error, which logsage misattributed as a disk/storage fault. -The issue was transient and node-specific; jobs submitted the next day ran cleanly. - -**Clean-run results (16/34):** see full table in -`/home/sbak/experiments/llama4-scout-gb200/fault_injection/20260409_170603/experiments_report.md` - -Aggregate for clean-run jobs: - -| FAULT_TYPE | N (clean) | restart% | rank_primary% | fr_rank% | -|---|---|---|---|---| -| GPU_SLEEP | 5 | 80% | 40% | 60% | -| GPU_ERROR | 4 | 0% | 25% | 75% | -| SIGKILL | 3 | 33% | 33% | 100% | -| OS_ABORT | 1 | 0% | 0% | 100% | - ---- - -### Session 3 — SEGFAULT cluster health check (Apr 10, `20260410_135216`) - -2 experiments: SEGFAULT rank=0 and rank=1, 2-node. Purpose: confirm cluster healthy after -the Apr 9 enroot issue. - -| # | FAULT_TYPE | RANK | restart | rank_p | rank_a | fault_d | fr_rank | -|---|---|---|---|---|---|---|---| -| 1 | SEGFAULT | 1 | ❌ | ✅ | ✅ | ✅ | ✅ | -| 2 | SEGFAULT | 0 | ❌ | ✅ | ✅ | ✅ | ✅ | - -Cluster healthy (both COMPLETED, 7 FR dumps each). Rank and fault description correct; -restart decision wrong (RESTART instead of STOP). - ---- - -### Session 4 — Python fault types (Apr 10, `20260410_143501`) - -4 experiments: LOCK_GIL×2, WORKLOAD_EXC×1, ASYNC_EXC×1 — all 2-node. -These were skipped in the full session due to the enroot issue. - -| # | FAULT_TYPE | RANK | restart | rank_p | rank_a | fault_d | fr_rank | -|---|---|---|---|---|---|---|---| -| 1 | LOCK_GIL | 1 | ✅ | ✅ | ✅ | partial | ✅ | -| 2 | LOCK_GIL | 0 | ✅ | ✅ | ✅ | partial | ✅ | -| 3 | WORKLOAD_EXC | 1 | ❌ | ✅ | ✅ | partial | ❌ (rank 7) | -| 4 | ASYNC_EXC | 1 | ❌ | ❌ | ❌ | false | ✅ | - -Note on WORKLOAD_EXC FR result: FR flagged rank 7 instead of rank 1. When a rank throws an -application exception and crashes, the last rank detected as missing by NCCL's collective -timeout isn't necessarily the originating rank — FR is identifying the symptom rank. - ---- - -## Attribution Quality Summary (clean runs only) - -| Dimension | Assessment | -|---|---| -| **FR rank identification** | Solid — correctly identified the hanging rank in all clean-run experiments where NCCL completed enough to produce dumps. The `capture_logs()` fix was the key enabler. | -| **Log rank identification** | Good for hang types (GPU_SLEEP, LOCK_GIL); weaker for crash/signal types where all ranks see a simultaneous NCCL timeout masking the originator. FR compensates for this gap. | -| **Restart decision** | ✅ Correct for hang/recoverable types: GPU_SLEEP, LOCK_GIL, SIGTERM. ❌ Wrong for crash/exception types: GPU_ERROR, SIGKILL, SEGFAULT, WORKLOAD_EXC, ASYNC_EXC — logsage consistently returns RESTART IMMEDIATE when the correct decision is STOP. | -| **Fault description** | Consistently `partial` — logsage describes the observable NCCL collective timeout symptom, not the underlying injected fault (GPU hang, kill signal, exception). This is expected given the log contains only symptoms. | - ---- - -## Open Gap - -**Single actionable fix:** logsage restart decision for crash/exception-type faults. - -Logsage sees the same NCCL collective timeout pattern whether the root cause is a recoverable -GPU hang or a hard crash (SIGKILL, SEGFAULT, CUDA error, application exception). It needs -keyword-based fast-path rules to detect crash signals before the LLM runs: - -| Fault type | Expected | Currently returns | -|---|---|---| -| GPU_ERROR | STOP | RESTART IMMEDIATE | -| SIGKILL | STOP | RESTART IMMEDIATE | -| SEGFAULT | STOP | RESTART IMMEDIATE | -| WORKLOAD_EXC | STOP | RESTART IMMEDIATE | -| ASYNC_EXC | STOP | RESTART IMMEDIATE | -| OS_ABORT | STOP | RESTART IMMEDIATE | - -Target file: `attribution/log_analyzer/nvrx_logsage.py` diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/l4_gb200_reduced.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/l4_gb200_reduced.sh deleted file mode 100644 index 5c903e7a..00000000 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/l4_gb200_reduced.sh +++ /dev/null @@ -1,363 +0,0 @@ -#!/bin/bash - -#SBATCH --account=root -#SBATCH --partition=gb-nvl-134-135 -#SBATCH --time=00:30:00 - -#SBATCH --job-name=llama4-scout-gb200 -#SBATCH --output=/tmp/slurm-%j.launch.out -#SBATCH --error=/tmp/slurm-%j.launch.err - -#SBATCH --nodes=2 -#SBATCH --ntasks-per-node=4 -#SBATCH --gpus-per-node=4 -#SBATCH --exclusive -#SBATCH --mem=0 - -log_msg() { - local msg="$1" - UNIX_DATETIME=$(date +%s) - HUMAN_DATETIME=$(date -d "@$UNIX_DATETIME" '+%Y-%m-%d %H:%M:%S.%3N') - echo ">>> ${msg} ${UNIX_DATETIME} (${HUMAN_DATETIME})" -} - -log_msg "START SBATCH" -echo "Running on nodes: ${SLURM_NODELIST}" -export RITS_PLATFORM_TYPE=gb200 -export RITS_GPUS_PER_NODE=4 -export RITS_NVL_DOMAIN_SIZE=72 -export NCCL_IB_DISABLE=0 -export NCCL_NET_GDR_LEVEL=3 -export RITS_CLUSTER_NAME=nvl72 -export PYXIS_LOG_LEVEL=debug -export NCCL_IB_SL=1 -export NCCL_IB_TIMEOUT=19 -export UB_TIMEOUT=720 -export CUDA_DEVICE_MAX_CONNECTIONS=1 -export NVTE_FWD_LAYERNORM_SM_MARGIN=16 -export NVTE_BWD_LAYERNORM_SM_MARGIN=16 -export NCCL_P2P_NET_CHUNKSIZE=2097152 -export NCCL_DEBUG=WARN -export PYTHONUNBUFFERED=1 -export ONE_LOGGER_JOB_CATEGORY=test -export LOGLEVEL=DEBUG -export TORCHINDUCTOR_WORKER_START=fork -export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True -export TORCH_CPP_LOG_LEVEL=INFO -export TORCH_NCCL_TRACE_BUFFER_SIZE=2000 -export TORCH_NCCL_RETHROW_CUDA_ERRORS=0 -export TORCH_NCCL_ENABLE_MONITORING=1 -export TORCH_NCCL_DUMP_ON_TIMEOUT=1 -export TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN=0 -export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=20 -export TORCH_DIST_INIT_BARRIER=0 -export TORCH_INCLUDE_STACK_TRACE=0 -export TORCH_INCLUDE_ONLY_ACTIVE=1 -export TORCH_NCCL_EXTRA_DUMP_ON_EXEC=1 - -# Checkpoint settings (overridable via sbatch --export) -export DIST_TIMEOUT_AFTER_INIT="${DIST_TIMEOUT_AFTER_INIT:-1800}" -# USE_ASYNC_CKPT=1: enable async checkpointing every CKPT_SAVE_INTERVAL iters -export USE_ASYNC_CKPT="${USE_ASYNC_CKPT:-0}" -export CKPT_SAVE_INTERVAL="${CKPT_SAVE_INTERVAL:-100}" -export USE_CPU_SHM="${USE_CPU_SHM:-1}" - -# Quantization mode (overridable via sbatch --export) -export USE_FP8="${USE_FP8:-1}" -export USE_FP4="${USE_FP4:-0}" - -# Overlap comm (overridable via sbatch --export) -export USE_OVERLAP_COMM="${USE_OVERLAP_COMM:-0}" - -# Node / task geometry (SLURM_NNODES is set by SLURM from --nodes override) -export GPUS_PER_NODE="${GPUS_PER_NODE:-4}" -TOTAL_TASKS=$((SLURM_NNODES * GPUS_PER_NODE)) - -# Per-experiment output directory (overridable via sbatch --export) -export BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-/home/sbak/experiments/llama4-scout-gb200}" -export EXPERIMENT_DIR="${EXPERIMENT_DIR:-${BASE_EXPERIMENTS_DIR}/ckpt_test/n${SLURM_NNODES}}" - -mkdir -p ${BASE_EXPERIMENTS_DIR}/datacache -mkdir -p ${EXPERIMENT_DIR}/tensorboard - -: "${SLURM_RESTART_COUNT:=0}" - -LOG_DIR=${EXPERIMENT_DIR}/logs -mkdir -p ${LOG_DIR} -echo "Writing logs to ${LOG_DIR}" -LOG_FILE_BASE="${LOG_DIR}/slurm/${SLURM_JOB_ID}.${SLURM_RESTART_COUNT}" - -# ── Shared-tmp directory (NFS, for cross-srun-step communication) ───────────── -# Mounted to /shared_tmp (NOT /tmp) so the container keeps its native fast /tmp. -SHARED_TMP_HOST=/home/sbak/tmp/${SLURM_JOB_ID} -mkdir -p ${SHARED_TMP_HOST} - -# ── Pre-populate .myenv with all variables that must reach the container ─────── -# Pyxis env forwarding is unreliable for vars set via sbatch --export; writing -# them into .myenv guarantees the inner bash picks them up via `source`. -MYENV_FILE=${SHARED_TMP_HOST}/.myenv_${SLURM_JOB_ID}.sh -cat > ${MYENV_FILE} << MYENVEOF -# Auto-generated by l4_gb200_reduced.sh — do not edit by hand. -export DIST_TIMEOUT_AFTER_INIT=${DIST_TIMEOUT_AFTER_INIT} -export USE_ASYNC_CKPT=${USE_ASYNC_CKPT} -export CKPT_SAVE_INTERVAL=${CKPT_SAVE_INTERVAL} -export USE_CPU_SHM=${USE_CPU_SHM} -export USE_FP8=${USE_FP8} -export USE_FP4=${USE_FP4} -export USE_OVERLAP_COMM=${USE_OVERLAP_COMM} -# Prepend local nvrx src so container picks up our changes without a pip install step. -export PYTHONPATH=/home/sbak/nvidia-resiliency-ext/src:\${PYTHONPATH} -MYENVEOF - -# Mounts -LUSTRE=/home:/home -SHARED_TMP=${SHARED_TMP_HOST}:/shared_tmp -LOGS=${EXPERIMENT_DIR}/logs:/logs -MEGATRON_REPO=/home/sbak/megatron-lm-main:/megatron-lm_repo -DATACACHE=${BASE_EXPERIMENTS_DIR}/datacache:/datacache -TENSORBOARD=${EXPERIMENT_DIR}/tensorboard:/tensorboard -WORKSPACE=/home/sbak/tmp:/workspace -FR_DUMP=${EXPERIMENT_DIR}/flight_recorder:/flight_recorder -mkdir -p ${EXPERIMENT_DIR}/flight_recorder -CONTAINER_MOUNTS=$LUSTRE,$SHARED_TMP,$LOGS,$MEGATRON_REPO,$DATACACHE,$TENSORBOARD,$WORKSPACE,$FR_DUMP - -# ── Disk cleanup: remove stale enroot containers from prior jobs ────────────── -log_msg "START disk_cleanup" -srun \ - --label \ - --ntasks-per-node=1 \ - --ntasks=${SLURM_NNODES} \ - --kill-on-bad-exit=0 \ - --mpi=none \ - bash -c ' - ENROOT_DIR="/var/lib/enroot/data/$(id -u)" - rm -rf "${ENROOT_DIR:?}"/* 2>/dev/null || true - echo "$(hostname): / $(df -h / | tail -1 | awk "{print \$3\" used, \"\$4\" avail\"}")" - ' -log_msg "END disk_cleanup" - -# all node setup -#-------------------------------- -log_msg "START all_node_setup" -srun \ - --label \ - --container-mounts ${CONTAINER_MOUNTS} \ - --container-image /home/sbak/mcore_ci_0415.sqsh \ - --container-name ${SLURM_JOB_ID} \ - --container-workdir / \ - --exclusive \ - --error=${LOG_FILE_BASE}.0.all_node_setup.log \ - --output=${LOG_FILE_BASE}.0.all_node_setup.log \ - --ntasks-per-node=1 \ - --ntasks=${SLURM_NNODES} \ - --kill-on-bad-exit=0 \ - --mpi=none \ - bash -c ' - # Use a per-node NFS path so all ranks on each node find the right clone. - MEGATRON_PATH=/shared_tmp/megatron_$(hostname)_${SLURM_JOB_ID} - mkdir -p ${MEGATRON_PATH} - pushd $MEGATRON_PATH - CURRENT_BRANCH=$(git -C /megatron-lm_repo branch --show-current) - echo "Cloning Megatron branch $CURRENT_BRANCH to ${MEGATRON_PATH}" - git clone --single-branch --branch $CURRENT_BRANCH /megatron-lm_repo . - popd - ' -log_msg "END all_node_setup" - -# main workload -#-------------------------------- -log_msg "START main_workload" -srun \ - --label \ - --container-mounts ${CONTAINER_MOUNTS} \ - --container-image /home/sbak/mcore_ci_0415.sqsh \ - --container-name ${SLURM_JOB_ID} \ - --container-workdir / \ - --error=${LOG_FILE_BASE}.1.main_workload.log \ - --output=${LOG_FILE_BASE}.1.main_workload.log \ - --ntasks-per-node=${GPUS_PER_NODE} \ - --ntasks=${TOTAL_TASKS} \ - --kill-on-bad-exit=0 \ - --mpi=none \ - bash -c ' - source /shared_tmp/.myenv_${SLURM_JOB_ID}.sh - - # Match the per-node path used in all_node_setup. - MEGATRON_PATH=/shared_tmp/megatron_$(hostname)_${SLURM_JOB_ID} - - NFS_TRITON_CACHE=/home/sbak/experiments/llama4-scout-gb200/triton_cache - NFS_INDUCTOR_CACHE=/home/sbak/experiments/llama4-scout-gb200/inductor_cache - - # Per-rank Triton/inductor cache on the container native /tmp (local fast storage). - export TRITON_CACHE_DIR=/tmp/triton_${SLURM_LOCALID} - export TORCHINDUCTOR_CACHE_DIR=/tmp/inductor_${SLURM_LOCALID} - mkdir -p ${TRITON_CACHE_DIR} ${TORCHINDUCTOR_CACHE_DIR} - - # Pre-stage: warm local cache from NFS (one rank per node) - if [[ "${SLURM_LOCALID}" == "0" ]]; then - if [[ -d "${NFS_TRITON_CACHE}" ]]; then - echo "Pre-staging triton cache from NFS..." - rsync -a --ignore-existing "${NFS_TRITON_CACHE}/" "${TRITON_CACHE_DIR}/" 2>/dev/null || true - fi - if [[ -d "${NFS_INDUCTOR_CACHE}" ]]; then - echo "Pre-staging inductor cache from NFS..." - rsync -a --ignore-existing "${NFS_INDUCTOR_CACHE}/" "${TORCHINDUCTOR_CACHE_DIR}/" 2>/dev/null || true - fi - fi - - # Post-stage: write back to NFS on exit (one rank per node) - _stage_back() { - if [[ "${SLURM_LOCALID}" == "0" ]]; then - echo "Staging triton cache back to NFS..." - mkdir -p "${NFS_TRITON_CACHE}" "${NFS_INDUCTOR_CACHE}" - rsync -a --ignore-existing "${TRITON_CACHE_DIR}/" "${NFS_TRITON_CACHE}/" 2>/dev/null || true - rsync -a --ignore-existing "${TORCHINDUCTOR_CACHE_DIR}/" "${NFS_INDUCTOR_CACHE}/" 2>/dev/null || true - echo "Cache staged back." - fi - } - trap _stage_back EXIT - - # Checkpoint directory — node-local /tmp (cleaned up by the cleanup job). - CKPT_DIR=/tmp/ckpt_${SLURM_JOB_ID} - mkdir -p ${CKPT_DIR} - - if [[ "${USE_FP8:-1}" == "1" ]]; then - QUANT_ARGS="--fp8-format hybrid \ - --fp8-recipe delayed \ - --fp8-param-gather \ - --fp8-amax-history-len 1024 \ - --fp8-amax-compute-algo max \ - --fp8-margin 0" - elif [[ "${USE_FP4:-0}" == "1" ]]; then - QUANT_ARGS="--fp4-format e2m1 \ - --fp4-recipe nvfp4" - else - QUANT_ARGS="" - fi - - if [[ "${USE_OVERLAP_COMM:-0}" == "1" ]]; then - OVERLAP_ARGS="--overlap-grad-reduce --overlap-param-gather" - else - OVERLAP_ARGS="" - fi - - # Build checkpoint args (controlled by USE_ASYNC_CKPT from .myenv). - # No --load: we only want to test save here. - CKPT_SAVE_ARGS="" - if [[ "${USE_ASYNC_CKPT}" == "1" ]]; then - CKPT_SAVE_ARGS="--save ${CKPT_DIR} --save-interval ${CKPT_SAVE_INTERVAL} --async-save --use-persistent-ckpt-worker --use-dist-ckpt --ckpt-fully-parallel-save --ckpt-assume-constant-structure $([[ "${USE_CPU_SHM}" == "1" ]] && echo "--async-ckpt-use-cpu-shm")" - fi - - pushd $MEGATRON_PATH - LAUNCHER_CMD="python3" - LAUNCHER_ARGS=" \ - " - WORKLOAD_CMD=${MEGATRON_PATH}/pretrain_gpt.py - WORKLOAD_ARGS=" \ - --exit-duration-in-mins 5750 \ - --distributed-timeout-minutes 10 \ - --disable-gloo-process-groups \ - --mock-data \ - --data-cache-path /datacache \ - --no-create-attention-mask-in-dataloader \ - --no-mmap-bin-files \ - --tokenizer-type NullTokenizer \ - --tiktoken-pattern v2 \ - --tokenizer-model /lustre/fsw/portfolios/llmservice/projects/llmservice_nlp_fm/nemotron6/tokenizers/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json \ - --micro-batch-size 1 \ - --global-batch-size 64 \ - --train-samples 10240000 \ - --adam-beta1 0.9 \ - --adam-beta2 0.95 \ - --adam-eps 1e-05 \ - --lr-decay-style cosine \ - --lr-warmup-samples 1024000 \ - --lr-decay-samples 20480000 \ - --lr 0.0003 \ - --min-lr 2.9999999999999997e-05 \ - --weight-decay 0.1 \ - --clip-grad 1.0 \ - --loss-scale 1.0 \ - --use-mcore-models \ - --untie-embeddings-and-output-weights \ - --disable-bias-linear \ - --attention-backend flash \ - --transformer-impl transformer_engine \ - --position-embedding-type rope \ - --rotary-base 500000 \ - --rotary-interleaved \ - --use-rope-scaling \ - --rope-scaling-factor 8.0 \ - --no-rope-fusion \ - --no-rope-freq 4 \ - --use-flash-attn \ - --cross-entropy-fusion-impl te \ - --cross-entropy-loss-fusion \ - --seq-length 8192 \ - --max-position-embeddings 8192 \ - --num-layers 12 \ - --swiglu \ - --hidden-size 5120 \ - --num-attention-heads 40 \ - --group-query-attention \ - --num-query-groups 8 \ - --ffn-hidden-size 16384 \ - --kv-channels 128 \ - --normalization RMSNorm \ - --attention-dropout 0.0 \ - --hidden-dropout 0.0 \ - --grad-reduce-in-bf16 \ - --qk-l2-norm \ - --num-experts 16 \ - --moe-layer-freq 1 \ - --moe-ffn-hidden-size 8192 \ - --moe-shared-expert-intermediate-size 8192 \ - --moe-router-topk 1 \ - --moe-router-score-function sigmoid \ - --moe-token-dispatcher-type alltoall \ - --moe-grouped-gemm \ - --moe-shared-expert-overlap \ - --moe-router-bias-update-rate 0.001 \ - --moe-router-load-balancing-type aux_loss \ - --moe-aux-loss-coeff 0.01 \ - --moe-router-enable-expert-bias \ - --moe-apply-probs-on-input \ - --moe-router-force-load-balancing \ - --bf16 \ - ${QUANT_ARGS} \ - --te-rng-tracker \ - --sequence-parallel \ - --use-distributed-optimizer \ - ${OVERLAP_ARGS} \ - --ddp-num-buckets 5 \ - --tensor-model-parallel-size 1 \ - --pipeline-model-parallel-size 1 \ - --expert-model-parallel-size 8 \ - --expert-tensor-parallel-size 1 \ - --ddp-average-in-collective \ - --log-interval 1 \ - --timing-log-option minmax \ - --log-params-norm \ - --log-num-zeros-in-grad \ - --log-throughput \ - --check-weight-hash-across-dp-replicas-interval 20000 \ - --tensorboard-dir /tensorboard \ - --logging-level 10 \ - --eval-iters 14 \ - --eval-interval 2000 \ - --manual-gc \ - --manual-gc-interval 100 \ - --num-workers 1 \ - --local-rank ${SLURM_LOCALID} \ - --context-parallel-size 1 \ - --vocab-size 238600 \ - --distributed-timeout-seconds-after-init ${DIST_TIMEOUT_AFTER_INIT} \ - --flight-recorder-dump-path /flight_recorder \ - " - $LAUNCHER_CMD $LAUNCHER_ARGS $WORKLOAD_CMD $WORKLOAD_ARGS $CKPT_SAVE_ARGS - ' -log_msg "END main_workload" - -log_msg "END SBATCH" - -set +x diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200.sh deleted file mode 100644 index 1147dda6..00000000 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200.sh +++ /dev/null @@ -1,166 +0,0 @@ -ENV_VARS: - NVTE_FWD_LAYERNORM_SM_MARGIN: 16 - NVTE_BWD_LAYERNORM_SM_MARGIN: 16 - TORCHINDUCTOR_WORKER_START: fork - QUANTIZATION_TYPE_DEBUG: 1 - PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True - NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN: 64 - USE_MNNVL: 1 -TEST_TYPE: "release" -MODEL_ARGS: - # Distributed args - --tensor-model-parallel-size: 2 - --pipeline-model-parallel-size: 1 - --expert-model-parallel-size: 64 - --expert-tensor-parallel-size: 1 - --use-distributed-optimizer: true - --overlap-grad-reduce: true - --overlap-param-gather: true - --sequence-parallel: true - --ddp-num-buckets: 10 - --ddp-pad-buckets-for-high-nccl-busbw: true - --high-priority-stream-groups: ep - --distributed-timeout-minutes: 10 - --disable-gloo-process-groups: true - - # Training args - --micro-batch-size: 1 - --global-batch-size: 3072 - --train-samples: 12207031 - --cross-entropy-loss-fusion: true - --cross-entropy-fusion-impl: native - --attention-backend: flash - --enable-cuda-graph: true - --cuda-graph-scope: mamba attn moe_router - --te-rng-tracker: true - --manual-gc: true - --manual-gc-interval: 10 - --no-create-attention-mask-in-dataloader: true - --num-workers: 1 - --exit-interval: 51000 - --override-opt_param-scheduler: true - - # Network size args - --use-mcore-models: true - --spec: megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec - --is-hybrid-model: true - --mamba-num-heads: 128 - --num-layers: 88 - --hidden-size: 4096 - --ffn-hidden-size: 2688 - --num-attention-heads: 32 - --group-query-attention: true - --num-query-groups: 2 - --kv-channels: 128 - --hybrid-override-pattern: MEMEMEM*EMEMEMEM*EMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEM*EMEMEMEME - --position-embedding-type: none - --normalization: RMSNorm - --untie-embeddings-and-output-weights: true - --init-method-std: 0.014 - --disable-bias-linear: true - --squared-relu: true - --use-fused-weighted-squared-relu: true - - # Data args - --seq-length: 8192 - --max-position-embeddings: 8192 - --data-path: ${DATA_BLEND} - --data-cache-path: ${DATA_CACHE_PATH} - --tiktoken-pattern: v2 - --tokenizer-type: ${TOKENIZER_TYPE} - --tokenizer-model: ${TOKENIZER_MODEL_PATH} - --no-mmap-bin-files: true - - # MoE args - --num-experts: 512 - --moe-router-topk: 22 - --moe-router-topk-scaling-factor: 5.0 - --moe-router-score-function: sigmoid - --moe-router-enable-expert-bias: true - --moe-router-dtype: fp32 - --moe-router-load-balancing-type: seq_aux_loss - --moe-aux-loss-coeff: 1e-4 - --moe-token-dispatcher-type: flex - --moe-flex-dispatcher-backend: hybridep - --moe-hybridep-num-sms: 32 - --moe-grouped-gemm: true - --moe-permute-fusion: true - --moe-latent-size: 1024 - --moe-shared-expert-intermediate-size: 5376 - --moe-shared-expert-compute-before-router: true - - # MTP args - --mtp-spec: megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec - --mtp-num-layers: 2 - --mtp-hybrid-override-pattern: \"*E\" - --calculate-per-token-loss: true - --mtp-loss-scaling-factor: 0.3 - - # Mixed precision / quantization args - --bf16: true - --keep-mtp-spec-in-bf16: true - --keep-mamba-stack-attention-linear-in-bf16: true - --keep-mamba-out-proj-in-mxfp8: true - --keep-moe-latent-projections-in-bf16: true - --first-last-layers-bf16: true - --num-layers-at-start-in-bf16: 0 - --num-layers-at-end-in-bf16: 14 - --fp4-format: e2m1 - --fp4-recipe: nvfp4 - - # Regularization args - --attention-dropout: 0.0 - --hidden-dropout: 0.0 - --clip-grad: 1.0 - --weight-decay: 0.1 - - # Learning rate args - --lr: 4.5e-4 - --min-lr: 4.5e-6 - --lr-decay-style: WSD - --lr-warmup-samples: 24414063 - --lr-decay-samples: 3048706055 - --lr-wsd-decay-style: minus_sqrt - --lr-wsd-decay-samples: 610351563 - --adam-beta1: 0.9 - --adam-beta2: 0.95 - - # Checkpointing args - --save: ${CHECKPOINT_SAVE_PATH} - --load: ${CHECKPOINT_LOAD_PATH} - --ckpt-format: torch_dist - --ckpt-fully-parallel-save: true - --ckpt-fully-parallel-load: true - --ckpt-assume-constant-structure: true - --async-save: true - --use-persistent-ckpt-worker: true - --save-interval: 1000 - --save-retain-interval: 5000 - - # Validation args - --eval-interval: 1000 - --eval-iters: 14 - - # Logging args - --log-interval: 100 - --log-params-norm: true - --log-num-zeros-in-grad: true - --log-timers-to-tensorboard: true - --log-memory-to-tensorboard: true - --log-throughput: true - --log-progress: true - --log-energy: true - --log-memory-interval: 500 - --logging-level: 20 - --timing-log-option: minmax - --check-weight-hash-across-dp-replicas-interval: 20000 - --tensorboard-dir: ${TENSORBOARD_PATH} - --wandb-project: megatron-core-release-runs - --wandb-entity: adlr - --wandb-exp-name: ${WANDB_EXPERIMENT} - --wandb-save-dir: ${WANDB_SAVE_PATH} -METRICS: - - "iteration-time" - - "lm loss" - - "mem-allocated-bytes" - - "mem-max-allocated-bytes" diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_shm_test.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_shm_test.sh deleted file mode 100755 index 673965f2..00000000 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_shm_test.sh +++ /dev/null @@ -1,369 +0,0 @@ -#!/bin/bash -# n3_super_gb200_shm_test.sh — one-time validation: Nemotron Super 8N with async cpu-shm ckpt. -# Model/infra config mirrors n3_super_gb200_fi.sh. No fault injection. -# Checkpoints to node-local /tmp (discardable — not cross-node accessible). - -#SBATCH --account=root -#SBATCH --partition=gb-nvl-134-135 -#SBATCH --time=00:45:00 - -#SBATCH --job-name=n3-super-shm-test -#SBATCH --output=/tmp/slurm-%j.launch.out -#SBATCH --error=/tmp/slurm-%j.launch.err - -#SBATCH --nodes=8 -#SBATCH --ntasks-per-node=4 -#SBATCH --gpus-per-node=4 -#SBATCH --exclusive -#SBATCH --mem=0 - -log_msg() { - local msg="$1" - UNIX_DATETIME=$(date +%s) - HUMAN_DATETIME=$(date -d "@$UNIX_DATETIME" '+%Y-%m-%d %H:%M:%S.%3N') - echo ">>> ${msg} ${UNIX_DATETIME} (${HUMAN_DATETIME})" -} - -log_msg "START SBATCH" -echo "Running on nodes: ${SLURM_NODELIST}" - -# ── Platform / NCCL / RITS ──────────────────────────────────────────────────── -export RITS_PLATFORM_TYPE=gb200 -export RITS_GPUS_PER_NODE=4 -export RITS_NVL_DOMAIN_SIZE=72 -export NCCL_IB_DISABLE=0 -export NCCL_NET_GDR_LEVEL=3 -export RITS_CLUSTER_NAME=nvl72 -export PYXIS_LOG_LEVEL=debug -export NCCL_IB_SL=1 -export NCCL_IB_TIMEOUT=19 -export UB_TIMEOUT=720 -export CUDA_DEVICE_MAX_CONNECTIONS=1 -export NCCL_P2P_NET_CHUNKSIZE=2097152 -export NCCL_DEBUG=WARN - -# ── PyTorch / TE / inductor (from n3_super_gb200.sh ENV_VARS) ───────────────── -export NVTE_FWD_LAYERNORM_SM_MARGIN=16 -export NVTE_BWD_LAYERNORM_SM_MARGIN=16 -export TORCHINDUCTOR_WORKER_START=fork -export QUANTIZATION_TYPE_DEBUG=1 -export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True -export USE_MNNVL=1 - -# ── DeepEP (hybridep MoE routing) — set USE_DEEPEP=0 to use alltoall instead ── -USE_DEEPEP="${USE_DEEPEP:-1}" -if [[ "${USE_DEEPEP}" == "1" ]]; then - export NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN=32 -fi - -# ── Logging / debugging ─────────────────────────────────────────────────────── -export PYTHONUNBUFFERED=1 -export ONE_LOGGER_JOB_CATEGORY=test -export LOGLEVEL=DEBUG -export TORCH_CPP_LOG_LEVEL=INFO -export TORCH_NCCL_TRACE_BUFFER_SIZE=2000 -export TORCH_NCCL_RETHROW_CUDA_ERRORS=0 -export TORCH_NCCL_ENABLE_MONITORING=1 -export TORCH_NCCL_DUMP_ON_TIMEOUT=1 -export TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN=0 -export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=30 -export TORCH_DIST_INIT_BARRIER=0 -export TORCH_INCLUDE_STACK_TRACE=0 -export TORCH_INCLUDE_ONLY_ACTIVE=1 -export TORCH_NCCL_EXTRA_DUMP_ON_EXEC=1 - -# ── CUDA graph ──────────────────────────────────────────────────────────────── -export ENABLE_CUDA_GRAPH="${ENABLE_CUDA_GRAPH:-1}" - -# ── Quantization mode: set USE_FP8=1 to use FP8, USE_FP4=1 for FP4 (default) ─ -# Only one may be active at a time. -export USE_FP4="${USE_FP4:-0}" -export USE_FP8="${USE_FP8:-1}" - -# ── Async checkpoint shm mode (default on) ──────────────────────────────────── -export USE_CPU_SHM="${USE_CPU_SHM:-1}" - -# ── Overlap comm (default off) ──────────────────────────────────────────────── -export USE_OVERLAP_COMM="${USE_OVERLAP_COMM:-0}" - -# ── Node / task geometry ───────────────────────────────────────────────────── -export GPUS_PER_NODE=4 -TOTAL_TASKS=$((SLURM_NNODES * GPUS_PER_NODE)) - -# ── Per-experiment output directory ─────────────────────────────────────────── -export BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-/home/sbak/experiments/n3-super-gb200}" -export EXPERIMENT_DIR="${EXPERIMENT_DIR:-${BASE_EXPERIMENTS_DIR}/shm_test}" - -mkdir -p ${BASE_EXPERIMENTS_DIR}/datacache -mkdir -p ${EXPERIMENT_DIR}/tensorboard - -: "${SLURM_RESTART_COUNT:=0}" - -LOG_DIR=${EXPERIMENT_DIR}/logs -mkdir -p ${LOG_DIR} -echo "Writing logs to ${LOG_DIR}" -LOG_FILE_BASE="${LOG_DIR}/slurm/${SLURM_JOB_ID}.${SLURM_RESTART_COUNT}" - -# ── Container mounts ────────────────────────────────────────────────────────── -LUSTRE=/home:/home -SHARED_TMP=/home/sbak/tmp/${SLURM_JOB_ID}:/shared_tmp -LOGS=${EXPERIMENT_DIR}/logs:/logs -MEGATRON_REPO=/home/sbak/megatron-lm-main:/megatron-lm_repo -DATACACHE=${BASE_EXPERIMENTS_DIR}/datacache:/datacache -TENSORBOARD=${EXPERIMENT_DIR}/tensorboard:/tensorboard -WORKSPACE=/home/sbak/tmp:/workspace -# No /checkpoints mount — saves go to node-local /tmp inside the container. -CONTAINER_MOUNTS=$LUSTRE,$SHARED_TMP,$LOGS,$MEGATRON_REPO,$DATACACHE,$TENSORBOARD,$WORKSPACE -mkdir -p /home/sbak/tmp/${SLURM_JOB_ID} - -# ── Disk cleanup: remove stale enroot containers from prior jobs ────────────── -log_msg "START disk_cleanup" -srun \ - --label \ - --ntasks-per-node=1 \ - --ntasks=${SLURM_NNODES} \ - --kill-on-bad-exit=0 \ - --mpi=none \ - bash -c ' - ENROOT_DIR="/var/lib/enroot/data/$(id -u)" - rm -rf "${ENROOT_DIR:?}"/* 2>/dev/null || true - echo "$(hostname): / $(df -h / | tail -1 | awk "{print \$3\" used, \"\$4\" avail\"}")" - ' -log_msg "END disk_cleanup" - -# ── All-node setup: clone Megatron into a per-node tmpdir ───────────────────── -log_msg "START all_node_setup" -srun \ - --label \ - --container-mounts ${CONTAINER_MOUNTS} \ - --container-image /home/sbak/mcore_ci_040825.sqsh \ - --container-name ${SLURM_JOB_ID} \ - --container-workdir / \ - --error=${LOG_FILE_BASE}.0.all_node_setup.log \ - --output=${LOG_FILE_BASE}.0.all_node_setup.log \ - --ntasks-per-node=1 \ - --ntasks=${SLURM_NNODES} \ - --kill-on-bad-exit=0 \ - --mpi=none \ - bash -c ' - MEGATRON_PATH=/shared_tmp/megatron_${SLURM_NODEID} - rm -rf "${MEGATRON_PATH}" - mkdir -p "${MEGATRON_PATH}" - pushd $MEGATRON_PATH - CURRENT_BRANCH=$(git -C /megatron-lm_repo branch --show-current) - echo "Cloning Megatron branch $CURRENT_BRANCH into $MEGATRON_PATH" - git clone --single-branch --branch $CURRENT_BRANCH /megatron-lm_repo . - popd - - # Install local nvidia-resiliency-ext so container picks up src changes. - uv pip install -e /home/sbak/nvidia-resiliency-ext - ' -log_msg "END all_node_setup" - -# ── Main workload ───────────────────────────────────────────────────────────── -log_msg "START main_workload" -srun \ - --label \ - --container-mounts ${CONTAINER_MOUNTS} \ - --container-image /home/sbak/mcore_ci_040825.sqsh \ - --container-name ${SLURM_JOB_ID} \ - --container-workdir / \ - --error=${LOG_FILE_BASE}.1.main_workload.log \ - --output=${LOG_FILE_BASE}.1.main_workload.log \ - --ntasks-per-node=${GPUS_PER_NODE} \ - --ntasks=${TOTAL_TASKS} \ - --kill-on-bad-exit=0 \ - --mpi=none \ - bash -c ' - MEGATRON_PATH=/shared_tmp/megatron_${SLURM_NODEID} - - NFS_TRITON_CACHE=/home/sbak/experiments/n3-super-gb200/triton_cache - NFS_INDUCTOR_CACHE=/home/sbak/experiments/n3-super-gb200/inductor_cache - TRITON_READY=/tmp/.triton_ready_${SLURM_JOB_ID} - - export TRITON_CACHE_DIR=/tmp/triton_${SLURM_LOCALID} - export TORCHINDUCTOR_CACHE_DIR=/tmp/inductor_${SLURM_LOCALID} - - if [[ "${SLURM_LOCALID}" == "0" ]]; then - if [[ -d "${NFS_TRITON_CACHE}" ]] && [[ -n "$(ls -A ${NFS_TRITON_CACHE} 2>/dev/null)" ]]; then - TRITON_CACHE_WAS_WARM=1 - else - TRITON_CACHE_WAS_WARM=0 - fi - for r in $(seq 0 $((GPUS_PER_NODE - 1))); do - mkdir -p /tmp/triton_${r} /tmp/inductor_${r} - [[ -d "${NFS_TRITON_CACHE}" ]] && rsync -a --ignore-existing "${NFS_TRITON_CACHE}/" "/tmp/triton_${r}/" 2>/dev/null || true - [[ -d "${NFS_INDUCTOR_CACHE}" ]] && rsync -a --ignore-existing "${NFS_INDUCTOR_CACHE}/" "/tmp/inductor_${r}/" 2>/dev/null || true - done - touch "${TRITON_READY}" - echo "Pre-staged triton/inductor cache for all local ranks (was_warm=${TRITON_CACHE_WAS_WARM})." - else - until [[ -f "${TRITON_READY}" ]]; do sleep 1; done - fi - - mkdir -p ${TRITON_CACHE_DIR} ${TORCHINDUCTOR_CACHE_DIR} - - _stage_back() { - if [[ "${SLURM_LOCALID}" == "0" && "${SLURM_NODEID}" == "0" && "${TRITON_CACHE_WAS_WARM}" == "0" ]]; then - echo "Staging triton cache back to NFS (cold start)..." - mkdir -p "${NFS_TRITON_CACHE}" "${NFS_INDUCTOR_CACHE}" - rsync -a --ignore-existing "${TRITON_CACHE_DIR}/" "${NFS_TRITON_CACHE}/" 2>/dev/null || true - rsync -a --ignore-existing "${TORCHINDUCTOR_CACHE_DIR}/" "${NFS_INDUCTOR_CACHE}/" 2>/dev/null || true - echo "Cache staged back." - fi - } - trap _stage_back EXIT - - if [[ "${ENABLE_CUDA_GRAPH}" == "1" ]]; then - CUDA_GRAPH_ARGS="--enable-cuda-graph --cuda-graph-scope mamba attn" - else - CUDA_GRAPH_ARGS="" - fi - - if [[ "${USE_DEEPEP:-1}" == "1" ]]; then - MOE_DISPATCHER_ARGS="--moe-token-dispatcher-type flex --moe-flex-dispatcher-backend hybridep --moe-hybridep-num-sms 32" - else - MOE_DISPATCHER_ARGS="--moe-token-dispatcher-type alltoall" - fi - - if [[ "${USE_FP8:-0}" == "1" ]]; then - QUANT_ARGS="--fp8-param-gather \ - --reuse-grad-buf-for-mxfp8-param-ag \ - --fp8-recipe mxfp8 \ - --fp8-format hybrid \ - --fp8-amax-history-len 1024 \ - --fp8-amax-compute-algo max" - elif [[ "${USE_FP4:-1}" == "1" ]]; then - QUANT_ARGS="--first-last-layers-bf16 \ - --num-layers-at-start-in-bf16 0 \ - --num-layers-at-end-in-bf16 14 \ - --fp4-format e2m1 \ - --fp4-recipe nvfp4" - else - QUANT_ARGS="" - fi - - # Checkpoint directory — node-local /tmp inside the container. - # Shards are not cross-node accessible; intentional for one-time shm validation. - CKPT_DIR=/tmp/ckpt_${SLURM_JOB_ID} - mkdir -p ${CKPT_DIR} - - pushd $MEGATRON_PATH - LAUNCHER_CMD="python3" - WORKLOAD_CMD=${MEGATRON_PATH}/pretrain_mamba.py - WORKLOAD_ARGS=" \ - --exit-duration-in-mins 40 \ - --exit-interval 100 \ - --distributed-timeout-minutes 30 \ - --distributed-timeout-seconds-after-init 1800 \ - --disable-gloo-process-groups \ - --mock-data \ - --data-cache-path /datacache \ - --no-create-attention-mask-in-dataloader \ - --no-mmap-bin-files \ - --tokenizer-type NullTokenizer \ - --tiktoken-pattern v2 \ - --vocab-size 128000 \ - --micro-batch-size 1 \ - --global-batch-size 32 \ - --train-samples 12207031 \ - --adam-beta1 0.9 \ - --adam-beta2 0.95 \ - --lr 4.5e-4 \ - --min-lr 4.5e-6 \ - --lr-decay-style WSD \ - --lr-warmup-samples 24414063 \ - --lr-decay-samples 3048706055 \ - --lr-wsd-decay-style minus_sqrt \ - --lr-wsd-decay-samples 610351563 \ - --weight-decay 0.1 \ - --clip-grad 1.0 \ - --override-opt_param-scheduler \ - --use-mcore-models \ - --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \ - --is-hybrid-model \ - --mamba-num-heads 128 \ - --num-layers 88 \ - --hidden-size 4096 \ - --ffn-hidden-size 2688 \ - --num-attention-heads 32 \ - --group-query-attention \ - --num-query-groups 2 \ - --kv-channels 128 \ - --hybrid-override-pattern MEMEMEM*EMEMEMEM*EMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEM*EMEMEMEME \ - --position-embedding-type none \ - --normalization RMSNorm \ - --untie-embeddings-and-output-weights \ - --init-method-std 0.014 \ - --disable-bias-linear \ - --squared-relu \ - --use-fused-weighted-squared-relu \ - --seq-length 8192 \ - --max-position-embeddings 8192 \ - --num-experts 512 \ - --moe-router-topk 22 \ - --moe-router-topk-scaling-factor 5.0 \ - --moe-router-score-function sigmoid \ - --moe-router-enable-expert-bias \ - --moe-router-dtype fp32 \ - --moe-router-load-balancing-type seq_aux_loss \ - --moe-aux-loss-coeff 1e-4 \ - ${MOE_DISPATCHER_ARGS} \ - --moe-grouped-gemm \ - --moe-permute-fusion \ - --moe-latent-size 1024 \ - --moe-shared-expert-intermediate-size 5376 \ - --calculate-per-token-loss \ - --bf16 \ - ${QUANT_ARGS} \ - --attention-dropout 0.0 \ - --hidden-dropout 0.0 \ - --sequence-parallel \ - --use-distributed-optimizer \ - $([[ "${USE_OVERLAP_COMM}" == "1" ]] && echo "--overlap-grad-reduce --overlap-param-gather") \ - --ddp-num-buckets 10 \ - --ddp-pad-buckets-for-high-nccl-busbw \ - --high-priority-stream-groups ep \ - --tensor-model-parallel-size 4 \ - --pipeline-model-parallel-size 1 \ - --expert-model-parallel-size 32 \ - --expert-tensor-parallel-size 1 \ - --cross-entropy-loss-fusion \ - --cross-entropy-fusion-impl native \ - --attention-backend flash \ - ${CUDA_GRAPH_ARGS} \ - --te-rng-tracker \ - --manual-gc \ - --manual-gc-interval 10 \ - --num-workers 1 \ - --eval-interval 1000 \ - --eval-iters 14 \ - --log-interval 1 \ - --log-params-norm \ - --log-num-zeros-in-grad \ - --log-timers-to-tensorboard \ - --log-memory-to-tensorboard \ - --log-throughput \ - --log-energy \ - --log-memory-interval 500 \ - --logging-level 10 \ - --timing-log-option minmax \ - --check-weight-hash-across-dp-replicas-interval 20000 \ - --tensorboard-dir /tensorboard \ - --local-rank ${SLURM_LOCALID} \ - --save ${CKPT_DIR} \ - --save-interval 10 \ - --ckpt-format torch_dist \ - --ckpt-fully-parallel-save \ - --ckpt-assume-constant-structure \ - --async-save \ - --use-persistent-ckpt-worker \ - $([[ "${USE_CPU_SHM}" == "1" ]] && echo "--async-ckpt-use-cpu-shm") \ - " - $LAUNCHER_CMD $WORKLOAD_CMD $WORKLOAD_ARGS - ' -log_msg "END main_workload" - -log_msg "END SBATCH" - -set +x diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/pools/n3_super_8n_16n.pool b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/pools/n3_super_8n_16n.pool deleted file mode 100644 index 1d700863..00000000 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/pools/n3_super_8n_16n.pool +++ /dev/null @@ -1,40 +0,0 @@ -# n3_super_8n_16n.pool — fault-injection pool for Nemotron-3 Super (TP=4, EP=32) -# Minimum scale: 8 nodes (32 ranks, EP=32 requires exactly 32 ranks) -# Maximum scale: 16 nodes (64 ranks) -# -# Rank coverage per node count (4 GPUs/node): -# 8 nodes → 32 ranks: rank-0=0, rank-1=1, mid=16, last=31 -# 16 nodes → 64 ranks: rank-0=0, rank-1=1, mid=32, last=63 -# -# NOTE: 16-node jobs require ~20 min for NCCL init + CUDA graph capture before iter 1. -# With 5-min watchdog timeout after fault + FR dumps, total is ~30+ min. -# Use TIME=00:45:00 (set in workloads.conf) to avoid SLURM wall-time kills. -# -# Format: FAULT_TYPE:RANK:ITER:NODES (one per line, # comments ignored) -# GPU faults — highest priority; rank sweep across both node counts -GPU_SLEEP:1:5:8 -GPU_SLEEP:0:5:8 -GPU_SLEEP:16:5:8 -GPU_SLEEP:31:5:8 -GPU_SLEEP:1:5:16 -GPU_SLEEP:32:5:16 -GPU_ERROR:1:5:8 -GPU_ERROR:0:5:8 -GPU_ERROR:16:5:8 -GPU_ERROR:1:5:16 -# Crash faults -SIGKILL:1:5:8 -SIGKILL:0:5:8 -SIGKILL:1:5:16 -SEGFAULT:1:5:8 -OS_ABORT:1:5:8 -# Python-level hangs -LOCK_GIL:1:5:8 -LOCK_GIL:0:5:8 -# Application exceptions -WORKLOAD_EXC:1:5:8 -ASYNC_EXC:1:5:8 -# Signal-based -SIGTERM:1:5:8 -SIGINT:1:5:8 -SIGNAL_EXC:1:5:8 From a146d54f293118824c30718aefb0af18d50f1cdd Mon Sep 17 00:00:00 2001 From: Seonmyeong Bak Date: Thu, 23 Apr 2026 16:01:14 -0700 Subject: [PATCH 03/21] feat(skills): harden nvrx-attr fault injection workflow --- .../attribution/log_analyzer/nvrx_logsage.py | 104 ++++++++++- .../nvrx-attr/fault-injection-loop/SKILL.md | 2 +- .../skills/nvrx-attr/fr-analysis/SKILL.md | 4 +- .../skills/nvrx-attr/log-analysis/SKILL.md | 4 +- .../nvrx-attr/scripts/l4_gb200_reduced.sh | 173 ++++++++++++------ .../nvrx-attr/scripts/prepare_node_alloc.sh | 31 +++- .../nvrx-attr/scripts/score_attribution.py | 38 +++- .../nvrx-attr/scripts/watch_and_analyze.sh | 21 ++- .../skills/nvrx-attr/scripts/workloads.conf | 6 +- 9 files changed, 294 insertions(+), 89 deletions(-) diff --git a/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py b/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py index c72aaae2..4ae1653b 100644 --- a/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py +++ b/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py @@ -1,7 +1,9 @@ import argparse import logging import os +import random import re +import time from typing import Any, Dict, Mapping, Union from langchain_openai import ChatOpenAI @@ -37,6 +39,7 @@ ATTR_ERRORS_NOT_FOUND = "ERRORS NOT FOUND" ATTR_NO_LOGS = "NO LOGS" ATTR_SLURM_CANCELLED_DUE_TO_PREEMPTION = "SLURM CANCELLED DUE TO PREEMPTION" +LOGSAGE_LLM_ENDPOINT_FAILED = "LLM ENDPOINT FAILED" MARKER_NEW_RUN_DIR_ADDED = "[sbatch_script]: New run dir added:" @@ -108,6 +111,98 @@ def chunk_logs_strict(lines): return final_chunks +def _log_analysis_retry_config() -> tuple[int, float, float, float]: + retries = int(os.getenv("NVRX_LOG_ANALYSIS_LLM_RETRIES", "3")) + initial_backoff = float(os.getenv("NVRX_LOG_ANALYSIS_LLM_INITIAL_BACKOFF_SEC", "1.0")) + max_backoff = float(os.getenv("NVRX_LOG_ANALYSIS_LLM_MAX_BACKOFF_SEC", "8.0")) + jitter = float(os.getenv("NVRX_LOG_ANALYSIS_LLM_JITTER_SEC", "0.25")) + return retries, initial_backoff, max_backoff, jitter + + +def _finished_status_name(status: Any) -> str: + return getattr(status, "name", status) + + +def _sleep_with_backoff(attempt: int, retries: int, backoff: float, max_backoff: float, jitter: float) -> float: + sleep_for = min(backoff, max_backoff) + random.uniform(0.0, jitter) + logger.info( + "Retrying log-analysis LLM in %.2fs after attempt %d/%d", + sleep_for, + attempt, + retries, + ) + time.sleep(sleep_for) + return min(backoff * 2, max_backoff) + + +def _retry_return_application_errors( + llm: ChatOpenAI, lines: list[str], cache_dict: LRUCache +) -> ApplicationData: + retries, initial_backoff, max_backoff, jitter = _log_analysis_retry_config() + backoff = initial_backoff + last_status = None + + for attempt in range(1, retries + 1): + app_data = return_application_errors(llm, lines, cache_dict) + status_name = _finished_status_name(app_data.finished) + if status_name != FINISHED_STATUS_LLM_FAILURE: + return app_data + + last_status = status_name + if attempt == retries: + logger.error( + "Log-analysis extraction failed after %d attempts; last status: %s", + retries, + last_status, + ) + return app_data + + backoff = _sleep_with_backoff(attempt, retries, backoff, max_backoff, jitter) + + return app_data + + +def _with_exponential_backoff(llm_call, checkpoint_saved: bool) -> tuple[str, str, str, str, str]: + retries, initial_backoff, max_backoff, jitter = _log_analysis_retry_config() + backoff = initial_backoff + + for attempt in range(1, retries + 1): + try: + result = llm_call() + if result and not any( + field == LOGSAGE_LLM_ENDPOINT_FAILED for field in result[:4] + ): + return result + last_error = LOGSAGE_LLM_ENDPOINT_FAILED + except Exception as exc: + last_error = str(exc) + logger.warning("Log-analysis LLM attempt %d/%d failed: %s", attempt, retries, exc) + + if attempt == retries: + logger.error( + "Log-analysis LLM failed after %d attempts; last error: %s", + retries, + last_error, + ) + return ( + ATTR_LLM_FAILURE, + ATTR_LLM_FAILURE, + ATTR_LLM_FAILURE, + ATTR_LLM_FAILURE, + str(checkpoint_saved), + ) + + backoff = _sleep_with_backoff(attempt, retries, backoff, max_backoff, jitter) + + return ( + ATTR_LLM_FAILURE, + ATTR_LLM_FAILURE, + ATTR_LLM_FAILURE, + ATTR_LLM_FAILURE, + str(checkpoint_saved), + ) + + class NVRxLogAnalyzer(NVRxAttribution): def __init__(self, args: Union[argparse.Namespace, Mapping[str, Any]]): from nvidia_resiliency_ext.attribution.api_keys import load_llm_api_key @@ -213,7 +308,7 @@ async def analyze_logs(self) -> list[ApplicationData]: current_chunk.append(line) output_list = [ - return_application_errors(self.llm, lines, self.lru_cache) + _retry_return_application_errors(self.llm, lines, self.lru_cache) for cycle, lines in chunks.items() ] return output_list @@ -248,7 +343,12 @@ async def llm_analyze(self, output_list: list[ApplicationData]) -> list[str]: ) else: if len(output.application_errors_list_full): - result.append(get_proposed_solution_cat(self.llm, output)) + result.append( + _with_exponential_backoff( + lambda: get_proposed_solution_cat(self.llm, output), + checkpoint_saved=output.checkpoint_saved, + ) + ) else: if output.finished == FINISHED_STATUS_LLM_FAILURE: result.append( diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md index abec6a91..228d7ce1 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md @@ -207,7 +207,7 @@ The judge is given: 4. Raw logsage stdout (5-field text format) 5. Raw CollectiveAnalyzer text output -Default judge model: `azure/anthropic/claude-sonnet-4-6`. Override with `--model` in `score_attribution.py`. +Default judge model: `qwen/qwen3.5-397b-a17b`. Override with `--model` in `score_attribution.py`. --- diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md index df038451..d07911ec 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md @@ -52,7 +52,7 @@ python scripts/fr_attribution.py \ | `--verbose`, `-v` | off | Print detailed per-rank collective tables | | `--health-check`, `-c` | off | Include node health check results in output | | `--llm-analyze`, `-l` | off | Pass structured findings to the LLM for a narrative summary | -| `--model`, `-m` | `nvdev/nvidia/llama-3.3-nemotron-super-49b-v1` | LLM model (only used with `--llm-analyze`) | +| `--model`, `-m` | `nvidia/nemotron-3-super-120b-a12b` | LLM model (only used with `--llm-analyze`) | | `--debug` | off | Convert binary trace files to JSON for inspection | --- @@ -68,7 +68,7 @@ analyzer = CollectiveAnalyzer({ "verbose": False, "health_check": False, "llm_analyze": False, - "model": "nvdev/nvidia/llama-3.3-nemotron-super-49b-v1", + "model": "nvidia/nemotron-3-super-120b-a12b", }) results = analyzer.run_sync({ "fr_path": "/path/to/fr_dumps/", diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/SKILL.md index a86e2ff7..e793d5de 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/SKILL.md +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/SKILL.md @@ -45,7 +45,7 @@ python scripts/nvrx_logsage.py \ | Flag | Default | Description | |------|---------|-------------| | `--log-path` | required | Path to the job log file | -| `--model` | `nvidia/qwen/qwen3.5-35b-a3b` | LLM model | +| `--model` | `nvidia/nemotron-3-super-120b-a12b` | LLM model | | `--temperature` | `0.2` | Sampling temperature | | `--top_p` | `0.7` | Top-p nucleus sampling | | `--max_tokens` | `8192` | Max output tokens | @@ -61,7 +61,7 @@ from nvidia_resiliency_ext.attribution.log_analyzer.nvrx_logsage import NVRxLogA analyzer = NVRxLogAnalyzer({ "log_path": "/path/to/job.log", - "model": "nvidia/qwen/qwen3.5-35b-a3b", + "model": "nvidia/nemotron-3-super-120b-a12b", "temperature": 0.2, "top_p": 0.7, "max_tokens": 8192, diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh index 9fd39ab8..0c87a30d 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh @@ -1,9 +1,10 @@ #!/bin/bash # Validated only with Megatron-LM as the feedback-loop example workload. +# Direct sbatch usage: +# sbatch --account= --partition= scripts/l4_gb200_reduced.sh +# If your cluster has defaults for those, the extra flags are not required. -#SBATCH --account=root -#SBATCH --partition=gb-nvl-134-135 #SBATCH --time=00:30:00 #SBATCH --job-name=llama4-scout-gb200 @@ -16,6 +17,10 @@ #SBATCH --exclusive #SBATCH --mem=0 +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +NVRX_SRC_ROOT_DEFAULT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)" +NVRX_REPO_ROOT_DEFAULT="$(cd "${SCRIPT_DIR}/../../../../.." && pwd)" + log_msg() { local msg="$1" UNIX_DATETIME=$(date +%s) @@ -25,12 +30,8 @@ log_msg() { log_msg "START SBATCH" echo "Running on nodes: ${SLURM_NODELIST}" -export RITS_PLATFORM_TYPE=gb200 -export RITS_GPUS_PER_NODE=4 -export RITS_NVL_DOMAIN_SIZE=72 export NCCL_IB_DISABLE=0 export NCCL_NET_GDR_LEVEL=3 -export RITS_CLUSTER_NAME=nvl72 export PYXIS_LOG_LEVEL=debug export NCCL_IB_SL=1 export NCCL_IB_TIMEOUT=19 @@ -58,26 +59,49 @@ export TORCH_INCLUDE_ONLY_ACTIVE=1 export TORCH_NCCL_EXTRA_DUMP_ON_EXEC=1 # Fault injection parameters (overridable via sbatch --export or environment) +# Current Megatron behavior: +# - FAULT_AT_ITER anchors the fault-delay timer after iteration N completes +# - FAULT_DELAY is the delay in seconds from that anchor (or from training start if unset) export FAULT_AT_ITER="${FAULT_AT_ITER:-5}" +export FAULT_DELAY="${FAULT_DELAY:-}" export FAULT_RANK="${FAULT_RANK:-1}" export FAULT_TYPE="${FAULT_TYPE:-GPU_SLEEP}" +export ENABLE_FAULT_INJECTION="${ENABLE_FAULT_INJECTION:-1}" # Checkpoint settings (overridable via sbatch --export) export NVRX_CKPT_USE_CPU_SHM="${NVRX_CKPT_USE_CPU_SHM:-0}" # Enable GPU-IPC cached-data-structure path without cpu-shm (for comparison baseline) export NVRX_CKPT_USE_CACHED_STRUCTURE="${NVRX_CKPT_USE_CACHED_STRUCTURE:-0}" export DIST_TIMEOUT_AFTER_INIT="${DIST_TIMEOUT_AFTER_INIT:-1}" +export ENABLE_NFS_CACHE_STAGING="${ENABLE_NFS_CACHE_STAGING:-0}" +export NFS_TRITON_CACHE="${NFS_TRITON_CACHE:-}" +export NFS_INDUCTOR_CACHE="${NFS_INDUCTOR_CACHE:-}" # USE_ASYNC_CKPT=1: enable async checkpointing every CKPT_SAVE_INTERVAL iters export USE_ASYNC_CKPT="${USE_ASYNC_CKPT:-0}" export CKPT_SAVE_INTERVAL="${CKPT_SAVE_INTERVAL:-100}" +export ENABLE_ENROOT_CLEANUP="${ENABLE_ENROOT_CLEANUP:-0}" # Node / task geometry (SLURM_NNODES is set by SLURM from --nodes override) export GPUS_PER_NODE="${GPUS_PER_NODE:-4}" TOTAL_TASKS=$((SLURM_NNODES * GPUS_PER_NODE)) # Per-experiment output directory (overridable via sbatch --export) -export BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-/home/sbak/experiments/llama4-scout-gb200}" -export EXPERIMENT_DIR="${EXPERIMENT_DIR:-${BASE_EXPERIMENTS_DIR}/fault_injection/manual/n${SLURM_NNODES}_${FAULT_TYPE}_r${FAULT_RANK}_i${FAULT_AT_ITER}}" +export BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-${HOME}/nvrx-attr-experiments}" +FAULT_LABEL="i${FAULT_AT_ITER}" +if [[ -n "${FAULT_DELAY}" ]]; then + FAULT_LABEL="d${FAULT_DELAY}" +fi +export EXPERIMENT_DIR="${EXPERIMENT_DIR:-${BASE_EXPERIMENTS_DIR}/fault_injection/manual/n${SLURM_NNODES}_${FAULT_TYPE}_r${FAULT_RANK}_${FAULT_LABEL}}" +export NVRX_REPO_ROOT="${NVRX_REPO_ROOT:-${NVRX_REPO_ROOT_DEFAULT}}" +export NVRX_SRC_ROOT="${NVRX_SRC_ROOT:-${NVRX_SRC_ROOT_DEFAULT}}" +export NVRX_CONTAINER_REPO_PATH="${NVRX_CONTAINER_REPO_PATH:-${HOME}/nvidia-resiliency-ext}" +export NVRX_CONTAINER_SRC_PATH="${NVRX_CONTAINER_SRC_PATH:-${NVRX_CONTAINER_REPO_PATH}/src}" +export SHARED_TMP_BASE_DIR="${SHARED_TMP_BASE_DIR:-${HOME}/tmp}" +export MEGATRON_REPO_HOST_PATH="${MEGATRON_REPO_HOST_PATH:-${HOME}/megatron-lm}" +export WORKSPACE_HOST_PATH="${WORKSPACE_HOST_PATH:-${HOME}/tmp}" +export CONTAINER_IMAGE="${CONTAINER_IMAGE:-nvcr.io/nvidia/nemo:26.04}" +export CONTAINER_NAME="${CONTAINER_NAME:-}" +export CONTAINER_WORKDIR="${CONTAINER_WORKDIR:-/}" mkdir -p ${BASE_EXPERIMENTS_DIR}/datacache mkdir -p ${EXPERIMENT_DIR}/tensorboard @@ -91,7 +115,7 @@ LOG_FILE_BASE="${LOG_DIR}/slurm/${SLURM_JOB_ID}.${SLURM_RESTART_COUNT}" # ── Shared-tmp directory (NFS, for cross-srun-step communication) ───────────── # Mounted to /shared_tmp (NOT /tmp) so the container keeps its native fast /tmp. -SHARED_TMP_HOST=/home/sbak/tmp/${SLURM_JOB_ID} +SHARED_TMP_HOST=${SHARED_TMP_BASE_DIR}/${SLURM_JOB_ID} mkdir -p ${SHARED_TMP_HOST} # ── Pre-populate .myenv with all variables that must reach the container ─────── @@ -106,48 +130,64 @@ export DIST_TIMEOUT_AFTER_INIT=${DIST_TIMEOUT_AFTER_INIT} export USE_ASYNC_CKPT=${USE_ASYNC_CKPT} export CKPT_SAVE_INTERVAL=${CKPT_SAVE_INTERVAL} export FAULT_AT_ITER=${FAULT_AT_ITER} +export FAULT_DELAY=${FAULT_DELAY} export FAULT_RANK=${FAULT_RANK} export FAULT_TYPE=${FAULT_TYPE} -# Prepend local nvrx src so container picks up our changes without a pip install step. -export PYTHONPATH=/home/sbak/nvidia-resiliency-ext/src:\${PYTHONPATH} +export ENABLE_FAULT_INJECTION=${ENABLE_FAULT_INJECTION} +export ENABLE_NFS_CACHE_STAGING=${ENABLE_NFS_CACHE_STAGING} +export NFS_TRITON_CACHE=${NFS_TRITON_CACHE} +export NFS_INDUCTOR_CACHE=${NFS_INDUCTOR_CACHE} +# Prepend local nvrx checkout so container picks up our changes without a pip install step. +export NVRX_REPO_ROOT=${NVRX_CONTAINER_REPO_PATH} +export NVRX_SRC_ROOT=${NVRX_CONTAINER_SRC_PATH} +export PYTHONPATH=\${NVRX_REPO_ROOT}:\${NVRX_SRC_ROOT}:\${PYTHONPATH} MYENVEOF # Mounts LUSTRE=/home:/home SHARED_TMP=${SHARED_TMP_HOST}:/shared_tmp LOGS=${EXPERIMENT_DIR}/logs:/logs -MEGATRON_REPO=/home/sbak/megatron-lm-main:/megatron-lm_repo +MEGATRON_REPO=${MEGATRON_REPO_HOST_PATH}:/megatron-lm_repo DATACACHE=${BASE_EXPERIMENTS_DIR}/datacache:/datacache TENSORBOARD=${EXPERIMENT_DIR}/tensorboard:/tensorboard -WORKSPACE=/home/sbak/tmp:/workspace +WORKSPACE=${WORKSPACE_HOST_PATH}:/workspace CHECKPOINTS=${EXPERIMENT_DIR}/checkpoints:/checkpoints mkdir -p ${EXPERIMENT_DIR}/checkpoints CONTAINER_MOUNTS=$LUSTRE,$SHARED_TMP,$LOGS,$MEGATRON_REPO,$DATACACHE,$TENSORBOARD,$WORKSPACE,$CHECKPOINTS +CONTAINER_ARGS=( + --container-mounts "${CONTAINER_MOUNTS}" + --container-image "${CONTAINER_IMAGE}" + --container-workdir "${CONTAINER_WORKDIR}" +) +if [[ -n "${CONTAINER_NAME}" ]]; then + CONTAINER_ARGS+=(--container-name "${CONTAINER_NAME}") +fi # ── Disk cleanup: remove stale enroot containers from prior jobs ────────────── -log_msg "START disk_cleanup" -srun \ - --label \ - --ntasks-per-node=1 \ - --ntasks=${SLURM_NNODES} \ - --kill-on-bad-exit=0 \ - --mpi=none \ - bash -c ' - ENROOT_DIR="/var/lib/enroot/data/$(id -u)" - rm -rf "${ENROOT_DIR:?}"/* 2>/dev/null || true - echo "$(hostname): / $(df -h / | tail -1 | awk "{print \$3\" used, \"\$4\" avail\"}")" - ' -log_msg "END disk_cleanup" +if [[ "${ENABLE_ENROOT_CLEANUP}" == "1" ]]; then + log_msg "START disk_cleanup" + srun \ + --label \ + --ntasks-per-node=1 \ + --ntasks=${SLURM_NNODES} \ + --kill-on-bad-exit=0 \ + --mpi=none \ + bash -c ' + ENROOT_DIR="/var/lib/enroot/data/$(id -u)" + rm -rf "${ENROOT_DIR:?}"/* 2>/dev/null || true + echo "$(hostname): / $(df -h / | tail -1 | awk "{print \$3\" used, \"\$4\" avail\"}")" + ' + log_msg "END disk_cleanup" +else + log_msg "SKIP disk_cleanup" +fi # all node setup #-------------------------------- log_msg "START all_node_setup" srun \ --label \ - --container-mounts ${CONTAINER_MOUNTS} \ - --container-image /home/sbak/mcore_ci_0415.sqsh \ - --container-name ${SLURM_JOB_ID} \ - --container-workdir / \ + "${CONTAINER_ARGS[@]}" \ --exclusive \ --error=${LOG_FILE_BASE}.0.all_node_setup.log \ --output=${LOG_FILE_BASE}.0.all_node_setup.log \ @@ -163,6 +203,8 @@ srun \ CURRENT_BRANCH=$(git -C /megatron-lm_repo branch --show-current) echo "Cloning Megatron branch $CURRENT_BRANCH to ${MEGATRON_PATH}" git clone --single-branch --branch $CURRENT_BRANCH /megatron-lm_repo . + rm -rf ${MEGATRON_PATH}/nvidia_resiliency_ext + rsync -a ${NVRX_CONTAINER_SRC_PATH}/nvidia_resiliency_ext/ ${MEGATRON_PATH}/nvidia_resiliency_ext/ popd ' log_msg "END all_node_setup" @@ -172,10 +214,7 @@ log_msg "END all_node_setup" log_msg "START main_workload" srun \ --label \ - --container-mounts ${CONTAINER_MOUNTS} \ - --container-image /home/sbak/mcore_ci_0415.sqsh \ - --container-name ${SLURM_JOB_ID} \ - --container-workdir / \ + "${CONTAINER_ARGS[@]}" \ --error=${LOG_FILE_BASE}.1.main_workload.log \ --output=${LOG_FILE_BASE}.1.main_workload.log \ --ntasks-per-node=${GPUS_PER_NODE} \ @@ -184,38 +223,48 @@ srun \ --mpi=none \ bash -c ' source /shared_tmp/.myenv_${SLURM_JOB_ID}.sh - - # Match the per-node path used in all_node_setup. MEGATRON_PATH=/shared_tmp/megatron_$(hostname)_${SLURM_JOB_ID} - - NFS_TRITON_CACHE=/home/sbak/experiments/llama4-scout-gb200/triton_cache - NFS_INDUCTOR_CACHE=/home/sbak/experiments/llama4-scout-gb200/inductor_cache + export PYTHONPATH=${MEGATRON_PATH}:${NVRX_REPO_ROOT}:${NVRX_SRC_ROOT}:${PYTHONPATH} + echo "NVRX_REPO_ROOT=${NVRX_REPO_ROOT}" + echo "NVRX_SRC_ROOT=${NVRX_SRC_ROOT}" + echo "PYTHONPATH=${PYTHONPATH}" + python3 - <<'"'"'PY'"'"' +import sys +print(f"sys.path[:8]={sys.path[:8]}") +import nvidia_resiliency_ext +from nvidia_resiliency_ext.shared_utils.inject_fault import Fault +print(f"nvidia_resiliency_ext={nvidia_resiliency_ext.__file__}") +print(f"fault_enum={Fault}") +PY # Per-rank Triton/inductor cache on the container native /tmp (local fast storage). export TRITON_CACHE_DIR=/tmp/triton_${SLURM_LOCALID} export TORCHINDUCTOR_CACHE_DIR=/tmp/inductor_${SLURM_LOCALID} mkdir -p ${TRITON_CACHE_DIR} ${TORCHINDUCTOR_CACHE_DIR} - # Pre-stage: warm local cache from NFS (one rank per node) - if [[ "${SLURM_LOCALID}" == "0" ]]; then - if [[ -d "${NFS_TRITON_CACHE}" ]]; then - echo "Pre-staging triton cache from NFS..." + # Optional pre/post-stage between a shared cache and the node-local /tmp cache. + if [[ "${ENABLE_NFS_CACHE_STAGING}" == "1" && "${SLURM_LOCALID}" == "0" ]]; then + if [[ -n "${NFS_TRITON_CACHE}" && -d "${NFS_TRITON_CACHE}" ]]; then + echo "Pre-staging triton cache from ${NFS_TRITON_CACHE}..." rsync -a --ignore-existing "${NFS_TRITON_CACHE}/" "${TRITON_CACHE_DIR}/" 2>/dev/null || true fi - if [[ -d "${NFS_INDUCTOR_CACHE}" ]]; then - echo "Pre-staging inductor cache from NFS..." + if [[ -n "${NFS_INDUCTOR_CACHE}" && -d "${NFS_INDUCTOR_CACHE}" ]]; then + echo "Pre-staging inductor cache from ${NFS_INDUCTOR_CACHE}..." rsync -a --ignore-existing "${NFS_INDUCTOR_CACHE}/" "${TORCHINDUCTOR_CACHE_DIR}/" 2>/dev/null || true fi fi # Post-stage: write back to NFS on exit (one rank per node) _stage_back() { - if [[ "${SLURM_LOCALID}" == "0" ]]; then - echo "Staging triton cache back to NFS..." - mkdir -p "${NFS_TRITON_CACHE}" "${NFS_INDUCTOR_CACHE}" - rsync -a --ignore-existing "${TRITON_CACHE_DIR}/" "${NFS_TRITON_CACHE}/" 2>/dev/null || true - rsync -a --ignore-existing "${TORCHINDUCTOR_CACHE_DIR}/" "${NFS_INDUCTOR_CACHE}/" 2>/dev/null || true - echo "Cache staged back." + if [[ "${ENABLE_NFS_CACHE_STAGING}" == "1" && "${SLURM_LOCALID}" == "0" ]]; then + if [[ -n "${NFS_TRITON_CACHE}" ]]; then + mkdir -p "${NFS_TRITON_CACHE}" + rsync -a --ignore-existing "${TRITON_CACHE_DIR}/" "${NFS_TRITON_CACHE}/" 2>/dev/null || true + fi + if [[ -n "${NFS_INDUCTOR_CACHE}" ]]; then + mkdir -p "${NFS_INDUCTOR_CACHE}" + rsync -a --ignore-existing "${TORCHINDUCTOR_CACHE_DIR}/" "${NFS_INDUCTOR_CACHE}/" 2>/dev/null || true + fi fi } trap _stage_back EXIT @@ -239,6 +288,21 @@ srun \ LAUNCHER_ARGS=" \ " WORKLOAD_CMD=${MEGATRON_PATH}/pretrain_gpt.py + FAULT_INJECTOR_ARGS="" + if [[ "${ENABLE_FAULT_INJECTION}" == "1" ]]; then + FAULT_INJECTOR_ARGS=" \ + --fault-injector-ranks ${FAULT_RANK} \ + --fault-injector-fault-types ${FAULT_TYPE} \ + " + if [[ -n "${FAULT_DELAY}" ]]; then + FAULT_INJECTOR_ARGS="${FAULT_INJECTOR_ARGS} --fault-injector-fault-delay ${FAULT_DELAY}" + if [[ -n "${FAULT_AT_ITER}" ]]; then + FAULT_INJECTOR_ARGS="${FAULT_INJECTOR_ARGS} --fault-injector-delay-start-iteration ${FAULT_AT_ITER}" + fi + elif [[ -n "${FAULT_AT_ITER}" ]]; then + FAULT_INJECTOR_ARGS="${FAULT_INJECTOR_ARGS} --fault-injector-fault-delay 0 --fault-injector-delay-start-iteration ${FAULT_AT_ITER}" + fi + fi WORKLOAD_ARGS=" \ --exit-duration-in-mins 5750 \ --distributed-timeout-minutes 10 \ @@ -347,13 +411,12 @@ srun \ --local-rank ${SLURM_LOCALID} \ --context-parallel-size 1 \ --vocab-size 238600 \ - --megatron-fault-at-iter ${FAULT_AT_ITER} \ - --megatron-fault-rank ${FAULT_RANK} \ - --megatron-fault-type ${FAULT_TYPE} \ + ${FAULT_INJECTOR_ARGS} \ --distributed-timeout-seconds-after-init ${DIST_TIMEOUT_AFTER_INIT} \ --flight-recorder-dump-path ${CKPT_DIR} \ " - $LAUNCHER_CMD $LAUNCHER_ARGS $WORKLOAD_CMD $WORKLOAD_ARGS $CKPT_SAVE_ARGS + PYTHONPATH=${MEGATRON_PATH}:${NVRX_REPO_ROOT}:${NVRX_SRC_ROOT}:${PYTHONPATH} \ + $LAUNCHER_CMD $LAUNCHER_ARGS $WORKLOAD_CMD $WORKLOAD_ARGS $CKPT_SAVE_ARGS ' log_msg "END main_workload" diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh index f2c90a64..8b8d7b01 100755 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh @@ -25,6 +25,12 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" WORKLOADS_CONF="${SCRIPT_DIR}/workloads.conf" +SLURM_DEFAULTS_CONF="${SCRIPT_DIR}/slurm.conf" + +if [[ -f "${SLURM_DEFAULTS_CONF}" ]]; then + # shellcheck disable=SC1090 + source "${SLURM_DEFAULTS_CONF}" +fi # ── Workload resolution from workloads.conf ──────────────────────────────────── # If WORKLOAD is set, look it up in workloads.conf and derive SBATCH_SCRIPT and @@ -48,7 +54,9 @@ if [[ -n "${WORKLOAD:-}" ]]; then _CONF_TIME=$(echo "${_CONF_LINE}" | awk '{print $6}') # Only set if not already overridden in the environment SBATCH_SCRIPT="${SBATCH_SCRIPT:-${SCRIPT_DIR}/${_CONF_SCRIPT}}" - BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-${_CONF_BASE}}" + if [[ -n "${_CONF_BASE}" && "${_CONF_BASE}" != "-" ]]; then + BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-${_CONF_BASE}}" + fi if [[ -n "${_CONF_TIME}" && "${_CONF_TIME}" != "-" ]]; then TIME="${TIME:-${_CONF_TIME}}" fi @@ -65,13 +73,13 @@ if [[ -n "${WORKLOAD:-}" ]]; then echo ">>> Workload: ${WORKLOAD} (${_CONF_DESC//_/ })" fi -ACCOUNT="${ACCOUNT:-root}" -PARTITION="${PARTITION:-gb-nvl-134-135}" +ACCOUNT="${ACCOUNT:-}" +PARTITION="${PARTITION:-}" GPUS_PER_NODE="${GPUS_PER_NODE:-4}" TIME="${TIME:-00:30:00}" BATCH_SIZE="${BATCH_SIZE:-2}" POLL_INTERVAL="${POLL_INTERVAL:-30}" -BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-/home/sbak/experiments/llama4-scout-gb200}" +BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-${HOME}/nvrx-attr-experiments}" # --------------------------------------------------------------------------- # Fault pool — ordered by priority (GPU-related first, then crash, then other) @@ -131,9 +139,7 @@ submit_one() { mkdir -p "${EXPERIMENT_DIR}/tensorboard" local JOB_ID - JOB_ID=$(sbatch \ - --account="${ACCOUNT}" \ - --partition="${PARTITION}" \ + local SBATCH_ARGS=( --nodes="${NODES}" \ --ntasks-per-node="${GPUS_PER_NODE}" \ --gpus-per-node="${GPUS_PER_NODE}" \ @@ -143,8 +149,15 @@ submit_one() { --output="${EXPERIMENT_DIR}/logs/slurm/%j.launch.out" \ --error="${EXPERIMENT_DIR}/logs/slurm/%j.launch.err" \ --export=ALL,FAULT_TYPE="${FAULT_TYPE}",FAULT_RANK="${RANK}",FAULT_AT_ITER="${ITER}",GPUS_PER_NODE="${GPUS_PER_NODE}",EXPERIMENT_DIR="${EXPERIMENT_DIR}",BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR}" \ - --parsable \ - "${SBATCH_SCRIPT}") + --parsable + ) + if [[ -n "${ACCOUNT}" ]]; then + SBATCH_ARGS+=(--account="${ACCOUNT}") + fi + if [[ -n "${PARTITION}" ]]; then + SBATCH_ARGS+=(--partition="${PARTITION}") + fi + JOB_ID=$(sbatch "${SBATCH_ARGS[@]}" "${SBATCH_SCRIPT}") # Print to stderr so callers using $(...) capture only the job ID on stdout printf " submitted: %s rank=%-2s iter=%s nodes=%s -> job=%s\n" \ diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py index 15417a6c..4d36ad66 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 """LLM-judge scorer for fault-injection attribution experiments. -Uses the same ChatOpenAI / NVIDIA-inference-API setup as nvrx_logsage.py. +Uses the same ChatOpenAI / NVIDIA inference API setup as nvrx_logsage.py. Reads ground-truth fault parameters and the raw text outputs of nvrx_logsage -and CollectiveAnalyzer, then asks a Sonnet/Opus judge to score each attribution +and CollectiveAnalyzer, then asks a judge model to score each attribution dimension and return structured JSON. Usage (called by watch_and_analyze.sh): @@ -11,8 +11,8 @@ --fault-type GPU_SLEEP --rank 0 --iter 5 --nodes 2 \ --log-output "$LOG_OUT" \ --fr-output "$FR_OUT" \ - [--model claude-sonnet-4-6] \ - [--base-url https://inference-api.nvidia.com/v1] + [--model qwen/qwen3.5-397b-a17b] \ + [--base-url https://inference.api.nvidia.com/v1] Stdout: one line of JSON with keys: restart_correct, rank_primary, rank_any, fault_described, fr_rank_correct, notes @@ -21,6 +21,7 @@ import argparse import json import logging +import os import sys from typing import Union @@ -32,8 +33,13 @@ logger = logging.getLogger(__name__) +INJECTION_MARKERS = ( + "FAULT INJECTION", + "nvidia_resiliency_ext.shared_utils.inject_fault", +) + # Default judge model — override with --model -DEFAULT_JUDGE_MODEL = "azure/anthropic/claude-sonnet-4-6" +DEFAULT_JUDGE_MODEL = "qwen/qwen3.5-397b-a17b" # Expected restart decision and rationale per fault type _RESTART_TABLE = { @@ -71,7 +77,7 @@ def load_log_excerpt(log_path, max_lines=400): lines = [line for line in lines if "[workload:" not in line or 'Cycle:' in line] # Strip fault-injection markers — the judge must not see which rank/fault was # injected in the raw log; it knows the ground truth from the structured args. - lines = [line for line in lines if "[MEGATRON_FAULT]" not in line] + lines = [line for line in lines if not any(marker in line for marker in INJECTION_MARKERS)] if len(lines) > max_lines: lines = lines[-max_lines:] return "".join(lines).strip() @@ -161,17 +167,29 @@ def build_judge_prompt(fault_type, rank, iter_, nodes, run_valid, log_output, fr def score(args): args.run_valid = args.run_valid.lower() == "true" - api_key = load_nvidia_api_key() + api_key = os.getenv("JUDGE_API_KEY", "").strip() + if not api_key: + judge_key_file = os.getenv("JUDGE_API_KEY_FILE", "").strip() + if judge_key_file: + try: + with open(judge_key_file, encoding="utf-8") as f: + api_key = f.read().strip() + except OSError: + api_key = "" + if not api_key: + api_key = load_nvidia_api_key() if not api_key: raise ValueError( - "NVIDIA_API_KEY not found. Set NVIDIA_API_KEY env var, " - "NVIDIA_API_KEY_FILE, or create ~/.nvidia_api_key" + "Judge API key not found. Set JUDGE_API_KEY/JUDGE_API_KEY_FILE, " + "or NVIDIA_API_KEY/NVIDIA_API_KEY_FILE, or create ~/.nvidia_api_key" ) + base_url = os.getenv("JUDGE_BASE_URL", "").strip() or args.base_url + llm = ChatOpenAI( model=args.model, api_key=api_key, - base_url=args.base_url, + base_url=base_url, temperature=0.0, max_completion_tokens=512, ) diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh index 8a5e3a4d..6b2a13c9 100755 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh @@ -22,6 +22,13 @@ SCORE_PY="${SCRIPT_DIR}/score_attribution.py" # Ensure nvidia_resiliency_ext is importable from source tree export PYTHONPATH="${NVRX_SRC_DIR}${PYTHONPATH:+:$PYTHONPATH}" +strip_injection_markers() { + local input_log="$1" + local output_log="$2" + grep -v -E 'FAULT INJECTION|nvidia_resiliency_ext\.shared_utils\.inject_fault' \ + "${input_log}" > "${output_log}" 2>/dev/null || true +} + REPORT_FILE="${TRACKING_FILE%.tsv}_report.md" DONE_JOBS_FILE="${TRACKING_FILE%.tsv}_done.txt" @@ -75,23 +82,25 @@ while true; do LOG_FILE=$(ls ${LOG_GLOB} 2>/dev/null | head -1 || true) LOG_OUT="" - # ---- Check run validity: did the fault actually fire? ---- - # The fault injection prints: [MEGATRON_FAULT] global_rank=RANK/...: injecting FAULT_TYPE at iteration ITER + # ---- Check run validity: did the fault actually arm/fire? ---- + # The fault injector prints: + # [timestamp] FAULT INJECTION: Rank R will inject fault TYPE at timestamp RUN_VALID="false" STRIPPED_LOG="" if [[ -n "${LOG_FILE}" && -f "${LOG_FILE}" ]]; then echo " log: ${LOG_FILE}" - if grep -qF "[MEGATRON_FAULT]" "${LOG_FILE}" 2>/dev/null; then + if grep -q "FAULT INJECTION" "${LOG_FILE}" 2>/dev/null; then RUN_VALID="true" fi echo " run_valid: ${RUN_VALID}" # Strip fault-injection markers so neither nvrx_logsage nor the judge # can see which rank/fault was injected — evaluation must be fair. - # [MEGATRON_FAULT] lines are printed by Megatron's debug_fault_injection.py - # and are not covered by --exclude_nvrx_logs. + # This removes: + # - scheduler lines from megatron.core.fault_injector ("FAULT INJECTION") + # - direct fault-tool log lines from nvidia_resiliency_ext.shared_utils.inject_fault STRIPPED_LOG=$(mktemp /tmp/fi_log_stripped.XXXXXX) - grep -vF "[MEGATRON_FAULT]" "${LOG_FILE}" > "${STRIPPED_LOG}" 2>/dev/null || true + strip_injection_markers "${LOG_FILE}" "${STRIPPED_LOG}" # nvrx_logsage.py prints 5 newline-joined fields to stdout: # line 1: restart_decision diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/workloads.conf b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/workloads.conf index 7cea1674..dcc1dc62 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/workloads.conf +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/workloads.conf @@ -5,7 +5,9 @@ # # NAME : identifier passed as WORKLOAD= to prepare_node_alloc.sh # SCRIPT : path to the sbatch job script (relative to the scripts/ dir) -# BASE_EXPERIMENTS_DIR : root directory for all experiment output (logs, checkpoints, etc.) +# BASE_EXPERIMENTS_DIR : root directory for all experiment output (logs, checkpoints, etc.); +# "-" means use BASE_EXPERIMENTS_DIR from the environment or +# prepare_node_alloc.sh default # DESCRIPTION : free-form human-readable label (no spaces; use underscores) # POOL_FILE : (optional) pool file under scripts/pools/ to use as default pool # when POOL env var is not set; "-" means use the built-in default pool @@ -14,4 +16,4 @@ # # Fields are whitespace-separated. Lines starting with # are ignored. -llama4_scout l4_gb200_reduced.sh /home/sbak/experiments/llama4-scout-gb200 Llama4-Scout_(reduced_layers)_on_GB200 - - +llama4_scout l4_gb200_reduced.sh - Llama4-Scout_(reduced_layers)_on_GB200 - - From f73ff8b16bc6eecad5766f70e8c1fdc2041aa668 Mon Sep 17 00:00:00 2001 From: Seonmyeong Bak Date: Thu, 23 Apr 2026 16:04:32 -0700 Subject: [PATCH 04/21] chore(skills): add slurm defaults template --- .../skills/nvrx-attr/scripts/slurm.conf | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/slurm.conf diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/slurm.conf b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/slurm.conf new file mode 100644 index 00000000..764003dc --- /dev/null +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/slurm.conf @@ -0,0 +1,11 @@ +# Optional site-specific Slurm defaults for nvrx-attr scripts. +# +# This file is sourced by prepare_node_alloc.sh. Environment variables still +# take precedence, so you can override these per invocation: +# +# ACCOUNT=myacct PARTITION=gpu bash scripts/prepare_node_alloc.sh +# +# Leave values empty to rely on the cluster's default account / partition. + +ACCOUNT="" +PARTITION="" From 2275c6a6b33f04b832614a3be3a24bef915ebd57 Mon Sep 17 00:00:00 2001 From: Seonmyeong Bak Date: Thu, 23 Apr 2026 21:56:08 -0700 Subject: [PATCH 05/21] feat(skills): add local env support for fault loop --- .gitignore | 1 + .../nvrx-attr/fault-injection-loop/SKILL.md | 55 ++++++++++++++++--- .../nvrx-attr/scripts/l4_gb200_reduced.sh | 20 +++---- .../nvrx-attr/scripts/prepare_node_alloc.sh | 35 +++++++++++- .../skills/nvrx-attr/scripts/run_session.sh | 7 ++- 5 files changed, 97 insertions(+), 21 deletions(-) diff --git a/.gitignore b/.gitignore index a24dba38..28d90ebe 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ ft_state.json *_pb2.pyi *_pb2_grpc.py .idea/ +src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md index 228d7ce1..0358f7ab 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md @@ -92,18 +92,44 @@ To run a custom subset, override `POOL` before calling the script: POOL="GPU_SLEEP:0:5:2 GPU_SLEEP:1:5:2" bash scripts/prepare_node_alloc.sh ``` +## Local User Config + +Put cluster-specific settings in `scripts/user.env`. This file is sourced by +`run_session.sh`, `prepare_node_alloc.sh`, and `l4_gb200_reduced.sh`, and it is +intended to stay local and untracked. + +Recommended contents: + +```bash +PARTITION=gb-nvl-134-135 +BASE_EXPERIMENTS_DIR="${HOME}/nvrx-attr-experiments" +MEGATRON_REPO_HOST_PATH="${HOME}/megatron-lm-main" +SHARED_TMP_BASE_DIR="${HOME}/tmp" +WORKSPACE_HOST_PATH="${HOME}/tmp" +CONTAINER_IMAGE="nvcr.io/nvidia/nemo:26.04" +``` + +Use `user.env` for stable site defaults such as partition, container image, and +host paths. Use per-run environment overrides for experiment-specific controls +such as `POOL`, `WORKLOAD`, `BATCH_SIZE`, `FAULT_TYPE`, `FAULT_AT_ITER`, or +`FAULT_DELAY`. + Environment variables: | Variable | Default | Description | |---|---|---| | `WORKLOAD` | `llama4_scout` | Select a registered workload by name (see `scripts/workloads.conf`) | -| `ACCOUNT` | `root` | SLURM account | -| `PARTITION` | `gb-nvl-134-135` | SLURM partition | +| `ACCOUNT` | _(cluster default or `scripts/slurm.conf`)_ | SLURM account | +| `PARTITION` | _(cluster default or `scripts/slurm.conf`)_ | SLURM partition | | `GPUS_PER_NODE` | `4` | GPUs per node | | `TIME` | `00:30:00` | Per-job wall-clock limit | | `BATCH_SIZE` | `2` | Jobs submitted per round | | `POLL_INTERVAL` | `30` | Seconds between queue polls | -| `BASE_EXPERIMENTS_DIR` | _(from workloads.conf or `llama4-scout-gb200`)_ | Root for all output | +| `BASE_EXPERIMENTS_DIR` | `${HOME}/nvrx-attr-experiments` | Root for all output | +| `MEGATRON_REPO_HOST_PATH` | `${HOME}/megatron-lm-main` | Host path to the Megatron checkout mounted into the container | +| `SHARED_TMP_BASE_DIR` | `${HOME}/tmp` | Shared filesystem path used for cross-step coordination | +| `WORKSPACE_HOST_PATH` | `${HOME}/tmp` | Host path mounted at `/workspace` inside the container | +| `CONTAINER_IMAGE` | `nvcr.io/nvidia/nemo:26.04` | Container image used by the workload script | | `SBATCH_SCRIPT` | `scripts/l4_gb200_reduced.sh` | Job script to submit | | `POOL` | _(default pool above)_ | Space-separated experiment triplets | @@ -111,7 +137,7 @@ Environment variables: | Name | Script | Base dir | Description | |---|---|---|---| -| `llama4_scout` | `l4_gb200_reduced.sh` | `.../llama4-scout-gb200` | Llama4-Scout (reduced layers) on GB200 | +| `llama4_scout` | `l4_gb200_reduced.sh` | `${HOME}/nvrx-attr-experiments` | Llama4-Scout (reduced layers) on GB200 | ```bash # Run the full pool against the validated example workload @@ -187,7 +213,7 @@ To also run the sub-skills interactively for a single experiment: ## Step 4 — Score Each Experiment -Scoring is performed by `scripts/score_attribution.py`, an LLM judge (Sonnet or Opus) that +Scoring is performed by `scripts/score_attribution.py`, an LLM judge that receives the ground truth, the filtered raw log, the logsage attribution output, and the FR analysis output, then returns structured JSON scores with a reasoning note. @@ -292,7 +318,7 @@ Required changes for a custom workload script: `${EXPERIMENT_DIR}/logs/slurm/${SLURM_JOB_ID}.*.1.main_workload.log` so `watch_and_analyze.sh` can find it. 3. Write NCCL flight-recorder dumps under `${EXPERIMENT_DIR}/checkpoints/`. -4. Emit a `[MEGATRON_FAULT] ...` marker when the fault is injected. +4. Emit a fault-injection marker when the fault is injected. `watch_and_analyze.sh` uses this to decide whether the run reached the injection point. 5. Preserve the per-experiment directory layout: `logs/slurm/`, `checkpoints/`, and `tensorboard/`. @@ -308,11 +334,12 @@ The example `SBATCH_SCRIPT` reads these env vars from `prepare_node_alloc.sh` vi | Variable | Default | Description | |---|---|---| | `FAULT_AT_ITER` | `5` | Training iteration at which to inject | +| `FAULT_DELAY` | `15` | Delay in seconds before fault injection after the iteration anchor | | `FAULT_RANK` | `1` | Global rank to inject `[0, total_ranks)` | | `FAULT_TYPE` | `GPU_SLEEP` | Megatron fault type enum name | | `GPUS_PER_NODE` | `4` | GPUs per node (used to compute `TOTAL_TASKS`) | | `EXPERIMENT_DIR` | `${BASE_EXPERIMENTS_DIR}/fault_injection/n${SLURM_NNODES}_${FAULT_TYPE}_r${FAULT_RANK}_i${FAULT_AT_ITER}` | Per-experiment output root | -| `BASE_EXPERIMENTS_DIR` | `/home/sbak/experiments/llama4-scout-gb200` | Shared root (datacache, triton/inductor caches) | +| `BASE_EXPERIMENTS_DIR` | `${HOME}/nvrx-attr-experiments` | Shared root (datacache, triton/inductor caches) | Valid `FAULT_TYPE` values: `GPU_ERROR`, `GPU_SLEEP`, `WORKLOAD_EXC`, `ASYNC_EXC`, `SIGNAL_EXC`, `OS_ABORT`, @@ -324,13 +351,23 @@ Valid `FAULT_TYPE` values: ```bash # Manual runs land under fault_injection/manual/ by default (no session dir needed) -EXPERIMENT_DIR=/home/sbak/experiments/llama4-scout-gb200/fault_injection/manual/n2_GPU_SLEEP_r1_i5 +EXPERIMENT_DIR=${HOME}/nvrx-attr-experiments/fault_injection/manual/n2_GPU_SLEEP_r1_i5 mkdir -p ${EXPERIMENT_DIR}/logs/slurm ${EXPERIMENT_DIR}/checkpoints ${EXPERIMENT_DIR}/tensorboard sbatch \ --nodes=2 \ --output=${EXPERIMENT_DIR}/logs/slurm/%j.launch.out \ --error=${EXPERIMENT_DIR}/logs/slurm/%j.launch.err \ - --export=ALL,FAULT_TYPE=GPU_SLEEP,FAULT_RANK=1,FAULT_AT_ITER=5,GPUS_PER_NODE=4,EXPERIMENT_DIR=${EXPERIMENT_DIR} \ + --export=ALL,FAULT_TYPE=GPU_SLEEP,FAULT_RANK=1,FAULT_AT_ITER=5,FAULT_DELAY=15,GPUS_PER_NODE=4,EXPERIMENT_DIR=${EXPERIMENT_DIR} \ scripts/l4_gb200_reduced.sh ``` + +Optional site-specific cleanup: + +```bash +export CONTAINER_CLEANUP_CMD=' +ENROOT_DIR="/var/lib/enroot/data/$(id -u)" +rm -rf "${ENROOT_DIR:?}"/* 2>/dev/null || true +echo "$(hostname): / $(df -h / | tail -1 | awk "{print \$3\" used, \"\$4\" avail\"}")" +' +``` diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh index 0c87a30d..a6e99b47 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh @@ -18,8 +18,13 @@ #SBATCH --mem=0 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +USER_ENV_FILE="${SCRIPT_DIR}/user.env" NVRX_SRC_ROOT_DEFAULT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)" NVRX_REPO_ROOT_DEFAULT="$(cd "${SCRIPT_DIR}/../../../../.." && pwd)" +if [[ -f "${USER_ENV_FILE}" ]]; then + # shellcheck disable=SC1090 + source "${USER_ENV_FILE}" +fi log_msg() { local msg="$1" @@ -63,7 +68,7 @@ export TORCH_NCCL_EXTRA_DUMP_ON_EXEC=1 # - FAULT_AT_ITER anchors the fault-delay timer after iteration N completes # - FAULT_DELAY is the delay in seconds from that anchor (or from training start if unset) export FAULT_AT_ITER="${FAULT_AT_ITER:-5}" -export FAULT_DELAY="${FAULT_DELAY:-}" +export FAULT_DELAY="${FAULT_DELAY:-15}" export FAULT_RANK="${FAULT_RANK:-1}" export FAULT_TYPE="${FAULT_TYPE:-GPU_SLEEP}" export ENABLE_FAULT_INJECTION="${ENABLE_FAULT_INJECTION:-1}" @@ -79,7 +84,7 @@ export NFS_INDUCTOR_CACHE="${NFS_INDUCTOR_CACHE:-}" # USE_ASYNC_CKPT=1: enable async checkpointing every CKPT_SAVE_INTERVAL iters export USE_ASYNC_CKPT="${USE_ASYNC_CKPT:-0}" export CKPT_SAVE_INTERVAL="${CKPT_SAVE_INTERVAL:-100}" -export ENABLE_ENROOT_CLEANUP="${ENABLE_ENROOT_CLEANUP:-0}" +export CONTAINER_CLEANUP_CMD="${CONTAINER_CLEANUP_CMD:-}" # Node / task geometry (SLURM_NNODES is set by SLURM from --nodes override) export GPUS_PER_NODE="${GPUS_PER_NODE:-4}" @@ -163,8 +168,8 @@ if [[ -n "${CONTAINER_NAME}" ]]; then CONTAINER_ARGS+=(--container-name "${CONTAINER_NAME}") fi -# ── Disk cleanup: remove stale enroot containers from prior jobs ────────────── -if [[ "${ENABLE_ENROOT_CLEANUP}" == "1" ]]; then +# ── Optional site-specific container cleanup hook ────────────────────────────── +if [[ -n "${CONTAINER_CLEANUP_CMD}" ]]; then log_msg "START disk_cleanup" srun \ --label \ @@ -172,11 +177,7 @@ if [[ "${ENABLE_ENROOT_CLEANUP}" == "1" ]]; then --ntasks=${SLURM_NNODES} \ --kill-on-bad-exit=0 \ --mpi=none \ - bash -c ' - ENROOT_DIR="/var/lib/enroot/data/$(id -u)" - rm -rf "${ENROOT_DIR:?}"/* 2>/dev/null || true - echo "$(hostname): / $(df -h / | tail -1 | awk "{print \$3\" used, \"\$4\" avail\"}")" - ' + bash -lc "${CONTAINER_CLEANUP_CMD}" log_msg "END disk_cleanup" else log_msg "SKIP disk_cleanup" @@ -313,7 +314,6 @@ PY --no-mmap-bin-files \ --tokenizer-type NullTokenizer \ --tiktoken-pattern v2 \ - --tokenizer-model /lustre/fsw/portfolios/llmservice/projects/llmservice_nlp_fm/nemotron6/tokenizers/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json \ --micro-batch-size 1 \ --global-batch-size 64 \ --train-samples 10240000 \ diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh index 8b8d7b01..67d80be1 100755 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh @@ -26,11 +26,44 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" WORKLOADS_CONF="${SCRIPT_DIR}/workloads.conf" SLURM_DEFAULTS_CONF="${SCRIPT_DIR}/slurm.conf" +USER_ENV_FILE="${SCRIPT_DIR}/user.env" +ACCOUNT_FROM_ENV="${ACCOUNT-}" +PARTITION_FROM_ENV="${PARTITION-}" +BASE_EXPERIMENTS_DIR_FROM_ENV="${BASE_EXPERIMENTS_DIR-}" +MEGATRON_REPO_HOST_PATH_FROM_ENV="${MEGATRON_REPO_HOST_PATH-}" +CONTAINER_IMAGE_FROM_ENV="${CONTAINER_IMAGE-}" +SHARED_TMP_BASE_DIR_FROM_ENV="${SHARED_TMP_BASE_DIR-}" +WORKSPACE_HOST_PATH_FROM_ENV="${WORKSPACE_HOST_PATH-}" if [[ -f "${SLURM_DEFAULTS_CONF}" ]]; then # shellcheck disable=SC1090 source "${SLURM_DEFAULTS_CONF}" fi +if [[ -f "${USER_ENV_FILE}" ]]; then + # shellcheck disable=SC1090 + source "${USER_ENV_FILE}" +fi +if [[ -n "${ACCOUNT_FROM_ENV}" ]]; then + ACCOUNT="${ACCOUNT_FROM_ENV}" +fi +if [[ -n "${PARTITION_FROM_ENV}" ]]; then + PARTITION="${PARTITION_FROM_ENV}" +fi +if [[ -n "${BASE_EXPERIMENTS_DIR_FROM_ENV}" ]]; then + BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR_FROM_ENV}" +fi +if [[ -n "${MEGATRON_REPO_HOST_PATH_FROM_ENV}" ]]; then + MEGATRON_REPO_HOST_PATH="${MEGATRON_REPO_HOST_PATH_FROM_ENV}" +fi +if [[ -n "${CONTAINER_IMAGE_FROM_ENV}" ]]; then + CONTAINER_IMAGE="${CONTAINER_IMAGE_FROM_ENV}" +fi +if [[ -n "${SHARED_TMP_BASE_DIR_FROM_ENV}" ]]; then + SHARED_TMP_BASE_DIR="${SHARED_TMP_BASE_DIR_FROM_ENV}" +fi +if [[ -n "${WORKSPACE_HOST_PATH_FROM_ENV}" ]]; then + WORKSPACE_HOST_PATH="${WORKSPACE_HOST_PATH_FROM_ENV}" +fi # ── Workload resolution from workloads.conf ──────────────────────────────────── # If WORKLOAD is set, look it up in workloads.conf and derive SBATCH_SCRIPT and @@ -148,7 +181,7 @@ submit_one() { --mem=0 \ --output="${EXPERIMENT_DIR}/logs/slurm/%j.launch.out" \ --error="${EXPERIMENT_DIR}/logs/slurm/%j.launch.err" \ - --export=ALL,FAULT_TYPE="${FAULT_TYPE}",FAULT_RANK="${RANK}",FAULT_AT_ITER="${ITER}",GPUS_PER_NODE="${GPUS_PER_NODE}",EXPERIMENT_DIR="${EXPERIMENT_DIR}",BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR}" \ + --export=ALL,FAULT_TYPE="${FAULT_TYPE}",FAULT_RANK="${RANK}",FAULT_AT_ITER="${ITER}",GPUS_PER_NODE="${GPUS_PER_NODE}",EXPERIMENT_DIR="${EXPERIMENT_DIR}",BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR}",MEGATRON_REPO_HOST_PATH="${MEGATRON_REPO_HOST_PATH:-}",CONTAINER_IMAGE="${CONTAINER_IMAGE:-}",SHARED_TMP_BASE_DIR="${SHARED_TMP_BASE_DIR:-}",WORKSPACE_HOST_PATH="${WORKSPACE_HOST_PATH:-}" \ --parsable ) if [[ -n "${ACCOUNT}" ]]; then diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/run_session.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/run_session.sh index ca5251bc..a8145d6c 100755 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/run_session.sh +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/run_session.sh @@ -11,6 +11,11 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +USER_ENV_FILE="${SCRIPT_DIR}/user.env" +if [[ -f "${USER_ENV_FILE}" ]]; then + # shellcheck disable=SC1090 + source "${USER_ENV_FILE}" +fi WORKLOAD="${WORKLOAD:-llama4_scout}" # ---- Phase 1: submit and wait for all experiments ---- @@ -22,7 +27,7 @@ WORKLOAD="${WORKLOAD}" bash "${SCRIPT_DIR}/prepare_node_alloc.sh" # prepare_node_alloc.sh prints the tracking file path; re-derive it the same way # (SESSION_TAG is the timestamp when prepare_node_alloc ran, which is a few seconds # before this line — find the newest session dir instead of recomputing the tag) -BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-/home/sbak/experiments/llama4-scout-gb200}" +BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-${HOME}/nvrx-attr-experiments}" TRACKING_FILE=$(ls -td "${BASE_EXPERIMENTS_DIR}/fault_injection"/[0-9]* 2>/dev/null \ | head -1)/experiments.tsv From 3f2016d3385ee116b808285ba6e6542d25466c18 Mon Sep 17 00:00:00 2001 From: Seonmyeong Bak Date: Thu, 23 Apr 2026 22:04:47 -0700 Subject: [PATCH 06/21] chore(skills): reduce torch cpp log verbosity --- .../skills/nvrx-attr/scripts/l4_gb200_reduced.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh index a6e99b47..f91f99ce 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh @@ -51,7 +51,7 @@ export ONE_LOGGER_JOB_CATEGORY=test export LOGLEVEL=DEBUG export TORCHINDUCTOR_WORKER_START=fork export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True -export TORCH_CPP_LOG_LEVEL=INFO +export TORCH_CPP_LOG_LEVEL=WARNING export TORCH_NCCL_TRACE_BUFFER_SIZE=2000 export TORCH_NCCL_RETHROW_CUDA_ERRORS=0 export TORCH_NCCL_ENABLE_MONITORING=1 From 43e39f74f6a835edc9a663ffd569dda6bfa090fc Mon Sep 17 00:00:00 2001 From: Seonmyeong Bak Date: Thu, 23 Apr 2026 22:20:51 -0700 Subject: [PATCH 07/21] style(skills): format changed python files --- .../attribution/log_analyzer/nvrx_logsage.py | 8 ++-- .../nvrx-attr/scripts/score_attribution.py | 47 ++++++++++++------- 2 files changed, 33 insertions(+), 22 deletions(-) diff --git a/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py b/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py index 4ae1653b..be5aa7f1 100644 --- a/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py +++ b/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py @@ -123,7 +123,9 @@ def _finished_status_name(status: Any) -> str: return getattr(status, "name", status) -def _sleep_with_backoff(attempt: int, retries: int, backoff: float, max_backoff: float, jitter: float) -> float: +def _sleep_with_backoff( + attempt: int, retries: int, backoff: float, max_backoff: float, jitter: float +) -> float: sleep_for = min(backoff, max_backoff) + random.uniform(0.0, jitter) logger.info( "Retrying log-analysis LLM in %.2fs after attempt %d/%d", @@ -169,9 +171,7 @@ def _with_exponential_backoff(llm_call, checkpoint_saved: bool) -> tuple[str, st for attempt in range(1, retries + 1): try: result = llm_call() - if result and not any( - field == LOGSAGE_LLM_ENDPOINT_FAILED for field in result[:4] - ): + if result and not any(field == LOGSAGE_LLM_ENDPOINT_FAILED for field in result[:4]): return result last_error = LOGSAGE_LLM_ENDPOINT_FAILED except Exception as exc: diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py index 4d36ad66..8588f018 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py @@ -23,7 +23,6 @@ import logging import os import sys -from typing import Union from langchain_openai import ChatOpenAI @@ -43,18 +42,27 @@ # Expected restart decision and rationale per fault type _RESTART_TABLE = { - "GPU_SLEEP": ("RESTART IMMEDIATE", "transient GPU hang, recoverable"), - "LOCK_GIL": ("RESTART IMMEDIATE", "transient Python GIL hang, recoverable"), - "SIGTERM": ("RESTART IMMEDIATE", "external termination signal, recoverable"), - "SIGINT": ("RESTART IMMEDIATE", "external interrupt signal, recoverable"), - "SIGSTOP": ("RESTART IMMEDIATE", "external stop signal, recoverable"), - "SIGNAL_EXC": ("RESTART IMMEDIATE", "signal-based exception, typically recoverable"), - "GPU_ERROR": ("STOP - DONT RESTART IMMEDIATE", "hardware GPU error, may be persistent"), - "SIGKILL": ("STOP - DONT RESTART IMMEDIATE", "hard kill, possible external pressure or OOM"), - "SEGFAULT": ("STOP - DONT RESTART IMMEDIATE", "segmentation fault, likely code or memory corruption"), - "OS_ABORT": ("STOP - DONT RESTART IMMEDIATE", "OS abort, likely severe system or hardware fault"), + "GPU_SLEEP": ("RESTART IMMEDIATE", "transient GPU hang, recoverable"), + "LOCK_GIL": ("RESTART IMMEDIATE", "transient Python GIL hang, recoverable"), + "SIGTERM": ("RESTART IMMEDIATE", "external termination signal, recoverable"), + "SIGINT": ("RESTART IMMEDIATE", "external interrupt signal, recoverable"), + "SIGSTOP": ("RESTART IMMEDIATE", "external stop signal, recoverable"), + "SIGNAL_EXC": ("RESTART IMMEDIATE", "signal-based exception, typically recoverable"), + "GPU_ERROR": ("STOP - DONT RESTART IMMEDIATE", "hardware GPU error, may be persistent"), + "SIGKILL": ("STOP - DONT RESTART IMMEDIATE", "hard kill, possible external pressure or OOM"), + "SEGFAULT": ( + "STOP - DONT RESTART IMMEDIATE", + "segmentation fault, likely code or memory corruption", + ), + "OS_ABORT": ( + "STOP - DONT RESTART IMMEDIATE", + "OS abort, likely severe system or hardware fault", + ), "WORKLOAD_EXC": ("STOP - DONT RESTART IMMEDIATE", "application exception, likely a code bug"), - "ASYNC_EXC": ("STOP - DONT RESTART IMMEDIATE", "async exception in workload, likely a code bug"), + "ASYNC_EXC": ( + "STOP - DONT RESTART IMMEDIATE", + "async exception in workload, likely a code bug", + ), } @@ -85,7 +93,9 @@ def load_log_excerpt(log_path, max_lines=400): return f"(could not read log file: {exc})" -def build_judge_prompt(fault_type, rank, iter_, nodes, run_valid, log_output, fr_output, log_excerpt): +def build_judge_prompt( + fault_type, rank, iter_, nodes, run_valid, log_output, fr_output, log_excerpt +): total_ranks = nodes * 4 # GPUS_PER_NODE=4 in the example SBATCH_SCRIPT expected_restart, restart_rationale = _RESTART_TABLE.get( fault_type, ("unknown", "unknown fault type") @@ -217,9 +227,7 @@ def score(args): # Strip markdown code fences if present if text.startswith("```"): lines = text.splitlines() - text = "\n".join( - line for line in lines if not line.startswith("```") - ).strip() + text = "\n".join(line for line in lines if not line.startswith("```")).strip() result = json.loads(text) return result @@ -231,8 +239,11 @@ def main(): parser.add_argument("--rank", type=int, required=True, help="Injected global rank") parser.add_argument("--iter", type=int, required=True, help="Injected iteration") parser.add_argument("--nodes", type=int, required=True, help="Node count") - parser.add_argument("--run-valid", default="true", - help="'true' if training reached the fault injection point, 'false' otherwise") + parser.add_argument( + "--run-valid", + default="true", + help="'true' if training reached the fault injection point, 'false' otherwise", + ) parser.add_argument("--log-path", default="", help="Path to the raw job log file") parser.add_argument("--log-output", default="", help="Raw stdout from nvrx_logsage") parser.add_argument("--fr-output", default="no_dumps", help="Raw text from CollectiveAnalyzer") From 251df4eb7039011d9951876716ea6934dbb4c5b3 Mon Sep 17 00:00:00 2001 From: Seonmyeong Bak Date: Fri, 24 Apr 2026 11:54:21 -0700 Subject: [PATCH 08/21] fix(skills): wire feedback-loop analysis outputs --- .../attribution/log_analyzer/nvrx_logsage.py | 15 +++- .../trace_analyzer/fr_attribution.py | 87 ++++++++++++++----- .../nvrx-attr/fault-injection-loop/SKILL.md | 18 ++-- .../skills/nvrx-attr/fr-analysis/SKILL.md | 18 ++-- .../skills/nvrx-attr/scripts/user.env.example | 25 ++++++ .../nvrx-attr/scripts/watch_and_analyze.sh | 51 +++++------ 6 files changed, 147 insertions(+), 67 deletions(-) create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example diff --git a/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py b/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py index be5aa7f1..0340a6b3 100644 --- a/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py +++ b/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py @@ -461,11 +461,24 @@ def main(): action='store_true', help='Input is already per-cycle data (skip filtering and chunking)', ) + parser.add_argument( + '--emit-stdout', + action='store_true', + help='Print final attribution payload to stdout for machine consumers', + ) args = parser.parse_args() analyzer = NVRxLogAnalyzer(args) - analyzer.run_sync(args) + results = analyzer.run_sync(args) + + if args.emit_stdout: + for result in results: + if not result: + continue + payload = result[0] if isinstance(result, tuple) else result + if payload: + print(payload) if __name__ == "__main__": diff --git a/src/nvidia_resiliency_ext/attribution/trace_analyzer/fr_attribution.py b/src/nvidia_resiliency_ext/attribution/trace_analyzer/fr_attribution.py index a5d06560..f4584cb4 100644 --- a/src/nvidia_resiliency_ext/attribution/trace_analyzer/fr_attribution.py +++ b/src/nvidia_resiliency_ext/attribution/trace_analyzer/fr_attribution.py @@ -34,6 +34,42 @@ def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) +def _parse_rank_list(rank_text: str) -> List[int]: + ranks = [] + for token in rank_text.split(','): + token = token.strip() + if not token: + continue + try: + ranks.append(int(token)) + except ValueError: + continue + return ranks + + +def _extract_missing_ranks_from_table(text: str) -> List[int]: + hanging_ranks = set() + capture = False + + for line in text.splitlines(): + stripped = line.strip() + if not stripped: + continue + if stripped.startswith("PGID") and "Missing Ranks" in stripped: + capture = True + continue + if not capture or "|" not in stripped: + continue + + columns = [col.strip() for col in stripped.split("|")] + if len(columns) < 6: + continue + for rank in _parse_rank_list(columns[-1]): + hanging_ranks.add(rank) + + return sorted(hanging_ranks) + + @dataclass class Collective: """ @@ -134,12 +170,7 @@ async def print_output(self, attribution_result: Optional[str]): hanging_ranks_str = hanging_ranks.group(1).strip() hanging_ranks_list = list(map(int, hanging_ranks_str.split(','))) else: - for idx, line in enumerate(text.split('\n')): - line_list = line.split('|') - if len(line_list) >= 5: - logger.info(line) - if idx >= 1: - hanging_ranks_list.append(line_list[5]) + hanging_ranks_list = _extract_missing_ranks_from_table(text) hanging_ranks = f"hanging ranks: {hanging_ranks_list}" # Dict form preserves collective table text for MCP clients and FRAnalysisResult parity. return ( @@ -218,20 +249,18 @@ def build_collectives_to_order(): # analyze collectives to find process groups with missing and completed ranks completed_pg, missing_pg = self.analyze_matches(verbose=bool(cfg.get("verbose"))) grouped_missing_pgs = {} - grouped_completed_pgs = {} # if the dump file contains health check results, parse the health check results # and print them in a format if cfg.get("health_check"): self.print_node_health_status(verbose=bool(cfg.get("verbose"))) - # group the process groups with missing and completed ranks - # by finding longest paths in the graph + # Group only process groups with missing ranks. + # Completed-rank summaries are not actionable for attribution and create + # misleading output in the feedback loop. grouped_missing_pgs = self.group_pgs(missing_pg) - if len(grouped_missing_pgs) == 0: - grouped_completed_pgs = self.group_pgs(completed_pg) - # gather the head node of each group with missing and completed ranks + # gather the head node of each group with missing ranks # the head node is the first node in the group # the missing ranks in the head node of the missing process groups # are considered to cause the other nodes in the group to hang @@ -242,16 +271,16 @@ def gather_head_nodes(grouped_pgs): return head_nodes head_nodes_missing = None - head_nodes_completed = None - # Gather the head node of each group + # Gather the head node of each missing-rank group. if len(grouped_missing_pgs) > 0: head_nodes_missing = gather_head_nodes(grouped_missing_pgs) logger.debug(f"head_nodes of missing_pg: {head_nodes_missing}") - else: - head_nodes_completed = gather_head_nodes(grouped_completed_pgs) - logger.debug(f"head_nodes of completed_pg: {head_nodes_completed}") # Print the analysis output - with capture_logs() as output: + original_level = logger.level + if logger.getEffectiveLevel() > logging.INFO: + logger.setLevel(logging.INFO) + + with capture_logs(logger.name) as output: def print_ranks_in_pgs(head_nodes, pg_dict, missing_or_completed="Missing"): logger.info( @@ -273,10 +302,8 @@ def print_ranks_in_pgs(head_nodes, pg_dict, missing_or_completed="Missing"): if head_nodes_missing: logger.debug(f"head_nodes_missing: {head_nodes_missing}") print_ranks_in_pgs(head_nodes_missing, missing_pg, "Missing") - # TODO: using this completed pg needs to be updated with new algorithm for isolation - if head_nodes_completed: - print_ranks_in_pgs(head_nodes_completed, completed_pg, "Completed") analysis_output = output.getvalue() + logger.setLevel(original_level) return analysis_output async def collective_analysis(self, analysis_output: str) -> Optional[str]: @@ -1117,7 +1144,7 @@ def main(): '--fr-path', type=str, help='Path to JSON files or directories containing JSON files' ) parser.add_argument( - '-p', '--pattern', default="*.json", help='File pattern to match (default: *.json)' + '-p', '--pattern', default="_dump_*", help='File pattern to match (default: _dump_*)' ) parser.add_argument('-v', '--verbose', action='store_true', help='verbose output') parser.add_argument( @@ -1143,11 +1170,25 @@ def main(): action='store_true', help='Convert the trace file to json file, if the trace is binary, for debugging', ) + parser.add_argument( + '--emit-stdout', + action='store_true', + help='Print final FR summary table to stdout for machine consumers', + ) args = parser.parse_args() analyzer = CollectiveAnalyzer(args) - analyzer.run_sync(args) + result = analyzer.run_sync(args) + + if args.emit_stdout and isinstance(result, tuple) and result: + payload = result[0] + if isinstance(payload, dict): + text = payload.get("analysis_text", "") + if text: + print(text) + elif payload: + print(payload) if __name__ == "__main__": diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md index 0358f7ab..879cbe31 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md @@ -94,9 +94,15 @@ POOL="GPU_SLEEP:0:5:2 GPU_SLEEP:1:5:2" bash scripts/prepare_node_alloc.sh ## Local User Config -Put cluster-specific settings in `scripts/user.env`. This file is sourced by -`run_session.sh`, `prepare_node_alloc.sh`, and `l4_gb200_reduced.sh`, and it is -intended to stay local and untracked. +Start from the tracked template: + +```bash +cp scripts/user.env.example scripts/user.env +``` + +Then edit `scripts/user.env` with cluster-specific settings. This file is +sourced by `run_session.sh`, `prepare_node_alloc.sh`, and +`l4_gb200_reduced.sh`, and it is intended to stay local and untracked. Recommended contents: @@ -198,7 +204,7 @@ The watcher: 1. Reads each row from the tracking TSV 2. Calls `nvrx_logsage.py --exclude_nvrx_logs` and parses the text output to get `restart_decision` and `attribution_text` -3. Calls `CollectiveAnalyzer` from `fr_attribution.py` to get suspect ranks +3. Calls FR analysis as `python -m nvidia_resiliency_ext.attribution.trace_analyzer.fr_attribution --fr-path "${EXPERIMENT_DIR}/checkpoints" -p "_dump_*"` and passes the raw table output to the judge 4. Scores 7 dimensions (restart correctness, rank primary, rank any, category, type, FR rank) 5. Appends a scored row to `_report.md` 6. Repeats until all experiments are analyzed @@ -206,7 +212,7 @@ The watcher: To also run the sub-skills interactively for a single experiment: ```bash /log-analysis --log-path "${EXPERIMENT_DIR}/logs/slurm/${JOB_ID}.*.1.main_workload.log" -/fr-analysis --fr-path "${EXPERIMENT_DIR}/checkpoints/" +/fr-analysis --fr-path "${EXPERIMENT_DIR}/checkpoints" -p "_dump_*" ``` --- @@ -231,7 +237,7 @@ The judge is given: 2. Expected restart decision + rationale (derived from `score_attribution.py:_RESTART_TABLE`) 3. Filtered raw log (last 400 lines, same `exclude_nvrx_logs` filtering as logsage) 4. Raw logsage stdout (5-field text format) -5. Raw CollectiveAnalyzer text output +5. Raw FR analysis table output from `fr_attribution.py --fr-path ... -p "_dump_*"` Default judge model: `qwen/qwen3.5-397b-a17b`. Override with `--model` in `score_attribution.py`. diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md index d07911ec..17cc7de5 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md @@ -22,7 +22,7 @@ and isolate the ranks responsible, using `CollectiveAnalyzer`. ## What it does -1. Loads all FR dump files (JSON or binary pickle) matching a glob pattern under `--fr-path`. +1. Loads all FR dump files matching a glob pattern under `--fr-path`. 2. Parses each dump into `Collective` records (op type, ranks, process group, timing, state). 3. Groups collectives by process group and sequence ID across ranks to detect mismatches. 4. Identifies the **wavefront** — the process group boundary where collectives diverge — and @@ -37,7 +37,7 @@ and isolate the ranks responsible, using `CollectiveAnalyzer`. ```bash python scripts/fr_attribution.py \ --fr-path /path/to/fr_dumps/ \ - [--pattern "*.json"] \ + [-p "_dump_*"] \ [--verbose] \ [--health-check] \ [--llm-analyze] \ @@ -48,7 +48,7 @@ python scripts/fr_attribution.py \ | Flag | Default | Description | |------|---------|-------------| | `--fr-path` | required | Path to a directory (or single file) containing FR dump files | -| `--pattern` | `*.json` | Glob pattern for dump files within `--fr-path` | +| `--pattern`, `-p` | `_dump_*` | Glob pattern for dump files within `--fr-path` | | `--verbose`, `-v` | off | Print detailed per-rank collective tables | | `--health-check`, `-c` | off | Include node health check results in output | | `--llm-analyze`, `-l` | off | Pass structured findings to the LLM for a narrative summary | @@ -64,7 +64,7 @@ from nvidia_resiliency_ext.attribution.trace_analyzer.fr_attribution import Coll analyzer = CollectiveAnalyzer({ "fr_path": "/path/to/fr_dumps/", - "pattern": "*.json", + "pattern": "_dump_*", "verbose": False, "health_check": False, "llm_analyze": False, @@ -80,10 +80,10 @@ results = analyzer.run_sync({ ## Output -Returns `(text, AttributionState)` pairs where `text` describes: +Returns `(text, AttributionState)` pairs where `text` is the FR analysis table and describes: -- The **wavefront process group** where collectives diverged -- **Missing ranks** at the wavefront (root-cause suspects) +- The selected wavefront/front process group +- **Missing ranks** at that process group (root-cause suspects) - Per-rank collective status tables (when `--verbose`) - Node health summary (when `--health-check`) - LLM narrative (when `--llm-analyze`) @@ -97,8 +97,8 @@ may be restartable after isolating the identified ranks. | Format | Notes | |--------|-------| -| JSON (`.json`) | Standard PyTorch FR export; default glob pattern | -| Binary pickle | Detected automatically; use `--debug` to convert to JSON | +| `_dump_*` files | PyTorch FR dump prefix pattern used by the feedback loop | +| Binary pickle / JSON payloads | Detected automatically; use `--debug` to convert binary traces to JSON | FR dumps are typically written to the directory specified by `TORCH_NCCL_DEBUG_INFO_TEMP_FILE` or triggered automatically on NCCL timeout. diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example new file mode 100644 index 00000000..273a488d --- /dev/null +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example @@ -0,0 +1,25 @@ +# Local site settings for the nvrx-attr fault-injection scripts. +# Copy to `user.env` and adjust for your cluster and host paths. +# +# This file is sourced by: +# - run_session.sh +# - prepare_node_alloc.sh +# - l4_gb200_reduced.sh +# +# Per-run overrides can still be provided as environment variables when invoking +# the scripts. + +# SLURM defaults +# ACCOUNT=myacct +# PARTITION=my-partition + +# Host paths +BASE_EXPERIMENTS_DIR="${HOME}/nvrx-attr-experiments" +MEGATRON_REPO_HOST_PATH="${HOME}/megatron-lm-main" +SHARED_TMP_BASE_DIR="${HOME}/tmp" +WORKSPACE_HOST_PATH="${HOME}/tmp" + +# Container settings +CONTAINER_IMAGE="nvcr.io/nvidia/nemo:26.04" +# CONTAINER_NAME= +# CONTAINER_WORKDIR=/ diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh index 6b2a13c9..591af5ea 100755 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh @@ -17,7 +17,13 @@ SKILL_DIR="$(dirname "${SCRIPT_DIR}")" NVRX_SRC_DIR="$(cd "${SKILL_DIR}/../../.." && pwd)" LOGSAGE_PY="${SKILL_DIR}/log-analysis/scripts/nvrx_logsage.py" +FR_ANALYSIS_MODULE="nvidia_resiliency_ext.attribution.trace_analyzer.fr_attribution" SCORE_PY="${SCRIPT_DIR}/score_attribution.py" +LOG_ANALYSIS_MODEL="${LOG_ANALYSIS_MODEL:-${NVRX_LLM_MODEL:-nvidia/nemotron-3-super-120b-a12b}}" +LOG_ANALYSIS_BASE_URL="${LOG_ANALYSIS_BASE_URL:-${NVRX_LLM_BASE_URL:-https://inference-api.nvidia.com}}" +JUDGE_MODEL="${JUDGE_MODEL:-qwen/qwen3.5-397b-a17b}" +JUDGE_BASE_URL="${JUDGE_BASE_URL:-https://inference-api.nvidia.com}" +FR_PATTERN="${FR_PATTERN:-_dump_*}" # Ensure nvidia_resiliency_ext is importable from source tree export PYTHONPATH="${NVRX_SRC_DIR}${PYTHONPATH:+:$PYTHONPATH}" @@ -110,6 +116,9 @@ while true; do # last line: checkpoint_saved ("True" / "False") LOG_OUT=$(python3 "${LOGSAGE_PY}" \ --log-path "${STRIPPED_LOG}" \ + --model "${LOG_ANALYSIS_MODEL}" \ + --base_url "${LOG_ANALYSIS_BASE_URL}" \ + --emit-stdout \ --exclude_nvrx_logs 2>/dev/null || echo "") LOG_RESTART=$(echo "${LOG_OUT}" | head -1) echo " restart_decision: ${LOG_RESTART:-}" @@ -122,33 +131,17 @@ while true; do FR_DIR="${EXPERIMENT_DIR}/checkpoints" FR_OUT="no_dumps" - if [[ "${RUN_VALID}" == "true" ]] && ls "${FR_DIR}"/_dump_* 2>/dev/null | grep -q .; then - echo " FR dumps: $(ls "${FR_DIR}"/_dump_* 2>/dev/null | wc -l) files" - FR_OUT=$(python3 -c " -import sys, logging -logging.basicConfig(level=logging.WARNING) -sys.path.insert(0, '${NVRX_SRC_DIR}') -from nvidia_resiliency_ext.attribution.trace_analyzer.fr_attribution import CollectiveAnalyzer -try: - ca = CollectiveAnalyzer({'fr_path': '${FR_DIR}'}) - results = ca.run_sync({'fr_path': '${FR_DIR}'}) - if results: - result_data = results[0] - if isinstance(result_data, dict): - text = result_data.get('analysis_text', '') - ranks = result_data.get('hanging_ranks', '') - if text: - print(text) - if ranks: - print(ranks) - else: - print(str(result_data)) - else: - print('no results') -except Exception as e: - print('error: ' + str(e), file=sys.stderr) - print('no_dumps') -" 2>/dev/null || echo "no_dumps") + if [[ "${RUN_VALID}" == "true" ]] && ls "${FR_DIR}"/${FR_PATTERN} 2>/dev/null | grep -q .; then + echo " FR dumps: $(ls "${FR_DIR}"/${FR_PATTERN} 2>/dev/null | wc -l) files" + # Use the FR CLI contract directly: + # --fr-path -p '_dump_*' + FR_OUT=$(python3 -m "${FR_ANALYSIS_MODULE}" \ + --fr-path "${FR_DIR}" \ + --emit-stdout \ + -p "${FR_PATTERN}" 2>/dev/null || echo "no_dumps") + if [[ -z "${FR_OUT}" ]]; then + FR_OUT="no_dumps" + fi elif [[ "${RUN_VALID}" == "false" ]]; then FR_OUT="run_invalid" echo " FR analysis skipped (run did not reach fault injection point)" @@ -164,7 +157,9 @@ except Exception as e: --run-valid "${RUN_VALID}" \ --log-path "${STRIPPED_LOG:-}" \ --log-output "${LOG_OUT}" \ - --fr-output "${FR_OUT}" 2>/dev/null || echo '{"notes":"judge_failed"}') + --fr-output "${FR_OUT}" \ + --model "${JUDGE_MODEL}" \ + --base-url "${JUDGE_BASE_URL}" 2>/dev/null || echo '{"notes":"judge_failed"}') # Clean up temp stripped log [[ -n "${STRIPPED_LOG}" && -f "${STRIPPED_LOG}" ]] && rm -f "${STRIPPED_LOG}" From 3b791bc3f5d5622b3f946cdede3931c8b3cdf5f9 Mon Sep 17 00:00:00 2001 From: Seonmyeong Bak Date: Fri, 24 Apr 2026 12:39:24 -0700 Subject: [PATCH 09/21] fix(skills): refine feedback-loop scoring config --- .../nvrx-attr/fault-injection-loop/SKILL.md | 35 ++++++++++--- .../nvrx-attr/scripts/score_attribution.py | 51 +++++++++++++++++-- .../skills/nvrx-attr/scripts/user.env.example | 10 ++++ .../nvrx-attr/scripts/watch_and_analyze.sh | 9 +++- 4 files changed, 92 insertions(+), 13 deletions(-) diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md index 879cbe31..857ba4f3 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md @@ -101,7 +101,7 @@ cp scripts/user.env.example scripts/user.env ``` Then edit `scripts/user.env` with cluster-specific settings. This file is -sourced by `run_session.sh`, `prepare_node_alloc.sh`, and +sourced by `run_session.sh`, `prepare_node_alloc.sh`, `watch_and_analyze.sh`, and `l4_gb200_reduced.sh`, and it is intended to stay local and untracked. Recommended contents: @@ -113,12 +113,20 @@ MEGATRON_REPO_HOST_PATH="${HOME}/megatron-lm-main" SHARED_TMP_BASE_DIR="${HOME}/tmp" WORKSPACE_HOST_PATH="${HOME}/tmp" CONTAINER_IMAGE="nvcr.io/nvidia/nemo:26.04" +NVIDIA_API_KEY_FILE="${HOME}/.nvidia_api_key" +JUDGE_API_KEY_FILE="${HOME}/.nvidia_api_key" +NVRX_LLM_MODEL="nvidia/nemotron-3-super-120b-a12b" +NVRX_LLM_BASE_URL="https://integrate.api.nvidia.com/v1" +JUDGE_MODEL="qwen/qwen3.5-397b-a17b" +JUDGE_BASE_URL="https://integrate.api.nvidia.com/v1" +FR_RACK_SIZE=32 ``` Use `user.env` for stable site defaults such as partition, container image, and -host paths. Use per-run environment overrides for experiment-specific controls -such as `POOL`, `WORKLOAD`, `BATCH_SIZE`, `FAULT_TYPE`, `FAULT_AT_ITER`, or -`FAULT_DELAY`. +host paths, plus local LLM credentials and endpoint settings for log-analysis +and the judge. Use per-run environment overrides for experiment-specific +controls such as `POOL`, `WORKLOAD`, `BATCH_SIZE`, `FAULT_TYPE`, +`FAULT_AT_ITER`, or `FAULT_DELAY`. Environment variables: @@ -136,6 +144,13 @@ Environment variables: | `SHARED_TMP_BASE_DIR` | `${HOME}/tmp` | Shared filesystem path used for cross-step coordination | | `WORKSPACE_HOST_PATH` | `${HOME}/tmp` | Host path mounted at `/workspace` inside the container | | `CONTAINER_IMAGE` | `nvcr.io/nvidia/nemo:26.04` | Container image used by the workload script | +| `NVIDIA_API_KEY_FILE` | _unset_ | File containing the log-analysis API key | +| `JUDGE_API_KEY_FILE` | _unset_ | File containing the judge API key | +| `NVRX_LLM_MODEL` | `nvidia/nemotron-3-super-120b-a12b` | Model for log-analysis | +| `NVRX_LLM_BASE_URL` | `https://integrate.api.nvidia.com/v1` | Base URL for log-analysis | +| `JUDGE_MODEL` | `qwen/qwen3.5-397b-a17b` | Model for judge scoring | +| `JUDGE_BASE_URL` | `https://integrate.api.nvidia.com/v1` | Base URL for judge scoring | +| `FR_RACK_SIZE` | `32` | Ranks per rack for coarse FR scoring | | `SBATCH_SCRIPT` | `scripts/l4_gb200_reduced.sh` | Job script to submit | | `POOL` | _(default pool above)_ | Space-separated experiment triplets | @@ -206,7 +221,7 @@ The watcher: `restart_decision` and `attribution_text` 3. Calls FR analysis as `python -m nvidia_resiliency_ext.attribution.trace_analyzer.fr_attribution --fr-path "${EXPERIMENT_DIR}/checkpoints" -p "_dump_*"` and passes the raw table output to the judge 4. Scores 7 dimensions (restart correctness, rank primary, rank any, category, type, FR rank) -5. Appends a scored row to `_report.md` +5. Appends a scored row to `_report.md` as a markdown table row 6. Repeats until all experiments are analyzed To also run the sub-skills interactively for a single experiment: @@ -229,7 +244,7 @@ analysis output, then returns structured JSON scores with a reasoning note. | **rank_primary** | `true` / `false` / `partial` | Injected rank is the primary root-cause in attribution | | **rank_any** | `true` / `false` | Injected rank mentioned anywhere in attribution | | **fault_described** | `true` / `false` / `partial` | Fault nature (hang/crash/signal/exception) correctly described | -| **fr_rank_correct** | `true` / `false` / `no_dumps` | FR analysis identifies injected rank as suspect | +| **fr_rank_correct** | `rank` / `node` / `rack` / `false` / `no_dumps` | FR analysis narrows correctly to the injected rank, node, rack, or fails to narrow usefully | | **judge_notes** | string | One-sentence summary of the main gap or confirmation | The judge is given: @@ -238,13 +253,19 @@ The judge is given: 3. Filtered raw log (last 400 lines, same `exclude_nvrx_logs` filtering as logsage) 4. Raw logsage stdout (5-field text format) 5. Raw FR analysis table output from `fr_attribution.py --fr-path ... -p "_dump_*"` +6. `GPUS_PER_NODE` and `FR_RACK_SIZE` to map the injected rank to node and rack scopes for FR scoring Default judge model: `qwen/qwen3.5-397b-a17b`. Override with `--model` in `score_attribution.py`. +Default rack size for FR scope scoring: `32` ranks. Override with `FR_RACK_SIZE`. --- ## Step 5 — Aggregate Results +The canonical output of the loop is the markdown table in `_report.md`. +When summarizing results for users, prefer linking to that file and reproducing the +same table shape rather than flattening the results into plain prose. + The report markdown table from `watch_and_analyze.sh` gives a matrix view. Look for patterns across rows: @@ -266,6 +287,8 @@ Common failure mode patterns and their meaning: | `fault_described=partial` for crash types | Crash keywords present but fault type not specifically named | | `restart_correct=false` for GPU_ERROR | LLM conflating hardware error with recoverable hang | | `fr_rank_correct=no_dumps` | NCCL watchdog did not fire before job ended — adjust `TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC` | +| `fr_rank_correct=node` | FR isolated the correct node but not the exact rank | +| `fr_rank_correct=rack` | FR isolated the correct rack-sized rank group but not the exact node/rank | --- diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py index 8588f018..b699096f 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py @@ -39,6 +39,8 @@ # Default judge model — override with --model DEFAULT_JUDGE_MODEL = "qwen/qwen3.5-397b-a17b" +DEFAULT_GPUS_PER_NODE = int(os.getenv("GPUS_PER_NODE", "4")) +DEFAULT_FR_RACK_SIZE = int(os.getenv("FR_RACK_SIZE", "32")) # Expected restart decision and rationale per fault type _RESTART_TABLE = { @@ -94,9 +96,21 @@ def load_log_excerpt(log_path, max_lines=400): def build_judge_prompt( - fault_type, rank, iter_, nodes, run_valid, log_output, fr_output, log_excerpt + fault_type, + rank, + iter_, + nodes, + run_valid, + log_output, + fr_output, + log_excerpt, + gpus_per_node, + rack_size, ): - total_ranks = nodes * 4 # GPUS_PER_NODE=4 in the example SBATCH_SCRIPT + total_ranks = nodes * gpus_per_node + node_index = rank // gpus_per_node + rack_start = (rank // rack_size) * rack_size + rack_end = min(rack_start + rack_size - 1, total_ranks - 1) expected_restart, restart_rationale = _RESTART_TABLE.get( fault_type, ("unknown", "unknown fault type") ) @@ -126,13 +140,21 @@ def build_judge_prompt( ## Ground truth (injected fault) - Fault type : {fault_type} - Injected rank : {rank} (global rank index, 0-based; total ranks = {total_ranks}) +- Injected node : {node_index} (using {gpus_per_node} GPUs per node) +- Injected rack : ranks {rack_start}-{rack_end} (using rack size {rack_size}) - Injected at iteration : {iter_} -- Cluster : {nodes} nodes × 4 GPUs = {total_ranks} total ranks +- Cluster : {nodes} nodes × {gpus_per_node} GPUs = {total_ranks} total ranks ## Expected correct behavior - restart_decision should be : {expected_restart} Rationale: {restart_rationale} - Rank {rank} should appear in Primary issues as the root cause +- FR scope scoring: + - "rank" if FR points directly to rank {rank} + - "node" if FR does not isolate rank {rank} but correctly narrows to node {node_index} + - "rack" if FR does not isolate rank {rank} or node {node_index} but correctly narrows to rack ranks {rack_start}-{rack_end} + - "false" if FR points elsewhere or is not useful + - "no_dumps" if there is no actionable FR output ## Raw job log (filtered, last 400 lines) {log_section} @@ -159,8 +181,13 @@ def build_judge_prompt( (e.g., GPU hang, segfault, signal kill) appropriate for {fault_type}? Values: "true" | "false" | "partial" (category right but specifics wrong) -5. **fr_rank_correct** — Does the FR analysis output identify rank {rank} as a suspect? - Values: "true" | "false" | "no_dumps" (no FR dumps available) +5. **fr_rank_correct** — How precise is the FR analysis output? + Values: "rank" | "node" | "rack" | "false" | "no_dumps" + Use "rank" only if rank {rank} is explicitly implicated. + Use "node" only if the FR output narrows correctly to node {node_index} but not the exact rank. + Use "rack" only if the FR output narrows correctly to rack ranks {rack_start}-{rack_end} but not the exact node or rank. + Use "false" if the FR output points somewhere else, is misleading, or does not narrow correctly. + Use "no_dumps" if there is no actionable FR output. 6. **notes** — One concise sentence summarizing the main gap or confirming correctness. @@ -215,6 +242,8 @@ def score(args): log_output=args.log_output, fr_output=args.fr_output, log_excerpt=log_excerpt, + gpus_per_node=args.gpus_per_node, + rack_size=args.rack_size, ) # build_judge_prompt returns a dict directly for invalid runs (no LLM call needed) @@ -249,6 +278,18 @@ def main(): parser.add_argument("--fr-output", default="no_dumps", help="Raw text from CollectiveAnalyzer") parser.add_argument("--model", default=DEFAULT_JUDGE_MODEL, help="Judge LLM model") parser.add_argument("--base-url", default=DEFAULT_LLM_BASE_URL, help="API base URL") + parser.add_argument( + "--gpus-per-node", + type=int, + default=DEFAULT_GPUS_PER_NODE, + help="GPUs per node for rank-to-node mapping", + ) + parser.add_argument( + "--rack-size", + type=int, + default=DEFAULT_FR_RACK_SIZE, + help="Ranks per rack for coarse FR scope scoring", + ) args = parser.parse_args() try: diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example index 273a488d..a00999fb 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example @@ -23,3 +23,13 @@ WORKSPACE_HOST_PATH="${HOME}/tmp" CONTAINER_IMAGE="nvcr.io/nvidia/nemo:26.04" # CONTAINER_NAME= # CONTAINER_WORKDIR=/ + +# Log-analysis / judge LLM settings +# Keep these local. Prefer *_API_KEY_FILE over inline secrets. +# NVIDIA_API_KEY_FILE="${HOME}/.nvidia_api_key" +# JUDGE_API_KEY_FILE="${HOME}/.nvidia_api_key" +# NVRX_LLM_MODEL="nvidia/nemotron-3-super-120b-a12b" +# NVRX_LLM_BASE_URL="https://integrate.api.nvidia.com/v1" +# JUDGE_MODEL="qwen/qwen3.5-397b-a17b" +# JUDGE_BASE_URL="https://integrate.api.nvidia.com/v1" +# FR_RACK_SIZE=32 diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh index 591af5ea..27685ee7 100755 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh @@ -13,6 +13,11 @@ TRACKING_FILE="${1:?Usage: $0 }" POLL_INTERVAL=30 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +USER_ENV_FILE="${SCRIPT_DIR}/user.env" +if [[ -f "${USER_ENV_FILE}" ]]; then + # shellcheck disable=SC1090 + source "${USER_ENV_FILE}" +fi SKILL_DIR="$(dirname "${SCRIPT_DIR}")" NVRX_SRC_DIR="$(cd "${SKILL_DIR}/../../.." && pwd)" @@ -20,9 +25,9 @@ LOGSAGE_PY="${SKILL_DIR}/log-analysis/scripts/nvrx_logsage.py" FR_ANALYSIS_MODULE="nvidia_resiliency_ext.attribution.trace_analyzer.fr_attribution" SCORE_PY="${SCRIPT_DIR}/score_attribution.py" LOG_ANALYSIS_MODEL="${LOG_ANALYSIS_MODEL:-${NVRX_LLM_MODEL:-nvidia/nemotron-3-super-120b-a12b}}" -LOG_ANALYSIS_BASE_URL="${LOG_ANALYSIS_BASE_URL:-${NVRX_LLM_BASE_URL:-https://inference-api.nvidia.com}}" +LOG_ANALYSIS_BASE_URL="${LOG_ANALYSIS_BASE_URL:-${NVRX_LLM_BASE_URL:-https://integrate.api.nvidia.com/v1}}" JUDGE_MODEL="${JUDGE_MODEL:-qwen/qwen3.5-397b-a17b}" -JUDGE_BASE_URL="${JUDGE_BASE_URL:-https://inference-api.nvidia.com}" +JUDGE_BASE_URL="${JUDGE_BASE_URL:-https://integrate.api.nvidia.com/v1}" FR_PATTERN="${FR_PATTERN:-_dump_*}" # Ensure nvidia_resiliency_ext is importable from source tree From af9c3259b5fd02bcd2aad3bf1da60515e022b6f3 Mon Sep 17 00:00:00 2001 From: Seonmyeong Bak Date: Fri, 24 Apr 2026 14:32:39 -0700 Subject: [PATCH 10/21] feat(skills): add n3 super fault-loop workload --- .../nvrx-attr/fault-injection-loop/SKILL.md | 7 +- .../nvrx-attr/scripts/n3_super_gb200_fi.sh | 408 ++++++++++++++++++ .../scripts/pools/n3_super_8node.txt | 10 + .../skills/nvrx-attr/scripts/workloads.conf | 5 + 4 files changed, 429 insertions(+), 1 deletion(-) create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh create mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/pools/n3_super_8node.txt diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md index 857ba4f3..ce7736f6 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md @@ -158,7 +158,12 @@ Environment variables: | Name | Script | Base dir | Description | |---|---|---|---| -| `llama4_scout` | `l4_gb200_reduced.sh` | `${HOME}/nvrx-attr-experiments` | Llama4-Scout (reduced layers) on GB200 | +| `llama4_scout` | `l4_gb200_reduced.sh` | `${HOME}/nvrx-attr-experiments` | Llama4-Scout (reduced layers) on GB200; minimum supported size is 2 nodes | +| `n3_super` | `n3_super_gb200_fi.sh` | `${HOME}/nvrx-attr-experiments` | Nemotron3-Super on GB200; minimum supported size is 8 nodes | + +Workload note: +- `llama4_scout` requires at least 2 nodes. +- `n3_super` requires at least 8 nodes. Its default registered pool contains only 8-node experiments. ```bash # Run the full pool against the validated example workload diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh new file mode 100644 index 00000000..799e8c98 --- /dev/null +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh @@ -0,0 +1,408 @@ +#!/bin/bash +# n3_super_gb200_fi.sh — fault-injection job script for the n3_super_gb200 workload. +# Production model args are kept aligned with the previously working nemotron config. +# Only path/container plumbing is adapted for the nvrx-attr feedback-loop workflow. + +#SBATCH --time=00:30:00 + +#SBATCH --job-name=n3-super-gb200-fi +#SBATCH --output=/tmp/slurm-%j.launch.out +#SBATCH --error=/tmp/slurm-%j.launch.err + +#SBATCH --nodes=8 +#SBATCH --ntasks-per-node=4 +#SBATCH --gpus-per-node=4 +#SBATCH --exclusive +#SBATCH --mem=0 + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +USER_ENV_FILE="${SCRIPT_DIR}/user.env" +NVRX_SRC_ROOT_DEFAULT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)" +NVRX_REPO_ROOT_DEFAULT="$(cd "${SCRIPT_DIR}/../../../../.." && pwd)" +if [[ -f "${USER_ENV_FILE}" ]]; then + # shellcheck disable=SC1090 + source "${USER_ENV_FILE}" +fi + +log_msg() { + local msg="$1" + UNIX_DATETIME=$(date +%s) + HUMAN_DATETIME=$(date -d "@$UNIX_DATETIME" '+%Y-%m-%d %H:%M:%S.%3N') + echo ">>> ${msg} ${UNIX_DATETIME} (${HUMAN_DATETIME})" +} + +log_msg "START SBATCH" +echo "Running on nodes: ${SLURM_NODELIST}" + +# ── Platform / NCCL ─────────────────────────────────────────────────────────── +export NCCL_IB_DISABLE=0 +export NCCL_NET_GDR_LEVEL=3 +export PYXIS_LOG_LEVEL=debug +export NCCL_IB_SL=1 +export NCCL_IB_TIMEOUT=19 +export UB_TIMEOUT=720 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export NCCL_P2P_NET_CHUNKSIZE=2097152 +export NCCL_DEBUG=WARN + +# ── PyTorch / TE / inductor (from n3_super_gb200.sh ENV_VARS) ───────────────── +export NVTE_FWD_LAYERNORM_SM_MARGIN=16 +export NVTE_BWD_LAYERNORM_SM_MARGIN=16 +export TORCHINDUCTOR_WORKER_START=fork +export QUANTIZATION_TYPE_DEBUG=1 +export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True +export USE_MNNVL=1 + +# ── DeepEP (hybridep MoE routing) — set USE_DEEPEP=0 to use alltoall instead ── +export USE_DEEPEP="${USE_DEEPEP:-1}" +if [[ "${USE_DEEPEP}" == "1" ]]; then + export NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN=32 +fi + +# ── Logging / debugging ─────────────────────────────────────────────────────── +export PYTHONUNBUFFERED=1 +export ONE_LOGGER_JOB_CATEGORY=test +export LOGLEVEL=DEBUG +export TORCH_CPP_LOG_LEVEL=WARNING +export TORCH_NCCL_TRACE_BUFFER_SIZE=2000 +export TORCH_NCCL_RETHROW_CUDA_ERRORS=0 +export TORCH_NCCL_ENABLE_MONITORING=1 +export TORCH_NCCL_DUMP_ON_TIMEOUT=1 +export TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN=0 +export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=30 +export TORCH_DIST_INIT_BARRIER=0 +export TORCH_INCLUDE_STACK_TRACE=0 +export TORCH_INCLUDE_ONLY_ACTIVE=1 +export TORCH_NCCL_EXTRA_DUMP_ON_EXEC=1 + +# ── Fault injection parameters (overridable via sbatch --export) ────────────── +export FAULT_AT_ITER="${FAULT_AT_ITER:-5}" +export FAULT_DELAY="${FAULT_DELAY:-15}" +export FAULT_RANK="${FAULT_RANK:-1}" +export FAULT_TYPE="${FAULT_TYPE:-GPU_SLEEP}" +export ENABLE_FAULT_INJECTION="${ENABLE_FAULT_INJECTION:-1}" + +# ── CUDA graph (set ENABLE_CUDA_GRAPH=0 to disable) ─────────────────────────── +export ENABLE_CUDA_GRAPH="${ENABLE_CUDA_GRAPH:-1}" + +# ── Node / task geometry (SLURM_NNODES is set by SLURM from --nodes override) ─ +export GPUS_PER_NODE="${GPUS_PER_NODE:-4}" +TOTAL_TASKS=$((SLURM_NNODES * GPUS_PER_NODE)) + +# ── Per-experiment output directory (overridable via sbatch --export) ───────── +export BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR:-${HOME}/nvrx-attr-experiments}" +export EXPERIMENT_DIR="${EXPERIMENT_DIR:-${BASE_EXPERIMENTS_DIR}/fault_injection/manual/n${SLURM_NNODES}_${FAULT_TYPE}_r${FAULT_RANK}_i${FAULT_AT_ITER}}" +export NVRX_REPO_ROOT="${NVRX_REPO_ROOT:-${NVRX_REPO_ROOT_DEFAULT}}" +export NVRX_SRC_ROOT="${NVRX_SRC_ROOT:-${NVRX_SRC_ROOT_DEFAULT}}" +export NVRX_CONTAINER_REPO_PATH="${NVRX_CONTAINER_REPO_PATH:-${HOME}/nvidia-resiliency-ext}" +export NVRX_CONTAINER_SRC_PATH="${NVRX_CONTAINER_SRC_PATH:-${NVRX_CONTAINER_REPO_PATH}/src}" +export SHARED_TMP_BASE_DIR="${SHARED_TMP_BASE_DIR:-${HOME}/tmp}" +export MEGATRON_REPO_HOST_PATH="${MEGATRON_REPO_HOST_PATH:-${HOME}/megatron-lm-main}" +export WORKSPACE_HOST_PATH="${WORKSPACE_HOST_PATH:-${HOME}/tmp}" +export CONTAINER_IMAGE="${CONTAINER_IMAGE:-nvcr.io/nvidia/nemo:26.04}" +export CONTAINER_NAME="${CONTAINER_NAME:-}" +export CONTAINER_WORKDIR="${CONTAINER_WORKDIR:-/}" +export CONTAINER_CLEANUP_CMD="${CONTAINER_CLEANUP_CMD:-}" +export ENABLE_NFS_CACHE_STAGING="${ENABLE_NFS_CACHE_STAGING:-0}" +export NFS_TRITON_CACHE="${NFS_TRITON_CACHE:-}" +export NFS_INDUCTOR_CACHE="${NFS_INDUCTOR_CACHE:-}" + +mkdir -p ${BASE_EXPERIMENTS_DIR}/datacache +mkdir -p ${EXPERIMENT_DIR}/checkpoints +mkdir -p ${EXPERIMENT_DIR}/tensorboard + +: "${SLURM_RESTART_COUNT:=0}" + +LOG_DIR=${EXPERIMENT_DIR}/logs +mkdir -p ${LOG_DIR} +echo "Writing logs to ${LOG_DIR}" +LOG_FILE_BASE="${LOG_DIR}/slurm/${SLURM_JOB_ID}.${SLURM_RESTART_COUNT}" + +# ── Container mounts ────────────────────────────────────────────────────────── +LUSTRE=/home:/home +SHARED_TMP_HOST=${SHARED_TMP_BASE_DIR}/${SLURM_JOB_ID} +mkdir -p ${SHARED_TMP_HOST} +SHARED_TMP=${SHARED_TMP_HOST}:/shared_tmp +LOGS=${EXPERIMENT_DIR}/logs:/logs +MEGATRON_REPO=${MEGATRON_REPO_HOST_PATH}:/megatron-lm_repo +DATACACHE=${BASE_EXPERIMENTS_DIR}/datacache:/datacache +CHECKPOINT_LOAD=${EXPERIMENT_DIR}/checkpoints:/checkpoint-load +CHECKPOINT_SAVE=${EXPERIMENT_DIR}/checkpoints:/checkpoint-save +TENSORBOARD=${EXPERIMENT_DIR}/tensorboard:/tensorboard +WORKSPACE=${WORKSPACE_HOST_PATH}:/workspace +CONTAINER_MOUNTS=$LUSTRE,$SHARED_TMP,$LOGS,$MEGATRON_REPO,$DATACACHE,$CHECKPOINT_LOAD,$CHECKPOINT_SAVE,$TENSORBOARD,$WORKSPACE +CONTAINER_ARGS=( + --container-mounts "${CONTAINER_MOUNTS}" + --container-image "${CONTAINER_IMAGE}" + --container-workdir "${CONTAINER_WORKDIR}" +) +if [[ -n "${CONTAINER_NAME}" ]]; then + CONTAINER_ARGS+=(--container-name "${CONTAINER_NAME}") +fi + +MYENV_FILE=${SHARED_TMP_HOST}/.myenv_${SLURM_JOB_ID}.sh +cat > ${MYENV_FILE} << MYENVEOF +export FAULT_AT_ITER=${FAULT_AT_ITER} +export FAULT_DELAY=${FAULT_DELAY} +export FAULT_RANK=${FAULT_RANK} +export FAULT_TYPE=${FAULT_TYPE} +export ENABLE_FAULT_INJECTION=${ENABLE_FAULT_INJECTION} +export ENABLE_CUDA_GRAPH=${ENABLE_CUDA_GRAPH} +export USE_DEEPEP=${USE_DEEPEP} +export ENABLE_NFS_CACHE_STAGING=${ENABLE_NFS_CACHE_STAGING} +export NFS_TRITON_CACHE=${NFS_TRITON_CACHE} +export NFS_INDUCTOR_CACHE=${NFS_INDUCTOR_CACHE} +export NVRX_REPO_ROOT=${NVRX_CONTAINER_REPO_PATH} +export NVRX_SRC_ROOT=${NVRX_CONTAINER_SRC_PATH} +export PYTHONPATH=\${NVRX_REPO_ROOT}:\${NVRX_SRC_ROOT}:\${PYTHONPATH} +MYENVEOF + +# ── Optional site-specific cleanup hook ─────────────────────────────────────── +if [[ -n "${CONTAINER_CLEANUP_CMD}" ]]; then + log_msg "START disk_cleanup" + srun \ + --label \ + --ntasks-per-node=1 \ + --ntasks=${SLURM_NNODES} \ + --kill-on-bad-exit=0 \ + --mpi=none \ + bash -lc "${CONTAINER_CLEANUP_CMD}" + log_msg "END disk_cleanup" +else + log_msg "SKIP disk_cleanup" +fi + +# ── All-node setup: clone Megatron into a per-job tmpdir ───────────────────── +log_msg "START all_node_setup" +srun \ + --label \ + "${CONTAINER_ARGS[@]}" \ + --exclusive \ + --error=${LOG_FILE_BASE}.0.all_node_setup.log \ + --output=${LOG_FILE_BASE}.0.all_node_setup.log \ + --ntasks-per-node=1 \ + --ntasks=${SLURM_NNODES} \ + --kill-on-bad-exit=0 \ + --mpi=none \ + bash -c ' + MEGATRON_PATH=/shared_tmp/megatron_${SLURM_NODEID} + rm -rf "${MEGATRON_PATH}" + mkdir -p "${MEGATRON_PATH}" + pushd $MEGATRON_PATH + CURRENT_BRANCH=$(git -C /megatron-lm_repo branch --show-current) + echo "Cloning Megatron branch $CURRENT_BRANCH into $MEGATRON_PATH" + git clone --single-branch --branch $CURRENT_BRANCH /megatron-lm_repo . + rm -rf "${MEGATRON_PATH}/nvidia_resiliency_ext" + if command -v rsync >/dev/null 2>&1; then + rsync -a "${NVRX_CONTAINER_SRC_PATH}/nvidia_resiliency_ext/" "${MEGATRON_PATH}/nvidia_resiliency_ext/" + else + cp -a "${NVRX_CONTAINER_SRC_PATH}/nvidia_resiliency_ext" "${MEGATRON_PATH}/" + fi + popd + ' +log_msg "END all_node_setup" + +# ── Main workload ───────────────────────────────────────────────────────────── +log_msg "START main_workload" +srun \ + --label \ + "${CONTAINER_ARGS[@]}" \ + --error=${LOG_FILE_BASE}.1.main_workload.log \ + --output=${LOG_FILE_BASE}.1.main_workload.log \ + --ntasks-per-node=${GPUS_PER_NODE} \ + --ntasks=${TOTAL_TASKS} \ + --kill-on-bad-exit=0 \ + --mpi=none \ + bash -c ' + source /shared_tmp/.myenv_${SLURM_JOB_ID}.sh + MEGATRON_PATH=/shared_tmp/megatron_${SLURM_NODEID} + export PYTHONPATH=${MEGATRON_PATH}:${NVRX_REPO_ROOT}:${NVRX_SRC_ROOT}:${PYTHONPATH} + + # Triton/inductor cache strategy: + # - /tmp inside the container is the node-local in-memory tmpfs (not NFS-backed) + # - Optional pre-stage from a persistent cache to each local rank /tmp dir + # - Barrier via marker file in /tmp ensures other ranks wait before Python starts + # - On exit: global rank 0 stages back to NFS only on cold start + TRITON_READY=/tmp/.triton_ready_${SLURM_JOB_ID} + + export TRITON_CACHE_DIR=/tmp/triton_${SLURM_LOCALID} + export TORCHINDUCTOR_CACHE_DIR=/tmp/inductor_${SLURM_LOCALID} + + if [[ "${ENABLE_NFS_CACHE_STAGING}" == "1" && "${SLURM_LOCALID}" == "0" ]]; then + if [[ -d "${NFS_TRITON_CACHE}" ]] && [[ -n "$(ls -A ${NFS_TRITON_CACHE} 2>/dev/null)" ]]; then + TRITON_CACHE_WAS_WARM=1 + else + TRITON_CACHE_WAS_WARM=0 + fi + for r in 0 1 2 3; do + mkdir -p /tmp/triton_${r} /tmp/inductor_${r} + [[ -d "${NFS_TRITON_CACHE}" ]] && rsync -a --ignore-existing "${NFS_TRITON_CACHE}/" "/tmp/triton_${r}/" 2>/dev/null || true + [[ -d "${NFS_INDUCTOR_CACHE}" ]] && rsync -a --ignore-existing "${NFS_INDUCTOR_CACHE}/" "/tmp/inductor_${r}/" 2>/dev/null || true + done + touch "${TRITON_READY}" + echo "Pre-staged triton/inductor cache for all local ranks (was_warm=${TRITON_CACHE_WAS_WARM})." + elif [[ "${SLURM_LOCALID}" != "0" ]]; then + until [[ -f "${TRITON_READY}" ]]; do sleep 1; done + fi + + mkdir -p ${TRITON_CACHE_DIR} ${TORCHINDUCTOR_CACHE_DIR} + + _stage_back() { + if [[ "${ENABLE_NFS_CACHE_STAGING}" == "1" && "${SLURM_LOCALID}" == "0" && "${SLURM_NODEID}" == "0" && "${TRITON_CACHE_WAS_WARM:-0}" == "0" ]]; then + echo "Staging triton cache back to NFS (cold start)..." + mkdir -p "${NFS_TRITON_CACHE}" "${NFS_INDUCTOR_CACHE}" + rsync -a --ignore-existing "${TRITON_CACHE_DIR}/" "${NFS_TRITON_CACHE}/" 2>/dev/null || true + rsync -a --ignore-existing "${TORCHINDUCTOR_CACHE_DIR}/" "${NFS_INDUCTOR_CACHE}/" 2>/dev/null || true + echo "Cache staged back." + fi + } + trap _stage_back EXIT + + if [[ "${ENABLE_CUDA_GRAPH}" == "1" ]]; then + CUDA_GRAPH_ARGS="--enable-cuda-graph --cuda-graph-scope mamba attn" + else + CUDA_GRAPH_ARGS="" + fi + + if [[ "${USE_DEEPEP:-1}" == "1" ]]; then + MOE_DISPATCHER_ARGS="--moe-token-dispatcher-type flex --moe-flex-dispatcher-backend hybridep --moe-hybridep-num-sms 32" + else + MOE_DISPATCHER_ARGS="--moe-token-dispatcher-type alltoall" + fi + + pushd $MEGATRON_PATH + LAUNCHER_CMD="python3" + LAUNCHER_ARGS=" \ + " + WORKLOAD_CMD=${MEGATRON_PATH}/pretrain_mamba.py + FAULT_INJECTOR_ARGS="" + if [[ "${ENABLE_FAULT_INJECTION}" == "1" ]]; then + FAULT_INJECTOR_ARGS=" \ + --fault-injector-ranks ${FAULT_RANK} \ + --fault-injector-fault-types ${FAULT_TYPE} \ + " + if [[ -n "${FAULT_DELAY:-}" ]]; then + FAULT_INJECTOR_ARGS="${FAULT_INJECTOR_ARGS} --fault-injector-fault-delay ${FAULT_DELAY}" + if [[ -n "${FAULT_AT_ITER:-}" ]]; then + FAULT_INJECTOR_ARGS="${FAULT_INJECTOR_ARGS} --fault-injector-delay-start-iteration ${FAULT_AT_ITER}" + fi + elif [[ -n "${FAULT_AT_ITER:-}" ]]; then + FAULT_INJECTOR_ARGS="${FAULT_INJECTOR_ARGS} --fault-injector-fault-delay 0 --fault-injector-delay-start-iteration ${FAULT_AT_ITER}" + fi + fi + WORKLOAD_ARGS=" \ + --exit-duration-in-mins 5750 \ + --exit-interval 100 \ + --distributed-timeout-minutes 10 \ + --disable-gloo-process-groups \ + --mock-data \ + --data-cache-path /datacache \ + --no-create-attention-mask-in-dataloader \ + --no-mmap-bin-files \ + --tokenizer-type NullTokenizer \ + --tiktoken-pattern v2 \ + --vocab-size 128000 \ + --micro-batch-size 1 \ + --global-batch-size 32 \ + --train-samples 12207031 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --lr 4.5e-4 \ + --min-lr 4.5e-6 \ + --lr-decay-style WSD \ + --lr-warmup-samples 24414063 \ + --lr-decay-samples 3048706055 \ + --lr-wsd-decay-style minus_sqrt \ + --lr-wsd-decay-samples 610351563 \ + --weight-decay 0.1 \ + --clip-grad 1.0 \ + --override-opt_param-scheduler \ + --use-mcore-models \ + --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \ + --is-hybrid-model \ + --mamba-num-heads 128 \ + --num-layers 88 \ + --hidden-size 4096 \ + --ffn-hidden-size 2688 \ + --num-attention-heads 32 \ + --group-query-attention \ + --num-query-groups 2 \ + --kv-channels 128 \ + --hybrid-override-pattern MEMEMEM*EMEMEMEM*EMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEM*EMEMEMEME \ + --position-embedding-type none \ + --normalization RMSNorm \ + --untie-embeddings-and-output-weights \ + --init-method-std 0.014 \ + --disable-bias-linear \ + --squared-relu \ + --use-fused-weighted-squared-relu \ + --seq-length 8192 \ + --max-position-embeddings 8192 \ + --num-experts 512 \ + --moe-router-topk 22 \ + --moe-router-topk-scaling-factor 5.0 \ + --moe-router-score-function sigmoid \ + --moe-router-enable-expert-bias \ + --moe-router-dtype fp32 \ + --moe-router-load-balancing-type seq_aux_loss \ + --moe-aux-loss-coeff 1e-4 \ + ${MOE_DISPATCHER_ARGS} \ + --moe-grouped-gemm \ + --moe-permute-fusion \ + --moe-latent-size 1024 \ + --moe-shared-expert-intermediate-size 5376 \ + --calculate-per-token-loss \ + --bf16 \ + --first-last-layers-bf16 \ + --num-layers-at-start-in-bf16 0 \ + --num-layers-at-end-in-bf16 14 \ + --fp4-format e2m1 \ + --fp4-recipe nvfp4 \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --sequence-parallel \ + --use-distributed-optimizer \ + --overlap-grad-reduce \ + --overlap-param-gather \ + --ddp-num-buckets 10 \ + --ddp-pad-buckets-for-high-nccl-busbw \ + --high-priority-stream-groups ep \ + --tensor-model-parallel-size 4 \ + --pipeline-model-parallel-size 1 \ + --expert-model-parallel-size 32 \ + --expert-tensor-parallel-size 1 \ + --cross-entropy-loss-fusion \ + --cross-entropy-fusion-impl native \ + --attention-backend flash \ + ${CUDA_GRAPH_ARGS} \ + --te-rng-tracker \ + --manual-gc \ + --manual-gc-interval 10 \ + --num-workers 1 \ + --eval-interval 1000 \ + --eval-iters 14 \ + --log-interval 1 \ + --log-params-norm \ + --log-num-zeros-in-grad \ + --log-timers-to-tensorboard \ + --log-memory-to-tensorboard \ + --log-throughput \ + --log-progress \ + --log-energy \ + --log-memory-interval 500 \ + --logging-level 20 \ + --timing-log-option minmax \ + --check-weight-hash-across-dp-replicas-interval 20000 \ + --tensorboard-dir /tensorboard \ + --local-rank ${SLURM_LOCALID} \ + --distributed-timeout-seconds-after-init 1 \ + --flight-recorder-dump-path /checkpoint-save \ + " + WORKLOAD_ARGS="${WORKLOAD_ARGS} ${FAULT_INJECTOR_ARGS}" + $LAUNCHER_CMD $LAUNCHER_ARGS $WORKLOAD_CMD $WORKLOAD_ARGS + ' +log_msg "END main_workload" + +log_msg "END SBATCH" + +set +x diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/pools/n3_super_8node.txt b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/pools/n3_super_8node.txt new file mode 100644 index 00000000..dc6a75cd --- /dev/null +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/pools/n3_super_8node.txt @@ -0,0 +1,10 @@ +# n3_super minimum supported size is 8 nodes. +# Format: FAULT_TYPE:RANK:ITER:NODES + +GPU_SLEEP:1:5:8 +GPU_SLEEP:0:5:8 +GPU_SLEEP:16:5:8 +GPU_SLEEP:31:5:8 +GPU_ERROR:1:5:8 +GPU_ERROR:0:5:8 +SIGKILL:1:5:8 diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/workloads.conf b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/workloads.conf index dcc1dc62..8d638e78 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/workloads.conf +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/workloads.conf @@ -15,5 +15,10 @@ # "-" means use the TIME env var or prepare_node_alloc.sh default (00:30:00) # # Fields are whitespace-separated. Lines starting with # are ignored. +# +# Workload-specific notes: +# - llama4_scout requires a minimum of 2 nodes. +# - n3_super requires a minimum of 8 nodes; use its dedicated 8-node pool file. llama4_scout l4_gb200_reduced.sh - Llama4-Scout_(reduced_layers)_on_GB200 - - +n3_super n3_super_gb200_fi.sh - Nemotron3-Super_on_GB200 n3_super_8node.txt - From 9a12531425eb95bd4a52eefd2c3a85dbc7d25eed Mon Sep 17 00:00:00 2001 From: Seonmyeong Bak Date: Fri, 24 Apr 2026 15:22:01 -0700 Subject: [PATCH 11/21] fix(skills): harden n3 fault-loop analysis --- .../skills/nvrx-attr/scripts/n3_super_gb200_fi.sh | 2 +- .../skills/nvrx-attr/scripts/watch_and_analyze.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh index 799e8c98..ff91debc 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh @@ -234,7 +234,7 @@ srun \ else TRITON_CACHE_WAS_WARM=0 fi - for r in 0 1 2 3; do + for ((r=0; r/dev/null || true [[ -d "${NFS_INDUCTOR_CACHE}" ]] && rsync -a --ignore-existing "${NFS_INDUCTOR_CACHE}/" "/tmp/inductor_${r}/" 2>/dev/null || true diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh index 27685ee7..b3cc1df2 100755 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh @@ -36,7 +36,7 @@ export PYTHONPATH="${NVRX_SRC_DIR}${PYTHONPATH:+:$PYTHONPATH}" strip_injection_markers() { local input_log="$1" local output_log="$2" - grep -v -E 'FAULT INJECTION|nvidia_resiliency_ext\.shared_utils\.inject_fault' \ + grep -a -v -E 'FAULT INJECTION|nvidia_resiliency_ext\.shared_utils\.inject_fault' \ "${input_log}" > "${output_log}" 2>/dev/null || true } @@ -100,7 +100,7 @@ while true; do STRIPPED_LOG="" if [[ -n "${LOG_FILE}" && -f "${LOG_FILE}" ]]; then echo " log: ${LOG_FILE}" - if grep -q "FAULT INJECTION" "${LOG_FILE}" 2>/dev/null; then + if grep -a -q "FAULT INJECTION" "${LOG_FILE}" 2>/dev/null; then RUN_VALID="true" fi echo " run_valid: ${RUN_VALID}" From 1e23cb903a94fba3474628185b16c16ed78ce9a1 Mon Sep 17 00:00:00 2001 From: Seonmyeong Bak Date: Fri, 24 Apr 2026 15:24:40 -0700 Subject: [PATCH 12/21] fix(skills): repair fr-analysis wrapper symlink --- .../skills/nvrx-attr/fr-analysis/scripts/fr_attribution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/scripts/fr_attribution.py b/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/scripts/fr_attribution.py index cfac8e34..d98699dd 120000 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/scripts/fr_attribution.py +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/scripts/fr_attribution.py @@ -1 +1 @@ -../../../trace_analyzer/fr_attribution.py \ No newline at end of file +../../../../attribution/trace_analyzer/fr_attribution.py \ No newline at end of file From 5e30a25247a62907e93b49adfe2c2b4620376087 Mon Sep 17 00:00:00 2001 From: Seonmyeong Bak Date: Fri, 24 Apr 2026 15:29:41 -0700 Subject: [PATCH 13/21] docs(skills): clarify local env configuration --- .../skills/nvrx-attr/fault-injection-loop/SKILL.md | 5 +++++ .../skills/nvrx-attr/scripts/user.env.example | 13 +++++++++++++ 2 files changed, 18 insertions(+) diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md index ce7736f6..ed908b05 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md @@ -128,6 +128,11 @@ and the judge. Use per-run environment overrides for experiment-specific controls such as `POOL`, `WORKLOAD`, `BATCH_SIZE`, `FAULT_TYPE`, `FAULT_AT_ITER`, or `FAULT_DELAY`. +If you use local Triton/Inductor cache staging, set the cache variables in +`scripts/user.env`. See `scripts/user.env.example` for the supported +`ENABLE_NFS_CACHE_STAGING`, `NFS_TRITON_CACHE`, and `NFS_INDUCTOR_CACHE` +entries and workload-specific path examples. + Environment variables: | Variable | Default | Description | diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example index a00999fb..72a2efca 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example @@ -24,8 +24,21 @@ CONTAINER_IMAGE="nvcr.io/nvidia/nemo:26.04" # CONTAINER_NAME= # CONTAINER_WORKDIR=/ +# Optional NFS-backed cache staging +# Set ENABLE_NFS_CACHE_STAGING=1 to pre-stage Triton/Inductor caches to local /tmp. +# Pick the workload-specific cache roots that match the workload you are running: +# NFS_TRITON_CACHE="/home/sbak/experiments/llama4-scout-gb200/triton_cache" +# NFS_INDUCTOR_CACHE="/home/sbak/experiments/llama4-scout-gb200/inductor_cache" +# NFS_TRITON_CACHE="/home/sbak/experiments/n3-super-gb200/triton_cache" +# NFS_INDUCTOR_CACHE="/home/sbak/experiments/n3-super-gb200/inductor_cache" +# ENABLE_NFS_CACHE_STAGING=1 +# NFS_TRITON_CACHE="/path/to//triton_cache" +# NFS_INDUCTOR_CACHE="/path/to//inductor_cache" + # Log-analysis / judge LLM settings # Keep these local. Prefer *_API_KEY_FILE over inline secrets. +# NVIDIA_API_KEY="..." +# JUDGE_API_KEY="..." # NVIDIA_API_KEY_FILE="${HOME}/.nvidia_api_key" # JUDGE_API_KEY_FILE="${HOME}/.nvidia_api_key" # NVRX_LLM_MODEL="nvidia/nemotron-3-super-120b-a12b" From 810c534abed5b231b7a030fa10abb1159f1d05e3 Mon Sep 17 00:00:00 2001 From: Seonmyeong Bak Date: Mon, 27 Apr 2026 10:27:59 -0700 Subject: [PATCH 14/21] refactor(skills): simplify local config wiring --- .../attribution/log_analyzer/nvrx_logsage.py | 8 -------- .../skills/nvrx-attr/fault-injection-loop/SKILL.md | 4 ++-- .../skills/nvrx-attr/scripts/prepare_node_alloc.sh | 5 ----- .../skills/nvrx-attr/scripts/slurm.conf | 11 ----------- .../skills/nvrx-attr/scripts/user.env.example | 4 ---- 5 files changed, 2 insertions(+), 30 deletions(-) delete mode 100644 src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/slurm.conf diff --git a/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py b/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py index 0340a6b3..30d90e8c 100644 --- a/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py +++ b/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py @@ -194,14 +194,6 @@ def _with_exponential_backoff(llm_call, checkpoint_saved: bool) -> tuple[str, st backoff = _sleep_with_backoff(attempt, retries, backoff, max_backoff, jitter) - return ( - ATTR_LLM_FAILURE, - ATTR_LLM_FAILURE, - ATTR_LLM_FAILURE, - ATTR_LLM_FAILURE, - str(checkpoint_saved), - ) - class NVRxLogAnalyzer(NVRxAttribution): def __init__(self, args: Union[argparse.Namespace, Mapping[str, Any]]): diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md index ed908b05..3cb4a93b 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md @@ -138,8 +138,8 @@ Environment variables: | Variable | Default | Description | |---|---|---| | `WORKLOAD` | `llama4_scout` | Select a registered workload by name (see `scripts/workloads.conf`) | -| `ACCOUNT` | _(cluster default or `scripts/slurm.conf`)_ | SLURM account | -| `PARTITION` | _(cluster default or `scripts/slurm.conf`)_ | SLURM partition | +| `ACCOUNT` | _(cluster default or `scripts/user.env`)_ | SLURM account | +| `PARTITION` | _(cluster default or `scripts/user.env`)_ | SLURM partition | | `GPUS_PER_NODE` | `4` | GPUs per node | | `TIME` | `00:30:00` | Per-job wall-clock limit | | `BATCH_SIZE` | `2` | Jobs submitted per round | diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh index 67d80be1..9deac578 100755 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh @@ -25,7 +25,6 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" WORKLOADS_CONF="${SCRIPT_DIR}/workloads.conf" -SLURM_DEFAULTS_CONF="${SCRIPT_DIR}/slurm.conf" USER_ENV_FILE="${SCRIPT_DIR}/user.env" ACCOUNT_FROM_ENV="${ACCOUNT-}" PARTITION_FROM_ENV="${PARTITION-}" @@ -35,10 +34,6 @@ CONTAINER_IMAGE_FROM_ENV="${CONTAINER_IMAGE-}" SHARED_TMP_BASE_DIR_FROM_ENV="${SHARED_TMP_BASE_DIR-}" WORKSPACE_HOST_PATH_FROM_ENV="${WORKSPACE_HOST_PATH-}" -if [[ -f "${SLURM_DEFAULTS_CONF}" ]]; then - # shellcheck disable=SC1090 - source "${SLURM_DEFAULTS_CONF}" -fi if [[ -f "${USER_ENV_FILE}" ]]; then # shellcheck disable=SC1090 source "${USER_ENV_FILE}" diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/slurm.conf b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/slurm.conf deleted file mode 100644 index 764003dc..00000000 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/slurm.conf +++ /dev/null @@ -1,11 +0,0 @@ -# Optional site-specific Slurm defaults for nvrx-attr scripts. -# -# This file is sourced by prepare_node_alloc.sh. Environment variables still -# take precedence, so you can override these per invocation: -# -# ACCOUNT=myacct PARTITION=gpu bash scripts/prepare_node_alloc.sh -# -# Leave values empty to rely on the cluster's default account / partition. - -ACCOUNT="" -PARTITION="" diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example index 72a2efca..cf8f8f5e 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example @@ -27,10 +27,6 @@ CONTAINER_IMAGE="nvcr.io/nvidia/nemo:26.04" # Optional NFS-backed cache staging # Set ENABLE_NFS_CACHE_STAGING=1 to pre-stage Triton/Inductor caches to local /tmp. # Pick the workload-specific cache roots that match the workload you are running: -# NFS_TRITON_CACHE="/home/sbak/experiments/llama4-scout-gb200/triton_cache" -# NFS_INDUCTOR_CACHE="/home/sbak/experiments/llama4-scout-gb200/inductor_cache" -# NFS_TRITON_CACHE="/home/sbak/experiments/n3-super-gb200/triton_cache" -# NFS_INDUCTOR_CACHE="/home/sbak/experiments/n3-super-gb200/inductor_cache" # ENABLE_NFS_CACHE_STAGING=1 # NFS_TRITON_CACHE="/path/to//triton_cache" # NFS_INDUCTOR_CACHE="/path/to//inductor_cache" From d0fac001702801a510621bad168b6a0c9b7675e7 Mon Sep 17 00:00:00 2001 From: Seonmyeong Bak Date: Mon, 27 Apr 2026 10:30:34 -0700 Subject: [PATCH 15/21] fix(fr): restore logger level on analysis errors --- .../trace_analyzer/fr_attribution.py | 43 ++++++++++--------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/src/nvidia_resiliency_ext/attribution/trace_analyzer/fr_attribution.py b/src/nvidia_resiliency_ext/attribution/trace_analyzer/fr_attribution.py index f4584cb4..8fe6c134 100644 --- a/src/nvidia_resiliency_ext/attribution/trace_analyzer/fr_attribution.py +++ b/src/nvidia_resiliency_ext/attribution/trace_analyzer/fr_attribution.py @@ -279,31 +279,32 @@ def gather_head_nodes(grouped_pgs): original_level = logger.level if logger.getEffectiveLevel() > logging.INFO: logger.setLevel(logging.INFO) + try: + with capture_logs(logger.name) as output: - with capture_logs(logger.name) as output: - - def print_ranks_in_pgs(head_nodes, pg_dict, missing_or_completed="Missing"): - logger.info( - f"{'PGID':<6} | {'Process Group Desc':<25} | {'Op Type':<10} | {'Size':<8} \ - | {'Dtype':<8} | {missing_or_completed} Ranks" - ) - for pg_idx in head_nodes: - entry = list(pg_dict[pg_idx][0]) - entry.remove(entry[-2]) - if missing_or_completed == "Missing": - ranks_to_print = entry[6] - else: - ranks_to_print = entry[5] + def print_ranks_in_pgs(head_nodes, pg_dict, missing_or_completed="Missing"): logger.info( - f"{entry[0]:<6} | {entry[1]:<25} | {entry[2]:<10} | {entry[3]:<8} \ - | {entry[4]:<8} | {ranks_to_print}" + f"{'PGID':<6} | {'Process Group Desc':<25} | {'Op Type':<10} | {'Size':<8} \ + | {'Dtype':<8} | {missing_or_completed} Ranks" ) + for pg_idx in head_nodes: + entry = list(pg_dict[pg_idx][0]) + entry.remove(entry[-2]) + if missing_or_completed == "Missing": + ranks_to_print = entry[6] + else: + ranks_to_print = entry[5] + logger.info( + f"{entry[0]:<6} | {entry[1]:<25} | {entry[2]:<10} | {entry[3]:<8} \ + | {entry[4]:<8} | {ranks_to_print}" + ) - if head_nodes_missing: - logger.debug(f"head_nodes_missing: {head_nodes_missing}") - print_ranks_in_pgs(head_nodes_missing, missing_pg, "Missing") - analysis_output = output.getvalue() - logger.setLevel(original_level) + if head_nodes_missing: + logger.debug(f"head_nodes_missing: {head_nodes_missing}") + print_ranks_in_pgs(head_nodes_missing, missing_pg, "Missing") + analysis_output = output.getvalue() + finally: + logger.setLevel(original_level) return analysis_output async def collective_analysis(self, analysis_output: str) -> Optional[str]: From 950b97b2e9cecea53531d8c4c4563ad308bf4983 Mon Sep 17 00:00:00 2001 From: Seonmyeong Bak Date: Mon, 27 Apr 2026 10:44:04 -0700 Subject: [PATCH 16/21] refactor(skills): require local env for fault loop --- .../skills/nvrx-attr/fault-injection-loop/SKILL.md | 5 +++-- .../skills/nvrx-attr/scripts/l4_gb200_reduced.sh | 9 ++++++--- .../skills/nvrx-attr/scripts/n3_super_gb200_fi.sh | 9 ++++++--- .../skills/nvrx-attr/scripts/prepare_node_alloc.sh | 9 ++++++--- .../skills/nvrx-attr/scripts/run_session.sh | 9 ++++++--- .../skills/nvrx-attr/scripts/watch_and_analyze.sh | 9 ++++++--- 6 files changed, 33 insertions(+), 17 deletions(-) diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md index 3cb4a93b..1598f221 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md @@ -101,8 +101,9 @@ cp scripts/user.env.example scripts/user.env ``` Then edit `scripts/user.env` with cluster-specific settings. This file is -sourced by `run_session.sh`, `prepare_node_alloc.sh`, `watch_and_analyze.sh`, and -`l4_gb200_reduced.sh`, and it is intended to stay local and untracked. +sourced by `run_session.sh`, `prepare_node_alloc.sh`, `watch_and_analyze.sh`, +`l4_gb200_reduced.sh`, and `n3_super_gb200_fi.sh`. It is required for this skill +to run and is intended to stay local and untracked. Recommended contents: diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh index f91f99ce..319053fc 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh @@ -21,10 +21,13 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" USER_ENV_FILE="${SCRIPT_DIR}/user.env" NVRX_SRC_ROOT_DEFAULT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)" NVRX_REPO_ROOT_DEFAULT="$(cd "${SCRIPT_DIR}/../../../../.." && pwd)" -if [[ -f "${USER_ENV_FILE}" ]]; then - # shellcheck disable=SC1090 - source "${USER_ENV_FILE}" +if [[ ! -f "${USER_ENV_FILE}" ]]; then + echo "ERROR: required local config not found: ${USER_ENV_FILE}" >&2 + echo "Create it from ${SCRIPT_DIR}/user.env.example and fill in your local settings." >&2 + exit 1 fi +# shellcheck disable=SC1090 +source "${USER_ENV_FILE}" log_msg() { local msg="$1" diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh index ff91debc..7f077575 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh @@ -19,10 +19,13 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" USER_ENV_FILE="${SCRIPT_DIR}/user.env" NVRX_SRC_ROOT_DEFAULT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)" NVRX_REPO_ROOT_DEFAULT="$(cd "${SCRIPT_DIR}/../../../../.." && pwd)" -if [[ -f "${USER_ENV_FILE}" ]]; then - # shellcheck disable=SC1090 - source "${USER_ENV_FILE}" +if [[ ! -f "${USER_ENV_FILE}" ]]; then + echo "ERROR: required local config not found: ${USER_ENV_FILE}" >&2 + echo "Create it from ${SCRIPT_DIR}/user.env.example and fill in your local settings." >&2 + exit 1 fi +# shellcheck disable=SC1090 +source "${USER_ENV_FILE}" log_msg() { local msg="$1" diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh index 9deac578..9ce92b67 100755 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh @@ -34,10 +34,13 @@ CONTAINER_IMAGE_FROM_ENV="${CONTAINER_IMAGE-}" SHARED_TMP_BASE_DIR_FROM_ENV="${SHARED_TMP_BASE_DIR-}" WORKSPACE_HOST_PATH_FROM_ENV="${WORKSPACE_HOST_PATH-}" -if [[ -f "${USER_ENV_FILE}" ]]; then - # shellcheck disable=SC1090 - source "${USER_ENV_FILE}" +if [[ ! -f "${USER_ENV_FILE}" ]]; then + echo "ERROR: required local config not found: ${USER_ENV_FILE}" >&2 + echo "Create it from ${SCRIPT_DIR}/user.env.example and fill in your local settings." >&2 + exit 1 fi +# shellcheck disable=SC1090 +source "${USER_ENV_FILE}" if [[ -n "${ACCOUNT_FROM_ENV}" ]]; then ACCOUNT="${ACCOUNT_FROM_ENV}" fi diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/run_session.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/run_session.sh index a8145d6c..df3b2c10 100755 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/run_session.sh +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/run_session.sh @@ -12,10 +12,13 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" USER_ENV_FILE="${SCRIPT_DIR}/user.env" -if [[ -f "${USER_ENV_FILE}" ]]; then - # shellcheck disable=SC1090 - source "${USER_ENV_FILE}" +if [[ ! -f "${USER_ENV_FILE}" ]]; then + echo "ERROR: required local config not found: ${USER_ENV_FILE}" >&2 + echo "Create it from ${SCRIPT_DIR}/user.env.example and fill in your local settings." >&2 + exit 1 fi +# shellcheck disable=SC1090 +source "${USER_ENV_FILE}" WORKLOAD="${WORKLOAD:-llama4_scout}" # ---- Phase 1: submit and wait for all experiments ---- diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh index b3cc1df2..249e8606 100755 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/watch_and_analyze.sh @@ -14,10 +14,13 @@ POLL_INTERVAL=30 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" USER_ENV_FILE="${SCRIPT_DIR}/user.env" -if [[ -f "${USER_ENV_FILE}" ]]; then - # shellcheck disable=SC1090 - source "${USER_ENV_FILE}" +if [[ ! -f "${USER_ENV_FILE}" ]]; then + echo "ERROR: required local config not found: ${USER_ENV_FILE}" >&2 + echo "Create it from ${SCRIPT_DIR}/user.env.example and fill in your local settings." >&2 + exit 1 fi +# shellcheck disable=SC1090 +source "${USER_ENV_FILE}" SKILL_DIR="$(dirname "${SCRIPT_DIR}")" NVRX_SRC_DIR="$(cd "${SKILL_DIR}/../../.." && pwd)" From 269a3e3f46be5e1ec22910e15d7f069a568b65db Mon Sep 17 00:00:00 2001 From: Seonmyeong Bak Date: Mon, 27 Apr 2026 14:26:37 -0700 Subject: [PATCH 17/21] fix(skills): normalize FR segment scoring --- .../skills/nvrx-attr/SKILL.md | 12 +++ .../nvrx-attr/fault-injection-loop/SKILL.md | 14 +-- .../nvrx-attr/scripts/score_attribution.py | 98 ++++++++++++++++--- .../skills/nvrx-attr/scripts/user.env.example | 3 +- 4 files changed, 108 insertions(+), 19 deletions(-) diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/SKILL.md index 6884f96f..1f018b2a 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/SKILL.md +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/SKILL.md @@ -50,3 +50,15 @@ full coalescing stack. - `logsage` package installed (required by `log_analysis`) - Package installed: `pip install nvidia-resiliency-ext` or `pip install -e .` from repo root - The fault-injection loop has only been validated with Megatron-LM training scripts + +## Fault-Loop Local Setup + +Before using `fault-injection-loop/`, create the local config file from the tracked +template and fill in your site-specific values: + +```bash +cp scripts/user.env.example scripts/user.env +``` + +The feedback-loop scripts require `src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env` +to exist at runtime. Keep `user.env` local and untracked. diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md index 1598f221..79b3637e 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md @@ -120,7 +120,7 @@ NVRX_LLM_MODEL="nvidia/nemotron-3-super-120b-a12b" NVRX_LLM_BASE_URL="https://integrate.api.nvidia.com/v1" JUDGE_MODEL="qwen/qwen3.5-397b-a17b" JUDGE_BASE_URL="https://integrate.api.nvidia.com/v1" -FR_RACK_SIZE=32 +FR_SEGMENT_SIZE=32 ``` Use `user.env` for stable site defaults such as partition, container image, and @@ -156,7 +156,7 @@ Environment variables: | `NVRX_LLM_BASE_URL` | `https://integrate.api.nvidia.com/v1` | Base URL for log-analysis | | `JUDGE_MODEL` | `qwen/qwen3.5-397b-a17b` | Model for judge scoring | | `JUDGE_BASE_URL` | `https://integrate.api.nvidia.com/v1` | Base URL for judge scoring | -| `FR_RACK_SIZE` | `32` | Ranks per rack for coarse FR scoring | +| `FR_SEGMENT_SIZE` | `32` | Ranks per segment for coarse FR scoring | | `SBATCH_SCRIPT` | `scripts/l4_gb200_reduced.sh` | Job script to submit | | `POOL` | _(default pool above)_ | Space-separated experiment triplets | @@ -255,7 +255,7 @@ analysis output, then returns structured JSON scores with a reasoning note. | **rank_primary** | `true` / `false` / `partial` | Injected rank is the primary root-cause in attribution | | **rank_any** | `true` / `false` | Injected rank mentioned anywhere in attribution | | **fault_described** | `true` / `false` / `partial` | Fault nature (hang/crash/signal/exception) correctly described | -| **fr_rank_correct** | `rank` / `node` / `rack` / `false` / `no_dumps` | FR analysis narrows correctly to the injected rank, node, rack, or fails to narrow usefully | +| **fr_rank_correct** | `rank` / `node` / `segment` / `false` / `no_dumps` | FR analysis narrows correctly to the injected rank, exactly one `GPUS_PER_NODE` rank block containing that rank, the configured `FR_SEGMENT_SIZE` rank block containing the injected rank, or fails to narrow usefully | | **judge_notes** | string | One-sentence summary of the main gap or confirmation | The judge is given: @@ -264,10 +264,10 @@ The judge is given: 3. Filtered raw log (last 400 lines, same `exclude_nvrx_logs` filtering as logsage) 4. Raw logsage stdout (5-field text format) 5. Raw FR analysis table output from `fr_attribution.py --fr-path ... -p "_dump_*"` -6. `GPUS_PER_NODE` and `FR_RACK_SIZE` to map the injected rank to node and rack scopes for FR scoring +6. `GPUS_PER_NODE` and `FR_SEGMENT_SIZE` to map the injected rank to exact node-sized and segment-sized scopes for FR scoring Default judge model: `qwen/qwen3.5-397b-a17b`. Override with `--model` in `score_attribution.py`. -Default rack size for FR scope scoring: `32` ranks. Override with `FR_RACK_SIZE`. +Default segment size for FR scope scoring: `32` ranks. Override with `FR_SEGMENT_SIZE`. --- @@ -298,8 +298,8 @@ Common failure mode patterns and their meaning: | `fault_described=partial` for crash types | Crash keywords present but fault type not specifically named | | `restart_correct=false` for GPU_ERROR | LLM conflating hardware error with recoverable hang | | `fr_rank_correct=no_dumps` | NCCL watchdog did not fire before job ended — adjust `TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC` | -| `fr_rank_correct=node` | FR isolated the correct node but not the exact rank | -| `fr_rank_correct=rack` | FR isolated the correct rack-sized rank group but not the exact node/rank | +| `fr_rank_correct=node` | FR isolated exactly one `GPUS_PER_NODE` rank block containing the injected rank, but not the exact rank | +| `fr_rank_correct=segment` | FR isolated the configured `FR_SEGMENT_SIZE` rank block containing the injected rank, but not the exact node/rank | --- diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py index b699096f..16c6faec 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py @@ -22,6 +22,7 @@ import json import logging import os +import re import sys from langchain_openai import ChatOpenAI @@ -40,7 +41,7 @@ # Default judge model — override with --model DEFAULT_JUDGE_MODEL = "qwen/qwen3.5-397b-a17b" DEFAULT_GPUS_PER_NODE = int(os.getenv("GPUS_PER_NODE", "4")) -DEFAULT_FR_RACK_SIZE = int(os.getenv("FR_RACK_SIZE", "32")) +DEFAULT_FR_SEGMENT_SIZE = int(os.getenv("FR_SEGMENT_SIZE", os.getenv("FR_RACK_SIZE", "32"))) # Expected restart decision and rationale per fault type _RESTART_TABLE = { @@ -95,6 +96,64 @@ def load_log_excerpt(log_path, max_lines=400): return f"(could not read log file: {exc})" +def parse_fr_missing_ranks(fr_output: str) -> set[int]: + if not fr_output or fr_output.strip() in ("", "no_dumps", "no results", "run_invalid"): + return set() + + ranks: set[int] = set() + for line in fr_output.splitlines(): + if "|" not in line or "Missing Ranks" in line: + continue + parts = [part.strip() for part in line.split("|")] + if len(parts) < 6: + continue + last_col = parts[-1] + for match in re.finditer(r"\d+", last_col): + ranks.add(int(match.group(0))) + return ranks + + +def normalize_fr_rank_correct( + raw_label: str, + fr_output: str, + rank: int, + total_ranks: int, + gpus_per_node: int, + segment_size: int, +) -> str: + label = (raw_label or "").strip().lower() + if label in {"n/a", ""}: + return raw_label + if label == "rack": + label = "segment" + if label == "no_dumps": + return "no_dumps" + + fr_ranks = parse_fr_missing_ranks(fr_output) + if not fr_ranks: + return "no_dumps" + + node_start = (rank // gpus_per_node) * gpus_per_node + node_end = min(node_start + gpus_per_node - 1, total_ranks - 1) + segment_start = (rank // segment_size) * segment_size + segment_end = min(segment_start + segment_size - 1, total_ranks - 1) + + in_node = all(node_start <= fr_rank <= node_end for fr_rank in fr_ranks) + in_segment = all(segment_start <= fr_rank <= segment_end for fr_rank in fr_ranks) + + if label == "node": + if in_node: + return "node" + if in_segment: + return "segment" + return "false" + + if label == "segment": + return "segment" if in_segment else "false" + + return label + + def build_judge_prompt( fault_type, rank, @@ -105,12 +164,12 @@ def build_judge_prompt( fr_output, log_excerpt, gpus_per_node, - rack_size, + segment_size, ): total_ranks = nodes * gpus_per_node node_index = rank // gpus_per_node - rack_start = (rank // rack_size) * rack_size - rack_end = min(rack_start + rack_size - 1, total_ranks - 1) + segment_start = (rank // segment_size) * segment_size + segment_end = min(segment_start + segment_size - 1, total_ranks - 1) expected_restart, restart_rationale = _RESTART_TABLE.get( fault_type, ("unknown", "unknown fault type") ) @@ -141,7 +200,7 @@ def build_judge_prompt( - Fault type : {fault_type} - Injected rank : {rank} (global rank index, 0-based; total ranks = {total_ranks}) - Injected node : {node_index} (using {gpus_per_node} GPUs per node) -- Injected rack : ranks {rack_start}-{rack_end} (using rack size {rack_size}) +- Injected segment : ranks {segment_start}-{segment_end} (using segment size {segment_size}) - Injected at iteration : {iter_} - Cluster : {nodes} nodes × {gpus_per_node} GPUs = {total_ranks} total ranks @@ -152,7 +211,7 @@ def build_judge_prompt( - FR scope scoring: - "rank" if FR points directly to rank {rank} - "node" if FR does not isolate rank {rank} but correctly narrows to node {node_index} - - "rack" if FR does not isolate rank {rank} or node {node_index} but correctly narrows to rack ranks {rack_start}-{rack_end} + - "segment" if FR does not isolate rank {rank} or node {node_index} but correctly narrows to segment ranks {segment_start}-{segment_end} - "false" if FR points elsewhere or is not useful - "no_dumps" if there is no actionable FR output @@ -182,10 +241,10 @@ def build_judge_prompt( Values: "true" | "false" | "partial" (category right but specifics wrong) 5. **fr_rank_correct** — How precise is the FR analysis output? - Values: "rank" | "node" | "rack" | "false" | "no_dumps" + Values: "rank" | "node" | "segment" | "false" | "no_dumps" Use "rank" only if rank {rank} is explicitly implicated. Use "node" only if the FR output narrows correctly to node {node_index} but not the exact rank. - Use "rack" only if the FR output narrows correctly to rack ranks {rack_start}-{rack_end} but not the exact node or rank. + Use "segment" only if the FR output narrows correctly to segment ranks {segment_start}-{segment_end} but not the exact node or rank. Use "false" if the FR output points somewhere else, is misleading, or does not narrow correctly. Use "no_dumps" if there is no actionable FR output. @@ -243,7 +302,7 @@ def score(args): fr_output=args.fr_output, log_excerpt=log_excerpt, gpus_per_node=args.gpus_per_node, - rack_size=args.rack_size, + segment_size=args.segment_size, ) # build_judge_prompt returns a dict directly for invalid runs (no LLM call needed) @@ -259,6 +318,15 @@ def score(args): text = "\n".join(line for line in lines if not line.startswith("```")).strip() result = json.loads(text) + total_ranks = args.nodes * args.gpus_per_node + result["fr_rank_correct"] = normalize_fr_rank_correct( + raw_label=result.get("fr_rank_correct", ""), + fr_output=args.fr_output, + rank=args.rank, + total_ranks=total_ranks, + gpus_per_node=args.gpus_per_node, + segment_size=args.segment_size, + ) return result @@ -284,13 +352,21 @@ def main(): default=DEFAULT_GPUS_PER_NODE, help="GPUs per node for rank-to-node mapping", ) + parser.add_argument( + "--segment-size", + type=int, + default=DEFAULT_FR_SEGMENT_SIZE, + help="Ranks per segment for coarse FR scope scoring", + ) parser.add_argument( "--rack-size", type=int, - default=DEFAULT_FR_RACK_SIZE, - help="Ranks per rack for coarse FR scope scoring", + default=None, + help="Deprecated alias for --segment-size", ) args = parser.parse_args() + if args.rack_size is not None: + args.segment_size = args.rack_size try: result = score(args) diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example index cf8f8f5e..cf060f45 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example @@ -41,4 +41,5 @@ CONTAINER_IMAGE="nvcr.io/nvidia/nemo:26.04" # NVRX_LLM_BASE_URL="https://integrate.api.nvidia.com/v1" # JUDGE_MODEL="qwen/qwen3.5-397b-a17b" # JUDGE_BASE_URL="https://integrate.api.nvidia.com/v1" -# FR_RACK_SIZE=32 +# FR_SEGMENT_SIZE=32 +# FR_RACK_SIZE=32 # deprecated alias From 59ebefe855091d06c58c21c1660b38ce96c23d4a Mon Sep 17 00:00:00 2001 From: Seonmyeong Bak Date: Tue, 28 Apr 2026 13:27:55 -0700 Subject: [PATCH 18/21] fix(log-analysis): handle zero LLM retries --- .../attribution/log_analyzer/nvrx_logsage.py | 23 ++++++++---- .../unit/test_nvrx_logsage_retry.py | 37 +++++++++++++++++++ 2 files changed, 53 insertions(+), 7 deletions(-) create mode 100644 tests/attribution/unit/test_nvrx_logsage_retry.py diff --git a/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py b/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py index 30d90e8c..39a5177c 100644 --- a/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py +++ b/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py @@ -167,6 +167,14 @@ def _retry_return_application_errors( def _with_exponential_backoff(llm_call, checkpoint_saved: bool) -> tuple[str, str, str, str, str]: retries, initial_backoff, max_backoff, jitter = _log_analysis_retry_config() backoff = initial_backoff + last_error = "no attempts made (retries=0)" + fallback = ( + ATTR_LLM_FAILURE, + ATTR_LLM_FAILURE, + ATTR_LLM_FAILURE, + ATTR_LLM_FAILURE, + str(checkpoint_saved), + ) for attempt in range(1, retries + 1): try: @@ -184,16 +192,17 @@ def _with_exponential_backoff(llm_call, checkpoint_saved: bool) -> tuple[str, st retries, last_error, ) - return ( - ATTR_LLM_FAILURE, - ATTR_LLM_FAILURE, - ATTR_LLM_FAILURE, - ATTR_LLM_FAILURE, - str(checkpoint_saved), - ) + return fallback backoff = _sleep_with_backoff(attempt, retries, backoff, max_backoff, jitter) + logger.error( + "Log-analysis LLM failed after %d attempts; last error: %s", + retries, + last_error, + ) + return fallback + class NVRxLogAnalyzer(NVRxAttribution): def __init__(self, args: Union[argparse.Namespace, Mapping[str, Any]]): diff --git a/tests/attribution/unit/test_nvrx_logsage_retry.py b/tests/attribution/unit/test_nvrx_logsage_retry.py new file mode 100644 index 00000000..7dd8ce94 --- /dev/null +++ b/tests/attribution/unit/test_nvrx_logsage_retry.py @@ -0,0 +1,37 @@ +import importlib +import os +import unittest +from unittest.mock import patch + + +try: + nvrx_logsage = importlib.import_module( + "nvidia_resiliency_ext.attribution.log_analyzer.nvrx_logsage" + ) + IMPORT_ERROR = None +except ImportError as exc: + nvrx_logsage = None + IMPORT_ERROR = exc + + +@unittest.skipIf(nvrx_logsage is None, f"missing optional dependency: {IMPORT_ERROR}") +class TestNVRxLogSageRetry(unittest.TestCase): + def test_with_exponential_backoff_returns_failure_when_retries_zero(self): + def llm_call(): + raise AssertionError("llm_call should not run when retries=0") + + with patch.dict(os.environ, {"NVRX_LOG_ANALYSIS_LLM_RETRIES": "0"}): + self.assertEqual( + nvrx_logsage._with_exponential_backoff(llm_call, checkpoint_saved=True), + ( + nvrx_logsage.ATTR_LLM_FAILURE, + nvrx_logsage.ATTR_LLM_FAILURE, + nvrx_logsage.ATTR_LLM_FAILURE, + nvrx_logsage.ATTR_LLM_FAILURE, + "True", + ), + ) + + +if __name__ == "__main__": + unittest.main() From 0e1d6db6693c44b67702190581dd66572c3d89f7 Mon Sep 17 00:00:00 2001 From: Seonmyeong Bak Date: Tue, 28 Apr 2026 13:51:23 -0700 Subject: [PATCH 19/21] fix(skills): use LLM API key names --- src/nvidia_resiliency_ext/skills/nvrx-attr/SKILL.md | 4 ++-- .../skills/nvrx-attr/fault-injection-loop/SKILL.md | 8 ++++---- .../skills/nvrx-attr/fr-analysis/SKILL.md | 4 ++-- .../skills/nvrx-attr/log-analysis/SKILL.md | 4 ++-- .../skills/nvrx-attr/scripts/score_attribution.py | 6 +++--- .../skills/nvrx-attr/scripts/user.env.example | 6 +++--- 6 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/SKILL.md index 1f018b2a..5da1c9db 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/SKILL.md +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/SKILL.md @@ -4,7 +4,7 @@ description: > Orchestration layer over nvidia_resiliency_ext attribution modules. Provides log-analysis, fr-analysis, and a Megatron-LM-oriented fault-injection feedback loop for benchmarking attribution quality on SLURM workloads. -compatibility: Requires Python 3.10+, nvidia-resiliency-ext installed, logsage, langchain-openai, and NVIDIA_API_KEY (env var, NVIDIA_API_KEY_FILE, or ~/.nvidia_api_key). The fault-injection loop has only been validated with Megatron-LM workloads. +compatibility: Requires Python 3.10+, nvidia-resiliency-ext installed, logsage, langchain-openai, and LLM_API_KEY (env var, LLM_API_KEY_FILE, or ~/.llm_api_key). The fault-injection loop has only been validated with Megatron-LM workloads. metadata: author: nvidia --- @@ -45,7 +45,7 @@ full coalescing stack. ## Common prerequisites -- `NVIDIA_API_KEY` environment variable, `NVIDIA_API_KEY_FILE`, or `~/.nvidia_api_key` +- `LLM_API_KEY` environment variable, `LLM_API_KEY_FILE`, or `~/.llm_api_key` - `langchain-openai` installed - `logsage` package installed (required by `log_analysis`) - Package installed: `pip install nvidia-resiliency-ext` or `pip install -e .` from repo root diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md index 79b3637e..063bb1cb 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fault-injection-loop/SKILL.md @@ -8,7 +8,7 @@ description: > After all jobs complete, runs /log-analysis and /fr-analysis on every experiment, scores attribution vs. ground truth, aggregates gaps, and iterates on attribution modules to close them. -compatibility: Requires SLURM cluster access, sbatch, NVIDIA_API_KEY, langchain-openai, logsage, and nvidia-resiliency-ext installed. This workflow has only been validated with Megatron-LM workloads. +compatibility: Requires SLURM cluster access, sbatch, LLM_API_KEY, langchain-openai, logsage, and nvidia-resiliency-ext installed. This workflow has only been validated with Megatron-LM workloads. metadata: author: nvidia sub-skills: [log-analysis, fr-analysis] @@ -114,8 +114,8 @@ MEGATRON_REPO_HOST_PATH="${HOME}/megatron-lm-main" SHARED_TMP_BASE_DIR="${HOME}/tmp" WORKSPACE_HOST_PATH="${HOME}/tmp" CONTAINER_IMAGE="nvcr.io/nvidia/nemo:26.04" -NVIDIA_API_KEY_FILE="${HOME}/.nvidia_api_key" -JUDGE_API_KEY_FILE="${HOME}/.nvidia_api_key" +LLM_API_KEY_FILE="${HOME}/.llm_api_key" +JUDGE_API_KEY_FILE="${HOME}/.llm_api_key" NVRX_LLM_MODEL="nvidia/nemotron-3-super-120b-a12b" NVRX_LLM_BASE_URL="https://integrate.api.nvidia.com/v1" JUDGE_MODEL="qwen/qwen3.5-397b-a17b" @@ -150,7 +150,7 @@ Environment variables: | `SHARED_TMP_BASE_DIR` | `${HOME}/tmp` | Shared filesystem path used for cross-step coordination | | `WORKSPACE_HOST_PATH` | `${HOME}/tmp` | Host path mounted at `/workspace` inside the container | | `CONTAINER_IMAGE` | `nvcr.io/nvidia/nemo:26.04` | Container image used by the workload script | -| `NVIDIA_API_KEY_FILE` | _unset_ | File containing the log-analysis API key | +| `LLM_API_KEY_FILE` | _unset_ | File containing the log-analysis API key | | `JUDGE_API_KEY_FILE` | _unset_ | File containing the judge API key | | `NVRX_LLM_MODEL` | `nvidia/nemotron-3-super-120b-a12b` | Model for log-analysis | | `NVRX_LLM_BASE_URL` | `https://integrate.api.nvidia.com/v1` | Base URL for log-analysis | diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md index 17cc7de5..8fcf559c 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/fr-analysis/SKILL.md @@ -5,7 +5,7 @@ description: > isolate the responsible ranks using CollectiveAnalyzer. Use when a distributed training job hangs due to an NCCL collective timeout and FR dump files are available. Detects the wavefront process group where collectives diverge and returns the root-cause suspect ranks. -compatibility: Requires PyTorch NCCL FR dumps (TORCH_NCCL_TRACE_BUFFER_SIZE > 0 must be set during training). NVIDIA_API_KEY and langchain-openai are required only when using --llm-analyze. +compatibility: Requires PyTorch NCCL FR dumps (TORCH_NCCL_TRACE_BUFFER_SIZE > 0 must be set during training). LLM_API_KEY and langchain-openai are required only when using --llm-analyze. metadata: entry-point: CollectiveAnalyzer script: scripts/fr_attribution.py @@ -108,6 +108,6 @@ or triggered automatically on NCCL timeout. ## Prerequisites - FR dump files produced by PyTorch NCCL (set `TORCH_NCCL_TRACE_BUFFER_SIZE` > 0) -- `NVIDIA_API_KEY` required only when using `--llm-analyze` +- `LLM_API_KEY` required only when using `--llm-analyze` - `langchain-openai` required only when using `--llm-analyze` - `FR_DEBUG=1` env var enables verbose debug logging in the script diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/SKILL.md b/src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/SKILL.md index e793d5de..a1199edc 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/SKILL.md +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/log-analysis/SKILL.md @@ -5,7 +5,7 @@ description: > NVRxLogAnalyzer. Use when you have a SLURM training job log and need to determine why the job failed and whether it should be restarted. Performs per-cycle chunking, fast-path pattern matching, and LLM-based classification. -compatibility: Requires NVIDIA_API_KEY, langchain-openai, and logsage packages installed. nvidia-resiliency-ext must be installed. +compatibility: Requires LLM_API_KEY, langchain-openai, and logsage packages installed. nvidia-resiliency-ext must be installed. metadata: entry-point: NVRxLogAnalyzer script: scripts/nvrx_logsage.py @@ -108,5 +108,5 @@ fields joined by `\n`: ## Prerequisites -- `NVIDIA_API_KEY` set (env var, `NVIDIA_API_KEY_FILE`, or `~/.nvidia_api_key`) +- `LLM_API_KEY` set (env var, `LLM_API_KEY_FILE`, or `~/.llm_api_key`) - `langchain-openai` and `logsage` packages installed diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py index 16c6faec..fcc5c3f3 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/score_attribution.py @@ -28,7 +28,7 @@ from langchain_openai import ChatOpenAI sys.path.insert(0, str(__import__("pathlib").Path(__file__).resolve().parents[4])) -from nvidia_resiliency_ext.attribution.api_keys import load_nvidia_api_key +from nvidia_resiliency_ext.attribution.api_keys import load_llm_api_key from nvidia_resiliency_ext.attribution.svc.config import DEFAULT_LLM_BASE_URL logger = logging.getLogger(__name__) @@ -273,11 +273,11 @@ def score(args): except OSError: api_key = "" if not api_key: - api_key = load_nvidia_api_key() + api_key = load_llm_api_key() if not api_key: raise ValueError( "Judge API key not found. Set JUDGE_API_KEY/JUDGE_API_KEY_FILE, " - "or NVIDIA_API_KEY/NVIDIA_API_KEY_FILE, or create ~/.nvidia_api_key" + "or LLM_API_KEY/LLM_API_KEY_FILE, or create ~/.llm_api_key" ) base_url = os.getenv("JUDGE_BASE_URL", "").strip() or args.base_url diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example index cf060f45..c4b05bc9 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env.example @@ -33,10 +33,10 @@ CONTAINER_IMAGE="nvcr.io/nvidia/nemo:26.04" # Log-analysis / judge LLM settings # Keep these local. Prefer *_API_KEY_FILE over inline secrets. -# NVIDIA_API_KEY="..." +# LLM_API_KEY="..." # JUDGE_API_KEY="..." -# NVIDIA_API_KEY_FILE="${HOME}/.nvidia_api_key" -# JUDGE_API_KEY_FILE="${HOME}/.nvidia_api_key" +# LLM_API_KEY_FILE="${HOME}/.llm_api_key" +# JUDGE_API_KEY_FILE="${HOME}/.llm_api_key" # NVRX_LLM_MODEL="nvidia/nemotron-3-super-120b-a12b" # NVRX_LLM_BASE_URL="https://integrate.api.nvidia.com/v1" # JUDGE_MODEL="qwen/qwen3.5-397b-a17b" From 26e642987a6bade9fb5b620bd2533621d7d2d5c3 Mon Sep 17 00:00:00 2001 From: Seonmyeong Bak Date: Tue, 28 Apr 2026 14:35:41 -0700 Subject: [PATCH 20/21] fix(skills): resolve user env for spooled jobs --- .../nvrx-attr/scripts/l4_gb200_reduced.sh | 21 ++++++++++++++++++- .../nvrx-attr/scripts/n3_super_gb200_fi.sh | 21 ++++++++++++++++++- .../nvrx-attr/scripts/prepare_node_alloc.sh | 2 +- 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh index 319053fc..4da69f2a 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/l4_gb200_reduced.sh @@ -17,7 +17,26 @@ #SBATCH --exclusive #SBATCH --mem=0 -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +resolve_script_dir() { + local candidate + + for candidate in \ + "${NVRX_ATTR_SCRIPT_DIR:-}" \ + "$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" \ + "${SLURM_SUBMIT_DIR:-}" \ + "${SLURM_SUBMIT_DIR:-}/scripts" \ + "${SLURM_SUBMIT_DIR:-}/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts" + do + if [[ -n "${candidate}" && -f "${candidate}/user.env" ]]; then + cd "${candidate}" && pwd + return + fi + done + + cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd +} + +SCRIPT_DIR="$(resolve_script_dir)" USER_ENV_FILE="${SCRIPT_DIR}/user.env" NVRX_SRC_ROOT_DEFAULT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)" NVRX_REPO_ROOT_DEFAULT="$(cd "${SCRIPT_DIR}/../../../../.." && pwd)" diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh index 7f077575..f1e7d818 100644 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/n3_super_gb200_fi.sh @@ -15,7 +15,26 @@ #SBATCH --exclusive #SBATCH --mem=0 -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +resolve_script_dir() { + local candidate + + for candidate in \ + "${NVRX_ATTR_SCRIPT_DIR:-}" \ + "$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" \ + "${SLURM_SUBMIT_DIR:-}" \ + "${SLURM_SUBMIT_DIR:-}/scripts" \ + "${SLURM_SUBMIT_DIR:-}/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts" + do + if [[ -n "${candidate}" && -f "${candidate}/user.env" ]]; then + cd "${candidate}" && pwd + return + fi + done + + cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd +} + +SCRIPT_DIR="$(resolve_script_dir)" USER_ENV_FILE="${SCRIPT_DIR}/user.env" NVRX_SRC_ROOT_DEFAULT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)" NVRX_REPO_ROOT_DEFAULT="$(cd "${SCRIPT_DIR}/../../../../.." && pwd)" diff --git a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh index 9ce92b67..c2d43752 100755 --- a/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh +++ b/src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/prepare_node_alloc.sh @@ -179,7 +179,7 @@ submit_one() { --mem=0 \ --output="${EXPERIMENT_DIR}/logs/slurm/%j.launch.out" \ --error="${EXPERIMENT_DIR}/logs/slurm/%j.launch.err" \ - --export=ALL,FAULT_TYPE="${FAULT_TYPE}",FAULT_RANK="${RANK}",FAULT_AT_ITER="${ITER}",GPUS_PER_NODE="${GPUS_PER_NODE}",EXPERIMENT_DIR="${EXPERIMENT_DIR}",BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR}",MEGATRON_REPO_HOST_PATH="${MEGATRON_REPO_HOST_PATH:-}",CONTAINER_IMAGE="${CONTAINER_IMAGE:-}",SHARED_TMP_BASE_DIR="${SHARED_TMP_BASE_DIR:-}",WORKSPACE_HOST_PATH="${WORKSPACE_HOST_PATH:-}" \ + --export=ALL,FAULT_TYPE="${FAULT_TYPE}",FAULT_RANK="${RANK}",FAULT_AT_ITER="${ITER}",GPUS_PER_NODE="${GPUS_PER_NODE}",EXPERIMENT_DIR="${EXPERIMENT_DIR}",BASE_EXPERIMENTS_DIR="${BASE_EXPERIMENTS_DIR}",MEGATRON_REPO_HOST_PATH="${MEGATRON_REPO_HOST_PATH:-}",CONTAINER_IMAGE="${CONTAINER_IMAGE:-}",SHARED_TMP_BASE_DIR="${SHARED_TMP_BASE_DIR:-}",WORKSPACE_HOST_PATH="${WORKSPACE_HOST_PATH:-}",NVRX_ATTR_SCRIPT_DIR="${SCRIPT_DIR}" \ --parsable ) if [[ -n "${ACCOUNT}" ]]; then From d4a3999b80bf0e0a456bfbf73176f66b5a583de1 Mon Sep 17 00:00:00 2001 From: Seonmyeong Bak Date: Tue, 28 Apr 2026 15:11:39 -0700 Subject: [PATCH 21/21] style(tests): sort nvrx logsage retry imports --- tests/attribution/unit/test_nvrx_logsage_retry.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/attribution/unit/test_nvrx_logsage_retry.py b/tests/attribution/unit/test_nvrx_logsage_retry.py index 7dd8ce94..4c5e5890 100644 --- a/tests/attribution/unit/test_nvrx_logsage_retry.py +++ b/tests/attribution/unit/test_nvrx_logsage_retry.py @@ -3,7 +3,6 @@ import unittest from unittest.mock import patch - try: nvrx_logsage = importlib.import_module( "nvidia_resiliency_ext.attribution.log_analyzer.nvrx_logsage"