NVIDIA · sbak5 · Apr 29, 2026 · Apr 22, 2026 · Apr 22, 2026 · Apr 23, 2026
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,4 @@ ft_state.json
 *_pb2.pyi
 *_pb2_grpc.py
 .idea/
+src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env
diff --git a/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py b/src/nvidia_resiliency_ext/attribution/log_analyzer/nvrx_logsage.py
@@ -1,7 +1,9 @@
 import argparse
 import logging
 import os
+import random
 import re
+import time
 from typing import Any, Dict, Mapping, Union
 
 from langchain_openai import ChatOpenAI
@@ -37,6 +39,7 @@
 ATTR_ERRORS_NOT_FOUND = "ERRORS NOT FOUND"
 ATTR_NO_LOGS = "NO LOGS"
 ATTR_SLURM_CANCELLED_DUE_TO_PREEMPTION = "SLURM CANCELLED DUE TO PREEMPTION"
+LOGSAGE_LLM_ENDPOINT_FAILED = "LLM ENDPOINT FAILED"
 
 
 MARKER_NEW_RUN_DIR_ADDED = "[sbatch_script]: New run dir added:"
@@ -108,6 +111,99 @@ def chunk_logs_strict(lines):
     return final_chunks
 
 
+def _log_analysis_retry_config() -> tuple[int, float, float, float]:
+    retries = int(os.getenv("NVRX_LOG_ANALYSIS_LLM_RETRIES", "3"))
+    initial_backoff = float(os.getenv("NVRX_LOG_ANALYSIS_LLM_INITIAL_BACKOFF_SEC", "1.0"))
+    max_backoff = float(os.getenv("NVRX_LOG_ANALYSIS_LLM_MAX_BACKOFF_SEC", "8.0"))
+    jitter = float(os.getenv("NVRX_LOG_ANALYSIS_LLM_JITTER_SEC", "0.25"))
+    return retries, initial_backoff, max_backoff, jitter
+
+
+def _finished_status_name(status: Any) -> str:
+    return getattr(status, "name", status)
+
+
+def _sleep_with_backoff(
+    attempt: int, retries: int, backoff: float, max_backoff: float, jitter: float
+) -> float:
+    sleep_for = min(backoff, max_backoff) + random.uniform(0.0, jitter)
+    logger.info(
+        "Retrying log-analysis LLM in %.2fs after attempt %d/%d",
+        sleep_for,
+        attempt,
+        retries,
+    )
+    time.sleep(sleep_for)
+    return min(backoff * 2, max_backoff)
+
+
+def _retry_return_application_errors(
+    llm: ChatOpenAI, lines: list[str], cache_dict: LRUCache
+) -> ApplicationData:
+    retries, initial_backoff, max_backoff, jitter = _log_analysis_retry_config()
+    backoff = initial_backoff
+    last_status = None
+
+    for attempt in range(1, retries + 1):
+        app_data = return_application_errors(llm, lines, cache_dict)
+        status_name = _finished_status_name(app_data.finished)
+        if status_name != FINISHED_STATUS_LLM_FAILURE:
+            return app_data
+
+        last_status = status_name
+        if attempt == retries:
+            logger.error(
+                "Log-analysis extraction failed after %d attempts; last status: %s",
+                retries,
+                last_status,
+            )
+            return app_data
+
+        backoff = _sleep_with_backoff(attempt, retries, backoff, max_backoff, jitter)
+
+    return app_data
+
+
+def _with_exponential_backoff(llm_call, checkpoint_saved: bool) -> tuple[str, str, str, str, str]:
+    retries, initial_backoff, max_backoff, jitter = _log_analysis_retry_config()
+    backoff = initial_backoff
+    last_error = "no attempts made (retries=0)"
+    fallback = (
+        ATTR_LLM_FAILURE,
+        ATTR_LLM_FAILURE,
+        ATTR_LLM_FAILURE,
+        ATTR_LLM_FAILURE,
+        str(checkpoint_saved),
+    )
+
+    for attempt in range(1, retries + 1):
+        try:
+            result = llm_call()
+            if result and not any(field == LOGSAGE_LLM_ENDPOINT_FAILED for field in result[:4]):
+                return result
+            last_error = LOGSAGE_LLM_ENDPOINT_FAILED
+        except Exception as exc:
+            last_error = str(exc)
+            logger.warning("Log-analysis LLM attempt %d/%d failed: %s", attempt, retries, exc)
+
+        if attempt == retries:
+            logger.error(
+                "Log-analysis LLM failed after %d attempts; last error: %s",
+                retries,
+                last_error,
+            )
+            return fallback
+
+        backoff = _sleep_with_backoff(attempt, retries, backoff, max_backoff, jitter)
+
+    logger.error(
+        "Log-analysis LLM failed after %d attempts; last error: %s",
+        retries,
+        last_error,
+    )
+    return fallback
+
+
 class NVRxLogAnalyzer(NVRxAttribution):
     def __init__(self, args: Union[argparse.Namespace, Mapping[str, Any]]):
         from nvidia_resiliency_ext.attribution.api_keys import load_llm_api_key
@@ -213,7 +309,7 @@ async def analyze_logs(self) -> list[ApplicationData]:
                     current_chunk.append(line)
 
         output_list = [
-            return_application_errors(self.llm, lines, self.lru_cache)
+            _retry_return_application_errors(self.llm, lines, self.lru_cache)
             for cycle, lines in chunks.items()
         ]
         return output_list
@@ -248,7 +344,12 @@ async def llm_analyze(self, output_list: list[ApplicationData]) -> list[str]:
                 )
             else:
                 if len(output.application_errors_list_full):
-                    result.append(get_proposed_solution_cat(self.llm, output))
+                    result.append(
+                        _with_exponential_backoff(
+                            lambda: get_proposed_solution_cat(self.llm, output),
+                            checkpoint_saved=output.checkpoint_saved,
+                        )
+                    )
                 else:
                     if output.finished == FINISHED_STATUS_LLM_FAILURE:
                         result.append(
@@ -361,11 +462,24 @@ def main():
         action='store_true',
         help='Input is already per-cycle data (skip filtering and chunking)',
     )
+    parser.add_argument(
+        '--emit-stdout',
+        action='store_true',
+        help='Print final attribution payload to stdout for machine consumers',
+    )
 
     args = parser.parse_args()
 
     analyzer = NVRxLogAnalyzer(args)
-    analyzer.run_sync(args)
+    results = analyzer.run_sync(args)
+
+    if args.emit_stdout:
+        for result in results:
+            if not result:
+                continue
+            payload = result[0] if isinstance(result, tuple) else result
+            if payload:
+                print(payload)
 
 
 if __name__ == "__main__":

diff --git a/src/nvidia_resiliency_ext/attribution/trace_analyzer/fr_attribution.py b/src/nvidia_resiliency_ext/attribution/trace_analyzer/fr_attribution.py
@@ -34,6 +34,42 @@ def eprint(*args, **kwargs):
     print(*args, file=sys.stderr, **kwargs)
 
 
+def _parse_rank_list(rank_text: str) -> List[int]:
+    ranks = []
+    for token in rank_text.split(','):
+        token = token.strip()
+        if not token:
+            continue
+        try:
+            ranks.append(int(token))
+        except ValueError:
+            continue
+    return ranks
+
+
+def _extract_missing_ranks_from_table(text: str) -> List[int]:
+    hanging_ranks = set()
+    capture = False
+
+    for line in text.splitlines():
+        stripped = line.strip()
+        if not stripped:
+            continue
+        if stripped.startswith("PGID") and "Missing Ranks" in stripped:
+            capture = True
+            continue
+        if not capture or "|" not in stripped:
+            continue
+
+        columns = [col.strip() for col in stripped.split("|")]
+        if len(columns) < 6:
+            continue
+        for rank in _parse_rank_list(columns[-1]):
+            hanging_ranks.add(rank)
+
+    return sorted(hanging_ranks)
+
+
 @dataclass
 class Collective:
     """
@@ -134,12 +170,7 @@ async def print_output(self, attribution_result: Optional[str]):
                 hanging_ranks_str = hanging_ranks.group(1).strip()
                 hanging_ranks_list = list(map(int, hanging_ranks_str.split(',')))
         else:
-            for idx, line in enumerate(text.split('\n')):
-                line_list = line.split('|')
-                if len(line_list) >= 5:
-                    logger.info(line)
-                    if idx >= 1:
-                        hanging_ranks_list.append(line_list[5])
+            hanging_ranks_list = _extract_missing_ranks_from_table(text)
         hanging_ranks = f"hanging ranks: {hanging_ranks_list}"
         # Dict form preserves collective table text for MCP clients and FRAnalysisResult parity.
         return (
@@ -218,20 +249,18 @@ def build_collectives_to_order():
         # analyze collectives to find process groups with missing and completed ranks
         completed_pg, missing_pg = self.analyze_matches(verbose=bool(cfg.get("verbose")))
         grouped_missing_pgs = {}
-        grouped_completed_pgs = {}
 
         # if the dump file contains health check results, parse the health check results
         # and print them in a format
         if cfg.get("health_check"):
             self.print_node_health_status(verbose=bool(cfg.get("verbose")))
 
-        # group the process groups with missing and completed ranks
-        # by finding longest paths in the graph
+        # Group only process groups with missing ranks.
+        # Completed-rank summaries are not actionable for attribution and create
+        # misleading output in the feedback loop.
         grouped_missing_pgs = self.group_pgs(missing_pg)
-        if len(grouped_missing_pgs) == 0:
-            grouped_completed_pgs = self.group_pgs(completed_pg)
 
-        # gather the head node of each group with missing and completed ranks
+        # gather the head node of each group with missing ranks
         # the head node is the first node in the group
         # the missing ranks in the head node of the missing process groups
         # are considered to cause the other nodes in the group to hang
@@ -242,41 +271,40 @@ def gather_head_nodes(grouped_pgs):
             return head_nodes
 
         head_nodes_missing = None
-        head_nodes_completed = None
-        # Gather the head node of each group
+        # Gather the head node of each missing-rank group.
         if len(grouped_missing_pgs) > 0:
             head_nodes_missing = gather_head_nodes(grouped_missing_pgs)
             logger.debug(f"head_nodes of missing_pg: {head_nodes_missing}")
-        else:
-            head_nodes_completed = gather_head_nodes(grouped_completed_pgs)
-            logger.debug(f"head_nodes of completed_pg: {head_nodes_completed}")
         # Print the analysis output
-        with capture_logs() as output:
+        original_level = logger.level
+        if logger.getEffectiveLevel() > logging.INFO:
+            logger.setLevel(logging.INFO)
+        try:
+            with capture_logs(logger.name) as output:
 
-            def print_ranks_in_pgs(head_nodes, pg_dict, missing_or_completed="Missing"):
-                logger.info(
-                    f"{'PGID':<6} | {'Process Group Desc':<25} | {'Op Type':<10} | {'Size':<8} \
-                        | {'Dtype':<8} | {missing_or_completed} Ranks"
-                )
-                for pg_idx in head_nodes:
-                    entry = list(pg_dict[pg_idx][0])
-                    entry.remove(entry[-2])
-                    if missing_or_completed == "Missing":
-                        ranks_to_print = entry[6]
-                    else:
-                        ranks_to_print = entry[5]
+                def print_ranks_in_pgs(head_nodes, pg_dict, missing_or_completed="Missing"):
                     logger.info(
-                        f"{entry[0]:<6} | {entry[1]:<25} | {entry[2]:<10} | {entry[3]:<8} \
-                            | {entry[4]:<8} | {ranks_to_print}"
+                        f"{'PGID':<6} | {'Process Group Desc':<25} | {'Op Type':<10} | {'Size':<8} \
+                            | {'Dtype':<8} | {missing_or_completed} Ranks"
                     )
+                    for pg_idx in head_nodes:
+                        entry = list(pg_dict[pg_idx][0])
+                        entry.remove(entry[-2])
+                        if missing_or_completed == "Missing":
+                            ranks_to_print = entry[6]
+                        else:
+                            ranks_to_print = entry[5]
+                        logger.info(
+                            f"{entry[0]:<6} | {entry[1]:<25} | {entry[2]:<10} | {entry[3]:<8} \
+                                | {entry[4]:<8} | {ranks_to_print}"
+                        )
 
-            if head_nodes_missing:
-                logger.debug(f"head_nodes_missing: {head_nodes_missing}")
-                print_ranks_in_pgs(head_nodes_missing, missing_pg, "Missing")
-            # TODO: using this completed pg needs to be updated with new algorithm for isolation
-            if head_nodes_completed:
-                print_ranks_in_pgs(head_nodes_completed, completed_pg, "Completed")
-        analysis_output = output.getvalue()
+                if head_nodes_missing:
+                    logger.debug(f"head_nodes_missing: {head_nodes_missing}")
+                    print_ranks_in_pgs(head_nodes_missing, missing_pg, "Missing")
+            analysis_output = output.getvalue()
+        finally:
+            logger.setLevel(original_level)
         return analysis_output
 
     async def collective_analysis(self, analysis_output: str) -> Optional[str]:
@@ -1117,7 +1145,7 @@ def main():
         '--fr-path', type=str, help='Path to JSON files or directories containing JSON files'
     )
     parser.add_argument(
-        '-p', '--pattern', default="*.json", help='File pattern to match (default: *.json)'
+        '-p', '--pattern', default="_dump_*", help='File pattern to match (default: _dump_*)'
     )
     parser.add_argument('-v', '--verbose', action='store_true', help='verbose output')
     parser.add_argument(
@@ -1143,11 +1171,25 @@ def main():
         action='store_true',
         help='Convert the trace file to json file, if the trace is binary, for debugging',
     )
+    parser.add_argument(
+        '--emit-stdout',
+        action='store_true',
+        help='Print final FR summary table to stdout for machine consumers',
+    )
 
     args = parser.parse_args()
 
     analyzer = CollectiveAnalyzer(args)
-    analyzer.run_sync(args)
+    result = analyzer.run_sync(args)
+
+    if args.emit_stdout and isinstance(result, tuple) and result:
+        payload = result[0]
+        if isinstance(payload, dict):
+            text = payload.get("analysis_text", "")
+            if text:
+                print(text)
+        elif payload:
+            print(payload)
 
 
 if __name__ == "__main__":

diff --git a/src/nvidia_resiliency_ext/skills/__init__.py b/src/nvidia_resiliency_ext/skills/__init__.py
@@ -0,0 +1 @@
+"""Agent skills bundled with nvidia_resiliency_ext."""
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Agent skills bundled with nvidia_resiliency_ext."""