-
Notifications
You must be signed in to change notification settings - Fork 54
feat(skills): add nvrx attribution workflow bundle #312
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
b2a035e
df5f5c7
a146d54
f73ff8b
2275c6a
3f2016d
43e39f7
251df4e
3b791bc
af9c325
9a12531
1e23cb9
5e30a25
810c534
d0fac00
950b97b
269a3e3
59ebefe
0e1d6db
26e6429
d4a3999
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,3 +11,4 @@ ft_state.json | |
| *_pb2.pyi | ||
| *_pb2_grpc.py | ||
| .idea/ | ||
| src/nvidia_resiliency_ext/skills/nvrx-attr/scripts/user.env | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,6 +34,42 @@ def eprint(*args, **kwargs): | |
| print(*args, file=sys.stderr, **kwargs) | ||
|
|
||
|
|
||
| def _parse_rank_list(rank_text: str) -> List[int]: | ||
| ranks = [] | ||
| for token in rank_text.split(','): | ||
| token = token.strip() | ||
| if not token: | ||
| continue | ||
| try: | ||
| ranks.append(int(token)) | ||
| except ValueError: | ||
| continue | ||
| return ranks | ||
|
|
||
|
|
||
| def _extract_missing_ranks_from_table(text: str) -> List[int]: | ||
| hanging_ranks = set() | ||
| capture = False | ||
|
|
||
| for line in text.splitlines(): | ||
| stripped = line.strip() | ||
| if not stripped: | ||
| continue | ||
| if stripped.startswith("PGID") and "Missing Ranks" in stripped: | ||
| capture = True | ||
| continue | ||
| if not capture or "|" not in stripped: | ||
| continue | ||
|
|
||
| columns = [col.strip() for col in stripped.split("|")] | ||
| if len(columns) < 6: | ||
| continue | ||
| for rank in _parse_rank_list(columns[-1]): | ||
| hanging_ranks.add(rank) | ||
|
|
||
| return sorted(hanging_ranks) | ||
|
|
||
|
|
||
| @dataclass | ||
| class Collective: | ||
| """ | ||
|
|
@@ -134,12 +170,7 @@ async def print_output(self, attribution_result: Optional[str]): | |
| hanging_ranks_str = hanging_ranks.group(1).strip() | ||
| hanging_ranks_list = list(map(int, hanging_ranks_str.split(','))) | ||
| else: | ||
| for idx, line in enumerate(text.split('\n')): | ||
| line_list = line.split('|') | ||
| if len(line_list) >= 5: | ||
| logger.info(line) | ||
| if idx >= 1: | ||
| hanging_ranks_list.append(line_list[5]) | ||
| hanging_ranks_list = _extract_missing_ranks_from_table(text) | ||
| hanging_ranks = f"hanging ranks: {hanging_ranks_list}" | ||
| # Dict form preserves collective table text for MCP clients and FRAnalysisResult parity. | ||
| return ( | ||
|
|
@@ -218,20 +249,18 @@ def build_collectives_to_order(): | |
| # analyze collectives to find process groups with missing and completed ranks | ||
| completed_pg, missing_pg = self.analyze_matches(verbose=bool(cfg.get("verbose"))) | ||
| grouped_missing_pgs = {} | ||
| grouped_completed_pgs = {} | ||
|
|
||
| # if the dump file contains health check results, parse the health check results | ||
| # and print them in a format | ||
| if cfg.get("health_check"): | ||
| self.print_node_health_status(verbose=bool(cfg.get("verbose"))) | ||
|
|
||
| # group the process groups with missing and completed ranks | ||
| # by finding longest paths in the graph | ||
| # Group only process groups with missing ranks. | ||
| # Completed-rank summaries are not actionable for attribution and create | ||
| # misleading output in the feedback loop. | ||
| grouped_missing_pgs = self.group_pgs(missing_pg) | ||
| if len(grouped_missing_pgs) == 0: | ||
| grouped_completed_pgs = self.group_pgs(completed_pg) | ||
|
|
||
| # gather the head node of each group with missing and completed ranks | ||
| # gather the head node of each group with missing ranks | ||
| # the head node is the first node in the group | ||
| # the missing ranks in the head node of the missing process groups | ||
| # are considered to cause the other nodes in the group to hang | ||
|
|
@@ -242,41 +271,40 @@ def gather_head_nodes(grouped_pgs): | |
| return head_nodes | ||
|
|
||
| head_nodes_missing = None | ||
| head_nodes_completed = None | ||
| # Gather the head node of each group | ||
| # Gather the head node of each missing-rank group. | ||
| if len(grouped_missing_pgs) > 0: | ||
| head_nodes_missing = gather_head_nodes(grouped_missing_pgs) | ||
| logger.debug(f"head_nodes of missing_pg: {head_nodes_missing}") | ||
| else: | ||
| head_nodes_completed = gather_head_nodes(grouped_completed_pgs) | ||
| logger.debug(f"head_nodes of completed_pg: {head_nodes_completed}") | ||
| # Print the analysis output | ||
| with capture_logs() as output: | ||
| original_level = logger.level | ||
| if logger.getEffectiveLevel() > logging.INFO: | ||
| logger.setLevel(logging.INFO) | ||
| try: | ||
| with capture_logs(logger.name) as output: | ||
|
|
||
| def print_ranks_in_pgs(head_nodes, pg_dict, missing_or_completed="Missing"): | ||
| logger.info( | ||
| f"{'PGID':<6} | {'Process Group Desc':<25} | {'Op Type':<10} | {'Size':<8} \ | ||
| | {'Dtype':<8} | {missing_or_completed} Ranks" | ||
| ) | ||
| for pg_idx in head_nodes: | ||
| entry = list(pg_dict[pg_idx][0]) | ||
| entry.remove(entry[-2]) | ||
| if missing_or_completed == "Missing": | ||
| ranks_to_print = entry[6] | ||
| else: | ||
| ranks_to_print = entry[5] | ||
| def print_ranks_in_pgs(head_nodes, pg_dict, missing_or_completed="Missing"): | ||
| logger.info( | ||
| f"{entry[0]:<6} | {entry[1]:<25} | {entry[2]:<10} | {entry[3]:<8} \ | ||
| | {entry[4]:<8} | {ranks_to_print}" | ||
| f"{'PGID':<6} | {'Process Group Desc':<25} | {'Op Type':<10} | {'Size':<8} \ | ||
| | {'Dtype':<8} | {missing_or_completed} Ranks" | ||
| ) | ||
| for pg_idx in head_nodes: | ||
| entry = list(pg_dict[pg_idx][0]) | ||
| entry.remove(entry[-2]) | ||
| if missing_or_completed == "Missing": | ||
| ranks_to_print = entry[6] | ||
| else: | ||
| ranks_to_print = entry[5] | ||
| logger.info( | ||
| f"{entry[0]:<6} | {entry[1]:<25} | {entry[2]:<10} | {entry[3]:<8} \ | ||
| | {entry[4]:<8} | {ranks_to_print}" | ||
| ) | ||
|
|
||
| if head_nodes_missing: | ||
| logger.debug(f"head_nodes_missing: {head_nodes_missing}") | ||
| print_ranks_in_pgs(head_nodes_missing, missing_pg, "Missing") | ||
| # TODO: using this completed pg needs to be updated with new algorithm for isolation | ||
| if head_nodes_completed: | ||
| print_ranks_in_pgs(head_nodes_completed, completed_pg, "Completed") | ||
| analysis_output = output.getvalue() | ||
| if head_nodes_missing: | ||
| logger.debug(f"head_nodes_missing: {head_nodes_missing}") | ||
| print_ranks_in_pgs(head_nodes_missing, missing_pg, "Missing") | ||
| analysis_output = output.getvalue() | ||
| finally: | ||
| logger.setLevel(original_level) | ||
| return analysis_output | ||
|
sbak5 marked this conversation as resolved.
|
||
|
|
||
| async def collective_analysis(self, analysis_output: str) -> Optional[str]: | ||
|
|
@@ -1117,7 +1145,7 @@ def main(): | |
| '--fr-path', type=str, help='Path to JSON files or directories containing JSON files' | ||
| ) | ||
| parser.add_argument( | ||
| '-p', '--pattern', default="*.json", help='File pattern to match (default: *.json)' | ||
| '-p', '--pattern', default="_dump_*", help='File pattern to match (default: _dump_*)' | ||
| ) | ||
| parser.add_argument('-v', '--verbose', action='store_true', help='verbose output') | ||
| parser.add_argument( | ||
|
|
@@ -1143,11 +1171,25 @@ def main(): | |
| action='store_true', | ||
| help='Convert the trace file to json file, if the trace is binary, for debugging', | ||
| ) | ||
| parser.add_argument( | ||
| '--emit-stdout', | ||
| action='store_true', | ||
| help='Print final FR summary table to stdout for machine consumers', | ||
| ) | ||
|
|
||
| args = parser.parse_args() | ||
|
|
||
| analyzer = CollectiveAnalyzer(args) | ||
| analyzer.run_sync(args) | ||
| result = analyzer.run_sync(args) | ||
|
|
||
| if args.emit_stdout and isinstance(result, tuple) and result: | ||
| payload = result[0] | ||
| if isinstance(payload, dict): | ||
| text = payload.get("analysis_text", "") | ||
| if text: | ||
| print(text) | ||
| elif payload: | ||
| print(payload) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
|
Comment on lines
1171
to
1195
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The Fix: unwrap the inner text from the first item in if args.emit_stdout and isinstance(result, tuple) and result:
output_list = result[0]
if isinstance(output_list, list) and output_list:
text = output_list[0][0] if isinstance(output_list[0], (list, tuple)) else output_list[0]
if isinstance(text, dict):
text = text.get("analysis_text", "")
if text:
print(text)
elif isinstance(output_list, dict):
text = output_list.get("analysis_text", "")
if text:
print(text)
elif output_list:
print(output_list) |
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| """Agent skills bundled with nvidia_resiliency_ext.""" |
Uh oh!
There was an error while loading. Please reload this page.