jhinpan · jhinpan · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026
diff --git a/.claude/settings.json b/.claude/settings.json
@@ -0,0 +1,5 @@
+{
+  "permissions": {
+    "deny": ["AskUserQuestion"]
+  }
+}
diff --git a/.claude/skills/kernel-trace-analysis/scripts/hotspot_analyzer.py b/.claude/skills/kernel-trace-analysis/scripts/hotspot_analyzer.py
@@ -238,18 +238,33 @@ def print_source_detail(hotspot, source_cache, context=3):
         print(f"      stall={fmt_cycles(inst.stall_cycles):>7}  type={inst.stall_type:<12}  {inst.asm}")
 
 
-def read_kernel_metadata(dispatch_dir):
+def read_kernel_metadata(dispatch_dir, kernel_filter=""):
     """Read authoritative resource counts from ``out_kernel_trace.csv`` if present.
 
     The ATT ``code.json`` only contains the (possibly single-CU, possibly
     vgpr-form) disassembly, so it cannot reveal accum_vgpr / SGPR / LDS /
     workgroup size.  The kernel-trace CSV carries the real launch metadata.
     Searches the dispatch dir and its parent (staging often copies the CSV
     next to the ui_output_agent_* dir).  Returns {} if not found.
+
+    Row selection priority:
+      1. ``kernel_filter`` substring matched against Kernel_Name, optionally
+         narrowed by Dispatch_Id when the dir name encodes ``dispatch_<id>``
+         (rocprofv3 ``ui_output_agent_*_dispatch_<id>`` layout).  Dispatch_Id
+         matching avoids false matches when a PyTorch reference kernel shares
+         the same name substring.
+      2. Bidirectional name heuristic against the directory basename (legacy
+         path for timestamped dirs like ``20240101_120000_pa_decode_kernel``).
     """
     candidates = []
     for base in (dispatch_dir, os.path.dirname(os.path.abspath(dispatch_dir))):
         candidates += glob.glob(os.path.join(base, "*kernel_trace*.csv"))
+
+    dir_name = os.path.basename(os.path.abspath(dispatch_dir))
+    # Extract the dispatch id from rocprofv3's ui_output_agent_<N>_dispatch_<id> layout.
+    _dispatch_id_m = re.search(r"dispatch_(\d+)$", dir_name)
+    dispatch_id = _dispatch_id_m.group(1) if _dispatch_id_m else None
+
     for path in candidates:
         try:
             with open(path) as f:
@@ -258,24 +273,40 @@ def read_kernel_metadata(dispatch_dir):
             continue
         if not rows or "Accum_VGPR_Count" not in rows[0]:
             continue
-        # Pick the row whose kernel matches the dispatch dir name.  The dir is
-        # usually staged as "<timestamp>_<short_kernel_name>" while the CSV
-        # Kernel_Name has a trailing index (e.g. dir ".._pa_decode_ps_kernel"
-        # vs kernel "pa_decode_ps_kernel_0"), so match bidirectionally on the
-        # timestamp-stripped short name.
-        dir_name = os.path.basename(os.path.abspath(dispatch_dir))
-        short = re.sub(r"^\d{8}_\d{6}_", "", dir_name)  # strip YYYYMMDD_HHMMSS_
-
-        def _matches(kn):
-            if not kn:
-                return False
-            return kn in dir_name or short in kn or kn.startswith(short) or short.startswith(kn)
+
+        has_dispatch_col = "Dispatch_Id" in rows[0]
 
         chosen = None
-        for r in rows:
-            if _matches(r.get("Kernel_Name", "")):
-                chosen = r
-                break
+        if kernel_filter:
+            # Explicit filter: kernel name substring, narrowed by Dispatch_Id when available.
+            can_disambiguate = bool(dispatch_id and has_dispatch_col)
+            matches = [r for r in rows if kernel_filter in r.get("Kernel_Name", "")]
+            if can_disambiguate:
+                matches = [r for r in matches if str(r.get("Dispatch_Id", "")).strip() == dispatch_id]
+            if matches:
+                chosen = matches[0]
+                if not can_disambiguate and len(matches) > 1:
+                    # First-substring-wins: no dispatch id available to pick between same-named rows.
+                    print(
+                        f"  warning: --kernel '{kernel_filter}' matched {len(matches)} rows in "
+                        f"{os.path.basename(path)} with no dispatch id to disambiguate; using the "
+                        "first match (pass a more specific --kernel)"
+                    )
+        else:
+            # Legacy heuristic: bidirectional substring match against the dir basename.
+            # Works for timestamped dirs like ``20240101_120000_pa_decode_kernel``.
+            short = re.sub(r"^\d{8}_\d{6}_", "", dir_name)  # strip YYYYMMDD_HHMMSS_
+
+            def _matches(kn):
+                if not kn:
+                    return False
+                return kn in dir_name or short in kn or kn.startswith(short) or short.startswith(kn)
+
+            for r in rows:
+                if _matches(r.get("Kernel_Name", "")):
+                    chosen = r
+                    break
+
         if chosen is None:
             continue  # no matching row in this CSV — try the next candidate
 
@@ -457,7 +488,10 @@ def print_reg_pressure(reg_info):
     print_header("Register Pressure & Occupancy")
     print(f"  Architecture:   {reg_info['arch']}")
     if not reg_info["has_meta"]:
-        print("  (no kernel_trace CSV found — accum/LDS/SGPR estimated from ISA only)")
+        print(
+            "  (kernel_trace CSV not matched — accum/LDS/SGPR estimated from ISA only; "
+            "pass --kernel <name_substr> to enable CSV metadata lookup)"
+        )
     if reg_info["is_vgpr_form"]:
         print(f"  arch_vgpr:      {reg_info['arch_vgpr']}  (MFMA vgpr-form: accumulators in arch file, no AGPR)")
     else:
@@ -496,6 +530,17 @@ def main():
         "--detail", action="store_true", help="Show source snippet + instruction breakdown under each source hotspot"
     )
     parser.add_argument("--context", type=int, default=3, help="Source lines of context around hotspot (default: 3)")
+    parser.add_argument(
+        "--kernel",
+        default="",
+        metavar="SUBSTR",
+        help="Kernel name substring for CSV metadata lookup "
+        "(e.g. 'pa_mqa_logits_fp4_kernel_0'). "
+        "Required when the dispatch dir name does not encode the kernel name, "
+        "as with rocprofv3 ui_output_agent_*_dispatch_<id> directories. "
+        "Combined with the dispatch id from the dir name when a Dispatch_Id "
+        "column is present in the CSV.",
+    )
     args = parser.parse_args()
 
     if not os.path.isdir(args.dispatch_dir):
@@ -515,7 +560,7 @@ def main():
     print(f"  Total cycles:  {fmt_cycles(total_cycles)}")
     print(f"  Total stalls:  {fmt_cycles(total_stall)}  ({100*total_stall/total_cycles:.1f}% of total cycles)")
 
-    meta = read_kernel_metadata(args.dispatch_dir)
+    meta = read_kernel_metadata(args.dispatch_dir, kernel_filter=args.kernel)
     reg_info = detect_arch_and_reg_pressure(instructions, meta)
     print_reg_pressure(reg_info)
 

diff --git a/.github/workflows/flydsl.yaml b/.github/workflows/flydsl.yaml
@@ -453,10 +453,8 @@ jobs:
         timeout-minutes: 15
         run: |
           docker exec flydsl_test bash -c "
-            apt-get install -y libpci-dev libibverbs-dev &&
-            rm -rf /tmp/mori &&
-            git clone --depth 1 --recursive --shallow-submodules https://github.com/ROCm/mori.git /tmp/mori &&
-            cd /tmp/mori && python3 -m pip install . &&
+            apt-get install -y libpci-dev libibverbs-dev libgrpc++1.51 libgrpc29 &&
+            python3 -m pip install amd_mori &&
             MORI_PRECOMPILE=1 python3 -c 'import mori'
           "
 

diff --git a/.gitignore b/.gitignore
@@ -64,3 +64,5 @@ Thumbs.db
 # Sphinx documentation build
 docs/_build/
 python/flydsl/_mlir
+
+.humanize*
diff --git a/docs/a8w4_evidence.md b/docs/a8w4_evidence.md
@@ -0,0 +1,35 @@
+# a8w4 Strict-Path Correctness Evidence (locked ref 523ca1c7)
+
+All a8w4 (fp8 activation x fp4 weight) points run through the strict, model-correct
+aiter path (`scripts/aiter_strict_point.py`: true per-model activation/gate,
+`strict_accuracy=True`). Correctness is gated on `logits_diff <= 0.01`. a8w4 is
+correctness-BLOCKED in this environment (see the `kernels/moe_tuning_spec.py` quarantine
+note). Categories: `correctness` = strict accuracy assertion (logits ~0.98);
+`runtime` = kernel/runtime rejection (e.g. Unsupported scales/output); `pass` = logits<=0.01.
+
+| model | total | correctness-fail | runtime-fail | pass |
+|---|---|---|---|---|
+| deepseek_v3 | 16 | 4 | 12 | 0 |
+| deepseek_v4 | 16 | 10 | 6 | 0 |
+| gpt_oss | 8 | 4 | 3 | 1 |
+| kimi_k2 | 16 | 9 | 7 | 0 |
+
+## Representative per-row errors
+
+| model | token | category | error |
+|---|---|---|---|
+| deepseek_v3 | 1 | runtime | RuntimeError: Unsupported scales/output dtype! |
+| deepseek_v3 | 16 | correctness | AssertionError: accuracy check failed: checkAllclose err=0.9967564344406128, logits_diff=0 |
+| deepseek_v4 | 1 | runtime | RuntimeError: Unsupported scales/output dtype! |
+| deepseek_v4 | 16 | correctness | AssertionError: accuracy check failed: checkAllclose err=0.996712863445282, logits_diff=0. |
+| kimi_k2 | 1 | runtime | RuntimeError: Unsupported scales/output dtype! |
+| kimi_k2 | 16 | correctness | AssertionError: accuracy check failed: checkAllclose err=0.996957004070282, logits_diff=0. |
+| gpt_oss | 256 | pass |  |
+| gpt_oss | 512 | correctness | AssertionError: accuracy check failed: checkAllclose err=0.9967130422592163, logits_diff=0 |
+| gpt_oss | 4096 | runtime | TypeError: __init__(): incompatible function arguments. The following argument types are s |
+
+Source: `docs/baseline_523ca1c7_a8w4_strict.csv` (per-row strict_error, error_category,
+aot_status, flydsl_command, kernel-path metrics). aot_status=no_aot for all a8w4: no aiter
+AOT cache entry exists for these a8w4 shapes, so the strict runner runs without the AOT
+gate; the kernel still compiles+runs and then fails the strict correctness gate or a runtime
+scale/output check -- a real correctness/runtime block, not merely a missing AOT artifact.