From 2c6c0cd710fe1c7598712fbbf3b01508e86c1503 Mon Sep 17 00:00:00 2001
From: Jin Pan <jpan236@wisc.edu>
Date: Thu, 25 Jun 2026 01:54:13 +0000
Subject: [PATCH 1/2] MXFP4 MoE tuning harness: legality filter, measurement,
 ledger, strict guardrail (#708)

Measurement + verification infrastructure for tuning the MXFP4 (per-1x32 fp4) MoE
2-stage GEMM on gfx950/MI350X, toward ROCm/FlyDSL#708 (low MFU at large shapes,
long latency at small tokens). Infrastructure only -- no production kernel logic
changes.

- kernels/moe_tuning.py: pre-compile legality filter for stage1/stage2 tile
  configs (LDS footprint, divisibility, MX-FP4 floors); mirrors builder LDS sizing
  (stage1 full lds_stride vs stage2 fp4-halved).
- kernels/moe_tuning_spec.py: locked spec constants + win/no-regression predicates
  (win margins, regime-aware band, token grid, MFU denominator, metric formula).
- scripts/moe_tuning_harness.py: provenance-complete measurement harness (verified
  clock pinning, idle check, faithful timed-loop median+p95) + fail-closed
  candidate sweep CLI (illegal/unmeasured configs recorded as rejections).
- scripts/moe_tuning_ledger.py: attempt ledger + full-coverage Pareto comparator
  with a single claimable_win gate (coverage + no-regression + win + AOT/correctness
  hard gate) and integrity scans (duplicate / replay / supersede-link).
- scripts/aiter_strict_point.py: strict AOT-checked model-correct aiter e2e +
  correctness guardrail (logits_diff <= 0.01).
- scripts/sync_aiter_flydsl_kernels.sh: overlay FlyDSL MoE kernels onto aiter's
  vendored copies for the e2e guardrail.
- docs/mxfp4_moe_tuning.md + docs/baseline_523ca1c7_validated.csv: docs + a
  validated locked a4w4 baseline reference table.
- Host-side unit tests (no GPU required): 94 passed, 4 skipped (committed-ledger
  scans skip without a ledger). black + ruff clean.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .gitignore                             |    2 +
 docs/baseline_523ca1c7_validated.csv   |   41 +
 docs/mxfp4_moe_tuning.md               |   58 +
 kernels/moe_tuning.py                  |  511 ++++++++
 kernels/moe_tuning_spec.py             |  254 ++++
 scripts/aiter_strict_point.py          |  196 +++
 scripts/moe_tuning_harness.py          | 1157 +++++++++++++++++
 scripts/moe_tuning_ledger.py           |  501 ++++++++
 scripts/sync_aiter_flydsl_kernels.sh   |   64 +
 tests/unit/test_moe_tuning_harness.py  | 1584 ++++++++++++++++++++++++
 tests/unit/test_moe_tuning_legality.py |  176 +++
 11 files changed, 4544 insertions(+)
 create mode 100644 docs/baseline_523ca1c7_validated.csv
 create mode 100644 docs/mxfp4_moe_tuning.md
 create mode 100644 kernels/moe_tuning.py
 create mode 100644 kernels/moe_tuning_spec.py
 create mode 100644 scripts/aiter_strict_point.py
 create mode 100644 scripts/moe_tuning_harness.py
 create mode 100644 scripts/moe_tuning_ledger.py
 create mode 100755 scripts/sync_aiter_flydsl_kernels.sh
 create mode 100644 tests/unit/test_moe_tuning_harness.py
 create mode 100644 tests/unit/test_moe_tuning_legality.py

diff --git a/.gitignore b/.gitignore
index 4a341beb1..35f6fcfd6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -64,3 +64,5 @@ Thumbs.db
 # Sphinx documentation build
 docs/_build/
 python/flydsl/_mlir
+
+.humanize*
diff --git a/docs/baseline_523ca1c7_validated.csv b/docs/baseline_523ca1c7_validated.csv
new file mode 100644
index 000000000..55692980f
--- /dev/null
+++ b/docs/baseline_523ca1c7_validated.csv
@@ -0,0 +1,41 @@
+gpu_id,gpu_model,branch,commit,command,warmup,iters,idle_gpu_verified,graph_capture,l2_flush_per_iter,clocks_pinned,metric_formula,model,model_dim,inter_dim,experts,topk,dtype,act,token,tile_m1,tile_n1,tile_k1,tile_m2,tile_n2,tile_k2,stage1_us,stage2_us,sorting_us,kernel_path_us,kernel_path_us_p95,effective_tflops,mfu,e2e_us,e2e_us_p95,logits_diff,correctness_pass,flydsl_command,strict_error,error_category,aot_status
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 1 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 1 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,1,64,256,256,64,256,256,82.1,48.2,0.0,130.2,157.60000000000002,0.7610632258064516,0.00016826513946638328,34.0650421052633,493.84400248527527,0.0019092379303825568,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 1 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 2 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 2 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,2,64,256,256,64,256,256,81.0,46.5,0.0,127.4,140.6,1.55557978021978,0.0003439265487994207,41.40170370370407,493.5239851474762,0.001377186866181268,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 2 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 4 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 4 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,4,64,256,256,64,256,256,82.2,47.2,0.0,129.4,141.8,3.063073632148377,0.0006772216741429089,56.274242424242736,658.486008644104,0.0029107140716863045,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 4 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 8 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 8 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,8,64,256,256,64,256,256,85.8,52.4,0.0,138.2,145.6,5.73605973950796,0.0012681980410143622,60.24796703296714,375.7230043411255,1.1455316338060406e-05,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 8 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 16 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 16 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,16,64,256,256,64,256,256,91.6,61.1,0.0,152.7,171.2,10.382756463654223,0.0022955464213252758,83.8479381443307,571.2850093841553,1.029067283353502e-05,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 16 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 32 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 32 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,32,64,256,256,64,256,256,102.9,76.9,0.0,179.8,193.0,17.635671991101223,0.0038991094386692953,114.85418367346972,406.5229892730713,1.0363225636189632e-05,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 32 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 64 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 64 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,64,64,256,256,64,256,256,113.4,89.6,0.0,203.0,216.3,31.24033324137931,0.00690699386278561,148.7931530612243,435.8829855918884,1.0139395224539882e-05,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 64 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 128 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 128 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,128,64,256,256,64,256,256,119.2,94.9,0.0,214.39999999999998,225.5,59.15846686567165,0.013079475318521258,158.68847422680435,446.8429982662201,1.015027033468563e-05,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 128 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 256 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 256 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,256,64,256,256,64,256,256,165.9,100.9,0.0,266.7,278.79999999999995,95.11492535433072,0.021029167666223904,171.77611111111284,628.6050081253052,1.0238226904046854e-05,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 256 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 512 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 512 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,512,64,256,256,64,256,256,168.0,111.2,0.0,279.2,292.2,181.71311312320918,0.040175351121646954,191.89716161616175,633.204996585846,1.0047836131898968e-05,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 512 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 1024 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 1024 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,1024,64,256,256,64,256,256,168.8,135.0,0.0,304.2,317.0,333.5588506508876,0.07374725860068265,248.54487878787992,718.5260057449341,3.4465752332124566e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 1024 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 2048 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 2048 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,2048,64,256,256,64,256,256,169.0,218.4,0.0,387.5,398.8,523.7089154477419,0.1157879538907234,365.92926262626133,782.4059724807739,3.4398751157516116e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 2048 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 4096 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 4096 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,4096,64,256,256,64,256,256,249.4,367.8,0.0,616.6,630.4,658.2458797794357,0.1455330267033906,568.4188444444443,734.9259853363037,3.437462961830562e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 4096 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 8192 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 8192 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,8192,64,256,256,64,256,256,428.3,650.8,0.0,1079.1999999999998,1103.5,752.1764445366939,0.1663003414850086,982.3065411764695,1264.5310163497925,3.436508721699205e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 8192 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 16384 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 16384 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,16384,64,256,256,64,256,256,669.1,1233.6,0.0,1902.6999999999998,1934.1999999999998,853.2599137478321,0.18864910761614684,1729.7136813186798,1922.6160049438477,3.433807777675213e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 16384 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 32768 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 32768 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,32768,64,256,256,64,256,256,1048.9,2375.2,0.0,3427.4,3500.7999999999997,947.3639714582482,0.20945478033567283,3223.8851157894737,3390.666961669922,3.435615013036575e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 32768 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 1 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 1 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,1,64,256,256,64,256,256,81.7,47.3,0.0,129.0,147.1,0.6827936744186047,0.0001509603525135098,38.18579591836654,493.88399720191956,0.0013388059847474487,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 1 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 2 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 2 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,2,64,256,256,64,256,256,81.2,46.5,0.0,127.7,136.0,1.379489177760376,0.0003049942909043502,41.845000000001676,500.6440281867981,0.002241814551414034,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 2 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 4 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 4 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,4,64,256,256,64,256,256,82.9,47.8,0.0,130.6,141.8,2.6977146707503827,0.0005964436592417383,55.812939393940304,648.045003414154,0.0008894657763870439,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 4 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 8 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 8 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,8,64,256,256,64,256,256,84.2,50.5,0.0,134.6,142.2,5.235089687964339,0.0011574374724661373,65.84719101123677,552.5649785995483,9.760027172789343e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 8 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 16 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 16 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,16,64,256,256,64,256,256,89.8,59.7,0.0,149.6,157.6,9.42036192513369,0.0020827684999190116,84.7373000000007,568.884015083313,9.839066007155672e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 16 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 32 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 32 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,32,64,256,256,64,256,256,100.7,74.1,0.0,174.89999999999998,185.5,16.115336123499144,0.0035629750438866117,119.12603225806363,594.165027141571,9.410086305061682e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 32 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 64 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 64 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,64,64,256,256,64,256,256,169.7,97.9,0.0,267.6,280.0,21.065562690582958,0.004657431503555816,168.3802087912095,609.9249720573425,9.403851483069658e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 64 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 128 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 128 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,128,64,256,256,64,256,256,176.5,116.0,0.0,292.4,304.8,38.5577604377565,0.008524819906645258,196.7042395833342,596.405029296875,0.0005954914352407359,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 128 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 256 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 256 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,256,64,256,256,64,256,256,179.8,124.6,0.0,304.3,315.8,74.09983011501807,0.01638289412226798,208.04478260869544,596.563994884491,0.0006630390382372786,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 256 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 512 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 512 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,512,64,256,256,64,256,256,180.3,134.4,0.0,314.70000000000005,327.9,143.3020546806482,0.031682965881195714,224.1345567010319,620.8850145339966,0.0006180732459515337,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 512 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 1024 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 1024 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,1024,64,256,256,64,256,256,181.5,151.8,0.0,333.3,343.6,270.6100006480648,0.05982975915278903,258.40870103092766,657.2449803352356,0.0006517958301904825,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 1024 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 2048 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 2048 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,2048,64,256,256,64,256,256,184.5,194.4,0.0,379.4,391.0,475.4576342435425,0.10511997219622872,382.2209595959608,787.6060009002686,3.441369365364544e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 2048 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 4096 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 4096 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,4096,64,256,256,64,256,256,252.1,348.4,0.0,600.5,615.2,600.7947591407161,0.13283103231057178,511.54278787878735,895.2869772911072,3.4482669706292768e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 4096 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 8192 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 8192 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,8192,64,256,256,64,256,256,387.2,564.1,0.0,950.8,975.5,758.8919917206563,0.1677850965555287,899.7465434782592,1170.7290410995483,3.4450919983441963e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 8192 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 16384 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 16384 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,16384,64,256,256,64,256,256,690.4,1095.2,0.0,1785.6,1822.1,808.1927707526881,0.17868511402889414,1597.2175056179763,1841.694951057434,3.443782965351083e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 16384 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 32768 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 32768 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,32768,64,256,256,64,256,256,1062.3,2143.4,0.0,3205.2,3256.9,900.4798523998503,0.1990890675215234,2964.037744680856,3157.3050022125244,3.4440292647763826e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 32768 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 256 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 3072 --inter-dim 3072 -e 128 -k 4 -t 256 --aq fp4 --wq fp4 --act swiglu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,gpt_oss,3072,3072,128,4,a4w4,swiglu,256,32,128,256,32,256,256,246.5,143.5,0.0,390.0,415.5,148.67194486153846,0.03287020669058997,328.1737765957458,656.0050249099731,6.2468397891146665e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 256 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 512 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 3072 --inter-dim 3072 -e 128 -k 4 -t 512 --aq fp4 --wq fp4 --act swiglu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,gpt_oss,3072,3072,128,4,a4w4,swiglu,512,32,128,256,32,256,256,254.4,146.2,0.0,400.6,428.79999999999995,289.47607836245635,0.0640009016941093,336.36401041666727,666.8050289154053,6.183315036101256e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 512 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 1024 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 3072 --inter-dim 3072 -e 128 -k 4 -t 1024 --aq fp4 --wq fp4 --act swiglu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,gpt_oss,3072,3072,128,4,a4w4,swiglu,1024,32,128,256,32,256,256,271.3,236.8,0.0,507.6,525.4,456.9114144680851,0.10101954774885809,343.81028125000074,689.1649961471558,6.178899654263326e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 1024 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 2048 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 3072 --inter-dim 3072 -e 128 -k 4 -t 2048 --aq fp4 --wq fp4 --act swiglu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,gpt_oss,3072,3072,128,4,a4w4,swiglu,2048,32,128,256,32,256,256,332.6,317.6,0.0,650.2,670.2,713.4058258505075,0.1577284602808993,453.1531264367808,835.0859880447388,6.184897809680123e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 2048 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 4096 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 3072 --inter-dim 3072 -e 128 -k 4 -t 4096 --aq fp4 --wq fp4 --act swiglu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,gpt_oss,3072,3072,128,4,a4w4,swiglu,4096,32,128,256,32,256,256,464.2,534.1,0.0,998.4000000000001,1056.0,929.1996553846153,0.20543879181618732,703.0980000000009,1013.7679576873779,6.199037882903546e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 4096 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 8192 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 3072 --inter-dim 3072 -e 128 -k 4 -t 8192 --aq fp4 --wq fp4 --act swiglu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,gpt_oss,3072,3072,128,4,a4w4,swiglu,8192,32,128,256,32,256,256,742.9,935.7,0.0,1678.3,1805.3,1105.5388618673658,0.2444260141205761,1302.9729550561804,1645.4930305480957,6.18097885130009e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 8192 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 16384 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 3072 --inter-dim 3072 -e 128 -k 4 -t 16384 --aq fp4 --wq fp4 --act swiglu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,gpt_oss,3072,3072,128,4,a4w4,swiglu,16384,32,128,256,32,256,256,1321.3,1578.6,0.0,2899.8999999999996,3148.1,1279.6481753660473,0.2829202244895086,2158.8413440860227,2466.418981552124,6.177899778192497e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 16384 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
+0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 32768 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 3072 --inter-dim 3072 -e 128 -k 4 -t 32768 --aq fp4 --wq fp4 --act swiglu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,gpt_oss,3072,3072,128,4,a4w4,swiglu,32768,32,128,256,32,256,256,2509.3,2999.5,0.0,5508.8,5865.0,1347.245042021493,0.2978653641435978,4044.7997684210477,4329.154014587402,6.180992124016349e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 32768 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked
diff --git a/docs/mxfp4_moe_tuning.md b/docs/mxfp4_moe_tuning.md
new file mode 100644
index 000000000..1b2a8a8fb
--- /dev/null
+++ b/docs/mxfp4_moe_tuning.md
@@ -0,0 +1,58 @@
+# MXFP4 MoE 2-Stage Tuning Harness (gfx950)
+
+Measurement + verification infrastructure for tuning the MXFP4 (per-1×32
+microscale fp4) MoE 2-stage GEMM pipeline on AMD gfx950 / MI350X, in support of
+[ROCm/FlyDSL#708](https://github.com/ROCm/FlyDSL/issues/708) ("MXFP4 MoE low MFU
+at large shapes and long latency at small tokens").
+
+> **Status: tuning *infrastructure* + a validated baseline. This does NOT yet
+> contain a performance change to any kernel** — it is the measurement, legality,
+> and bookkeeping foundation that a tuning campaign runs on top of. No production
+> kernel logic is modified by this change set.
+
+## Components
+
+- **`kernels/moe_tuning.py`** — pre-compile legality filter for stage1/stage2 tile
+  configs (LDS footprint, thread/divisibility constraints, MX-FP4 floors). Lets a
+  config search reject illegal tiles before spending GPU time; mirrors the
+  builders' real LDS sizing (stage1 vs stage2 fp4 asymmetry included).
+- **`kernels/moe_tuning_spec.py`** — locked spec constants + win/no-regression
+  predicates (win margins, regime-aware no-regression band, token grid, MFU
+  denominator, metric formula).
+- **`scripts/moe_tuning_harness.py`** — the measurement harness: full provenance
+  per point (GPU id+model, branch+commit, exact replayable command, warmup/iters,
+  idle-GPU check, verified clock pinning), median+p95 from a faithful timed loop,
+  and a fail-closed candidate sweep CLI (illegal/unmeasured configs are recorded
+  as machine-readable rejections, never silently skipped).
+- **`scripts/moe_tuning_ledger.py`** — attempt ledger + full-coverage Pareto
+  comparator. A candidate is promotable only via a single `claimable_win` gate
+  (full coverage + no kernel-path/e2e regression + a real win + a strict
+  AOT/correctness hard gate). Includes ledger-integrity scans (duplicate /
+  replayable-command / supersede-link).
+- **`scripts/aiter_strict_point.py`** — strict, AOT-checked, model-correct single
+  -case aiter fused-MoE e2e + correctness guardrail (`logits_diff <= 0.01`).
+- **`scripts/sync_aiter_flydsl_kernels.sh`** — overlay the current FlyDSL MoE
+  kernels onto aiter's vendored copies so the e2e guardrail runs against the same
+  sources being tuned.
+- **`docs/baseline_523ca1c7_validated.csv`** — a validated locked a4w4 baseline
+  table (reference every candidate is compared against).
+
+## Tests
+
+`tests/unit/test_moe_tuning_harness.py` and
+`tests/unit/test_moe_tuning_legality.py` cover the legality filter, provenance
+contracts, the Pareto comparator + win gate, and the integrity scans (host-side,
+no GPU required):
+
+```bash
+python3 -m pytest tests/unit/test_moe_tuning_harness.py \
+                  tests/unit/test_moe_tuning_legality.py -q
+```
+
+## Scope notes
+
+- This change set targets the a4w4 (fp4×fp4) path. a8w4 (fp8×fp4) correctness is
+  currently environment-blocked by an aiter non-fp4-activation wrapper/layout
+  contract mismatch (not a FlyDSL kernel bug); it is quarantined for win claims.
+- The actual tile/lever tuning that produces MFU/latency wins runs on top of this
+  harness and is tracked separately against #708.
diff --git a/kernels/moe_tuning.py b/kernels/moe_tuning.py
new file mode 100644
index 000000000..691d2617a
--- /dev/null
+++ b/kernels/moe_tuning.py
@@ -0,0 +1,511 @@
+# SPDX-License-Identifier: MIT
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
+
+"""Tuning support for the mixed (fp4/fp8 x fp4) MoE 2-stage GEMM kernels.
+
+This module holds host-side, pre-compile tooling for the MXFP4 MoE tuning
+campaign.  Nothing here changes kernel behavior; it mirrors the legality checks
+that ``compile_mixed_moe_gemm1`` / ``compile_mixed_moe_gemm2`` already enforce so
+that a tile-config search can reject illegal candidates *before* spending GPU
+time on a compile that the kernel would refuse.
+
+The single entry point is :func:`check_tile_config`, which returns a
+:class:`TileCheck` describing whether a ``(stage, tile_m, tile_n, tile_k, ...)``
+candidate is legal and, when it is not, a machine-readable reason.
+
+The constraints encoded here are a faithful copy of the ones in
+``kernels/mixed_moe_gemm_2stage.py`` (stage1: ``tile_k_bytes % 64``,
+``tile_m*tile_k*elem_bytes % total_threads``, split-K divisibility, the LDS
+sizing / arch limit; stage2: ``model_dim % tile_n``, ``inter_dim % tile_k``,
+``sort_block_m % tile_m``, ``tile_m*tile_k % 256``, the LDS sizing) plus the
+MX-FP4 layout requirements (``tile_m % 32``, ``tile_m >= 32``, ``tile_k >= 256``).
+Keep the two files in sync: if a constraint changes in the kernel builder, update
+the matching check below.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Optional
+
+# gfx -> total LDS bytes available to a single workgroup.  Matches the
+# ``_lds_limit`` dict in compile_mixed_moe_gemm1 / 2.
+LDS_LIMIT_BYTES = {"gfx950": 163840, "gfx942": 65536}
+
+# Element byte width of the activation operand, keyed by a_dtype.  fp4 and fp8
+# both occupy 1 byte in the kernel's sizing math (fp4 is vector-packed 2:1 via
+# a_elem_vec_pack, handled separately); fp16 is 2 bytes.
+_A_ELEM_BYTES = {"fp8": 1, "fp4": 1, "int8": 1, "fp16": 2}
+
+# Activation vector pack factor (fp4 packs two logical elements per byte).
+_A_ELEM_VEC_PACK = {"fp4": 2}
+
+
+@dataclass
+class TileCheck:
+    """Result of a legality check for one tile candidate.
+
+    ``legal`` is True iff the kernel builder would accept the candidate.  When
+    illegal, ``reason`` is a short machine-readable token (e.g.
+    ``"tile_k_bytes_not_div_64"``) and ``detail`` is a human-readable message.
+    ``lds_bytes`` is the computed LDS footprint when it could be evaluated.
+    """
+
+    legal: bool
+    stage: int
+    reason: str = ""
+    detail: str = ""
+    lds_bytes: Optional[int] = None
+    params: dict = field(default_factory=dict)
+
+    def as_record(self) -> dict:
+        """Flat dict suitable for JSONL/CSV logging of a rejected candidate."""
+        rec = {
+            "stage": self.stage,
+            "legal": self.legal,
+            "reason": self.reason,
+            "detail": self.detail,
+            "lds_bytes": self.lds_bytes,
+        }
+        rec.update(self.params)
+        return rec
+
+
+def _align(ptr: int, align: int) -> int:
+    """Round ``ptr`` up to a multiple of ``align`` (mirrors SmemAllocator._align)."""
+    if ptr % align == 0:
+        return ptr
+    return (ptr + align - 1) // align * align
+
+
+def _a_elem_bytes(a_dtype: str) -> int:
+    if a_dtype not in _A_ELEM_BYTES:
+        raise ValueError(f"a_dtype must be one of {sorted(_A_ELEM_BYTES)}, got {a_dtype!r}")
+    return _A_ELEM_BYTES[a_dtype]
+
+
+def stage1_lds_bytes(
+    *,
+    tile_m: int,
+    tile_n: int,
+    tile_k: int,
+    a_dtype: str,
+    out_dtype: str = "f16",
+    waves_per_eu: int = 4,
+    use_cshuffle_epilog: bool = True,
+    gpu_arch: str = "gfx950",
+) -> int:
+    """LDS bytes used by a stage1 config, mirroring compile_mixed_moe_gemm1.
+
+    Follows the ping/pong allocator walk: pong holds max(input, lds_out)+tid,
+    ping holds input, with the lds_out auto-split when the standard layout would
+    overflow the arch limit, plus the waves_per_eu minimum-LDS padding.
+    """
+    a_elem_bytes = _a_elem_bytes(a_dtype)
+    # FLIR_CK_LDS128 defaults on -> pad_k = 0.
+    lds_stride = tile_k
+    # NOTE: stage1 sizes the LDS A tile from the FULL lds_stride; unlike stage2 it
+    # does NOT divide by a_elem_vec_pack for fp4 here.  The fp4 vec-pack stride
+    # halving only applies, conditionally, to an inner async-copy buffer in the
+    # kernel body, not to this top-level ping/pong allocation.  See
+    # compile_mixed_moe_gemm1: ``_single_x_bytes = tile_m * lds_stride * a_elem_bytes``.
+
+    out_s = str(out_dtype).strip().lower()
+    out_is_f32 = out_s in ("f32", "fp32", "float")
+    need_quant = out_s in ("fp4", "fp8")
+    if need_quant:
+        use_cshuffle_epilog = True
+
+    single_x_bytes = tile_m * lds_stride * a_elem_bytes
+    cshuffle_elem_bytes = 4 if need_quant else (4 if out_is_f32 else 2)
+    lds_out_bytes = cshuffle_elem_bytes * tile_m * tile_n if use_cshuffle_epilog else 0
+    lds_tid_bytes = tile_m * 4
+    num_waves = min(4, tile_n // 32) if tile_n >= 32 else 0
+
+    global_align = 1024
+    std_pong = max(single_x_bytes, lds_out_bytes) + lds_tid_bytes
+    std_ping = single_x_bytes
+    std_pong_aligned = _align(std_pong, 128)
+    std_total = _align(std_pong_aligned, global_align) + _align(std_ping, 128)
+    lds_limit = LDS_LIMIT_BYTES.get(gpu_arch, 0)
+
+    split_lds_out = lds_limit > 0 and lds_out_bytes > 0 and std_total > lds_limit and num_waves >= 2
+
+    if split_lds_out:
+        half_out_bytes = cshuffle_elem_bytes * tile_m * (tile_n // 2)
+        pong_buffer_bytes = max(single_x_bytes, half_out_bytes)
+        ping_buffer_bytes = max(single_x_bytes, half_out_bytes)
+    else:
+        pong_buffer_bytes = max(single_x_bytes, lds_out_bytes)
+        ping_buffer_bytes = single_x_bytes
+
+    # Allocator walk: pong = align16(0)+pong_buf, then align4()+tid.
+    pong_ptr = _align(0, 16) + pong_buffer_bytes
+    pong_ptr = _align(pong_ptr, 4) + lds_tid_bytes
+    ping_ptr = _align(0, 16) + ping_buffer_bytes
+
+    if waves_per_eu is not None and waves_per_eu >= 1:
+        total_cu_lds = 160 * 1024
+        min_lds = total_cu_lds // (waves_per_eu + 1) + 1
+        pong_sz = _align(pong_ptr, 128)
+        ping_sz = _align(ping_ptr, 128)
+        cur_lds = pong_sz + ping_sz
+        if cur_lds < min_lds:
+            ping_ptr += min_lds - cur_lds
+
+    # Final footprint uses the same global/128 alignment as _std_total.
+    return _align(_align(pong_ptr, 128), global_align) + _align(ping_ptr, 128)
+
+
+def stage2_lds_bytes(
+    *,
+    tile_m: int,
+    tile_n: int,
+    tile_k: int,
+    a_dtype: str,
+    use_cshuffle_epilog: bool = True,
+) -> int:
+    """LDS bytes used by a stage2 config, mirroring compile_mixed_moe_gemm2.
+
+    Stage2 has no lds_out auto-split and no waves_per_eu padding.
+    """
+    a_elem_bytes = _a_elem_bytes(a_dtype)
+    vec_pack = _A_ELEM_VEC_PACK.get(a_dtype, 1)
+    lds_stride = tile_k  # pad_k = 0 with FLIR_CK_LDS128 default.
+    eff_lds_stride = lds_stride // vec_pack if vec_pack > 1 else lds_stride
+
+    single_x_bytes = tile_m * eff_lds_stride * a_elem_bytes
+    cshuffle_elem_bytes = 2  # stage2 f16/bf16
+    lds_out_bytes = cshuffle_elem_bytes * tile_m * tile_n if use_cshuffle_epilog else 0
+    lds_tid_bytes = tile_m * 4
+
+    pong_buffer_bytes = max(single_x_bytes, lds_out_bytes)
+    ping_buffer_bytes = single_x_bytes
+
+    pong_ptr = _align(0, 16) + pong_buffer_bytes
+    pong_ptr = _align(pong_ptr, 4) + lds_tid_bytes
+    ping_ptr = _align(0, 16) + ping_buffer_bytes
+    return pong_ptr + ping_ptr
+
+
+def _check_stage1(
+    *,
+    model_dim: int,
+    inter_dim: int,
+    tile_m: int,
+    tile_n: int,
+    tile_k: int,
+    a_dtype: str,
+    out_dtype: str,
+    k_batch: int,
+    waves_per_eu: int,
+    gpu_arch: str,
+    params: dict,
+) -> TileCheck:
+    a_elem_bytes = _a_elem_bytes(a_dtype)
+
+    # MX-FP4 layout requirements (fp4/fp8 weight path).
+    if tile_m < 32:
+        return TileCheck(
+            False, 1, "tile_m_lt_32", f"tile_m={tile_m} < 32 (MX-FP4 layout requires tile_m>=32)", params=params
+        )
+    if tile_m % 32 != 0:
+        return TileCheck(
+            False, 1, "tile_m_not_div_32", f"tile_m={tile_m} not divisible by 32 (MX-FP4 layout)", params=params
+        )
+    if tile_k < 256:
+        return TileCheck(
+            False, 1, "tile_k_lt_256", f"tile_k={tile_k} < 256 (MX-FP4 layout requires tile_k>=256)", params=params
+        )
+
+    if tile_n < 32 or tile_n % 32 != 0:
+        return TileCheck(
+            False, 1, "tile_n_not_mult_32", f"tile_n={tile_n} must be a positive multiple of 32", params=params
+        )
+
+    # tile_k_bytes % 64 (kernel raises otherwise).
+    tile_k_bytes = tile_k * a_elem_bytes
+    if tile_k_bytes % 64 != 0:
+        return TileCheck(
+            False, 1, "tile_k_bytes_not_div_64", f"tile_k_bytes={tile_k_bytes} not divisible by 64", params=params
+        )
+
+    # total_threads = min(4, tile_n // 32) * 64
+    num_waves = min(4, tile_n // 32)
+    total_threads = num_waves * 64
+    bytes_x_per_tile = tile_m * tile_k * a_elem_bytes
+    if bytes_x_per_tile % total_threads != 0:
+        return TileCheck(
+            False,
+            1,
+            "tile_load_not_div_total_threads",
+            f"tile_m*tile_k*elem_bytes={bytes_x_per_tile} not divisible by total_threads={total_threads}",
+            params=params,
+        )
+
+    # K-loop coverage: model_dim must be divisible by tile_k (implicit but required).
+    if model_dim % tile_k != 0:
+        return TileCheck(
+            False,
+            1,
+            "model_dim_not_div_tile_k",
+            f"model_dim={model_dim} not divisible by tile_k={tile_k}",
+            params=params,
+        )
+
+    # Split-K divisibility.
+    if k_batch > 1:
+        if model_dim % k_batch != 0:
+            return TileCheck(
+                False,
+                1,
+                "model_dim_not_div_k_batch",
+                f"model_dim={model_dim} not divisible by k_batch={k_batch}",
+                params=params,
+            )
+        k_per_batch = model_dim // k_batch
+        if k_per_batch % tile_k != 0:
+            return TileCheck(
+                False,
+                1,
+                "k_per_batch_not_div_tile_k",
+                f"(model_dim//k_batch)={k_per_batch} not divisible by tile_k={tile_k}",
+                params=params,
+            )
+
+    # LDS fits the arch limit.
+    lds = stage1_lds_bytes(
+        tile_m=tile_m,
+        tile_n=tile_n,
+        tile_k=tile_k,
+        a_dtype=a_dtype,
+        out_dtype=out_dtype,
+        waves_per_eu=waves_per_eu,
+        gpu_arch=gpu_arch,
+    )
+    limit = LDS_LIMIT_BYTES.get(gpu_arch, 0)
+    if limit and lds > limit:
+        return TileCheck(
+            False, 1, "lds_over_limit", f"stage1 LDS {lds} > {gpu_arch} limit {limit}", lds_bytes=lds, params=params
+        )
+
+    return TileCheck(True, 1, lds_bytes=lds, params=params)
+
+
+def _check_stage2(
+    *,
+    model_dim: int,
+    inter_dim: int,
+    tile_m: int,
+    tile_n: int,
+    tile_k: int,
+    a_dtype: str,
+    sort_block_m: int,
+    gpu_arch: str,
+    params: dict,
+) -> TileCheck:
+    a_elem_bytes = _a_elem_bytes(a_dtype)
+
+    # MX-FP4 layout requirements.
+    if tile_m < 32:
+        return TileCheck(
+            False, 2, "tile_m_lt_32", f"tile_m={tile_m} < 32 (MX-FP4 layout requires tile_m>=32)", params=params
+        )
+    if tile_m % 32 != 0:
+        return TileCheck(
+            False, 2, "tile_m_not_div_32", f"tile_m={tile_m} not divisible by 32 (MX-FP4 layout)", params=params
+        )
+    if tile_k < 256:
+        return TileCheck(
+            False, 2, "tile_k_lt_256", f"tile_k={tile_k} < 256 (MX-FP4 layout requires tile_k>=256)", params=params
+        )
+
+    # model_dim % 16 (kernel asserts) and the N-tile coverage model_dim % tile_n.
+    if model_dim % 16 != 0:
+        return TileCheck(False, 2, "model_dim_not_div_16", f"model_dim={model_dim} not divisible by 16", params=params)
+    if model_dim % tile_n != 0:
+        return TileCheck(
+            False,
+            2,
+            "model_dim_not_div_tile_n",
+            f"model_dim={model_dim} not divisible by tile_n={tile_n}",
+            params=params,
+        )
+
+    # inter_dim (= stage2 K) must be divisible by tile_k.
+    if inter_dim % tile_k != 0:
+        return TileCheck(
+            False,
+            2,
+            "inter_dim_not_div_tile_k",
+            f"inter_dim={inter_dim} not divisible by tile_k={tile_k}",
+            params=params,
+        )
+
+    # tile_k_bytes % 64.
+    tile_k_bytes = tile_k * a_elem_bytes
+    if tile_k_bytes % 64 != 0:
+        return TileCheck(
+            False, 2, "tile_k_bytes_not_div_64", f"tile_k_bytes={tile_k_bytes} not divisible by 64", params=params
+        )
+
+    # total_threads is a fixed 256 in stage2.
+    bytes_x_per_tile = tile_m * tile_k * a_elem_bytes
+    if bytes_x_per_tile % 256 != 0:
+        return TileCheck(
+            False,
+            2,
+            "tile_load_not_div_256",
+            f"tile_m*tile_k*elem_bytes={bytes_x_per_tile} not divisible by 256",
+            params=params,
+        )
+    # gmem load mapping: bytes_per_thread must be divisible by 4.
+    if (bytes_x_per_tile // 256) % 4 != 0:
+        return TileCheck(
+            False,
+            2,
+            "bytes_per_thread_not_div_4",
+            f"bytes_per_thread_x={bytes_x_per_tile // 256} not divisible by 4",
+            params=params,
+        )
+
+    # sort_block_m must be a multiple of tile_m (0 -> equals tile_m, always legal).
+    eff_sort_block_m = tile_m if sort_block_m <= 0 else sort_block_m
+    if eff_sort_block_m != tile_m and eff_sort_block_m % tile_m != 0:
+        return TileCheck(
+            False,
+            2,
+            "sort_block_m_not_mult_tile_m",
+            f"sort_block_m={eff_sort_block_m} not a multiple of tile_m={tile_m}",
+            params=params,
+        )
+
+    # LDS fits the arch limit.
+    lds = stage2_lds_bytes(tile_m=tile_m, tile_n=tile_n, tile_k=tile_k, a_dtype=a_dtype)
+    limit = LDS_LIMIT_BYTES.get(gpu_arch, 0)
+    if limit and lds > limit:
+        return TileCheck(
+            False, 2, "lds_over_limit", f"stage2 LDS {lds} > {gpu_arch} limit {limit}", lds_bytes=lds, params=params
+        )
+
+    return TileCheck(True, 2, lds_bytes=lds, params=params)
+
+
+def check_tile_config(
+    *,
+    stage: int,
+    model_dim: int,
+    inter_dim: int,
+    tile_m: int,
+    tile_n: int,
+    tile_k: int,
+    a_dtype: str = "fp4",
+    out_dtype: str = "f16",
+    k_batch: int = 1,
+    waves_per_eu: int = 4,
+    sort_block_m: int = 0,
+    gpu_arch: str = "gfx950",
+) -> TileCheck:
+    """Check whether a single tile candidate is legal for ``stage`` (1 or 2).
+
+    Mirrors the pre-compile constraints in ``compile_mixed_moe_gemm1`` /
+    ``compile_mixed_moe_gemm2`` so the candidate never reaches a compile the
+    kernel would reject.  ``a_dtype`` is ``"fp4"`` for a4w4 and ``"fp8"`` for
+    a8w4 (the activation operand); the weight operand is fp4 in both cases.
+
+    Returns a :class:`TileCheck`; ``.legal`` is the accept/reject decision and
+    ``.reason`` is a machine-readable token on rejection.
+    """
+    params = {
+        "model_dim": model_dim,
+        "inter_dim": inter_dim,
+        "tile_m": tile_m,
+        "tile_n": tile_n,
+        "tile_k": tile_k,
+        "a_dtype": a_dtype,
+        "out_dtype": out_dtype,
+        "k_batch": k_batch,
+        "waves_per_eu": waves_per_eu,
+        "sort_block_m": sort_block_m,
+        "gpu_arch": gpu_arch,
+    }
+    if a_dtype not in _A_ELEM_BYTES:
+        return TileCheck(False, stage, "bad_a_dtype", f"a_dtype={a_dtype!r} not supported", params=params)
+
+    if stage == 1:
+        return _check_stage1(
+            model_dim=model_dim,
+            inter_dim=inter_dim,
+            tile_m=tile_m,
+            tile_n=tile_n,
+            tile_k=tile_k,
+            a_dtype=a_dtype,
+            out_dtype=out_dtype,
+            k_batch=k_batch,
+            waves_per_eu=waves_per_eu,
+            gpu_arch=gpu_arch,
+            params=params,
+        )
+    if stage == 2:
+        return _check_stage2(
+            model_dim=model_dim,
+            inter_dim=inter_dim,
+            tile_m=tile_m,
+            tile_n=tile_n,
+            tile_k=tile_k,
+            a_dtype=a_dtype,
+            sort_block_m=sort_block_m,
+            gpu_arch=gpu_arch,
+            params=params,
+        )
+    return TileCheck(False, stage, "bad_stage", f"stage must be 1 or 2, got {stage}", params=params)
+
+
+def enumerate_legal_configs(
+    *,
+    stage: int,
+    model_dim: int,
+    inter_dim: int,
+    a_dtype: str,
+    tile_m_choices,
+    tile_n_choices,
+    tile_k_choices,
+    out_dtype: str = "f16",
+    k_batch_choices=(1,),
+    waves_per_eu_choices=(4,),
+    sort_block_m_choices=(0,),
+    gpu_arch: str = "gfx950",
+    rejected_log: Optional[list] = None,
+):
+    """Yield every legal tile candidate from the cross product of the choices.
+
+    Rejected candidates are appended (as ``TileCheck.as_record()`` dicts) to
+    ``rejected_log`` when provided, so the search never silently drops a
+    candidate without a machine-readable reason.
+    """
+    legal = []
+    for tile_m in tile_m_choices:
+        for tile_n in tile_n_choices:
+            for tile_k in tile_k_choices:
+                for k_batch in (k_batch_choices if stage == 1 else (1,)):
+                    for waves_per_eu in (waves_per_eu_choices if stage == 1 else (4,)):
+                        for sort_block_m in (sort_block_m_choices if stage == 2 else (0,)):
+                            res = check_tile_config(
+                                stage=stage,
+                                model_dim=model_dim,
+                                inter_dim=inter_dim,
+                                tile_m=tile_m,
+                                tile_n=tile_n,
+                                tile_k=tile_k,
+                                a_dtype=a_dtype,
+                                out_dtype=out_dtype,
+                                k_batch=k_batch,
+                                waves_per_eu=waves_per_eu,
+                                sort_block_m=sort_block_m,
+                                gpu_arch=gpu_arch,
+                            )
+                            if res.legal:
+                                legal.append(res)
+                            elif rejected_log is not None:
+                                rejected_log.append(res.as_record())
+    return legal
diff --git a/kernels/moe_tuning_spec.py b/kernels/moe_tuning_spec.py
new file mode 100644
index 000000000..910bf409b
--- /dev/null
+++ b/kernels/moe_tuning_spec.py
@@ -0,0 +1,254 @@
+# SPDX-License-Identifier: MIT
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
+
+"""Locked specification for the MXFP4 MoE 2-stage tuning campaign on gfx950.
+
+This is the single source of truth for the campaign's fixed parameters: the
+target model shapes, the token sweep grid, the measurement protocol, the
+win/no-regression predicates, the MFU denominator, and the routing-distribution
+set used in correctness checks.  The measurement harness and the (later)
+shape->config dispatch both import from here so the numbers live in exactly one
+place.
+
+All values are fixed inputs locked by the user before the campaign began; do not
+change them as part of tuning.  Tuning changes tile configs, not these gates.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Tuple
+
+# --- MFU denominator -------------------------------------------------------
+# Empirically measured fp4 GEMM ceiling on the target MI350X (gfx950, 256 CU,
+# sclk max 2200 MHz).  MFU = effective_TFLOPS / FP4_PEAK_TFLOPS.
+FP4_PEAK_TFLOPS = 4523.0
+
+# --- Win margins (the win-margin policy) ---------------------------------------------------
+WIN_MARGIN = 0.10  # 10% relative improvement required to claim a win.
+# Large-shape (tokens >= LARGE_TOKEN_MIN): tuned_MFU >= baseline_MFU * (1 + WIN_MARGIN).
+# Small-token (tokens <= SMALL_TOKEN_MAX): tuned_us <= baseline_us * (1 - WIN_MARGIN)
+#   AND (baseline_us - tuned_us) >= ABS_US_BAND.
+
+# --- No-regression tolerance + protocol (the no-regression policy) ----------------------------
+REGRESSION_REL = 0.02  # 2% relative.
+ABS_US_BAND = 2.0  # microseconds; default absolute floor (tokens >= 128).
+
+# Regime-aware absolute floor (user-approved amendment).  On this shared node the
+# small/low-token absolute latency is tiny (~30-300 us) and run-to-run jitter is
+# ~3-7 us even after the in-protocol controls are exhausted (faithful L2-flush
+# argument rotation, repeated measurement, AND harness-verified clock pinning).
+# This is irreducible measurement noise at tiny absolute latency, not a harness
+# defect: under the 8 us small-token floor the residual a4w4 repeatability
+# instability is confined to a single mid-token point (token 128, under the strict
+# 2 us tokens>=128 floor) plus the e2e guardrail outlier (token 64) -- i.e. the
+# small-token (<=64) kernel-path band is satisfied; tokens >= 128 keep the strict
+# 2 us floor.  8 us is still far below the small-token win threshold (>= 10% AND
+# >= 2 us; 10% of even the smallest ~127 us point is ~12.7 us), so widening the
+# band does NOT weaken win detection.  Floor is regime-aware: 8 us for
+# tokens <= SMALL_TOKEN_MAX, 2 us otherwise.
+SMALL_TOKEN_ABS_US_BAND = 8.0
+
+
+def abs_floor_us(token: int) -> float:
+    """Regime-aware absolute floor for the no-regression / repeatability band.
+
+    8 us for the small-token regime (tokens <= SMALL_TOKEN_MAX), 2 us otherwise.
+    Used together with the 2% relative term as ``max(2%, abs_floor_us(token))``.
+    """
+    return SMALL_TOKEN_ABS_US_BAND if token <= SMALL_TOKEN_MAX else ABS_US_BAND
+
+
+WARMUP_ITERS = 10
+BENCH_ITERS = 100
+# Reported statistics per point.
+REPORT_STATS = ("median", "p95")
+# Protocol flags (recorded with every measurement; runs under other settings are
+# non-comparable).
+GRAPH_CAPTURE = False
+L2_FLUSH_PER_ITER = True
+CLOCKS_PINNED = True
+
+# --- Token regimes (the win-margin policy / the target-bucket policy) -----------------------------------------
+LARGE_TOKEN_MIN = 4096  # MFU regime.
+SMALL_TOKEN_MAX = 64  # latency regime.
+# Predeclared MFU target buckets (the target-bucket policy): the two largest in-sweep tokens.
+MFU_TARGET_BUCKETS: Tuple[int, ...] = (16384, 32768)
+
+# --- Token grids (the token-grid policy) ---------------------------------------------------
+TOKEN_GRID_FULL: Tuple[int, ...] = (
+    1,
+    2,
+    4,
+    8,
+    16,
+    32,
+    64,
+    128,
+    256,
+    512,
+    1024,
+    2048,
+    4096,
+    8192,
+    16384,
+    32768,
+)
+TOKEN_GRID_GPTOSS: Tuple[int, ...] = (256, 512, 1024, 2048, 4096, 8192, 16384, 32768)
+
+# --- Routing distributions for correctness (the routing-distribution policy) -------------------------
+ROUTING_DISTRIBUTIONS: Tuple[str, ...] = (
+    "default",
+    "uniform",
+    "expert_skewed",
+    "few_active",
+    "all_active",
+    "sentinel_padding",
+)
+
+# --- Node environment (the node/shape policy) ----------------------------------------------
+TARGET_ARCH = "gfx950"
+
+
+@dataclass(frozen=True)
+class ModelShape:
+    """One target MoE model shape and its in-scope quant dtypes.
+
+    ``dtypes`` are the activation x weight quant aliases in scope for this loop:
+    ``"a4w4"`` (fp4 x fp4) and/or ``"a8w4"`` (fp8 x fp4).  ``i4`` is out of scope.
+    ``token_grid`` is the sweep used for this model (the token-grid policy).
+    """
+
+    name: str
+    model_dim: int
+    inter_dim: int
+    experts: int
+    topk: int
+    act: str  # "silu" or "swiglu"
+    dtypes: Tuple[str, ...]
+    token_grid: Tuple[int, ...]
+
+
+# The four target models (the node/shape policy + plan workload table).  DeepSeek V4 is a8w4
+# only; i4 (Kimi a16wi4) is excluded from this loop.
+MODELS: Tuple[ModelShape, ...] = (
+    ModelShape("deepseek_v3", 7168, 256, 257, 9, "silu", ("a4w4", "a8w4"), TOKEN_GRID_FULL),
+    ModelShape("deepseek_v4", 7168, 512, 385, 7, "silu", ("a8w4",), TOKEN_GRID_FULL),
+    ModelShape("kimi_k2", 7168, 256, 384, 8, "silu", ("a4w4", "a8w4"), TOKEN_GRID_FULL),
+    ModelShape("gpt_oss", 3072, 3072, 128, 4, "swiglu", ("a4w4", "a8w4"), TOKEN_GRID_GPTOSS),
+)
+
+# Map a quant alias to the activation operand dtype passed to the kernel builder
+# (the weight operand is fp4 in both in-scope cases).
+DTYPE_ALIAS_TO_A_DTYPE = {"a4w4": "fp4", "a8w4": "fp8"}
+
+# --- Correctness quarantine (non-fp4-activation e2e is environment-blocked) ---
+# Controlled evidence (direct aiter test_fmoe, each model's true activation, both
+# gate modes, token=16) shows the failing axis is the ACTIVATION operand being
+# non-fp4:
+#   a4w4  (fp4 activation):  logits_diff ~1e-5  -> PASS (all models, both gates)
+#   a8w4  (fp8 activation):  logits_diff ~0.98  -> FAIL (DS V3/V4, Kimi; both gates)
+#   a16w4 (bf16 activation): logits_diff ~0.98  -> FAIL (DS V3; both gates)
+#   GPT-OSS a8w4 Swiglu+INTERLEAVE: ~6e-6       -> PASS (lone non-fp4-act pass;
+#     aiter selects a different runtime q_dtype_a/fuse-quant path there)
+# fp8 AND bf16 activation both fail with fp4 weight; only fp4 activation passes.
+# Note: aiter test_fmoe passes the SAME activation/gate to BOTH its torch
+# reference and the kernel, so the activation choice alone cannot explain the
+# mismatch.
+#
+# Root cause is an activation-dtype-dependent wrapper/layout CONTRACT mismatch in
+# the aiter e2e path, NOT a proven FlyDSL kernel math bug -- this checkout's own
+# tests/kernels/test_moe_gemm.py --in_dtype a8w4 passes with --skip_ref false.
+# For non-fp4 activation aiter preps weights via shuffle_weight_a16w4 /
+# shuffle_scale_a16w4 and its reference sets a2_scale=None (no stage1->stage2 A2
+# requant), while the FlyDSL mixed stage2 kernel expects a pre-scattered A2 E8M0
+# scale (mixed_moe_gemm_2stage.py); this checkout's own 2-stage harness does
+# requantize A2 and passes.  Reconciling this is aiter-environment integration
+# work, outside the GEMM-tuning scope.
+#
+# All a8w4 (model, dtype) pairs are therefore QUARANTINED until the e2e a8w4
+# correctness path is validated.  Their rows are kept for provenance but excluded
+# from the validated baseline and from any win claim -- a genuine correctness
+# block, not a silent scope reduction.
+QUARANTINED_SHAPES: Tuple[Tuple[str, str], ...] = (
+    ("deepseek_v3", "a8w4"),
+    ("deepseek_v4", "a8w4"),
+    ("kimi_k2", "a8w4"),
+    ("gpt_oss", "a8w4"),
+)
+
+
+def is_quarantined(model: str, dtype: str) -> bool:
+    """True if (model, dtype) is correctness-quarantined (see QUARANTINED_SHAPES)."""
+    return (model, dtype) in QUARANTINED_SHAPES
+
+
+def validated_models():
+    """Yield (ModelShape, dtype) pairs that are NOT correctness-quarantined."""
+    for m in MODELS:
+        for dtype in m.dtypes:
+            if not is_quarantined(m.name, dtype):
+                yield m, dtype
+
+
+def validated_point_keys() -> set:
+    """(model, dtype, act, token) keys for the correctness-passing subset.
+
+    This is the workload the validated baseline must fully cover; the quarantined
+    a8w4 shapes are excluded until their correctness path is fixed.
+    """
+    keys = set()
+    for m, dtype in validated_models():
+        for token in m.token_grid:
+            keys.add((m.name, dtype, m.act, str(token)))
+    return keys
+
+
+def is_large_token(token: int) -> bool:
+    """True if ``token`` is in the large-shape MFU regime (tokens >= 4096)."""
+    return token >= LARGE_TOKEN_MIN
+
+
+def is_small_token(token: int) -> bool:
+    """True if ``token`` is in the small-token latency regime (tokens <= 64)."""
+    return token <= SMALL_TOKEN_MAX
+
+
+def is_regression(baseline_us: float, tuned_us: float, token: int = None) -> bool:
+    """No-regression gate (the no-regression policy): regression iff BOTH the
+    relative AND absolute bands are exceeded — ``tuned > baseline*1.02`` AND
+    ``tuned-baseline > abs_floor``.
+
+    The absolute floor is regime-aware (``abs_floor_us(token)``): 8 us for
+    tokens <= SMALL_TOKEN_MAX, 2 us otherwise.  When ``token`` is None the strict
+    2 us floor is used (back-compatible).  Applied per point on BOTH the
+    kernel-path and e2e metrics; a point is a regression if either metric regresses.
+    """
+    floor = ABS_US_BAND if token is None else abs_floor_us(token)
+    return (tuned_us > baseline_us * (1.0 + REGRESSION_REL)) and ((tuned_us - baseline_us) > floor)
+
+
+def is_large_shape_win(baseline_mfu: float, tuned_mfu: float) -> bool:
+    """Large-shape win gate (the win-margin policy): ``tuned_MFU >= baseline_MFU * 1.10``."""
+    return tuned_mfu >= baseline_mfu * (1.0 + WIN_MARGIN)
+
+
+def is_small_token_win(baseline_us: float, tuned_us: float) -> bool:
+    """Small-token win gate (the win-margin policy): both a relative and an absolute floor —
+    ``tuned_us <= baseline_us*0.90`` AND ``(baseline_us - tuned_us) >= 2us``.
+
+    The absolute floor rejects sub-microsecond percentage-only claims.
+    """
+    return (tuned_us <= baseline_us * (1.0 - WIN_MARGIN)) and ((baseline_us - tuned_us) >= ABS_US_BAND)
+
+
+def effective_tflops(token: int, model_dim: int, inter_dim: int, topk: int, combined_us: float) -> float:
+    """Combined effective TFLOPS per the aiter test_moe_2stage formula:
+    ``token*model_dim*inter_dim*3*topk*2 / us`` (us in microseconds).
+    """
+    return token * model_dim * inter_dim * 3 * topk * 2 / combined_us / 1e6
+
+
+def mfu(effective_tflops_value: float) -> float:
+    """MFU = effective TFLOPS / fp4 peak (4523 TFLOPS)."""
+    return effective_tflops_value / FP4_PEAK_TFLOPS
diff --git a/scripts/aiter_strict_point.py b/scripts/aiter_strict_point.py
new file mode 100644
index 000000000..324dcf94f
--- /dev/null
+++ b/scripts/aiter_strict_point.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
+
+"""Run ONE aiter MoE point through the strict, AOT-checked, model-correct path.
+
+This replaces the aiter *legacy CLI* path (which sets ``strict_accuracy=False``,
+``check_aot_cache=False``, hardcodes ``ActivationType.Swiglu`` for the fp8/fp4
+case, and times with warmup=2/iters=5) with a direct call to aiter's
+``test_fmoe`` using:
+
+* the model's TRUE activation and gate mode (passed by the caller),
+* ``strict_accuracy=True`` and ``check_aot_cache=True`` (the AOT-cache-wrapped
+  variant ``test_fmoe_with_aot_cache_check`` — so an AOT-cache miss raises),
+* the locked e2e measurement protocol (warmup/iters injected by monkeypatching
+  the module's ``run_perftest`` reference).
+
+It prints one machine-readable ``STRICT_RESULT {json}`` line with e2e us,
+logits_diff, correctness pass/fail, and the strict/AOT/protocol flags actually
+used, which ``moe_tuning_harness.parse_strict_aiter_output`` consumes.
+
+Usage:
+  python3 scripts/aiter_strict_point.py \
+    --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 16 \
+    --aq fp4 --wq fp4 --act silu --gate separated \
+    [--warmup 10 --iters 100] [--no-aot] [--aiter-repo /sgl-workspace/aiter]
+"""
+
+from __future__ import annotations
+
+import argparse
+import importlib.util
+import json
+import sys
+
+
+def _load_aiter_module(aiter_repo: str):
+    """Import test_moe_2stage.py without running its default CLI sweep.
+
+    The module has no ``__main__`` guard, so executing it runs the bottom sweep;
+    we set argv to ``--no-legacy --no-flydsl-csv`` first to make that sweep empty.
+    """
+    sys.argv = ["test_moe_2stage.py", "--no-legacy", "--no-flydsl-csv"]
+    path = f"{aiter_repo}/op_tests/test_moe_2stage.py"
+    spec = importlib.util.spec_from_file_location("aiter_test_moe_2stage", path)
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    return mod
+
+
+_DTYPES = {}
+
+
+def _resolve_dtypes():
+    from aiter import dtypes
+
+    return {
+        "fp4": dtypes.fp4x2,
+        "fp8": dtypes.fp8,
+        "bf16": dtypes.bf16,
+        "fp16": dtypes.fp16,
+    }
+
+
+def main(argv=None) -> int:
+    ap = argparse.ArgumentParser(description="strict single-case aiter MoE guardrail")
+    ap.add_argument("--model-dim", type=int, required=True)
+    ap.add_argument("--inter-dim", type=int, required=True)
+    ap.add_argument("-e", "--experts", type=int, required=True)
+    ap.add_argument("-k", "--topk", type=int, required=True)
+    ap.add_argument("-t", "--token", type=int, required=True)
+    ap.add_argument("--aq", required=True, help="activation quant dtype: fp4|fp8|bf16")
+    ap.add_argument("--wq", default="fp4", help="weight quant dtype (fp4)")
+    ap.add_argument("--act", required=True, help="silu|swiglu")
+    ap.add_argument("--gate", default="separated", help="separated|interleave")
+    ap.add_argument("--warmup", type=int, default=10)
+    ap.add_argument("--iters", type=int, default=100)
+    ap.add_argument("--no-aot", action="store_true", help="disable AOT-cache check (records it)")
+    ap.add_argument("--aiter-repo", default="/sgl-workspace/aiter")
+    args = ap.parse_args(argv)
+
+    mod = _load_aiter_module(args.aiter_repo)
+    import aiter
+
+    dts = _resolve_dtypes()
+    aq, wq = dts[args.aq], dts[args.wq]
+    act = getattr(aiter.ActivationType, args.act.capitalize())
+    check_aot = not args.no_aot
+
+    # Inject the locked e2e protocol by wrapping the module's run_perftest so the
+    # internal warmup=2/iters=5 are overridden with the locked values.
+    _orig_run_perftest = mod.run_perftest
+
+    # True timed-loop e2e distribution: after a warmup, time the fused_moe call per
+    # iteration (median + p95 over `iters`) IN ADDITION TO aiter's own rotated
+    # average.  We keep aiter's rotated average as the median e2e_us (it defeats L2
+    # via arg rotation, matching the L2-flush intent and staying comparable across
+    # runs) and use the per-iteration loop only for the e2e p95 dispersion.
+    e2e_dist = {"median": None, "p95": None}
+    # run_perftest's own control kwargs are NOT forwarded to the timed callable.
+    _PERF_CTRL_KW = ("num_iters", "num_warmup", "testGraph", "num_rotate_args", "needTrace")
+
+    def _locked_run_perftest(func, *a, **kw):
+        # aiter's rotated average (locked warmup/iters) -> the comparable median.
+        kw_avg = dict(kw)
+        kw_avg["num_iters"] = args.iters
+        kw_avg["num_warmup"] = args.warmup
+        data, avg = _orig_run_perftest(func, *a, **kw_avg)
+        e2e_dist["median"] = avg
+        # Per-iteration p95 dispersion (best-effort; does not change the median).
+        try:
+            import torch
+
+            call_kw = {k: v for k, v in kw.items() if k not in _PERF_CTRL_KW}
+            lat = []
+            ev0 = torch.cuda.Event(enable_timing=True)
+            ev1 = torch.cuda.Event(enable_timing=True)
+            for _ in range(max(1, args.iters)):
+                ev0.record()
+                func(*a, **call_kw)
+                ev1.record()
+                ev1.synchronize()
+                lat.append(ev0.elapsed_time(ev1) * 1000.0)  # ms -> us
+            ordered = sorted(lat)
+            idx = max(0, min(len(ordered) - 1, int(round(0.95 * (len(ordered) - 1)))))
+            e2e_dist["p95"] = ordered[idx]
+        except Exception:
+            e2e_dist["p95"] = None
+        return data, avg
+
+    mod.run_perftest = _locked_run_perftest
+
+    test_fn = mod.test_fmoe_with_aot_cache_check if check_aot else mod.test_fmoe
+
+    result = {
+        "strict_accuracy": True,
+        "check_aot_cache": check_aot,
+        "warmup": args.warmup,
+        "iters": args.iters,
+        "act": args.act,
+        "gate": args.gate,
+        "aq": args.aq,
+        "wq": args.wq,
+    }
+    try:
+        ret = test_fn(
+            aiter.dtypes.bf16,
+            args.token,
+            args.model_dim,
+            args.inter_dim,
+            args.experts,
+            args.topk,
+            act,
+            args.gate,
+            aiter.QuantType.per_1x32,
+            aq,
+            wq,
+            use_g1u1=True,
+            doweight_stage1=False,
+            strict_accuracy=True,
+            check_aot_cache=check_aot,
+        )
+        if ret is None:
+            result.update({"error": "skipped_or_none", "error_category": "skipped", "correctness_pass": False})
+        else:
+            ld = float(ret["logits_diff"])
+            result.update(
+                {
+                    "e2e_us": e2e_dist["median"] if e2e_dist["median"] is not None else float(ret["us"]),
+                    "e2e_us_p95": e2e_dist["p95"],
+                    "logits_diff": ld,
+                    "correctness_pass": ld <= 0.01,
+                    "error_category": "" if ld <= 0.01 else "correctness",
+                }
+            )
+    except Exception as e:  # AOT miss, strict assertion, or runtime error.
+        name = type(e).__name__
+        msg = str(e)
+        if "AOT cache miss" in msg:
+            cat = "aot_miss"
+        elif name == "AssertionError" or "accuracy check failed" in msg:
+            cat = "correctness"
+        elif "out of memory" in msg.lower() or "OOM" in msg:
+            cat = "oom"
+        else:
+            cat = "runtime"
+        result.update({"error": f"{name}: {msg[:200]}", "error_category": cat, "correctness_pass": False})
+    finally:
+        mod.run_perftest = _orig_run_perftest
+
+    print("STRICT_RESULT " + json.dumps(result), flush=True)
+    return 0 if result.get("correctness_pass") else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/moe_tuning_harness.py b/scripts/moe_tuning_harness.py
new file mode 100644
index 000000000..8f9b3ba8d
--- /dev/null
+++ b/scripts/moe_tuning_harness.py
@@ -0,0 +1,1157 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
+
+"""Measurement harness for the MXFP4 MoE 2-stage tuning campaign on gfx950.
+
+The harness emits a per-point CSV that is the single reference table every
+candidate is compared against.  Two measurement paths feed it:
+
+* **Per-stage kernel-path us** comes from the FlyDSL ``tests/kernels/test_moe_gemm.py``
+  benchmark, which prints ``FlyDSL MoE stage1[..]`` / ``FlyDSL MoE stage2 [..]``
+  lines with per-stage us.  Combined kernel-path us = stage1 + stage2 + sorting.
+* **Strict correctness + full fused-MoE e2e us** comes from the aiter
+  ``op_tests/test_moe_2stage.py`` harness (``strict_accuracy``,
+  ``logits_diff <= 0.01``, ``fail_on_aot_cache_miss``).  That harness times the
+  whole ``fused_moe`` call as the e2e guardrail.
+
+Every row records full provenance (GPU id+model, branch+commit, exact command,
+shape, dtype+act, warmup/iters, idle-GPU check) and the resolved metric formula,
+under the locked protocol in :mod:`kernels.moe_tuning_spec`.
+
+This module keeps the parsing / metric / provenance / CSV logic as pure
+functions so they are unit-testable without a GPU.  The live sweep driver
+(:func:`run_point`) shells out to the two harnesses and is intended to run on the
+fixed idle gfx950 node.
+"""
+
+from __future__ import annotations
+
+import csv
+import json
+import os
+import re
+import shlex
+import statistics
+import subprocess
+import sys
+from dataclasses import dataclass
+from typing import Dict, List, Optional
+
+_REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+if _REPO_ROOT not in sys.path:
+    sys.path.insert(0, _REPO_ROOT)
+
+from kernels import moe_tuning_spec as spec  # noqa: E402
+
+# CSV columns: provenance first, then shape/config, then metrics.
+CSV_COLUMNS = [
+    # provenance
+    "gpu_id",
+    "gpu_model",
+    "branch",
+    "commit",
+    "command",
+    "warmup",
+    "iters",
+    "idle_gpu_verified",
+    "graph_capture",
+    "l2_flush_per_iter",
+    "clocks_pinned",
+    "metric_formula",
+    # shape / config
+    "model",
+    "model_dim",
+    "inter_dim",
+    "experts",
+    "topk",
+    "dtype",
+    "act",
+    "token",
+    "tile_m1",
+    "tile_n1",
+    "tile_k1",
+    "tile_m2",
+    "tile_n2",
+    "tile_k2",
+    # metrics (median + p95 over iters)
+    "stage1_us",
+    "stage2_us",
+    "sorting_us",
+    "kernel_path_us",
+    "kernel_path_us_p95",
+    "effective_tflops",
+    "mfu",
+    "e2e_us",
+    "e2e_us_p95",
+    "logits_diff",
+    "correctness_pass",
+    # failure provenance (auditable for quarantined / failing rows)
+    "flydsl_command",
+    "strict_error",
+    "error_category",
+    "aot_status",
+]
+
+METRIC_FORMULA = (
+    "effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523"
+)
+
+# Print formats from tests/kernels/test_moe_gemm.py (the first us is the median;
+# an optional " p95=<v> us" suffix appears when FLYDSL_PERF_DIST is set):
+#   "FlyDSL MoE stage1[fp4]: 1163.2 us, p95=1170.0 us 1654.24 TFLOPS(...), 0.377 TB/s (...)"
+#   "FlyDSL MoE stage2 [moe_gemm2] fp4 atomic | ... | 1163.2 us, p95=1170.0 us 1654.24 TFLOPS, 0.377 TB/s"
+_STAGE1_RE = re.compile(r"FlyDSL MoE stage1\[[^\]]+\]:\s*([0-9.]+)\s*us")
+_STAGE2_RE = re.compile(r"FlyDSL MoE stage2 \[[^\]]+\]\s+\S+\s+(atomic|reduce)\b.*?([0-9.]+)\s*us")
+# Optional per-stage p95 suffix.
+_STAGE1_P95_RE = re.compile(r"FlyDSL MoE stage1\[[^\]]+\]:\s*[0-9.]+\s*us,\s*p95=([0-9.]+)\s*us")
+_STAGE2_P95_RE = re.compile(
+    r"FlyDSL MoE stage2 \[[^\]]+\]\s+\S+\s+(?:atomic|reduce)\b.*?[0-9.]+\s*us,\s*p95=([0-9.]+)\s*us"
+)
+# Optional sorting print, if the FlyDSL benchmark emits one.
+_SORT_RE = re.compile(r"FlyDSL MoE sort(?:ing)?[^\d]*([0-9.]+)\s*us", re.IGNORECASE)
+
+# aiter op_tests/test_moe_2stage.py full fused_moe e2e print (line 363):
+#   "ck_moe_2stages:  123.45 us,  654.00 tflops......(quant:...)"
+_AITER_E2E_RE = re.compile(r"ck_moe_2stages:\s*([0-9.]+)\s*us")
+# aiter logits_diff warning line (only printed when logits_diff > 1e-3).
+_AITER_LOGITS_RE = re.compile(r"logits_diff[:=]\s*([0-9.eE+-]+)")
+# aiter summary markdown data row: the final two numeric cells are
+# ``... | <e2e us> | <logits_diff> | <model> |``.  This carries logits_diff even
+# when it is below the 1e-3 warning threshold (so no warning line is printed).
+_AITER_MD_ROW_RE = re.compile(r"\|\s*([0-9][0-9.eE+-]*)\s*\|\s*([0-9][0-9.eE+-]*)\s*\|\s*\w+\s*\|\s*$")
+# Real correctness-miss signals: the strict-accuracy assertion or a hard error.
+# NOTE: the bare ``checkAllclose ... failed!`` line is the LOOSE elementwise check
+# and is EXPECTED for fp4; correctness is gated on logits_diff <= 0.01 per the
+# locked contract, not on that line.
+_AITER_FAIL_RE = re.compile(r"accuracy check failed|AssertionError|Traceback|RuntimeError", re.IGNORECASE)
+
+# aiter -q quant index -> dtype alias used here (see l_quant in the harness).
+DTYPE_ALIAS_TO_AITER_Q = {"a4w4": 4, "a8w4": 7}
+
+
+@dataclass
+class Provenance:
+    """Run provenance recorded with every measured point."""
+
+    gpu_id: str = ""
+    gpu_model: str = ""
+    branch: str = ""
+    commit: str = ""
+    warmup: int = spec.WARMUP_ITERS
+    iters: int = spec.BENCH_ITERS
+    idle_gpu_verified: bool = False
+    graph_capture: bool = spec.GRAPH_CAPTURE
+    l2_flush_per_iter: bool = spec.L2_FLUSH_PER_ITER
+    # NOT proof until verified: defaults False so a row never claims pinned clocks
+    # unless the driver enabled performance determinism AND verified the state.
+    # (spec.CLOCKS_PINNED is the protocol's INTENT, not evidence.)
+    clocks_pinned: bool = False
+    metric_formula: str = METRIC_FORMULA
+
+    REQUIRED_FIELDS = ("gpu_id", "gpu_model", "branch", "commit", "warmup", "iters")
+
+    def missing_fields(self) -> List[str]:
+        """Required provenance fields that are empty/unset (the baseline contract negative gate)."""
+        missing = []
+        for f in self.REQUIRED_FIELDS:
+            v = getattr(self, f)
+            if v in ("", None):
+                missing.append(f)
+        return missing
+
+    def is_complete(self) -> bool:
+        return not self.missing_fields()
+
+
+@dataclass
+class PointRow:
+    """One per-point measurement row (provenance + shape/config + metrics)."""
+
+    provenance: Provenance
+    command: str
+    model: str
+    model_dim: int
+    inter_dim: int
+    experts: int
+    topk: int
+    dtype: str
+    act: str
+    token: int
+    tile_m1: int = 0
+    tile_n1: int = 0
+    tile_k1: int = 0
+    tile_m2: int = 0
+    tile_n2: int = 0
+    tile_k2: int = 0
+    stage1_us: Optional[float] = None
+    stage2_us: Optional[float] = None
+    sorting_us: Optional[float] = None
+    kernel_path_us: Optional[float] = None
+    kernel_path_us_p95: Optional[float] = None
+    effective_tflops: Optional[float] = None
+    mfu: Optional[float] = None
+    e2e_us: Optional[float] = None
+    e2e_us_p95: Optional[float] = None
+    logits_diff: Optional[float] = None
+    correctness_pass: Optional[bool] = None
+    flydsl_command: str = ""
+    strict_error: str = ""
+    error_category: str = ""
+    aot_status: str = ""
+
+    def to_csv_dict(self) -> dict:
+        p = self.provenance
+        row = {
+            "gpu_id": p.gpu_id,
+            "gpu_model": p.gpu_model,
+            "branch": p.branch,
+            "commit": p.commit,
+            "command": self.command,
+            "warmup": p.warmup,
+            "iters": p.iters,
+            "idle_gpu_verified": p.idle_gpu_verified,
+            "graph_capture": p.graph_capture,
+            "l2_flush_per_iter": p.l2_flush_per_iter,
+            "clocks_pinned": p.clocks_pinned,
+            "metric_formula": p.metric_formula,
+        }
+        for k in (
+            "model",
+            "model_dim",
+            "inter_dim",
+            "experts",
+            "topk",
+            "dtype",
+            "act",
+            "token",
+            "tile_m1",
+            "tile_n1",
+            "tile_k1",
+            "tile_m2",
+            "tile_n2",
+            "tile_k2",
+            "stage1_us",
+            "stage2_us",
+            "sorting_us",
+            "kernel_path_us",
+            "kernel_path_us_p95",
+            "effective_tflops",
+            "mfu",
+            "e2e_us",
+            "e2e_us_p95",
+            "logits_diff",
+            "correctness_pass",
+            "flydsl_command",
+            "strict_error",
+            "error_category",
+            "aot_status",
+        ):
+            row[k] = getattr(self, k)
+        return row
+
+
+# --- pure parsing / metric helpers (unit-testable, no GPU) -----------------
+
+
+def parse_flydsl_stage_us(stdout: str) -> dict:
+    """Extract stage1 / stage2 median us and optional p95 from FlyDSL stdout.
+
+    Returns ``{"stage1_us", "stage2_us", "stage1_p95", "stage2_p95"}`` using the
+    last matching line for each stage (the benchmarked, post-warmup print).  The
+    p95 fields are populated only when the FlyDSL benchmark was run with
+    FLYDSL_PERF_DIST (true timed-loop distribution); otherwise None.
+    """
+    s1 = _STAGE1_RE.findall(stdout)
+    s2 = _STAGE2_RE.findall(stdout)
+    s1p = _STAGE1_P95_RE.findall(stdout)
+    s2p = _STAGE2_P95_RE.findall(stdout)
+    return {
+        "stage1_us": float(s1[-1]) if s1 else None,
+        "stage2_us": float(s2[-1][1]) if s2 else None,
+        "stage1_p95": float(s1p[-1]) if s1p else None,
+        "stage2_p95": float(s2p[-1]) if s2p else None,
+    }
+
+
+def parse_flydsl_sorting_us(stdout: str) -> Optional[float]:
+    """Extract sorting us from FlyDSL stdout if present, else None (sorting is 0)."""
+    m = _SORT_RE.findall(stdout)
+    return float(m[-1]) if m else None
+
+
+def parse_aiter_output(stdout: str) -> dict:
+    """Extract e2e us, logits_diff, and correctness pass/fail from aiter stdout.
+
+    The aiter ``op_tests/test_moe_2stage.py`` harness times the whole fused_moe
+    call (the e2e guardrail) and logs ``ck_moe_2stages: <us> us``; the
+    per-case ``us`` and ``logits_diff`` also appear in the final summary markdown
+    row (which carries logits_diff even when it is below the 1e-3 warning
+    threshold).  Correctness is gated on ``logits_diff <= 0.01`` (the locked
+    contract) plus the absence of a hard assertion/error; the bare loose
+    ``checkAllclose ... failed!`` line is expected for fp4 and is NOT a miss.
+
+    ``correctness_pass`` requires an e2e number, a logits_diff, ``logits_diff <=
+    0.01``, and no hard failure.
+    """
+    md = _AITER_MD_ROW_RE.findall(stdout)
+    md_e2e = float(md[-1][0]) if md else None
+    md_logits = float(md[-1][1]) if md else None
+
+    e2e_line = _AITER_E2E_RE.findall(stdout)
+    logits_line = _AITER_LOGITS_RE.findall(stdout)
+    e2e_us = float(e2e_line[-1]) if e2e_line else md_e2e
+    # Prefer the markdown logits cell (always present); fall back to the warning line.
+    logits_diff = md_logits if md_logits is not None else (float(logits_line[-1]) if logits_line else None)
+
+    failed = bool(_AITER_FAIL_RE.search(stdout))
+    correctness_pass = (e2e_us is not None) and (logits_diff is not None) and (logits_diff <= 0.01) and (not failed)
+    return {"e2e_us": e2e_us, "logits_diff": logits_diff, "correctness_pass": correctness_pass}
+
+
+def parse_strict_aiter_output(stdout: str) -> dict:
+    """Parse the ``STRICT_RESULT {json}`` line from ``scripts/aiter_strict_point.py``.
+
+    Returns ``{"e2e_us", "logits_diff", "correctness_pass", "error"}``.  The strict
+    runner already applies ``strict_accuracy=True`` + ``logits_diff <= 0.01``, so
+    ``correctness_pass`` is authoritative; an AOT miss or strict assertion is
+    reported as ``error`` with ``correctness_pass=False``.
+    """
+    line = None
+    for ln in stdout.splitlines():
+        if ln.startswith("STRICT_RESULT "):
+            line = ln[len("STRICT_RESULT ") :]
+    empty = {
+        "e2e_us": None,
+        "e2e_us_p95": None,
+        "logits_diff": None,
+        "correctness_pass": False,
+        "error": "no_strict_result",
+        "error_category": "no_result",
+        "aot_status": "",
+    }
+    if line is None:
+        return empty
+    try:
+        d = json.loads(line)
+    except json.JSONDecodeError:
+        return {**empty, "error": "bad_strict_json", "error_category": "bad_json"}
+    return {
+        "e2e_us": d.get("e2e_us"),
+        "e2e_us_p95": d.get("e2e_us_p95"),
+        "logits_diff": d.get("logits_diff"),
+        "correctness_pass": bool(d.get("correctness_pass")),
+        "error": d.get("error", ""),
+        "error_category": d.get("error_category", ""),
+        "aot_status": "checked" if d.get("check_aot_cache") else "no_aot",
+    }
+
+
+def combined_kernel_path_us(stage1_us: float, stage2_us: float, sorting_us: float = 0.0) -> float:
+    """Combined kernel-path latency = stage1 + stage2 + sorting (microseconds)."""
+    return float(stage1_us) + float(stage2_us) + float(sorting_us)
+
+
+def summarize(samples: List[float]) -> dict:
+    """Median + p95 over a list of per-iter latencies (the locked statistics)."""
+    if not samples:
+        return {"median": None, "p95": None}
+    ordered = sorted(samples)
+    median = statistics.median(ordered)
+    # Nearest-rank p95.
+    idx = max(0, min(len(ordered) - 1, int(round(0.95 * (len(ordered) - 1)))))
+    return {"median": median, "p95": ordered[idx]}
+
+
+def compute_metrics(*, token: int, model_dim: int, inter_dim: int, topk: int, combined_us: float) -> dict:
+    """Effective TFLOPS + MFU for a combined kernel-path us, via the spec formula."""
+    tflops = spec.effective_tflops(token, model_dim, inter_dim, topk, combined_us)
+    return {"effective_tflops": tflops, "mfu": spec.mfu(tflops)}
+
+
+# --- provenance collection (uses the host; safe no-ops when tools absent) ---
+
+
+def _run(cmd: List[str]) -> str:
+    try:
+        return subprocess.check_output(cmd, stderr=subprocess.DEVNULL, text=True).strip()
+    except Exception:
+        return ""
+
+
+def git_provenance(repo_root: str = _REPO_ROOT) -> dict:
+    """Current branch + commit SHA of ``repo_root`` (empty strings on failure)."""
+    branch = _run(["git", "-C", repo_root, "rev-parse", "--abbrev-ref", "HEAD"])
+    commit = _run(["git", "-C", repo_root, "rev-parse", "HEAD"])
+    return {"branch": branch, "commit": commit}
+
+
+def gpu_provenance(gpu_id: str) -> dict:
+    """GPU model name from rocm-smi for ``gpu_id`` (empty string on failure)."""
+    out = _run(["rocm-smi", "--showproductname"])
+    model = ""
+    for line in out.splitlines():
+        if "Card Series" in line:
+            model = line.split(":")[-1].strip()
+            break
+    return {"gpu_id": str(gpu_id), "gpu_model": model}
+
+
+def write_csv(rows: List[PointRow], path: str) -> None:
+    """Write per-point rows to ``path`` using the fixed CSV schema."""
+    os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
+    with open(path, "w", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=CSV_COLUMNS)
+        writer.writeheader()
+        for r in rows:
+            writer.writerow(r.to_csv_dict())
+
+
+def read_csv(path: str) -> List[dict]:
+    """Read a per-point CSV back as a list of column dicts."""
+    with open(path, newline="") as f:
+        return list(csv.DictReader(f))
+
+
+# --- workload run list (full the token-grid policy coverage from the spec) ------------------
+
+
+@dataclass(frozen=True)
+class RunPoint:
+    """One (model, dtype, act, token) point in the campaign workload."""
+
+    model: str
+    model_dim: int
+    inter_dim: int
+    experts: int
+    topk: int
+    act: str
+    dtype: str  # "a4w4" | "a8w4"
+    token: int
+
+
+def build_run_list() -> List[RunPoint]:
+    """Every model x in-scope dtype x the token-grid policy token from ``moe_tuning_spec.MODELS``.
+
+    This is the authoritative campaign workload; the harness sweeps exactly these
+    points so coverage is the full the token-grid policy grid (not a partial manual table).
+    """
+    points: List[RunPoint] = []
+    for m in spec.MODELS:
+        for dtype in m.dtypes:
+            for token in m.token_grid:
+                points.append(RunPoint(m.name, m.model_dim, m.inter_dim, m.experts, m.topk, m.act, dtype, token))
+    return points
+
+
+def expected_point_keys() -> set:
+    """The set of (model, dtype, act, token) keys the full workload must cover."""
+    return {(p.model, p.dtype, p.act, str(p.token)) for p in build_run_list()}
+
+
+def select_run_points(model=None, dtype=None, tokens=None) -> List[RunPoint]:
+    """Filter the full run list by model / dtype / token set (for candidate sweeps).
+
+    ``model`` and ``dtype`` are exact-match strings (None = all); ``tokens`` is an
+    iterable of ints (None = the model's full grid).  Lets a reproducible candidate
+    sweep target e.g. one model+dtype over chosen tokens instead of the whole grid.
+    """
+    tok_set = set(int(t) for t in tokens) if tokens else None
+    out = []
+    for rp in build_run_list():
+        if model is not None and rp.model != model:
+            continue
+        if dtype is not None and rp.dtype != dtype:
+            continue
+        if tok_set is not None and rp.token not in tok_set:
+            continue
+        out.append(rp)
+    return out
+
+
+def candidate_tile_for(rp: RunPoint, overrides: dict) -> dict:
+    """Tile config for a candidate sweep: the shape's default tiles with explicit
+    per-key overrides applied (only keys present in ``overrides`` are changed).
+
+    Raises ValueError if the resulting (stage1, stage2) tiles are illegal for the
+    shape under the pre-compile legality filter, so a candidate sweep never spends
+    GPU time on a config the kernel would reject.
+    """
+    from kernels import moe_tuning as _mt
+
+    tile = dict(default_tile_for(rp))
+    for k in ("tile_m1", "tile_n1", "tile_k1", "tile_n2", "tile_k2"):
+        if overrides.get(k) is not None:
+            tile[k] = int(overrides[k])
+    a_dtype = spec.DTYPE_ALIAS_TO_A_DTYPE[rp.dtype]
+    r1 = _mt.check_tile_config(
+        stage=1,
+        model_dim=rp.model_dim,
+        inter_dim=rp.inter_dim,
+        tile_m=tile["tile_m1"],
+        tile_n=tile["tile_n1"],
+        tile_k=tile["tile_k1"],
+        a_dtype=a_dtype,
+    )
+    r2 = _mt.check_tile_config(
+        stage=2,
+        model_dim=rp.model_dim,
+        inter_dim=rp.inter_dim,
+        tile_m=tile["tile_m1"],
+        tile_n=tile["tile_n2"],
+        tile_k=tile["tile_k2"],
+        a_dtype=a_dtype,
+    )
+    if not (r1.legal and r2.legal):
+        raise ValueError(f"illegal candidate tiles for {rp.model}/{rp.dtype}: s1={r1.reason} s2={r2.reason}")
+    return tile
+
+
+def prepare_candidate_run(overrides: dict, model=None, dtype=None, tokens=None, prov=None, command=""):
+    """Resolve a fail-closed candidate run: (run_list, per-point tiles).
+
+    Requirements (raises ValueError, recording a machine-readable rejection for
+    illegal tiles, so the caller fails closed WITHOUT writing a partial CSV):
+    - at least one explicit tile override must be given (no silent default-tile
+      fallback for candidate mode);
+    - the selection must match at least one point;
+    - EVERY selected point's tiles must pass the legality filter — the first
+      illegal point aborts the whole run (a candidate run must be all-legal).
+
+    ``prov`` (a ``Provenance``) and ``command`` (the exact top-level invocation)
+    supply the run-provenance class carried by every rejected-candidate record so
+    a rejection is as auditable as a measured attempt.  When ``prov`` is None the
+    git branch/commit are still resolved (host-side path), so the record stays
+    complete; GPU identity is then left to the caller's monkeypatch/tests.
+    """
+    import moe_tuning_ledger as _ledger
+
+    if not any(v is not None for v in overrides.values()):
+        raise ValueError("candidate mode requires at least one explicit --tile-* override")
+    run_list = select_run_points(model=model, dtype=dtype, tokens=tokens)
+    if not run_list:
+        raise ValueError("candidate selection matched no points")
+    # Provenance shared by every rejection from this run (filled from prov + git).
+    git = git_provenance()
+    base_prov = {
+        "gpu_id": getattr(prov, "gpu_id", "") or "",
+        "gpu_model": getattr(prov, "gpu_model", "") or "",
+        "branch": getattr(prov, "branch", "") or git.get("branch", ""),
+        "commit": getattr(prov, "commit", "") or git.get("commit", ""),
+        "warmup": getattr(prov, "warmup", spec.WARMUP_ITERS),
+        "iters": getattr(prov, "iters", spec.BENCH_ITERS),
+        "command": command,
+        "selection": {"model": model, "dtype": dtype, "tokens": list(tokens) if tokens else None},
+    }
+    tiles = []
+    for rp in run_list:
+        try:
+            tiles.append(candidate_tile_for(rp, overrides))
+        except ValueError as e:
+            _ledger.append_rejected_candidate(
+                {
+                    **base_prov,
+                    "model": rp.model,
+                    "dtype": rp.dtype,
+                    "act": rp.act,
+                    "token": rp.token,
+                    "stage": 0,  # candidate-tile rejection spans both stages; reason names the stage
+                    "config": {k: overrides.get(k) for k in overrides},
+                    "reason": str(e),
+                    # No measured artifact exists for a pre-compile rejection, but
+                    # the keys must be present to match a measured attempt's schema.
+                    "csv_path": "",
+                    "profile_path": "",
+                }
+            )
+            raise ValueError(f"illegal candidate at {rp.model}/{rp.dtype} t={rp.token}: {e}") from e
+    return run_list, tiles
+
+
+# --- baseline validation gate (the baseline contract negative tests) ------------------------
+
+# The locked baseline must come from this exact commit (DEC scope).
+LOCKED_BASELINE_COMMIT = "523ca1c7"
+# Identity/provenance fields every baseline row must carry beyond the protocol.
+ROW_REQUIRED_FIELDS = ("command", "dtype", "act", "model", "token")
+# Numeric metric fields every baseline row must carry, parseable as float
+# (the baseline contract + the no-regression policy: per-stage, combined kernel-path median+p95, effective TFLOPS,
+# MFU, and the e2e guardrail median+p95, plus the correctness logits_diff).
+ROW_REQUIRED_METRIC_FIELDS = (
+    "stage1_us",
+    "stage2_us",
+    "sorting_us",
+    "kernel_path_us",
+    "kernel_path_us_p95",
+    "effective_tflops",
+    "mfu",
+    "e2e_us",
+    "e2e_us_p95",
+    "logits_diff",
+)
+
+
+def _is_float(v) -> bool:
+    if v in (None, "", "None"):
+        return False
+    try:
+        float(v)
+        return True
+    except (TypeError, ValueError):
+        return False
+
+
+def validate_baseline_row(row: dict) -> List[str]:
+    """Return reasons ``row`` is NOT an acceptable locked-baseline row (empty=OK).
+
+    Rejects rows that are not from the locked commit, not idle-GPU verified, miss
+    a required provenance/identity field, miss or non-numeric any the baseline contract/the no-regression policy metric
+    field (per-stage, kernel-path median+p95, effective TFLOPS, MFU, e2e
+    median+p95, logits_diff), are not correctness_pass=True, or use a non-locked
+    protocol (warmup/iters/graph/L2/clock).
+    """
+    reasons: List[str] = []
+
+    commit = str(row.get("commit", ""))
+    if not commit:
+        reasons.append("missing_commit")
+    elif not commit.startswith(LOCKED_BASELINE_COMMIT):
+        reasons.append(f"commit_not_{LOCKED_BASELINE_COMMIT}")
+
+    if str(row.get("idle_gpu_verified", "")).lower() not in ("true", "1"):
+        reasons.append("idle_gpu_not_verified")
+
+    for f in ("gpu_id", "gpu_model", "branch", *ROW_REQUIRED_FIELDS):
+        if str(row.get(f, "")).strip() in ("", "None"):
+            reasons.append(f"missing_{f}")
+
+    # Every the baseline contract/the no-regression policy metric must be present AND numeric.
+    for f in ROW_REQUIRED_METRIC_FIELDS:
+        if not _is_float(row.get(f)):
+            reasons.append(f"missing_{f}")
+
+    # Correctness gate must have passed for this point.
+    if str(row.get("correctness_pass", "")).lower() not in ("true", "1"):
+        reasons.append("correctness_not_passed")
+
+    # Locked protocol (the no-regression policy): warmup=10, iters=100, graph OFF, L2 flush on, clocks pinned.
+    if str(row.get("warmup", "")) != str(spec.WARMUP_ITERS):
+        reasons.append("warmup_mismatch")
+    if str(row.get("iters", "")) != str(spec.BENCH_ITERS):
+        reasons.append("iters_mismatch")
+    if str(row.get("graph_capture", "")).lower() not in ("false", "0"):
+        reasons.append("graph_capture_must_be_off")
+    if str(row.get("l2_flush_per_iter", "")).lower() not in ("true", "1"):
+        reasons.append("l2_flush_must_be_on")
+    if str(row.get("clocks_pinned", "")).lower() not in ("true", "1"):
+        reasons.append("clocks_must_be_pinned")
+    return reasons
+
+
+def validate_baseline_csv(path: str, expected_keys: Optional[set] = None) -> dict:
+    """Validate every row of a baseline CSV and that coverage equals the workload.
+
+    Returns ``{"valid": bool, "row_errors": {key: [reasons]}, "missing_points":
+    [...], "n_rows": int}``.  A baseline is valid only if every row that belongs
+    to ``expected_keys`` passes :func:`validate_baseline_row` AND all
+    ``expected_keys`` points are present.
+
+    ``expected_keys`` defaults to the full the token-grid policy workload
+    (:func:`expected_point_keys`).  Pass a subset (e.g.
+    ``moe_tuning_spec.validated_point_keys()``) to validate the correctness-passing
+    subset independently of the quarantined a8w4 shapes.  Rows outside
+    ``expected_keys`` are ignored (neither required nor cause errors).
+    """
+    if expected_keys is None:
+        expected_keys = expected_point_keys()
+    rows = read_csv(path)
+    row_errors: Dict[str, list] = {}
+    seen = set()
+    for row in rows:
+        key = (row.get("model"), row.get("dtype"), row.get("act"), row.get("token"))
+        if key not in expected_keys:
+            continue  # quarantined / out-of-subset row: not validated here.
+        seen.add(key)
+        errs = validate_baseline_row(row)
+        if errs:
+            row_errors[str(key)] = errs
+    missing = sorted(str(k) for k in (expected_keys - seen))
+    valid = not row_errors and not missing
+    return {"valid": valid, "row_errors": row_errors, "missing_points": missing, "n_rows": len(rows)}
+
+
+# --- live measurement (runs on the gfx950 node) ----------------------------
+
+
+def check_idle_gpu(gpu_id: str, busy_pct_threshold: int = 5) -> bool:
+    """True if the GPU's utilization is below ``busy_pct_threshold`` (idle check)."""
+    out = _run(["rocm-smi", "-d", str(gpu_id), "--showuse"])
+    for line in out.splitlines():
+        m = re.search(r"GPU use \(%\)\s*:?\s*([0-9]+)", line)
+        if m:
+            return int(m.group(1)) < busy_pct_threshold
+    # If utilization could not be read, do not claim idle.
+    return False
+
+
+# Locked sclk to pin for the measurement protocol (this node's max, MHz).
+PINNED_SCLK_MHZ = 2200
+
+
+def pin_clocks(gpu_id: str, sclk_mhz: int = PINNED_SCLK_MHZ) -> bool:
+    """Enable performance determinism (pin sclk) so the recorded
+    ``clocks_pinned`` flag is truthful, not aspirational.
+
+    Returns True if determinism was enabled (rocm-smi reports success), else
+    False (e.g. the container forbids it).  DVFS auto-scaling is the dominant
+    source of small-token run-to-run jitter; pinning is the in-protocol way to
+    reduce it without changing the no-regression band.
+    """
+    out = _run(["rocm-smi", "-d", str(gpu_id), "--setperfdeterminism", str(sclk_mhz)])
+    return "performance determinism" in out.lower() and "successfully" in out.lower()
+
+
+def clocks_pinned_state(gpu_id: str) -> bool:
+    """True if the GPU performance level is a pinned/deterministic mode (not auto)."""
+    out = _run(["rocm-smi", "-d", str(gpu_id), "--showperflevel"]).lower()
+    # "determinism" or "manual"/"high" indicate a pinned level; "auto" is DVFS.
+    return ("determinism" in out) or ("manual" in out) or ("high" in out)
+
+
+def setup_run_provenance(gpu_id: str, assume_idle: bool = False, repo_ref: str = _REPO_ROOT) -> Provenance:
+    """Build the run Provenance with VERIFIED idle + clock-pinned state.
+
+    Enables performance determinism (pins sclk) and verifies it via
+    ``clocks_pinned_state``; ``Provenance.clocks_pinned`` reflects only the
+    verified state (never the static intent default).  Used by the live sweep so
+    every emitted row's clock provenance is trustworthy.
+    """
+    idle = True if assume_idle else check_idle_gpu(gpu_id)
+    pin_clocks(gpu_id)  # best-effort enable
+    pinned = clocks_pinned_state(gpu_id)  # verify the actual state
+    prov = Provenance(idle_gpu_verified=idle, clocks_pinned=pinned)
+    prov.__dict__.update(git_provenance(repo_ref))
+    prov.__dict__.update(gpu_provenance(gpu_id))
+    return prov
+
+
+def _flydsl_cmd(rp: RunPoint, gpu_id: str, tile: dict) -> List[str]:
+    """FlyDSL per-stage benchmark command for one point under the locked protocol."""
+    in_dtype = "fp4" if rp.dtype == "a4w4" else "a8w4"
+    return [
+        "python3",
+        os.path.join(_REPO_ROOT, "tests", "kernels", "test_moe_gemm.py"),
+        "--in_dtype",
+        in_dtype,
+        "-dim",
+        f"{rp.model_dim},{rp.inter_dim}",
+        "-t",
+        str(rp.token),
+        "-e",
+        str(rp.experts),
+        "-k",
+        str(rp.topk),
+        "--num_warmup",
+        str(spec.WARMUP_ITERS),
+        "--num_iters",
+        str(spec.BENCH_ITERS),
+        "--tile_m",
+        str(tile["tile_m1"]),
+        "--tile_n",
+        str(tile["tile_n1"]),
+        "--tile_k",
+        str(tile["tile_k1"]),
+        "--tile_n2",
+        str(tile["tile_n2"]),
+        "--tile_k2",
+        str(tile["tile_k2"]),
+        "--skip_ref",
+        "true",
+        "--compare_aiter_ck",
+        "false",
+    ]
+
+
+AITER_REPO = "/sgl-workspace/aiter"
+# Default gate mode per quant alias for the strict aiter guardrail.  a4w4 uses
+# SEPARATED (validated correct); a8w4 is quarantined (see moe_tuning_spec) so its
+# gate choice is recorded but never gates a win.
+DTYPE_ALIAS_TO_GATE = {"a4w4": "separated", "a8w4": "interleave"}
+
+
+def _aiter_cmd(rp: RunPoint, check_aot: bool = True) -> List[str]:
+    """Strict, AOT-checked, model-correct single-case aiter guardrail command.
+
+    Invokes ``scripts/aiter_strict_point.py`` which calls aiter ``test_fmoe`` with
+    the model's TRUE activation and gate mode, ``strict_accuracy=True``, the
+    AOT-cache-wrapped variant (``check_aot`` -> ``fail_on_aot_cache_miss``), and
+    the locked e2e protocol (warmup=10/iters=100 injected over aiter's internal
+    2/5).  This is NOT the aiter legacy CLI (which is non-strict, non-AOT, and
+    hardcodes Swiglu/INTERLEAVE for the fp8xfp4 case).
+    """
+    aq = spec.DTYPE_ALIAS_TO_A_DTYPE[rp.dtype]  # a4w4->fp4, a8w4->fp8
+    gate = DTYPE_ALIAS_TO_GATE[rp.dtype]
+    cmd = [
+        "python3",
+        os.path.join(_REPO_ROOT, "scripts", "aiter_strict_point.py"),
+        "--model-dim",
+        str(rp.model_dim),
+        "--inter-dim",
+        str(rp.inter_dim),
+        "-e",
+        str(rp.experts),
+        "-k",
+        str(rp.topk),
+        "-t",
+        str(rp.token),
+        "--aq",
+        aq,
+        "--wq",
+        "fp4",
+        "--act",
+        rp.act,
+        "--gate",
+        gate,
+        "--warmup",
+        str(spec.WARMUP_ITERS),
+        "--iters",
+        str(spec.BENCH_ITERS),
+        "--aiter-repo",
+        AITER_REPO,
+    ]
+    if not check_aot:
+        cmd.append("--no-aot")
+    return cmd
+
+
+def _exec(cmd: List[str], gpu_id: str, extra_env: Optional[dict] = None) -> str:
+    env = dict(os.environ)
+    env["HIP_VISIBLE_DEVICES"] = str(gpu_id)
+    if extra_env:
+        env.update({k: str(v) for k, v in extra_env.items()})
+    try:
+        out = subprocess.run(cmd, env=env, capture_output=True, text=True, timeout=3600)
+        return (out.stdout or "") + "\n" + (out.stderr or "")
+    except Exception as e:  # pragma: no cover - live-run only
+        return f"HARNESS_EXEC_ERROR: {e}"
+
+
+def run_point(
+    rp: RunPoint,
+    tile: dict,
+    gpu_id: str,
+    provenance: Provenance,
+    measure_e2e: bool = True,
+    reps: int = 3,
+    check_aot: bool = True,
+) -> PointRow:  # pragma: no cover - exercised only on the gfx950 node
+    """Measure one workload point: FlyDSL per-stage us + aiter e2e/correctness.
+
+    ``tile`` carries tile_m1/n1/k1 and tile_n2/k2 (stage1 + stage2 tiles).  The
+    combined kernel-path us = stage1 + stage2 + sorting; the aiter run supplies
+    the e2e guardrail us, logits_diff, and correctness pass/fail.
+
+    Median + p95 come from the TRUE timed loop inside each subprocess: the FlyDSL
+    benchmark runs with ``FLYDSL_PERF_DIST=1`` (per-iteration median+p95 over
+    ``iters``) and the strict aiter runner times fused_moe per iteration.  ``reps``
+    here is just how many independent subprocess samples to take of the median; the
+    per-point p95 is the timed-loop p95 (median of the per-rep p95 values), NOT a
+    dispersion across reps.  ``flydsl_command``, ``strict_error``,
+    ``error_category``, and ``aot_status`` are recorded for auditability.
+
+    ``check_aot`` gates the strict aiter AOT-cache check; when False the e2e still
+    runs strict+correct but does not require a pre-populated AOT cache (recorded as
+    ``aot_status="no_aot"``).  ``command`` names ONLY the commands actually executed
+    for this row: the aiter command is appended only when ``measure_e2e`` is True.
+    """
+    flydsl_cmd = _flydsl_cmd(rp, gpu_id, tile)
+    aiter_cmd = _aiter_cmd(rp, check_aot=check_aot)
+    # The FlyDSL benchmark must emit its true per-iteration distribution; the env
+    # is part of the reproducible command provenance (a replay must set it too).
+    flydsl_env = {"FLYDSL_PERF_DIST": "1"}
+    env_prefix = f"HIP_VISIBLE_DEVICES={gpu_id} FLYDSL_PERF_DIST=1 "
+    flydsl_command_str = env_prefix + " ".join(flydsl_cmd)
+    # Only name commands that actually run for this row (truthful provenance).
+    command = flydsl_command_str
+    if measure_e2e:
+        command += " ; " + f"HIP_VISIBLE_DEVICES={gpu_id} " + " ".join(aiter_cmd)
+
+    s1_samples, s2_samples, sort_samples, combined_samples = [], [], [], []
+    s1_p95s, s2_p95s = [], []
+    for _ in range(max(1, reps)):
+        out = _exec(flydsl_cmd, gpu_id, extra_env=flydsl_env)
+        stages = parse_flydsl_stage_us(out)
+        if stages["stage1_us"] is None or stages["stage2_us"] is None:
+            continue
+        srt = parse_flydsl_sorting_us(out) or 0.0
+        s1_samples.append(stages["stage1_us"])
+        s2_samples.append(stages["stage2_us"])
+        sort_samples.append(srt)
+        combined_samples.append(combined_kernel_path_us(stages["stage1_us"], stages["stage2_us"], srt))
+        if stages["stage1_p95"] is not None:
+            s1_p95s.append(stages["stage1_p95"])
+        if stages["stage2_p95"] is not None:
+            s2_p95s.append(stages["stage2_p95"])
+
+    e2e_samples, e2e_p95s, logits_samples, correctness = [], [], [], None
+    strict_error, error_category, aot_status = "", "", ""
+    if measure_e2e:
+        for _ in range(max(1, reps)):
+            res = parse_strict_aiter_output(_exec(aiter_cmd, gpu_id))
+            if res["e2e_us"] is not None:
+                e2e_samples.append(res["e2e_us"])
+            if res.get("e2e_us_p95") is not None:
+                e2e_p95s.append(res["e2e_us_p95"])
+            if res["logits_diff"] is not None:
+                logits_samples.append(res["logits_diff"])
+            rep_ok = res["correctness_pass"]
+            correctness = rep_ok if correctness is None else (correctness and bool(rep_ok))
+            # keep the last rep's failure provenance (representative).
+            strict_error = res.get("error", "") or strict_error
+            error_category = res.get("error_category", "") or error_category
+            aot_status = res.get("aot_status", "") or aot_status
+
+    row = PointRow(
+        provenance=provenance,
+        command=command,
+        model=rp.model,
+        model_dim=rp.model_dim,
+        inter_dim=rp.inter_dim,
+        experts=rp.experts,
+        topk=rp.topk,
+        dtype=rp.dtype,
+        act=rp.act,
+        token=rp.token,
+        tile_m1=tile["tile_m1"],
+        tile_n1=tile["tile_n1"],
+        tile_k1=tile["tile_k1"],
+        tile_m2=tile["tile_m1"],
+        tile_n2=tile["tile_n2"],
+        tile_k2=tile["tile_k2"],
+        flydsl_command=flydsl_command_str,
+        strict_error=strict_error,
+        error_category=error_category,
+        aot_status=aot_status,
+    )
+    if combined_samples:
+        row.stage1_us = summarize(s1_samples)["median"]
+        row.stage2_us = summarize(s2_samples)["median"]
+        row.sorting_us = summarize(sort_samples)["median"]
+        row.kernel_path_us = summarize(combined_samples)["median"]
+        # p95 is the timed-loop p95 (median across the per-rep timed-loop p95s);
+        # fall back to the across-rep combined p95 only if the timed-loop p95 is
+        # unavailable.
+        if s1_p95s and s2_p95s:
+            row.kernel_path_us_p95 = (
+                summarize(s1_p95s)["median"] + summarize(s2_p95s)["median"] + summarize(sort_samples)["median"]
+            )
+        else:
+            row.kernel_path_us_p95 = summarize(combined_samples)["p95"]
+        m = compute_metrics(
+            token=rp.token, model_dim=rp.model_dim, inter_dim=rp.inter_dim, topk=rp.topk, combined_us=row.kernel_path_us
+        )
+        row.effective_tflops = m["effective_tflops"]
+        row.mfu = m["mfu"]
+    if e2e_samples:
+        row.e2e_us = summarize(e2e_samples)["median"]
+        row.e2e_us_p95 = summarize(e2e_p95s)["median"] if e2e_p95s else summarize(e2e_samples)["p95"]
+    if logits_samples:
+        row.logits_diff = max(logits_samples)  # worst-case correctness across reps
+    row.correctness_pass = correctness
+    return row
+
+
+def row_missing_kernel_path(row: "PointRow") -> bool:
+    """True if a measured row has no parseable kernel-path timing.
+
+    The FlyDSL benchmark emits no stage times for some tile shapes (e.g. the
+    tile_k1!=256 / tile_n1=512 harness limitation): the subprocess returns but
+    ``parse_flydsl_stage_us`` finds nothing, so the row's stage/kernel-path fields
+    stay ``None``.  Such a row is NOT a measurement and must never be recorded as a
+    ``loss`` -- candidate mode treats it as a fail-closed rejected measurement.
+    """
+    return row.stage1_us is None or row.stage2_us is None or row.kernel_path_us is None
+
+
+# Default (baseline) tile config per shape: matches scripts/run_benchmark.sh.
+def default_tile_for(rp: RunPoint) -> dict:  # pragma: no cover - simple table
+    if rp.model_dim == 3072:  # GPT-OSS
+        return {"tile_m1": 32, "tile_n1": 128, "tile_k1": 256, "tile_n2": 256, "tile_k2": 256}
+    return {"tile_m1": 64, "tile_n1": 256, "tile_k1": 256, "tile_n2": 256, "tile_k2": 256}
+
+
+def _main(argv: Optional[List[str]] = None) -> int:  # pragma: no cover - CLI/live
+    import argparse
+
+    ap = argparse.ArgumentParser(description="MXFP4 MoE tuning measurement harness (gfx950)")
+    ap.add_argument("mode", choices=["baseline", "candidate", "validate", "list"])
+    ap.add_argument("--gpu", default=os.environ.get("GPU", "0"), help="GPU id (HIP_VISIBLE_DEVICES)")
+    ap.add_argument("--out", default="", help="output CSV path")
+    ap.add_argument("--csv", default="", help="CSV to validate (validate mode)")
+    ap.add_argument("--no-e2e", action="store_true", help="skip the aiter e2e/correctness run")
+    ap.add_argument(
+        "--no-aot-check",
+        action="store_true",
+        help="run e2e strict+correct but do not require a pre-populated AOT cache (records aot_status=no_aot)",
+    )
+    ap.add_argument("--assume-idle", action="store_true", help="skip the live idle-GPU probe")
+    ap.add_argument(
+        "--allow-unpinned",
+        action="store_true",
+        help="proceed (recording clocks_pinned=False) even if clock pinning cannot be verified",
+    )
+    # Candidate-mode selection + explicit tile overrides (reproducible sweeps).
+    ap.add_argument("--model", default=None, help="restrict to one model (candidate mode)")
+    ap.add_argument("--dtype", default=None, help="restrict to one dtype alias, e.g. a4w4 (candidate mode)")
+    ap.add_argument("--tokens", default=None, help="comma/space-separated token list (candidate mode)")
+    ap.add_argument("--reps", type=int, default=3, help="independent subprocess reps per point")
+    for _k in ("tile-m1", "tile-n1", "tile-k1", "tile-n2", "tile-k2"):
+        ap.add_argument(f"--{_k}", type=int, default=None, help=f"candidate {_k.replace('-', '_')} override")
+    args = ap.parse_args(argv)
+
+    if args.mode == "list":
+        for rp in build_run_list():
+            print(rp)
+        return 0
+
+    if args.mode == "validate":
+        res = validate_baseline_csv(args.csv)
+        print(json.dumps(res, indent=2))
+        return 0 if res["valid"] else 1
+
+    prov = setup_run_provenance(args.gpu, assume_idle=args.assume_idle)
+    print(f"clocks_pinned (verified)={prov.clocks_pinned} idle_gpu_verified={prov.idle_gpu_verified}")
+    # The locked protocol requires fixed clocks: if verification failed, do not
+    # emit a baseline that falsely claims pinned clocks.
+    if spec.CLOCKS_PINNED and not prov.clocks_pinned and not args.allow_unpinned:
+        print(
+            "ERROR: locked protocol requires pinned clocks but verification failed; "
+            "the run would be non-comparable. Re-run with the GPU clocks pinnable, "
+            "or pass --allow-unpinned to record clocks_pinned=False explicitly.",
+            file=sys.stderr,
+        )
+        return 2
+
+    overrides = {
+        "tile_m1": args.tile_m1,
+        "tile_n1": args.tile_n1,
+        "tile_k1": args.tile_k1,
+        "tile_n2": args.tile_n2,
+        "tile_k2": args.tile_k2,
+    }
+
+    if args.mode == "candidate":
+        toks = [int(t) for t in args.tokens.replace(",", " ").split()] if args.tokens else None
+        top_command = "python3 " + shlex.join([os.path.relpath(__file__, _REPO_ROOT), *(argv or sys.argv[1:])])
+        try:
+            run_list, tiles = prepare_candidate_run(
+                overrides, model=args.model, dtype=args.dtype, tokens=toks, prov=prov, command=top_command
+            )
+        except ValueError as e:
+            # Fail closed: do not write a partial CSV; rejection already recorded.
+            print(f"ERROR: candidate run rejected: {e}", file=sys.stderr)
+            return 2
+        rows = [
+            run_point(
+                rp,
+                tiles[i],
+                args.gpu,
+                prov,
+                measure_e2e=not args.no_e2e,
+                reps=args.reps,
+                check_aot=not args.no_aot_check,
+            )
+            for i, rp in enumerate(run_list)
+        ]
+        # Fail closed on unmeasured rows: a missing kernel-path row is NOT a loss.
+        import moe_tuning_ledger as _ledger
+
+        bad = [(rp, tiles[i], r) for i, (rp, r) in enumerate(zip(run_list, rows)) if row_missing_kernel_path(r)]
+        if bad:
+            for rp, tile, r in bad:
+                _ledger.append_rejected_candidate(
+                    {
+                        "model": rp.model,
+                        "dtype": rp.dtype,
+                        "act": rp.act,
+                        "token": rp.token,
+                        "stage": 1,
+                        "config": {k: tile.get(k) for k in ("tile_m1", "tile_n1", "tile_k1", "tile_n2", "tile_k2")},
+                        "reason": "no parseable kernel-path stage times emitted (unmeasured shape; e.g. "
+                        "tile_k1!=256 / tile_n1=512 harness limitation)",
+                        "selection": {"model": args.model, "dtype": args.dtype, "tokens": toks},
+                        "gpu_id": prov.gpu_id,
+                        "gpu_model": prov.gpu_model,
+                        "branch": prov.branch,
+                        "commit": prov.commit,
+                        "command": top_command,
+                        "warmup": prov.warmup,
+                        "iters": prov.iters,
+                        "csv_path": "",
+                        "profile_path": "",
+                    }
+                )
+            print(
+                f"ERROR: {len(bad)} candidate point(s) produced no kernel-path measurement; "
+                "recorded as rejected measurements, no CSV written.",
+                file=sys.stderr,
+            )
+            return 2
+    else:  # baseline: full grid, default tiles
+        run_list = build_run_list()
+        rows = [
+            run_point(
+                rp,
+                default_tile_for(rp),
+                args.gpu,
+                prov,
+                measure_e2e=not args.no_e2e,
+                reps=args.reps,
+                check_aot=not args.no_aot_check,
+            )
+            for rp in run_list
+        ]
+
+    out = args.out or f"/tmp/moe_{args.mode}.csv"
+    write_csv(rows, out)
+    print(f"wrote {len(rows)} rows -> {out}")
+    return 0
+
+
+__all__ = [
+    "CSV_COLUMNS",
+    "METRIC_FORMULA",
+    "LOCKED_BASELINE_COMMIT",
+    "Provenance",
+    "PointRow",
+    "RunPoint",
+    "parse_flydsl_stage_us",
+    "parse_flydsl_sorting_us",
+    "parse_aiter_output",
+    "parse_strict_aiter_output",
+    "combined_kernel_path_us",
+    "summarize",
+    "compute_metrics",
+    "git_provenance",
+    "gpu_provenance",
+    "check_idle_gpu",
+    "pin_clocks",
+    "clocks_pinned_state",
+    "setup_run_provenance",
+    "build_run_list",
+    "expected_point_keys",
+    "select_run_points",
+    "candidate_tile_for",
+    "prepare_candidate_run",
+    "default_tile_for",
+    "validate_baseline_row",
+    "validate_baseline_csv",
+    "run_point",
+    "row_missing_kernel_path",
+    "write_csv",
+    "read_csv",
+]
+
+
+if __name__ == "__main__":  # pragma: no cover
+    raise SystemExit(_main())
diff --git a/scripts/moe_tuning_ledger.py b/scripts/moe_tuning_ledger.py
new file mode 100644
index 000000000..80629cf0b
--- /dev/null
+++ b/scripts/moe_tuning_ledger.py
@@ -0,0 +1,501 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
+
+"""Attempt ledger + Pareto comparison for the MXFP4 MoE tuning campaign.
+
+Every candidate attempt — win or loss — is appended to ``docs/attempts.jsonl``
+with full provenance (config, stage, model, dtype, act, GPU id+model,
+branch+commit, command, warmup/iters, CSV/profile path, result).  A human-facing
+running log lives in ``docs/optimization-ledger.md``.
+
+The Pareto comparison takes a baseline per-point CSV and a candidate per-point
+CSV (both emitted by ``scripts/moe_tuning_harness.py``) and reports, per point,
+whether the candidate is a win / regression / neutral under the locked the win-margin policy /
+the no-regression policy predicates.  A win is only claimable when no point regresses on either the
+kernel-path or e2e metric (no Pareto regression) and the re-run-stability rule
+holds.
+"""
+
+from __future__ import annotations
+
+import csv
+import json
+import os
+import sys
+import time
+from dataclasses import asdict, dataclass, field
+from typing import Dict, List, Optional, Tuple
+
+_REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+if _REPO_ROOT not in sys.path:
+    sys.path.insert(0, _REPO_ROOT)
+
+from kernels import moe_tuning_spec as spec  # noqa: E402
+
+ATTEMPTS_JSONL = os.path.join(_REPO_ROOT, "docs", "attempts.jsonl")
+LEDGER_MD = os.path.join(_REPO_ROOT, "docs", "optimization-ledger.md")
+
+# Required provenance keys for any ledger attempt (the ledger contract).
+REQUIRED_ATTEMPT_FIELDS = (
+    "config",
+    "stage",
+    "model",
+    "dtype",
+    "act",
+    "gpu_id",
+    "gpu_model",
+    "branch",
+    "commit",
+    "command",
+    "warmup",
+    "iters",
+    "result",
+)
+
+# A rejected search candidate never reaches compile/GPU, so it has no measured
+# metrics (csv_path/profile_path stay empty), but it MUST still carry the same
+# identity + run-provenance class as a measured attempt so the rejection is
+# auditable (the rejected-candidate ledger contract).  ``stage`` is 0 when the
+# rejection is at the candidate-tile level spanning both stages; the reason
+# string still names the offending stage.  ``selection`` records the run's
+# model/dtype/tokens filter so the rejection is reproducible.
+REQUIRED_REJECTED_FIELDS = (
+    "model",
+    "dtype",
+    "act",
+    "token",
+    "stage",
+    "config",
+    "reason",
+    "selection",
+    "gpu_id",
+    "gpu_model",
+    "branch",
+    "commit",
+    "command",
+    "warmup",
+    "iters",
+)
+
+# Keys that must be PRESENT on a rejected record but may legitimately be empty
+# strings: a pre-compile rejection produces no measured CSV/profile artifact, yet
+# the keys must exist so the record schema matches a measured attempt.
+REQUIRED_REJECTED_PRESENT_KEYS = (
+    "csv_path",
+    "profile_path",
+)
+
+
+@dataclass
+class Attempt:
+    """One tuning attempt record (win or loss)."""
+
+    config: dict
+    stage: int
+    model: str
+    dtype: str
+    act: str
+    gpu_id: str
+    gpu_model: str
+    branch: str
+    commit: str
+    command: str
+    warmup: int
+    iters: int
+    result: str  # "win" | "loss" | "rejected" | "neutral"
+    csv_path: str = ""
+    profile_path: str = ""
+    note: str = ""
+    timestamp: Optional[float] = None
+
+    def missing_fields(self) -> List[str]:
+        return [f for f in REQUIRED_ATTEMPT_FIELDS if getattr(self, f, None) in ("", None)]
+
+
+def append_attempt(attempt: Attempt, path: str = ATTEMPTS_JSONL, now: Optional[float] = None) -> dict:
+    """Append an attempt to the JSONL ledger.
+
+    Raises ``ValueError`` if any required provenance field is missing, so a win
+    can never be recorded without complete provenance (the ledger contract negative gate).
+    """
+    missing = attempt.missing_fields()
+    if missing:
+        raise ValueError(f"attempt missing required provenance fields: {missing}")
+    rec = asdict(attempt)
+    rec["timestamp"] = now if now is not None else time.time()
+    os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
+    with open(path, "a") as f:
+        f.write(json.dumps(rec, sort_keys=True) + "\n")
+    return rec
+
+
+def append_rejected_candidate(record: dict, path: str = ATTEMPTS_JSONL, now: float = None) -> dict:
+    """Append a machine-readable rejected-candidate record to the JSONL ledger.
+
+    ``record`` must carry the full provenance class (``REQUIRED_REJECTED_FIELDS``)
+    so a rejected search candidate is as auditable as a measured attempt — even
+    though it never reached compile/GPU.  The measured-artifact keys
+    (``REQUIRED_REJECTED_PRESENT_KEYS``: ``csv_path``/``profile_path``) must be
+    present but may be empty strings (no artifact exists pre-compile).  Raises
+    ``ValueError`` if any required field is missing, so an incomplete rejection can
+    never be recorded (the rejected-candidate contract negative gate).
+    """
+    # Treat only None / "" as missing — integer 0 (stage, warmup, iters) is valid.
+    missing = [k for k in REQUIRED_REJECTED_FIELDS if record.get(k) in (None, "")]
+    # Artifact keys must EXIST (empty string allowed); only a truly absent key fails.
+    missing += [k for k in REQUIRED_REJECTED_PRESENT_KEYS if k not in record]
+    if missing:
+        raise ValueError(f"rejected-candidate record missing fields: {missing}")
+    # selection must be a non-empty dict so the rejection's run filter is recorded.
+    sel = record.get("selection")
+    if not isinstance(sel, dict) or not sel:
+        raise ValueError("rejected-candidate record 'selection' must be a non-empty dict")
+    rec = {"result": "rejected_candidate", **record}
+    rec["timestamp"] = now if now is not None else time.time()
+    os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
+    with open(path, "a") as f:
+        f.write(json.dumps(rec, sort_keys=True) + "\n")
+    return rec
+
+
+def read_point_csv(path: str) -> Dict[Tuple, dict]:
+    """Read a per-point harness CSV keyed by (model, dtype, token, stage tiles).
+
+    The key is (model, dtype, act, token) — the comparison axis between baseline
+    and candidate at one shape/token point.
+    """
+    table: Dict[Tuple, dict] = {}
+    with open(path, newline="") as f:
+        for row in csv.DictReader(f):
+            key = (row.get("model"), row.get("dtype"), row.get("act"), row.get("token"))
+            table[key] = row
+    return table
+
+
+def _f(row: dict, col: str) -> Optional[float]:
+    v = row.get(col)
+    if v in (None, "", "None"):
+        return None
+    try:
+        return float(v)
+    except (TypeError, ValueError):
+        return None
+
+
+@dataclass
+class PointVerdict:
+    key: Tuple
+    token: int
+    kernel_path_regression: bool = False
+    e2e_regression: bool = False
+    large_shape_win: bool = False
+    small_token_win: bool = False
+    note: str = ""
+
+
+def compare_point(baseline: dict, candidate: dict) -> PointVerdict:
+    """Apply the win-margin policy / the no-regression policy predicates to one (baseline, candidate) point pair."""
+    token = int(float(candidate.get("token") or baseline.get("token") or 0))
+    key = (candidate.get("model"), candidate.get("dtype"), candidate.get("act"), candidate.get("token"))
+    v = PointVerdict(key=key, token=token)
+
+    b_kp, c_kp = _f(baseline, "kernel_path_us"), _f(candidate, "kernel_path_us")
+    b_e2e, c_e2e = _f(baseline, "e2e_us"), _f(candidate, "e2e_us")
+    b_mfu, c_mfu = _f(baseline, "mfu"), _f(candidate, "mfu")
+
+    if b_kp is not None and c_kp is not None:
+        v.kernel_path_regression = spec.is_regression(b_kp, c_kp, token=token)
+    if b_e2e is not None and c_e2e is not None:
+        v.e2e_regression = spec.is_regression(b_e2e, c_e2e, token=token)
+
+    if spec.is_large_token(token) and token in spec.MFU_TARGET_BUCKETS:
+        if b_mfu is not None and c_mfu is not None:
+            v.large_shape_win = spec.is_large_shape_win(b_mfu, c_mfu)
+    if spec.is_small_token(token):
+        if b_kp is not None and c_kp is not None:
+            v.small_token_win = spec.is_small_token_win(b_kp, c_kp)
+    return v
+
+
+def _required_fields_for_point(token: int) -> Tuple[str, ...]:
+    """Comparison fields a candidate row must carry for its token regime.
+
+    Every point needs both latency metrics; large target buckets additionally
+    need ``mfu`` (the large-shape win/regression axis).
+    """
+    fields = ["kernel_path_us", "e2e_us"]
+    if spec.is_large_token(token) and token in spec.MFU_TARGET_BUCKETS:
+        fields.append("mfu")
+    return tuple(fields)
+
+
+def _row_missing_fields(row: dict, fields: Tuple[str, ...]) -> List[str]:
+    return [f for f in fields if _f(row, f) is None]
+
+
+@dataclass
+class CampaignVerdict:
+    points: List[PointVerdict] = field(default_factory=list)
+    any_regression: bool = False
+    large_wins: List[Tuple] = field(default_factory=list)
+    small_wins: List[Tuple] = field(default_factory=list)
+    missing_candidate_points: List[Tuple] = field(default_factory=list)
+    incomplete_points: List[Tuple] = field(default_factory=list)
+    # Strict correctness + AOT-cache hard gate over the candidate CSV
+    # (``selected_candidate_gate`` output).  Populated by ``compare_csvs``; a
+    # candidate that fails this gate (e.g. ``aot_status=no_aot``) can never be a
+    # claimable win even if its metrics look winning.
+    gate: dict = field(default_factory=lambda: {"passed": False, "n_rows": 0, "violations": []})
+
+    @property
+    def coverage_complete(self) -> bool:
+        """True only if every baseline point has a candidate row with all the
+        regime-required comparison fields present (no cherry-picking)."""
+        return not self.missing_candidate_points and not self.incomplete_points
+
+    @property
+    def pareto_clean(self) -> bool:
+        """True only if coverage is complete AND no point regressed on kernel-path
+        or e2e.  Incomplete/cherry-picked candidate CSVs can never be clean."""
+        return self.coverage_complete and not self.any_regression
+
+    @property
+    def claimable_win(self) -> bool:
+        """The SINGLE source of truth for whether a candidate may be promoted to a
+        win.  True only when ALL hold:
+        - ``pareto_clean`` (full coverage + no kernel-path/e2e regression),
+        - at least one target-bucket or small-token win is present, and
+        - the strict correctness + AOT-cache hard gate passed
+          (``aot_status=checked`` + correctness + ``logits_diff<=0.01`` on every
+          row) -- so a ``no_aot`` / failed-correctness candidate is never claimable
+          regardless of how good its metrics look.
+        Re-run stability is enforced separately by re-running and re-comparing."""
+        return self.pareto_clean and bool(self.large_wins or self.small_wins) and bool(self.gate.get("passed"))
+
+
+def compare_csvs(baseline_csv: str, candidate_csv: str) -> CampaignVerdict:
+    """Full per-point Pareto comparison of a candidate vs the locked baseline.
+
+    Iterates the COMPLETE baseline key set so a candidate cannot pass by omitting
+    a regressing/uncovered point.  A point with a missing candidate row, or whose
+    candidate row lacks a regime-required field (kernel_path_us/e2e_us for every
+    point; mfu for large target buckets), makes ``coverage_complete`` False, which
+    forces ``pareto_clean`` False.
+
+    The candidate is run through ``selected_candidate_gate`` and the result is
+    stored on the verdict.  ``CampaignVerdict.claimable_win`` is the single source
+    of truth for promotability: it requires ``pareto_clean`` + at least one win +
+    the gate (``aot_status=checked`` + correctness + ``logits_diff<=0.01``).  Do
+    NOT promote a candidate from ``pareto_clean`` + win lists alone -- a ``no_aot``
+    candidate can be pareto_clean with wins yet must not be claimable.
+    """
+    base = read_point_csv(baseline_csv)
+    cand = read_point_csv(candidate_csv)
+    cv = CampaignVerdict()
+    cv.gate = selected_candidate_gate(candidate_csv)
+    for key, b_row in base.items():
+        token = int(float(b_row.get("token") or 0))
+        c_row = cand.get(key)
+        if c_row is None:
+            cv.missing_candidate_points.append(key)
+            cv.points.append(PointVerdict(key=key, token=token, note="missing_candidate_point"))
+            continue
+        missing = _row_missing_fields(c_row, _required_fields_for_point(token))
+        if missing:
+            cv.incomplete_points.append(key)
+            cv.points.append(PointVerdict(key=key, token=token, note="missing_fields:" + ",".join(missing)))
+            continue
+        pv = compare_point(b_row, c_row)
+        cv.points.append(pv)
+        if pv.kernel_path_regression or pv.e2e_regression:
+            cv.any_regression = True
+        if pv.large_shape_win:
+            cv.large_wins.append(key)
+        if pv.small_token_win:
+            cv.small_wins.append(key)
+    return cv
+
+
+def selected_candidate_gate(candidate_csv: str, max_logits_diff: float = 0.01) -> dict:
+    """Hard gate a candidate CSV before it can be promoted to a win.
+
+    A selected candidate must clear the strict correctness + AOT-cache hard gate on
+    EVERY row: ``aot_status == "checked"`` (the strict aiter run required a
+    pre-populated AOT cache, not the ``no_aot`` repeatability/diagnostic bypass),
+    ``correctness_pass`` is true, and ``logits_diff <= max_logits_diff``.  Rows
+    measured with ``--no-aot-check`` (``aot_status == "no_aot"``) are valid for
+    NEUTRAL repeatability/diagnostic artifacts but can never be promoted to a win,
+    so they fail this gate.
+
+    Returns ``{"passed": bool, "n_rows": int, "violations": [(key, reason), ...]}``.
+    ``passed`` is False if there are zero rows (nothing to promote) or any violation.
+    """
+    rows = read_point_csv(candidate_csv)
+    violations: List[Tuple] = []
+    for key, row in rows.items():
+        aot = (row.get("aot_status") or "").strip()
+        if aot != "checked":
+            violations.append((key, f"aot_status={aot or 'missing'} (need 'checked')"))
+        cp = (row.get("correctness_pass") or "").strip().lower()
+        if cp not in ("true", "1"):
+            violations.append((key, f"correctness_pass={row.get('correctness_pass')!r} (need True)"))
+        ld = _f(row, "logits_diff")
+        if ld is None:
+            violations.append((key, "logits_diff missing"))
+        elif ld > max_logits_diff:
+            violations.append((key, f"logits_diff={ld} > {max_logits_diff}"))
+    return {"passed": bool(rows) and not violations, "n_rows": len(rows), "violations": violations}
+
+
+def repeatability_check(csv_a: str, csv_b: str) -> dict:
+    """Compare two independent sweeps of the SAME config under the no-regression policy.
+
+    For each shared (model, dtype, act, token) point, a metric is "stable" if the
+    two runs agree within the no-regression noise band (NOT a regression in either
+    direction): ``|b - a| <= max(a*REGRESSION_REL, abs_floor_us(token))``, where
+    the absolute floor is regime-aware (8 us for tokens <= SMALL_TOKEN_MAX, 2 us
+    otherwise).  Returns the set of unstable points per metric; an empty unstable
+    set demonstrates the harness is repeatable (the measurement protocol).
+    """
+    a = read_point_csv(csv_a)
+    b = read_point_csv(csv_b)
+    shared = sorted(set(a) & set(b))
+    unstable = {"kernel_path_us": [], "e2e_us": []}
+
+    def band(x, token):
+        return max(abs(x) * spec.REGRESSION_REL, spec.abs_floor_us(token))
+
+    for key in shared:
+        token = int(float(a[key].get("token") or 0))
+        for metric in ("kernel_path_us", "e2e_us"):
+            va, vb = _f(a[key], metric), _f(b[key], metric)
+            if va is None or vb is None:
+                unstable[metric].append((key, "missing"))
+            elif abs(vb - va) > band(va, token):
+                unstable[metric].append((key, va, vb))
+    return {
+        "n_shared": len(shared),
+        "unstable": unstable,
+        "stable": not unstable["kernel_path_us"] and not unstable["e2e_us"],
+    }
+
+
+def scan_replay_consistency(path: str = ATTEMPTS_JSONL) -> List[Tuple]:
+    """Find committed attempts whose ``csv_path`` lists files the ``command`` cannot replay.
+
+    A multi-file attempt (``csv_path`` = ``a.csv;b.csv``) must name EVERY listed
+    file in its ``command`` string, so the attempt is replayable end-to-end from
+    the ledger alone (no brace shorthand like ``run{1,2}.csv``, no required step
+    hidden behind a ``#`` comment).  Superseded records are skipped.  Returns a
+    list of ``(timestamp, [missing files])`` for offending records (empty == clean).
+    """
+    if not os.path.exists(path):
+        return []
+    offenders: List[Tuple] = []
+    with open(path) as f:
+        for ln in f:
+            ln = ln.strip()
+            if not ln:
+                continue
+            rec = json.loads(ln)
+            if "superseded_by" in rec:
+                continue
+            csv_path = rec.get("csv_path") or ""
+            files = [p for p in csv_path.split(";") if p.strip()]
+            if len(files) < 2:
+                continue  # single/no file: nothing multi-file to reconcile
+            command = rec.get("command") or ""
+            # Strip anything after a '#' on each segment: a required step hidden in
+            # a comment is not actually replayed by a shell.
+            replayable = " ".join(seg.split("#", 1)[0] for seg in command.splitlines())
+            missing = [fp for fp in files if fp not in replayable]
+            if missing:
+                offenders.append((rec.get("timestamp"), missing))
+    return offenders
+
+
+def _rejected_key(rec: dict) -> Tuple:
+    """Identity of a rejected probe: model/dtype/token/act + the tile config.
+    Used to detect duplicate non-superseded rejection records for the same probe."""
+    cfg = rec.get("config") or {}
+    cfg_key = tuple(sorted((str(k), str(v)) for k, v in cfg.items()))
+    return (rec.get("model"), rec.get("dtype"), rec.get("act"), rec.get("token"), cfg_key)
+
+
+def scan_duplicate_rejected_candidates(path: str = ATTEMPTS_JSONL) -> List[Tuple]:
+    """Find probes with more than one ACTIVE (non-superseded) rejected record.
+
+    Two ledger entries that reject the same (model,dtype,act,token,config) probe
+    are a provenance defect -- there must be exactly one active reason per probe
+    (older duplicates must be marked ``superseded_by``).  Returns a list of
+    ``(key, [timestamps])`` for probes with >1 active record (empty == clean).
+    """
+    if not os.path.exists(path):
+        return []
+    seen: Dict[Tuple, List] = {}
+    with open(path) as f:
+        for ln in f:
+            ln = ln.strip()
+            if not ln:
+                continue
+            rec = json.loads(ln)
+            if rec.get("result") != "rejected_candidate" or "superseded_by" in rec:
+                continue
+            seen.setdefault(_rejected_key(rec), []).append(rec.get("timestamp"))
+    return [(k, ts) for k, ts in seen.items() if len(ts) > 1]
+
+
+def scan_superseded_rejected_candidates(path: str = ATTEMPTS_JSONL) -> List[Tuple]:
+    """Find superseded rejected records that do NOT link to a matching successor.
+
+    Every ``rejected_candidate`` carrying ``superseded_by`` must point at the
+    timestamp of an EXISTING active (non-superseded) rejected record for the SAME
+    rejected key ``(model,dtype,act,token,config)``.  A supersede link to a
+    different probe's record (or to no record) is an evidence-integrity defect:
+    ``scan_duplicate_rejected_candidates`` only proves one active record per key, it
+    does not prove the superseded chain points to the correct successor.  Returns a
+    list of ``(timestamp, reason)`` for offending records (empty == clean).
+    """
+    if not os.path.exists(path):
+        return []
+    active_ts_by_key: Dict[Tuple, set] = {}
+    superseded: List[dict] = []
+    with open(path) as f:
+        for ln in f:
+            ln = ln.strip()
+            if not ln:
+                continue
+            rec = json.loads(ln)
+            if rec.get("result") != "rejected_candidate":
+                continue
+            if "superseded_by" in rec:
+                superseded.append(rec)
+            else:
+                active_ts_by_key.setdefault(_rejected_key(rec), set()).add(rec.get("timestamp"))
+    offenders: List[Tuple] = []
+    for rec in superseded:
+        key = _rejected_key(rec)
+        target = rec.get("superseded_by")
+        if target not in active_ts_by_key.get(key, set()):
+            offenders.append((rec.get("timestamp"), f"superseded_by={target} is not an active record of the same key"))
+    return offenders
+
+
+__all__ = [
+    "ATTEMPTS_JSONL",
+    "LEDGER_MD",
+    "REQUIRED_ATTEMPT_FIELDS",
+    "Attempt",
+    "append_attempt",
+    "read_point_csv",
+    "compare_point",
+    "compare_csvs",
+    "selected_candidate_gate",
+    "scan_replay_consistency",
+    "scan_duplicate_rejected_candidates",
+    "scan_superseded_rejected_candidates",
+    "repeatability_check",
+    "PointVerdict",
+    "CampaignVerdict",
+]
diff --git a/scripts/sync_aiter_flydsl_kernels.sh b/scripts/sync_aiter_flydsl_kernels.sh
new file mode 100755
index 000000000..d6eedcc85
--- /dev/null
+++ b/scripts/sync_aiter_flydsl_kernels.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: MIT
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# Sync aiter's vendored FlyDSL MoE kernels with this FlyDSL checkout so the aiter
+# fused-MoE e2e + strict-correctness guardrail (op_tests/test_moe_2stage.py) runs
+# against the SAME kernel sources we tune here.
+#
+# Why this is needed: aiter pins `flydsl==0.1.8` and ships its own (older) vendored
+# copies under aiter/ops/flydsl/kernels/.  Against the installed FlyDSL compiler
+# (0.2.x) those stale copies crash during MLIR emission BEFORE producing any number
+# (`'Int32' object has no attribute 'type'`, then `arith.extsi i64->i32 cast
+# incompatible`).  Overlaying the current FlyDSL kernel sources resolves the skew;
+# the e2e path then produces real us + logits_diff and the strict correctness gate
+# (`logits_diff <= 0.01`) can be applied.  This is an aiter-environment integration
+# step, not a change to the FlyDSL kernels themselves.
+#
+# Idempotent.  Backs up the originals once to <aiter>/ops/flydsl/kernels/.orig_bak/.
+# Usage:  bash scripts/sync_aiter_flydsl_kernels.sh [AITER_REPO]
+set -euo pipefail
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+AITER_REPO="${1:-/sgl-workspace/aiter}"
+SRC="${REPO_ROOT}/kernels"
+DST="${AITER_REPO}/aiter/ops/flydsl/kernels"
+BAK="${DST}/.orig_bak"
+
+if [[ ! -d "${DST}" ]]; then
+  echo "ERROR: aiter vendored kernel dir not found: ${DST}" >&2
+  exit 1
+fi
+
+# The MoE 2-stage kernel and its sibling deps imported via `from .<name>`.
+FILES=(
+  mixed_moe_gemm_2stage.py
+  moe_gemm_2stage.py
+  moe_common.py
+  mfma_epilogues.py
+  mfma_preshuffle_pipeline.py
+  layout_utils.py
+)
+
+mkdir -p "${BAK}"
+for f in "${FILES[@]}"; do
+  if [[ ! -f "${SRC}/${f}" ]]; then
+    echo "ERROR: missing FlyDSL source: ${SRC}/${f}" >&2
+    exit 1
+  fi
+  # Back up the original aiter copy once.
+  if [[ -f "${DST}/${f}" && ! -f "${BAK}/${f}" ]]; then
+    cp "${DST}/${f}" "${BAK}/${f}"
+  fi
+  cp "${SRC}/${f}" "${DST}/${f}"
+  echo "synced ${f}"
+done
+
+# Clear the aiter FlyDSL JIT cache so stale compiled artifacts are not reused.
+CACHE="${AITER_REPO}/aiter/jit/flydsl_cache"
+if [[ -d "${CACHE}" ]]; then
+  rm -rf "${CACHE:?}/"* 2>/dev/null || true
+  echo "cleared aiter flydsl JIT cache: ${CACHE}"
+fi
+
+echo "done: aiter vendored FlyDSL MoE kernels synced from ${SRC}"
diff --git a/tests/unit/test_moe_tuning_harness.py b/tests/unit/test_moe_tuning_harness.py
new file mode 100644
index 000000000..d16c0fca3
--- /dev/null
+++ b/tests/unit/test_moe_tuning_harness.py
@@ -0,0 +1,1584 @@
+# SPDX-License-Identifier: MIT
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
+
+"""Backend-agnostic tests for the MXFP4 MoE tuning harness, spec, and ledger.
+
+These exercise the pure host-side logic (decision predicates, stage-us parsing,
+metric computation, provenance gating, attempt-ledger validation, and per-point
+Pareto comparison) with no GPU and no compile.
+"""
+
+import os
+import sys
+
+import pytest
+
+_REPO_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+_SCRIPTS = os.path.join(_REPO_ROOT, "scripts")
+for p in (_REPO_ROOT, _SCRIPTS):
+    if p not in sys.path:
+        sys.path.insert(0, p)
+
+import moe_tuning_harness as harness  # noqa: E402
+import moe_tuning_ledger as ledger  # noqa: E402
+
+from kernels import moe_tuning_spec as spec  # noqa: E402
+
+pytestmark = pytest.mark.l0_backend_agnostic
+
+
+# --- spec: locked values + predicates --------------------------------------
+
+
+def test_locked_constants():
+    assert spec.FP4_PEAK_TFLOPS == 4523.0
+    assert spec.WIN_MARGIN == 0.10
+    assert spec.REGRESSION_REL == 0.02
+    assert spec.ABS_US_BAND == 2.0
+    assert spec.WARMUP_ITERS == 10
+    assert spec.BENCH_ITERS == 100
+    assert spec.MFU_TARGET_BUCKETS == (16384, 32768)
+    assert spec.LARGE_TOKEN_MIN == 4096
+    assert spec.SMALL_TOKEN_MAX == 64
+    assert spec.TARGET_ARCH == "gfx950"
+
+
+def test_token_grids():
+    assert spec.TOKEN_GRID_FULL[0] == 1 and spec.TOKEN_GRID_FULL[-1] == 32768
+    assert len(spec.TOKEN_GRID_FULL) == 16
+    assert spec.TOKEN_GRID_GPTOSS[0] == 256 and spec.TOKEN_GRID_GPTOSS[-1] == 32768
+
+
+def test_models_in_scope_dtypes():
+    by_name = {m.name: m for m in spec.MODELS}
+    assert set(by_name) == {"deepseek_v3", "deepseek_v4", "kimi_k2", "gpt_oss"}
+    # DeepSeek V4 is a8w4-only; i4 excluded everywhere.
+    assert by_name["deepseek_v4"].dtypes == ("a8w4",)
+    assert by_name["kimi_k2"].dtypes == ("a4w4", "a8w4")
+    assert all("i4" not in m.dtypes for m in spec.MODELS)
+    assert by_name["gpt_oss"].act == "swiglu"
+    assert by_name["deepseek_v4"].model_dim == 7168 and by_name["deepseek_v4"].inter_dim == 512
+
+
+def test_regression_predicate_requires_both_bands():
+    # 1.5% over but only +1.5us: relative under 2%? 1.5% < 2% -> not a regression.
+    assert not spec.is_regression(100.0, 101.5)
+    # 3% over but only +0.3us absolute (small base): abs band not exceeded -> not a regression.
+    assert not spec.is_regression(10.0, 10.3)
+    # 5% over AND +5us: both bands exceeded -> regression.
+    assert spec.is_regression(100.0, 105.0)
+    # exactly at boundaries (strict >): 102.0 and +2.0 -> not a regression.
+    assert not spec.is_regression(100.0, 102.0)
+
+
+def test_large_shape_win_predicate():
+    assert spec.is_large_shape_win(0.50, 0.55)  # exactly +10%
+    assert not spec.is_large_shape_win(0.50, 0.549)
+
+
+def test_small_token_win_predicate():
+    # 12% faster AND >= 2us absolute -> win.
+    assert spec.is_small_token_win(100.0, 88.0)
+    # 12% faster but only 0.6us absolute (tiny base) -> rejected (abs floor).
+    assert not spec.is_small_token_win(5.0, 4.4)
+    # 8% faster -> rejected (under 10%).
+    assert not spec.is_small_token_win(100.0, 92.0)
+
+
+def test_effective_tflops_and_mfu_formula():
+    # token*model_dim*inter_dim*3*topk*2 / us / 1e6
+    tflops = spec.effective_tflops(4096, 7168, 256, 9, combined_us=1000.0)
+    expected = 4096 * 7168 * 256 * 3 * 9 * 2 / 1000.0 / 1e6
+    assert abs(tflops - expected) < 1e-9
+    assert abs(spec.mfu(tflops) - tflops / 4523.0) < 1e-12
+
+
+# --- harness: parsing / metrics / provenance -------------------------------
+
+
+def test_parse_flydsl_stage_us():
+    stdout = (
+        "noise\n"
+        "FlyDSL MoE stage1[fp4]: 1163.2 us, 1654.24 TFLOPS(logical, M=4608), 0.377 TB/s (doweight_stage1=False)\n"
+        "FlyDSL MoE stage2 [moe_gemm2] fp4 atomic | 7168x2048, E=32, K=8, M_eff=4608 | 845.5 us, 1200.00 TFLOPS, 0.300 TB/s\n"
+        "FlyDSL MoE stage2 [moe_gemm2] fp4 reduce | 7168x2048, E=32, K=8, M_eff=4608 | 900.1 us, 1100.00 TFLOPS, 0.280 TB/s\n"
+    )
+    got = harness.parse_flydsl_stage_us(stdout)
+    assert got["stage1_us"] == 1163.2
+    # last matching stage2 line wins
+    assert got["stage2_us"] == 900.1
+
+
+def test_parse_flydsl_stage_us_missing():
+    got = harness.parse_flydsl_stage_us("nothing here")
+    assert got["stage1_us"] is None and got["stage2_us"] is None
+
+
+def test_parse_aiter_output_pass_warning_line():
+    out = (
+        "calling test_fmoe(...)\n"
+        "ck_moe_2stages:  234.56 us,   654.00 tflops......(quant:fp4x2)[checkAllclose passed~]\n"
+        "logits_diff: 0.0008\n"
+    )
+    res = harness.parse_aiter_output(out)
+    assert res["e2e_us"] == 234.56
+    assert res["logits_diff"] == 0.0008
+    assert res["correctness_pass"] is True
+
+
+def test_parse_aiter_output_pass_markdown_row():
+    # logits_diff below 1e-3 prints no warning line; it only appears in the
+    # summary markdown row.  The loose "checkAllclose ... failed!" line is the
+    # EXPECTED fp4 elementwise warning and must NOT fail correctness.
+    out = (
+        "ck_moe_2stages:   84.32 us,  18.80 tflops......(quant:fp4x2)[checkAllclose atol=0.01 rtol=0.01 failed!]\n"
+        "moe_2stage summary (markdown):\n"
+        "| dtype | token | ... |      us |   logits_diff | model   |\n"
+        "|:------|------:| ... |--------:|--------------:|:--------|\n"
+        "| torch.bfloat16 | 16 | ... | 87.195 |    9.6236e-06 | legacy  |\n"
+    )
+    res = harness.parse_aiter_output(out)
+    assert res["e2e_us"] == 84.32
+    assert res["logits_diff"] == 9.6236e-06
+    assert res["correctness_pass"] is True
+
+
+def test_parse_aiter_output_fail_cases():
+    # logits over 0.01 (markdown row) -> fail.
+    out_logits = "ck_moe_2stages:  100.00 us, 100.00 tflops\n" "| torch.bfloat16 | 16 | ... | 100.0 | 0.05 | legacy |\n"
+    assert harness.parse_aiter_output(out_logits)["correctness_pass"] is False
+    # hard assertion text -> fail even if a number was produced.
+    out_assert = "ck_moe_2stages:  100.00 us\naccuracy check failed: err=1, logits_diff=0.2\n"
+    assert harness.parse_aiter_output(out_assert)["correctness_pass"] is False
+    # no logits at all -> fail (cannot confirm correctness).
+    out_no_logits = "ck_moe_2stages:  100.00 us, 100.00 tflops\n"
+    assert harness.parse_aiter_output(out_no_logits)["correctness_pass"] is False
+    # no e2e number at all -> fail.
+    assert harness.parse_aiter_output("nothing")["correctness_pass"] is False
+
+
+def test_aiter_cmd_is_strict_aot_model_correct():
+    # The aiter guardrail must use the strict/AOT/model-correct runner
+    # (scripts/aiter_strict_point.py), NOT the non-strict legacy CLI, and must
+    # carry the model's true act/gate, locked warmup/iters, and AOT enabled.
+    rp = harness.RunPoint("kimi_k2", 7168, 256, 384, 8, "silu", "a4w4", 16)
+    cmd = harness._aiter_cmd(rp)
+    joined = " ".join(cmd)
+    assert "aiter_strict_point.py" in joined
+    # Must NOT be the legacy CLI path.
+    assert "test_moe_2stage.py" not in joined
+    assert "--no-flydsl-csv" not in cmd
+    assert cmd[cmd.index("--aq") + 1] == "fp4"  # a4w4 -> fp4 activation
+    assert cmd[cmd.index("--act") + 1] == "silu"
+    assert cmd[cmd.index("--gate") + 1] == "separated"
+    assert cmd[cmd.index("--warmup") + 1] == "10"
+    assert cmd[cmd.index("--iters") + 1] == "100"
+    assert "--no-aot" not in cmd  # AOT cache check ON by default
+    assert cmd[cmd.index("-t") + 1] == "16"
+    # a8w4 -> fp8 activation; swiglu model carries swiglu act.
+    rpg = harness.RunPoint("gpt_oss", 3072, 3072, 128, 4, "swiglu", "a8w4", 512)
+    cmdg = harness._aiter_cmd(rpg)
+    assert cmdg[cmdg.index("--aq") + 1] == "fp8"
+    assert cmdg[cmdg.index("--act") + 1] == "swiglu"
+    # --no-aot toggle is honored.
+    assert "--no-aot" in harness._aiter_cmd(rp, check_aot=False)
+
+
+def test_parse_strict_aiter_output():
+    ok = (
+        'noise\nSTRICT_RESULT {"e2e_us": 80.7, "e2e_us_p95": 84.0, "logits_diff": 1.0e-05, '
+        '"correctness_pass": true, "check_aot_cache": true, "error_category": ""}\n'
+    )
+    r = harness.parse_strict_aiter_output(ok)
+    assert r["e2e_us"] == 80.7 and r["e2e_us_p95"] == 84.0 and r["correctness_pass"] is True
+    assert r["aot_status"] == "checked"
+    fail = (
+        'STRICT_RESULT {"error": "AssertionError: accuracy check failed", '
+        '"error_category": "correctness", "correctness_pass": false, "check_aot_cache": false}\n'
+    )
+    rf = harness.parse_strict_aiter_output(fail)
+    assert rf["correctness_pass"] is False and "AssertionError" in rf["error"]
+    assert rf["error_category"] == "correctness" and rf["aot_status"] == "no_aot"
+    miss = harness.parse_strict_aiter_output("no result here")
+    assert miss["correctness_pass"] is False and miss["error"] == "no_strict_result"
+
+
+def test_parse_flydsl_stage_p95():
+    stdout = (
+        "FlyDSL MoE stage1[fp4]: 100.0 us, p95=105.0 us 1654.24 TFLOPS(logical, M=144), 4.0 TB/s (x)\n"
+        "FlyDSL MoE stage2 [moe_gemm2] fp4 atomic | 7168x256, ... | 50.0 us, p95=55.0 us 1200.0 TFLOPS, 3.0 TB/s\n"
+    )
+    g = harness.parse_flydsl_stage_us(stdout)
+    assert g["stage1_us"] == 100.0 and g["stage1_p95"] == 105.0
+    assert g["stage2_us"] == 50.0 and g["stage2_p95"] == 55.0
+    # Without the p95 suffix, the p95 fields are None but median us still parses.
+    g2 = harness.parse_flydsl_stage_us("FlyDSL MoE stage1[fp4]: 100.0 us, 1.0 TFLOPS(logical, M=1), 4.0 TB/s (x)\n")
+    assert g2["stage1_us"] == 100.0 and g2["stage1_p95"] is None
+
+
+# --- run-list coverage (full token grid from spec) -------------------------
+
+
+def test_run_list_covers_full_dec6_grid():
+    rl = harness.build_run_list()
+    # DS V3 (16 tok x 2 dtype) + DS V4 (16 x 1) + Kimi (16 x 2) + GPT-OSS (8 x 2)
+    assert len(rl) == 16 * 2 + 16 * 1 + 16 * 2 + 8 * 2 == 96
+    keys = harness.expected_point_keys()
+    # DeepSeek V4 is a8w4-only.
+    assert ("deepseek_v4", "a8w4", "silu", "1") in keys
+    assert ("deepseek_v4", "a4w4", "silu", "1") not in keys
+    # GPT-OSS has no tiny-token regime; starts at 256.
+    assert ("gpt_oss", "a4w4", "swiglu", "256") in keys
+    assert ("gpt_oss", "a4w4", "swiglu", "1") not in keys
+    # full small + large coverage for a skinny model.
+    for tok in (1, 16, 64, 4096, 16384, 32768):
+        assert ("kimi_k2", "a4w4", "silu", str(tok)) in keys
+
+
+# --- baseline validation gate (negative tests) ------------------------
+
+
+def _good_baseline_row(**over):
+    row = {
+        "gpu_id": "0",
+        "gpu_model": "MI350X",
+        "branch": "rlcr/mxfp4-moe",
+        "commit": "523ca1c7deadbeef",
+        "command": "python3 test_moe_gemm.py ... ; python3 test_moe_2stage.py ...",
+        "warmup": "10",
+        "iters": "100",
+        "idle_gpu_verified": "True",
+        "graph_capture": "False",
+        "l2_flush_per_iter": "True",
+        "clocks_pinned": "True",
+        "model": "kimi_k2",
+        "dtype": "a4w4",
+        "act": "silu",
+        "token": "16",
+        # All required metric fields present and numeric.
+        "stage1_us": "55.3",
+        "stage2_us": "21.8",
+        "sorting_us": "0.0",
+        "kernel_path_us": "77.1",
+        "kernel_path_us_p95": "79.0",
+        "effective_tflops": "12.3",
+        "mfu": "0.0027",
+        "e2e_us": "150.0",
+        "e2e_us_p95": "155.0",
+        "logits_diff": "0.0008",
+        "correctness_pass": "True",
+    }
+    row.update(over)
+    return row
+
+
+def test_validate_baseline_row_accepts_good_row():
+    assert harness.validate_baseline_row(_good_baseline_row()) == []
+
+
+@pytest.mark.parametrize(
+    "over,expect",
+    [
+        ({"commit": "abc123"}, "commit_not_523ca1c7"),
+        ({"commit": ""}, "missing_commit"),
+        ({"idle_gpu_verified": "False"}, "idle_gpu_not_verified"),
+        ({"command": ""}, "missing_command"),
+        ({"dtype": ""}, "missing_dtype"),
+        ({"act": ""}, "missing_act"),
+        ({"e2e_us": ""}, "missing_e2e_us"),
+        ({"logits_diff": ""}, "missing_logits_diff"),
+        # Hardened metric-field requirements.
+        ({"stage1_us": ""}, "missing_stage1_us"),
+        ({"stage2_us": ""}, "missing_stage2_us"),
+        ({"sorting_us": ""}, "missing_sorting_us"),
+        ({"kernel_path_us": ""}, "missing_kernel_path_us"),
+        ({"kernel_path_us_p95": ""}, "missing_kernel_path_us_p95"),
+        ({"effective_tflops": ""}, "missing_effective_tflops"),
+        ({"mfu": ""}, "missing_mfu"),
+        ({"e2e_us_p95": ""}, "missing_e2e_us_p95"),
+        ({"kernel_path_us": "not-a-number"}, "missing_kernel_path_us"),
+        ({"correctness_pass": "False"}, "correctness_not_passed"),
+        ({"correctness_pass": ""}, "correctness_not_passed"),
+        ({"warmup": "2"}, "warmup_mismatch"),
+        ({"iters": "5"}, "iters_mismatch"),
+        ({"graph_capture": "True"}, "graph_capture_must_be_off"),
+        ({"l2_flush_per_iter": "False"}, "l2_flush_must_be_on"),
+        ({"clocks_pinned": "False"}, "clocks_must_be_pinned"),
+    ],
+)
+def test_validate_baseline_row_rejections(over, expect):
+    reasons = harness.validate_baseline_row(_good_baseline_row(**over))
+    assert expect in reasons
+
+
+def test_validate_baseline_csv_missing_coverage(tmp_path):
+    # A single fully-valid row is not enough; the full workload must be covered.
+    out = tmp_path / "baseline.csv"
+    p = harness.Provenance(
+        gpu_id="0", gpu_model="MI350X", branch="b", commit="523ca1c7", idle_gpu_verified=True, clocks_pinned=True
+    )
+    row = harness.PointRow(
+        provenance=p,
+        command="cmd",
+        model="kimi_k2",
+        model_dim=7168,
+        inter_dim=256,
+        experts=384,
+        topk=8,
+        dtype="a4w4",
+        act="silu",
+        token=16,
+        stage1_us=55.3,
+        stage2_us=21.8,
+        sorting_us=0.0,
+        kernel_path_us=77.1,
+        kernel_path_us_p95=79.0,
+        effective_tflops=12.3,
+        mfu=0.0027,
+        e2e_us=150.0,
+        e2e_us_p95=155.0,
+        logits_diff=0.0008,
+        correctness_pass=True,
+    )
+    harness.write_csv([row], str(out))
+    res = harness.validate_baseline_csv(str(out))
+    assert res["valid"] is False
+    assert res["missing_points"]  # almost all points missing
+    assert res["row_errors"] == {}  # the one present row is itself fully valid
+
+
+def test_validate_baseline_csv_rejects_missing_kernel_metrics(tmp_path):
+    # Regression: a full-coverage CSV with e2e/logits present
+    # but kernel metrics empty must NOT validate.
+    out = tmp_path / "baseline.csv"
+    p = harness.Provenance(
+        gpu_id="0", gpu_model="MI350X", branch="b", commit="523ca1c7", idle_gpu_verified=True, clocks_pinned=True
+    )
+    rows = []
+    for rp in harness.build_run_list():
+        rows.append(
+            harness.PointRow(
+                provenance=p,
+                command="cmd",
+                model=rp.model,
+                model_dim=rp.model_dim,
+                inter_dim=rp.inter_dim,
+                experts=rp.experts,
+                topk=rp.topk,
+                dtype=rp.dtype,
+                act=rp.act,
+                token=rp.token,
+                # kernel metrics deliberately omitted
+                e2e_us=150.0,
+                e2e_us_p95=155.0,
+                logits_diff=0.0008,
+                correctness_pass=True,
+            )
+        )
+    harness.write_csv(rows, str(out))
+    res = harness.validate_baseline_csv(str(out))
+    assert res["valid"] is False
+    assert not res["missing_points"]  # coverage is complete...
+    assert res["row_errors"]  # ...but rows fail on missing kernel metrics
+    some = next(iter(res["row_errors"].values()))
+    assert "missing_kernel_path_us" in some and "missing_mfu" in some
+
+
+def test_combined_and_metrics():
+    combined = harness.combined_kernel_path_us(1000.0, 800.0, 50.0)
+    assert combined == 1850.0
+    m = harness.compute_metrics(token=4096, model_dim=7168, inter_dim=256, topk=9, combined_us=combined)
+    assert m["effective_tflops"] > 0 and 0 < m["mfu"] < 10
+
+
+def test_summarize_median_p95():
+    s = harness.summarize([10, 11, 12, 13, 100])
+    assert s["median"] == 12
+    assert s["p95"] == 100
+
+
+def test_provenance_missing_fields_gate():
+    p = harness.Provenance()  # gpu_id/gpu_model/branch/commit unset
+    missing = p.missing_fields()
+    assert "gpu_id" in missing and "commit" in missing
+    assert not p.is_complete()
+    p2 = harness.Provenance(gpu_id="0", gpu_model="MI350X", branch="rlcr/mxfp4-moe", commit="deadbeef")
+    assert p2.is_complete()
+
+
+def test_pointrow_csv_dict_has_all_columns():
+    p = harness.Provenance(gpu_id="0", gpu_model="MI350X", branch="b", commit="c")
+    row = harness.PointRow(
+        provenance=p,
+        command="cmd",
+        model="kimi_k2",
+        model_dim=7168,
+        inter_dim=256,
+        experts=384,
+        topk=8,
+        dtype="a4w4",
+        act="silu",
+        token=4096,
+    )
+    d = row.to_csv_dict()
+    assert set(d.keys()) == set(harness.CSV_COLUMNS)
+    assert d["metric_formula"] == harness.METRIC_FORMULA
+
+
+def test_write_csv_roundtrip(tmp_path):
+    p = harness.Provenance(gpu_id="0", gpu_model="MI350X", branch="b", commit="c")
+    rows = [
+        harness.PointRow(
+            provenance=p,
+            command="cmd",
+            model="kimi_k2",
+            model_dim=7168,
+            inter_dim=256,
+            experts=384,
+            topk=8,
+            dtype="a4w4",
+            act="silu",
+            token=4096,
+            kernel_path_us=1850.0,
+            e2e_us=2000.0,
+            mfu=0.5,
+        )
+    ]
+    out = tmp_path / "baseline.csv"
+    harness.write_csv(rows, str(out))
+    text = out.read_text()
+    assert "kernel_path_us" in text.splitlines()[0]
+    assert "kimi_k2" in text
+
+
+# --- ledger: attempt validation + comparison -------------------------------
+
+
+def _complete_attempt(**over):
+    base = dict(
+        config={"tile_m": 64},
+        stage=1,
+        model="kimi_k2",
+        dtype="a4w4",
+        act="silu",
+        gpu_id="0",
+        gpu_model="MI350X",
+        branch="b",
+        commit="c",
+        command="cmd",
+        warmup=10,
+        iters=100,
+        result="loss",
+    )
+    base.update(over)
+    return ledger.Attempt(**base)
+
+
+def test_attempt_missing_provenance_rejected(tmp_path):
+    bad = _complete_attempt(commit="")  # missing required field
+    assert "commit" in bad.missing_fields()
+    with pytest.raises(ValueError):
+        ledger.append_attempt(bad, path=str(tmp_path / "attempts.jsonl"))
+
+
+def test_attempt_append_roundtrip(tmp_path):
+    path = str(tmp_path / "attempts.jsonl")
+    rec = ledger.append_attempt(_complete_attempt(result="win"), path=path, now=123.0)
+    assert rec["timestamp"] == 123.0
+    lines = open(path).read().strip().splitlines()
+    assert len(lines) == 1 and '"result": "win"' in lines[0]
+
+
+def _complete_rejected(**over):
+    base = dict(
+        model="kimi_k2",
+        dtype="a4w4",
+        act="silu",
+        token=64,
+        stage=0,
+        config={"tile_m1": 16},
+        reason="illegal candidate tiles: s1=fp4 tile_m<32",
+        selection={"model": "kimi_k2", "dtype": "a4w4", "tokens": [64]},
+        gpu_id="0",
+        gpu_model="MI350X",
+        branch="b",
+        commit="c",
+        command="python3 scripts/moe_tuning_harness.py candidate --tile-m1 16",
+        warmup=10,
+        iters=100,
+        csv_path="",  # present-but-empty: no measured artifact pre-compile
+        profile_path="",
+    )
+    base.update(over)
+    return base
+
+
+def test_rejected_candidate_full_provenance_roundtrip(tmp_path):
+    path = str(tmp_path / "attempts.jsonl")
+    rec = ledger.append_rejected_candidate(_complete_rejected(), path=path, now=7.0)
+    assert rec["result"] == "rejected_candidate" and rec["timestamp"] == 7.0
+    # csv_path/profile_path are present (empty allowed); selection is a non-empty dict.
+    assert rec["csv_path"] == "" and rec["profile_path"] == "" and rec["selection"]
+    # stage 0 is a valid value (candidate-tile rejection spanning both stages).
+    rec0 = ledger.append_rejected_candidate(_complete_rejected(stage=0), path=path, now=8.0)
+    assert rec0["stage"] == 0
+    lines = open(path).read().strip().splitlines()
+    assert len(lines) == 2
+
+
+def test_rejected_candidate_missing_provenance_rejected(tmp_path):
+    path = str(tmp_path / "attempts.jsonl")
+    # Each required (non-empty) provenance field, when blanked, must be refused.
+    for field in ("act", "gpu_id", "gpu_model", "branch", "commit", "command", "warmup", "iters"):
+        bad = _complete_rejected(**{field: ""})
+        with pytest.raises(ValueError, match="missing fields"):
+            ledger.append_rejected_candidate(bad, path=path)
+    # csv_path/profile_path keys must EXIST even though empty is allowed: drop them.
+    for field in ("csv_path", "profile_path"):
+        bad = _complete_rejected()
+        del bad[field]
+        with pytest.raises(ValueError, match="missing fields"):
+            ledger.append_rejected_candidate(bad, path=path)
+    # selection None/"" trips the missing-fields gate; {} / non-dict trips the
+    # dedicated selection gate.
+    for sel in (None, ""):
+        with pytest.raises(ValueError, match="missing fields"):
+            ledger.append_rejected_candidate(_complete_rejected(selection=sel), path=path)
+    for sel in ({}, "a4w4"):
+        with pytest.raises(ValueError, match="selection"):
+            ledger.append_rejected_candidate(_complete_rejected(selection=sel), path=path)
+    # The minimal-only record (the old contract) is now rejected.
+    with pytest.raises(ValueError, match="missing fields"):
+        ledger.append_rejected_candidate(
+            {"model": "kimi_k2", "dtype": "a4w4", "token": 64, "config": {}, "reason": "x"}, path=path
+        )
+    # No partial file should have been written.
+    assert not os.path.exists(path)
+
+
+def test_committed_rejected_records_are_contract_complete():
+    """Every committed rejected_candidate record must carry full provenance, unless
+    it is an explicitly superseded pre-contract artifact (marked superseded_by)."""
+    import json as _json
+
+    attempts = os.path.join(_REPO_ROOT, "docs", "attempts.jsonl")
+    if not os.path.exists(attempts):
+        pytest.skip("no committed attempts ledger")
+    required = set(ledger.REQUIRED_REJECTED_FIELDS)
+    present_keys = set(ledger.REQUIRED_REJECTED_PRESENT_KEYS)
+    offenders = []
+    for ln in open(attempts):
+        ln = ln.strip()
+        if not ln:
+            continue
+        rec = _json.loads(ln)
+        if rec.get("result") != "rejected_candidate":
+            continue
+        if "superseded_by" in rec:  # incomplete historical record, explicitly invalidated
+            continue
+        missing = [k for k in required if rec.get(k) in (None, "")]
+        missing += [k for k in present_keys if k not in rec]
+        sel = rec.get("selection")
+        if not isinstance(sel, dict) or not sel:
+            missing.append("selection")
+        if missing:
+            offenders.append((rec.get("timestamp"), missing))
+    assert not offenders, f"incomplete committed rejected records: {offenders}"
+
+
+def _csv(path, rows):
+    import csv as _c
+
+    with open(path, "w", newline="") as f:
+        w = _c.DictWriter(f, fieldnames=["model", "dtype", "act", "token", "kernel_path_us", "e2e_us", "mfu"])
+        w.writeheader()
+        for r in rows:
+            w.writerow(r)
+
+
+def _gate_csv(path, rows):
+    import csv as _c
+
+    cols = [
+        "model",
+        "dtype",
+        "act",
+        "token",
+        "kernel_path_us",
+        "e2e_us",
+        "aot_status",
+        "correctness_pass",
+        "logits_diff",
+    ]
+    with open(path, "w", newline="") as f:
+        w = _c.DictWriter(f, fieldnames=cols)
+        w.writeheader()
+        for r in rows:
+            w.writerow(r)
+
+
+def _gate_row(**over):
+    base = dict(
+        model="kimi_k2",
+        dtype="a4w4",
+        act="silu",
+        token=16,
+        kernel_path_us=150.0,
+        e2e_us=80.0,
+        aot_status="checked",
+        correctness_pass=True,
+        logits_diff=0.001,
+    )
+    base.update(over)
+    return base
+
+
+def test_selected_candidate_gate_accepts_checked_correct(tmp_path):
+    path = str(tmp_path / "cand.csv")
+    _gate_csv(path, [_gate_row(token=16), _gate_row(token=16384, kernel_path_us=1700, e2e_us=1500)])
+    res = ledger.selected_candidate_gate(path)
+    assert res["passed"] is True and res["n_rows"] == 2 and res["violations"] == []
+
+
+def test_selected_candidate_gate_rejects_no_aot_and_bad_correctness(tmp_path):
+    # no_aot row (repeatability/diagnostic bypass) can never be promoted to a win.
+    p1 = str(tmp_path / "no_aot.csv")
+    _gate_csv(p1, [_gate_row(aot_status="no_aot")])
+    r1 = ledger.selected_candidate_gate(p1)
+    assert r1["passed"] is False and any("aot_status" in v[1] for v in r1["violations"])
+
+    # failed correctness rejected.
+    p2 = str(tmp_path / "bad_correct.csv")
+    _gate_csv(p2, [_gate_row(correctness_pass=False)])
+    r2 = ledger.selected_candidate_gate(p2)
+    assert r2["passed"] is False and any("correctness_pass" in v[1] for v in r2["violations"])
+
+    # logits over threshold rejected.
+    p3 = str(tmp_path / "bad_logits.csv")
+    _gate_csv(p3, [_gate_row(logits_diff=0.05)])
+    r3 = ledger.selected_candidate_gate(p3)
+    assert r3["passed"] is False and any("logits_diff" in v[1] for v in r3["violations"])
+
+    # empty CSV: nothing to promote -> not passed.
+    p4 = str(tmp_path / "empty.csv")
+    _gate_csv(p4, [])
+    assert ledger.selected_candidate_gate(p4)["passed"] is False
+
+
+def test_scan_replay_consistency(tmp_path):
+    path = str(tmp_path / "attempts.jsonl")
+    import json as _json
+
+    def _write(recs):
+        with open(path, "w") as f:
+            for r in recs:
+                f.write(_json.dumps(r) + "\n")
+
+    # multi-file attempt whose command replays BOTH files -> clean.
+    good = {
+        "result": "neutral",
+        "csv_path": "docs/a.csv;docs/b.csv",
+        "command": "h candidate --out docs/a.csv ; h candidate --out docs/b.csv ; repeatability_check",
+        "timestamp": 1.0,
+    }
+    _write([good])
+    assert ledger.scan_replay_consistency(path) == []
+
+    # command misses b.csv -> offender.
+    bad = dict(good, command="h candidate --out docs/a.csv", timestamp=2.0)
+    _write([bad])
+    off = ledger.scan_replay_consistency(path)
+    assert off and off[0][0] == 2.0 and "docs/b.csv" in off[0][1]
+
+    # brace shorthand does not literally contain either file -> offender.
+    brace = dict(good, command="h candidate --out docs/{a,b}.csv", timestamp=3.0)
+    _write([brace])
+    assert ledger.scan_replay_consistency(path)
+
+    # required file hidden behind a '#' comment -> offender.
+    commented = dict(good, command="h candidate --out docs/a.csv  # then docs/b.csv", timestamp=4.0)
+    _write([commented])
+    assert ledger.scan_replay_consistency(path)
+
+    # superseded records are skipped.
+    superseded = dict(bad, superseded_by=9.0, timestamp=5.0)
+    _write([superseded])
+    assert ledger.scan_replay_consistency(path) == []
+
+
+def test_committed_repeatability_attempts_replayable():
+    """Committed multi-file repeatability attempts must replay all their CSVs."""
+    attempts = os.path.join(_REPO_ROOT, "docs", "attempts.jsonl")
+    if not os.path.exists(attempts):
+        pytest.skip("no committed attempts ledger")
+    off = ledger.scan_replay_consistency(attempts)
+    assert off == [], f"non-replayable committed repeatability attempts: {off}"
+
+
+def test_scan_duplicate_rejected_candidates(tmp_path):
+    path = str(tmp_path / "attempts.jsonl")
+    import json as _json
+
+    def _probe(ts, sup=None):
+        r = {
+            "result": "rejected_candidate",
+            "model": "deepseek_v3",
+            "dtype": "a4w4",
+            "act": "silu",
+            "token": 32,
+            "config": {"tile_m1": 256, "tile_n1": 32},
+            "reason": "x",
+            "timestamp": ts,
+        }
+        if sup is not None:
+            r["superseded_by"] = sup
+        return r
+
+    # Two ACTIVE records for the same probe -> duplicate.
+    open(path, "w").write(_json.dumps(_probe(1.0)) + "\n" + _json.dumps(_probe(2.0)) + "\n")
+    dups = ledger.scan_duplicate_rejected_candidates(path)
+    assert dups and sorted(dups[0][1]) == [1.0, 2.0]
+
+    # Superseding the older one leaves exactly one active -> clean.
+    open(path, "w").write(_json.dumps(_probe(1.0, sup=2.0)) + "\n" + _json.dumps(_probe(2.0)) + "\n")
+    assert ledger.scan_duplicate_rejected_candidates(path) == []
+
+
+def test_committed_rejected_candidates_unique():
+    """Committed ledger must have exactly one active rejected record per probe."""
+    attempts = os.path.join(_REPO_ROOT, "docs", "attempts.jsonl")
+    if not os.path.exists(attempts):
+        pytest.skip("no committed attempts ledger")
+    dups = ledger.scan_duplicate_rejected_candidates(attempts)
+    assert dups == [], f"duplicate active rejected-candidate records: {dups}"
+
+
+def test_scan_superseded_rejected_candidates(tmp_path):
+    path = str(tmp_path / "attempts.jsonl")
+    import json as _json
+
+    def _probe(ts, n, sup=None):
+        r = {
+            "result": "rejected_candidate",
+            "model": "deepseek_v3",
+            "dtype": "a4w4",
+            "act": "silu",
+            "token": 32,
+            "config": {"tile_m1": 256, "tile_n1": n},
+            "reason": "x",
+            "timestamp": ts,
+        }
+        if sup is not None:
+            r["superseded_by"] = sup
+        return r
+
+    # superseded record links to the matching active record of the SAME key -> clean.
+    open(path, "w").write(_json.dumps(_probe(1.0, 32, sup=2.0)) + "\n" + _json.dumps(_probe(2.0, 32)) + "\n")
+    assert ledger.scan_superseded_rejected_candidates(path) == []
+
+    # superseded record links to a DIFFERENT probe's active record -> offender.
+    open(path, "w").write(
+        _json.dumps(_probe(1.0, 32, sup=3.0))  # links to the n=64 record, wrong key
+        + "\n"
+        + _json.dumps(_probe(2.0, 32))
+        + "\n"
+        + _json.dumps(_probe(3.0, 64))
+        + "\n"
+    )
+    off = ledger.scan_superseded_rejected_candidates(path)
+    assert off and off[0][0] == 1.0
+
+
+def test_committed_superseded_links_valid():
+    """Every committed superseded rejected record must link to an active record of the same key."""
+    attempts = os.path.join(_REPO_ROOT, "docs", "attempts.jsonl")
+    if not os.path.exists(attempts):
+        pytest.skip("no committed attempts ledger")
+    off = ledger.scan_superseded_rejected_candidates(attempts)
+    assert off == [], f"superseded records linking to the wrong/no successor: {off}"
+
+
+def test_row_missing_kernel_path():
+    rp = harness.RunPoint("deepseek_v3", 7168, 256, 257, 9, "silu", "a4w4", 32)
+    prov = harness.Provenance(gpu_id="0", gpu_model="MI350X", branch="b", commit="c")
+    # A row with no parsed stage times is "missing" (the tile_n=512 / tile_k!=256 case).
+    blank = harness.PointRow(
+        provenance=prov,
+        command="x",
+        model=rp.model,
+        model_dim=rp.model_dim,
+        inter_dim=rp.inter_dim,
+        experts=rp.experts,
+        topk=rp.topk,
+        dtype=rp.dtype,
+        act=rp.act,
+        token=rp.token,
+    )
+    assert harness.row_missing_kernel_path(blank) is True
+    # A row with kernel-path populated is not missing.
+    blank.stage1_us = 90.0
+    blank.stage2_us = 70.0
+    blank.kernel_path_us = 160.0
+    assert harness.row_missing_kernel_path(blank) is False
+
+
+def test_compare_csvs_detects_regression_and_wins(tmp_path):
+    base = str(tmp_path / "base.csv")
+    cand = str(tmp_path / "cand.csv")
+    _csv(
+        base,
+        [
+            {
+                "model": "kimi_k2",
+                "dtype": "a4w4",
+                "act": "silu",
+                "token": 16384,
+                "kernel_path_us": 1000,
+                "e2e_us": 1200,
+                "mfu": 0.50,
+            },
+            {
+                "model": "kimi_k2",
+                "dtype": "a4w4",
+                "act": "silu",
+                "token": 16,
+                "kernel_path_us": 100,
+                "e2e_us": 150,
+                "mfu": 0.05,
+            },
+            {
+                "model": "kimi_k2",
+                "dtype": "a4w4",
+                "act": "silu",
+                "token": 128,
+                "kernel_path_us": 500,
+                "e2e_us": 600,
+                "mfu": 0.30,
+            },
+        ],
+    )
+    _csv(
+        cand,
+        [
+            # large bucket: +10% MFU win, no kernel-path regression
+            {
+                "model": "kimi_k2",
+                "dtype": "a4w4",
+                "act": "silu",
+                "token": 16384,
+                "kernel_path_us": 950,
+                "e2e_us": 1180,
+                "mfu": 0.56,
+            },
+            # small token: 20% faster and >=2us -> win
+            {
+                "model": "kimi_k2",
+                "dtype": "a4w4",
+                "act": "silu",
+                "token": 16,
+                "kernel_path_us": 80,
+                "e2e_us": 150,
+                "mfu": 0.05,
+            },
+            # mid token: regression on kernel-path (+10% and +50us)
+            {
+                "model": "kimi_k2",
+                "dtype": "a4w4",
+                "act": "silu",
+                "token": 128,
+                "kernel_path_us": 550,
+                "e2e_us": 600,
+                "mfu": 0.30,
+            },
+        ],
+    )
+    cv = ledger.compare_csvs(base, cand)
+    assert cv.any_regression is True  # the 128-token point regressed
+    assert cv.coverage_complete  # candidate covers all 3 baseline points
+    assert not cv.pareto_clean
+    assert ("kimi_k2", "a4w4", "silu", "16384") in cv.large_wins
+    assert ("kimi_k2", "a4w4", "silu", "16") in cv.small_wins
+
+
+def _gated_compare_csv(path, rows):
+    """Write a candidate/baseline CSV that ALSO carries the gate columns."""
+    import csv as _c
+
+    cols = [
+        "model",
+        "dtype",
+        "act",
+        "token",
+        "kernel_path_us",
+        "e2e_us",
+        "mfu",
+        "aot_status",
+        "correctness_pass",
+        "logits_diff",
+    ]
+    with open(path, "w", newline="") as f:
+        w = _c.DictWriter(f, fieldnames=cols)
+        w.writeheader()
+        for r in rows:
+            w.writerow(r)
+
+
+def _two_point_baseline_and_candidate(tmp_path, aot_status):
+    """A fully-covered, non-regressing, otherwise-WINNING 2-point candidate whose
+    gate columns are parameterized by ``aot_status``."""
+    base = str(tmp_path / "base.csv")
+    cand = str(tmp_path / "cand.csv")
+    bl = [
+        dict(
+            model="kimi_k2",
+            dtype="a4w4",
+            act="silu",
+            token=16384,
+            kernel_path_us=1000,
+            e2e_us=1200,
+            mfu=0.50,
+            aot_status="checked",
+            correctness_pass=True,
+            logits_diff=0.001,
+        ),
+        dict(
+            model="kimi_k2",
+            dtype="a4w4",
+            act="silu",
+            token=16,
+            kernel_path_us=100,
+            e2e_us=150,
+            mfu=0.05,
+            aot_status="checked",
+            correctness_pass=True,
+            logits_diff=0.001,
+        ),
+    ]
+    # candidate: +12% MFU at 16384 (large win), 20% faster at 16 (small win), no regressions
+    cd = [
+        dict(
+            model="kimi_k2",
+            dtype="a4w4",
+            act="silu",
+            token=16384,
+            kernel_path_us=950,
+            e2e_us=1180,
+            mfu=0.56,
+            aot_status=aot_status,
+            correctness_pass=True,
+            logits_diff=0.001,
+        ),
+        dict(
+            model="kimi_k2",
+            dtype="a4w4",
+            act="silu",
+            token=16,
+            kernel_path_us=80,
+            e2e_us=150,
+            mfu=0.05,
+            aot_status=aot_status,
+            correctness_pass=True,
+            logits_diff=0.001,
+        ),
+    ]
+    _gated_compare_csv(base, bl)
+    _gated_compare_csv(cand, cd)
+    return base, cand
+
+
+def test_claimable_win_blocks_no_aot_winning_candidate(tmp_path):
+    # The leak Codex flagged: an otherwise-winning, fully-covered, non-regressing
+    # candidate measured with --no-aot-check must NOT be promotable.
+    base, cand = _two_point_baseline_and_candidate(tmp_path, aot_status="no_aot")
+    cv = ledger.compare_csvs(base, cand)
+    # metrics still look winning...
+    assert cv.pareto_clean is True
+    assert cv.large_wins and cv.small_wins
+    # ...but the hard gate fails, so the candidate is NOT claimable.
+    assert cv.gate["passed"] is False
+    assert cv.claimable_win is False
+    # and the standalone gate agrees.
+    assert ledger.selected_candidate_gate(cand)["passed"] is False
+
+
+def test_claimable_win_allows_checked_correct_candidate(tmp_path):
+    base, cand = _two_point_baseline_and_candidate(tmp_path, aot_status="checked")
+    cv = ledger.compare_csvs(base, cand)
+    assert cv.pareto_clean is True
+    assert cv.large_wins and cv.small_wins
+    assert cv.gate["passed"] is True
+    assert cv.claimable_win is True
+
+
+def test_compare_csvs_rejects_cherry_picked_candidate(tmp_path):
+    # Baseline has 3 points; candidate reports only the single winning large
+    # point and omits the others.  Coverage must be incomplete and the verdict
+    # must NOT be pareto_clean -- a cherry-picked win cannot pass.
+    base = str(tmp_path / "base.csv")
+    cand = str(tmp_path / "cand.csv")
+    _csv(
+        base,
+        [
+            {
+                "model": "kimi_k2",
+                "dtype": "a4w4",
+                "act": "silu",
+                "token": 16384,
+                "kernel_path_us": 1000,
+                "e2e_us": 1200,
+                "mfu": 0.50,
+            },
+            {
+                "model": "kimi_k2",
+                "dtype": "a4w4",
+                "act": "silu",
+                "token": 16,
+                "kernel_path_us": 100,
+                "e2e_us": 150,
+                "mfu": 0.05,
+            },
+            {
+                "model": "kimi_k2",
+                "dtype": "a4w4",
+                "act": "silu",
+                "token": 128,
+                "kernel_path_us": 500,
+                "e2e_us": 600,
+                "mfu": 0.30,
+            },
+        ],
+    )
+    _csv(
+        cand,
+        [
+            {
+                "model": "kimi_k2",
+                "dtype": "a4w4",
+                "act": "silu",
+                "token": 16384,
+                "kernel_path_us": 900,
+                "e2e_us": 1100,
+                "mfu": 0.56,
+            },
+        ],
+    )
+    cv = ledger.compare_csvs(base, cand)
+    assert not cv.coverage_complete
+    assert ("kimi_k2", "a4w4", "silu", "16") in cv.missing_candidate_points
+    assert ("kimi_k2", "a4w4", "silu", "128") in cv.missing_candidate_points
+    assert not cv.pareto_clean  # forced False by incomplete coverage
+
+
+def test_compare_csvs_rejects_missing_regime_fields(tmp_path):
+    # Candidate covers every point but the large target bucket lacks mfu, and a
+    # point lacks e2e.  Those points are incomplete -> not pareto_clean.
+    base = str(tmp_path / "base.csv")
+    cand = str(tmp_path / "cand.csv")
+    _csv(
+        base,
+        [
+            {
+                "model": "kimi_k2",
+                "dtype": "a4w4",
+                "act": "silu",
+                "token": 16384,
+                "kernel_path_us": 1000,
+                "e2e_us": 1200,
+                "mfu": 0.50,
+            },
+            {
+                "model": "kimi_k2",
+                "dtype": "a4w4",
+                "act": "silu",
+                "token": 128,
+                "kernel_path_us": 500,
+                "e2e_us": 600,
+                "mfu": 0.30,
+            },
+        ],
+    )
+    _csv(
+        cand,
+        [
+            # large bucket missing mfu
+            {
+                "model": "kimi_k2",
+                "dtype": "a4w4",
+                "act": "silu",
+                "token": 16384,
+                "kernel_path_us": 900,
+                "e2e_us": 1100,
+                "mfu": "",
+            },
+            # mid point missing e2e
+            {
+                "model": "kimi_k2",
+                "dtype": "a4w4",
+                "act": "silu",
+                "token": 128,
+                "kernel_path_us": 480,
+                "e2e_us": "",
+                "mfu": 0.30,
+            },
+        ],
+    )
+    cv = ledger.compare_csvs(base, cand)
+    assert not cv.coverage_complete
+    assert ("kimi_k2", "a4w4", "silu", "16384") in cv.incomplete_points
+    assert ("kimi_k2", "a4w4", "silu", "128") in cv.incomplete_points
+    assert not cv.pareto_clean
+
+
+def test_repeatability_check(tmp_path):
+    a = str(tmp_path / "a.csv")
+    b = str(tmp_path / "b.csv")
+    _csv(
+        a,
+        [
+            {
+                "model": "kimi_k2",
+                "dtype": "a4w4",
+                "act": "silu",
+                "token": 16384,
+                "kernel_path_us": 1000,
+                "e2e_us": 1200,
+                "mfu": 0.5,
+            },
+            {
+                "model": "kimi_k2",
+                "dtype": "a4w4",
+                "act": "silu",
+                "token": 16,
+                "kernel_path_us": 100,
+                "e2e_us": 150,
+                "mfu": 0.05,
+            },
+        ],
+    )
+    # b: first point within band (1.5% < 2% and +15us... wait 15us>2us, so need <=max(2%*1000=20us,2us)=20us -> 1015 ok),
+    # second point unstable (+10us on a 100us base -> band=max(2us,2us)=2us, 10>2 -> unstable).
+    _csv(
+        b,
+        [
+            {
+                "model": "kimi_k2",
+                "dtype": "a4w4",
+                "act": "silu",
+                "token": 16384,
+                "kernel_path_us": 1015,
+                "e2e_us": 1210,
+                "mfu": 0.5,
+            },
+            {
+                "model": "kimi_k2",
+                "dtype": "a4w4",
+                "act": "silu",
+                "token": 16,
+                "kernel_path_us": 110,
+                "e2e_us": 150,
+                "mfu": 0.05,
+            },
+        ],
+    )
+    res = ledger.repeatability_check(a, b)
+    assert res["n_shared"] == 2
+    assert not res["stable"]  # the 16-token kernel_path drifted > band
+    assert any(u[0] == ("kimi_k2", "a4w4", "silu", "16") for u in res["unstable"]["kernel_path_us"])
+    # 16384 kernel_path within band, e2e within band -> not flagged.
+    assert all(u[0] != ("kimi_k2", "a4w4", "silu", "16384") for u in res["unstable"]["kernel_path_us"])
+
+
+def test_quarantine_and_validated_keys():
+    from kernels import moe_tuning_spec as spec
+
+    # ALL a8w4 shapes are correctness-quarantined (the non-fp4-activation
+    # e2e path fails the aiter correctness gate for fp8 AND bf16 activation; only
+    # fp4 activation passes).  DS V3 a8w4 is included (its earlier legacy-path "pass" was the
+    # legacy-Swiglu artifact, not a real Silu a8w4 pass).
+    assert spec.is_quarantined("deepseek_v3", "a8w4")
+    assert spec.is_quarantined("deepseek_v4", "a8w4")
+    assert spec.is_quarantined("kimi_k2", "a8w4")
+    assert spec.is_quarantined("gpt_oss", "a8w4")
+    # a4w4 is NOT quarantined for any model.
+    assert not spec.is_quarantined("deepseek_v3", "a4w4")
+    assert not spec.is_quarantined("kimi_k2", "a4w4")
+
+    vkeys = spec.validated_point_keys()
+    # Validated = all a4w4: DS V3 (16) + Kimi (16) + GPT-OSS (8) = 40.
+    assert len(vkeys) == 40
+    assert ("deepseek_v3", "a4w4", "silu", "1") in vkeys
+    assert ("deepseek_v3", "a8w4", "silu", "1") not in vkeys  # quarantined
+    assert ("kimi_k2", "a8w4", "silu", "1") not in vkeys  # quarantined
+    assert ("gpt_oss", "a8w4", "swiglu", "256") not in vkeys  # quarantined
+    # validated subset is a strict subset of the full workload.
+    assert vkeys < harness.expected_point_keys()
+
+
+def test_validate_baseline_csv_subset_keys(tmp_path):
+    # A CSV covering only the validated subset validates against validated keys,
+    # but fails against the full workload (missing the quarantined points).
+    from kernels import moe_tuning_spec as spec
+
+    out = tmp_path / "sub.csv"
+    p = harness.Provenance(
+        gpu_id="0", gpu_model="MI350X", branch="b", commit="523ca1c7", idle_gpu_verified=True, clocks_pinned=True
+    )
+    rows = []
+    for key in spec.validated_point_keys():
+        model, dtype, act, token = key
+        rows.append(
+            harness.PointRow(
+                provenance=p,
+                command="cmd",
+                model=model,
+                model_dim=7168,
+                inter_dim=256,
+                experts=257,
+                topk=9,
+                dtype=dtype,
+                act=act,
+                token=int(token),
+                stage1_us=10.0,
+                stage2_us=5.0,
+                sorting_us=0.0,
+                kernel_path_us=15.0,
+                kernel_path_us_p95=15.5,
+                effective_tflops=1.0,
+                mfu=0.01,
+                e2e_us=12.0,
+                e2e_us_p95=12.5,
+                logits_diff=0.0001,
+                correctness_pass=True,
+            )
+        )
+    harness.write_csv(rows, str(out))
+    assert harness.validate_baseline_csv(str(out), expected_keys=spec.validated_point_keys())["valid"] is True
+    assert harness.validate_baseline_csv(str(out))["valid"] is False  # full workload not covered
+
+
+def test_perf_dist_percentile():
+    import importlib
+
+    tc = importlib.import_module("tests.test_common")
+    # nearest-rank p95 over 1..100: idx=round(0.95*99)=94 -> value 95 (0-based).
+    assert tc._percentile(list(range(1, 101)), 0.95) == 95
+    assert tc._percentile([], 0.95) is None
+    assert "n_rotate" in tc.LAST_PERF_DIST
+
+
+def test_timed_distribution_rotates_distinct_args():
+    # Branch-level regression for the FLYDSL_PERF_DIST timed loop: it must cycle
+    # the cache-sized rotated arg copies (iteration i -> rotate_args[i % n]) so
+    # DISTINCT working sets reach func (the L2-flush behavior), and compute
+    # median/p95 from the injected per-call timings.
+    import importlib
+
+    tc = importlib.import_module("tests.test_common")
+
+    # 3 distinct arg copies; record which args each call received.
+    rotate_args = [((tag,), {}) for tag in ("A", "B", "C")]
+    seen = []
+
+    def func(tag):
+        seen.append(tag)
+        return f"out-{tag}"
+
+    # Injected timer returns a deterministic latency per call so we can check
+    # median/p95 without a GPU.
+    timings = iter([10.0, 30.0, 20.0, 50.0, 40.0, 60.0, 70.0])
+
+    def time_call(fn, a_i, kw_i):
+        out = fn(*a_i, **kw_i)
+        return next(timings), out
+
+    data, median, p95, n_rot = tc._timed_distribution(func, rotate_args, num_iters=7, time_call=time_call)
+    # 7 iters over 3 copies -> A,B,C,A,B,C,A (distinct args actually reach func).
+    assert seen == ["A", "B", "C", "A", "B", "C", "A"]
+    assert n_rot == 3
+    assert data == "out-A"  # last call's output
+    # median of [10,30,20,50,40,60,70] sorted=[10,20,30,40,50,60,70] -> 40.
+    assert median == 40.0
+    # nearest-rank p95: idx=round(0.95*6)=6 -> 70.
+    assert p95 == 70.0
+
+
+def test_clock_pinning_helpers(monkeypatch):
+    # pin_clocks parses the rocm-smi determinism-success message; clocks_pinned_state
+    # treats determinism/manual/high as pinned and auto as DVFS (not pinned).
+    outs = {}
+
+    def fake_run(cmd):
+        if "--setperfdeterminism" in cmd:
+            return outs.get("set", "")
+        if "--showperflevel" in cmd:
+            return outs.get("level", "")
+        return ""
+
+    monkeypatch.setattr(harness, "_run", fake_run)
+    outs["set"] = "GPU[0]: Successfully enabled performance determinism and set GFX clock frequency: 2200"
+    assert harness.pin_clocks("0") is True
+    outs["set"] = "GPU[0]: set_perf_level, Not supported on the given system"
+    assert harness.pin_clocks("0") is False
+    outs["level"] = "GPU[0]: Performance Level: determinism"
+    assert harness.clocks_pinned_state("0") is True
+    outs["level"] = "GPU[0]: Performance Level: auto"
+    assert harness.clocks_pinned_state("0") is False
+
+
+def test_setup_run_provenance_reflects_verified_clock_state(monkeypatch):
+    # The live setup path must record the VERIFIED clock-pinned state, never the
+    # static spec intent default. Provenance.clocks_pinned defaults to False.
+    assert harness.Provenance().clocks_pinned is False
+
+    calls = {"pin": 0}
+
+    def fake_pin(gpu_id, *a, **k):
+        calls["pin"] += 1
+        return True
+
+    monkeypatch.setattr(harness, "check_idle_gpu", lambda g, **k: True)
+    monkeypatch.setattr(harness, "pin_clocks", fake_pin)
+    monkeypatch.setattr(harness, "git_provenance", lambda *a, **k: {"branch": "b", "commit": "523ca1c7"})
+    monkeypatch.setattr(harness, "gpu_provenance", lambda g: {"gpu_id": str(g), "gpu_model": "MI350X"})
+
+    # Verified pinned -> clocks_pinned True.
+    monkeypatch.setattr(harness, "clocks_pinned_state", lambda g: True)
+    prov = harness.setup_run_provenance("0")
+    assert calls["pin"] == 1  # the driver actually attempted to pin
+    assert prov.clocks_pinned is True
+    assert prov.idle_gpu_verified is True
+    assert prov.commit == "523ca1c7" and prov.gpu_model == "MI350X"
+
+    # Verification fails -> clocks_pinned MUST be False (not the intent default).
+    monkeypatch.setattr(harness, "clocks_pinned_state", lambda g: False)
+    prov2 = harness.setup_run_provenance("0")
+    assert prov2.clocks_pinned is False
+    # A row built from unverified provenance is rejected by the baseline validator.
+    row = {
+        "commit": "523ca1c7",
+        "idle_gpu_verified": "True",
+        "gpu_id": "0",
+        "gpu_model": "MI350X",
+        "branch": "b",
+        "command": "c",
+        "dtype": "a4w4",
+        "act": "silu",
+        "model": "kimi_k2",
+        "token": "16",
+        "stage1_us": "1",
+        "stage2_us": "1",
+        "sorting_us": "0",
+        "kernel_path_us": "2",
+        "kernel_path_us_p95": "2",
+        "effective_tflops": "1",
+        "mfu": "0.1",
+        "e2e_us": "1",
+        "e2e_us_p95": "1",
+        "logits_diff": "0.0001",
+        "correctness_pass": "True",
+        "warmup": "10",
+        "iters": "100",
+        "graph_capture": "False",
+        "l2_flush_per_iter": "True",
+        "clocks_pinned": str(prov2.clocks_pinned),
+    }
+    assert "clocks_must_be_pinned" in harness.validate_baseline_row(row)
+
+
+def test_main_clock_provenance_fail_closed(monkeypatch, tmp_path):
+    # Direct regression around the live _main() path: it must pin+verify clocks,
+    # write rows with the verified clocks_pinned, fail-closed (rc=2, no CSV) when
+    # pinning cannot be verified, and proceed under --allow-unpinned.
+    rp = harness.RunPoint("kimi_k2", 7168, 256, 384, 8, "silu", "a4w4", 16)
+    monkeypatch.setattr(harness, "build_run_list", lambda: [rp])
+    monkeypatch.setattr(harness, "check_idle_gpu", lambda g, **k: True)
+    monkeypatch.setattr(harness, "git_provenance", lambda *a, **k: {"branch": "b", "commit": "523ca1c7"})
+    monkeypatch.setattr(harness, "gpu_provenance", lambda g: {"gpu_id": str(g), "gpu_model": "MI350X"})
+
+    written = {}
+
+    def fake_write_csv(rows, path):
+        written["rows"] = rows
+        written["path"] = path
+
+    def fake_run_point(rp_, tile, gpu, prov, **k):
+        return harness.PointRow(
+            provenance=prov,
+            command="cmd",
+            model=rp_.model,
+            model_dim=rp_.model_dim,
+            inter_dim=rp_.inter_dim,
+            experts=rp_.experts,
+            topk=rp_.topk,
+            dtype=rp_.dtype,
+            act=rp_.act,
+            token=rp_.token,
+        )
+
+    monkeypatch.setattr(harness, "write_csv", fake_write_csv)
+    monkeypatch.setattr(harness, "run_point", fake_run_point)
+    monkeypatch.setattr(harness, "pin_clocks", lambda g, *a, **k: True)
+
+    out = str(tmp_path / "b.csv")
+
+    # (a) verified pinned -> rc 0, rows written with clocks_pinned True.
+    written.clear()
+    monkeypatch.setattr(harness, "clocks_pinned_state", lambda g: True)
+    rc = harness._main(["baseline", "--gpu", "0", "--assume-idle", "--no-e2e", "--out", out])
+    assert rc == 0
+    assert written["rows"][0].provenance.clocks_pinned is True
+
+    # (b) verification fails -> fail-closed: rc 2 and NO csv written.
+    written.clear()
+    monkeypatch.setattr(harness, "clocks_pinned_state", lambda g: False)
+    rc = harness._main(["baseline", "--gpu", "0", "--assume-idle", "--no-e2e", "--out", out])
+    assert rc == 2
+    assert "rows" not in written  # fail-closed: did not write a false-pinned CSV
+
+    # (c) --allow-unpinned proceeds, recording clocks_pinned False.
+    written.clear()
+    rc = harness._main(["baseline", "--gpu", "0", "--assume-idle", "--no-e2e", "--allow-unpinned", "--out", out])
+    assert rc == 0
+    assert written["rows"][0].provenance.clocks_pinned is False
+
+
+def test_regime_aware_abs_floor():
+    # Regime-aware floor: 8us for tokens<=64, 2us for tokens>=128.
+    assert spec.abs_floor_us(1) == 8.0
+    assert spec.abs_floor_us(64) == 8.0
+    assert spec.abs_floor_us(128) == 2.0
+    assert spec.abs_floor_us(32768) == 2.0
+
+
+def test_is_regression_regime_aware():
+    # Small token (16): a 5us drift on a 130us base is within the 8us floor -> NOT a regression.
+    assert spec.is_regression(130.0, 135.0, token=16) is False
+    # Small token: 9us drift on 130us base -> regression (exceeds 8us AND 2%).
+    assert spec.is_regression(130.0, 139.0, token=16) is True
+    # Large token (128): 5us drift on 130us base -> regression under the 2us floor.
+    assert spec.is_regression(130.0, 135.0, token=128) is True
+    # Back-compat: token=None keeps the strict 2us floor.
+    assert spec.is_regression(130.0, 135.0) is True
+
+
+def test_repeatability_check_regime_aware(tmp_path):
+    a = str(tmp_path / "a.csv")
+    b = str(tmp_path / "b.csv")
+    _csv(
+        a,
+        [
+            {
+                "model": "kimi_k2",
+                "dtype": "a4w4",
+                "act": "silu",
+                "token": 16,
+                "kernel_path_us": 130,
+                "e2e_us": 40,
+                "mfu": 0.05,
+            },
+            {
+                "model": "kimi_k2",
+                "dtype": "a4w4",
+                "act": "silu",
+                "token": 128,
+                "kernel_path_us": 290,
+                "e2e_us": 250,
+                "mfu": 0.3,
+            },
+        ],
+    )
+    _csv(
+        b,
+        [
+            # token 16: +5us kernel-path -> within 8us small-token floor -> stable.
+            {
+                "model": "kimi_k2",
+                "dtype": "a4w4",
+                "act": "silu",
+                "token": 16,
+                "kernel_path_us": 135,
+                "e2e_us": 40,
+                "mfu": 0.05,
+            },
+            # token 128: +7us -> exceeds 2us floor (and 2%) -> unstable.
+            {
+                "model": "kimi_k2",
+                "dtype": "a4w4",
+                "act": "silu",
+                "token": 128,
+                "kernel_path_us": 297,
+                "e2e_us": 250,
+                "mfu": 0.3,
+            },
+        ],
+    )
+    res = ledger.repeatability_check(a, b)
+    kp = res["unstable"]["kernel_path_us"]
+    assert any(u[0] == ("kimi_k2", "a4w4", "silu", "128") for u in kp)  # 128 unstable
+    assert all(u[0] != ("kimi_k2", "a4w4", "silu", "16") for u in kp)  # 16 stable under 8us
+
+
+def test_select_run_points_filters():
+    # Candidate selection filters the full grid by model/dtype/token.
+    pts = harness.select_run_points(model="deepseek_v3", dtype="a4w4", tokens=[16, 16384])
+    keys = {(p.model, p.dtype, p.token) for p in pts}
+    assert keys == {("deepseek_v3", "a4w4", 16), ("deepseek_v3", "a4w4", 16384)}
+    # dtype filter excludes a8w4.
+    assert all(p.dtype == "a4w4" for p in harness.select_run_points(model="kimi_k2", dtype="a4w4"))
+    # whole-grid when unfiltered equals build_run_list.
+    assert len(harness.select_run_points()) == len(harness.build_run_list())
+
+
+def test_candidate_tile_for_overrides_and_legality():
+    rp = harness.RunPoint("deepseek_v3", 7168, 256, 257, 9, "silu", "a4w4", 16)
+    # Legal override: stage1 tile_n -> 128 (the DS V3 lead).
+    t = harness.candidate_tile_for(rp, {"tile_n1": 128})
+    assert t["tile_n1"] == 128 and t["tile_m1"] == 64 and t["tile_k1"] == 256
+    # No overrides -> the shape's default tiles.
+    assert harness.candidate_tile_for(rp, {}) == harness.default_tile_for(rp)
+    # Illegal override is rejected before any compile (e.g. fp4 tile_m < 32).
+    import pytest as _pytest
+
+    with _pytest.raises(ValueError):
+        harness.candidate_tile_for(rp, {"tile_m1": 16})
+
+
+def test_prepare_candidate_run_fail_closed(tmp_path, monkeypatch):
+    # candidate run is fail-closed: requires explicit tiles, all-legal, non-empty.
+    import moe_tuning_ledger as _ledger
+    import pytest as _pytest
+
+    # Capture rejected-candidate records instead of writing to the real ledger.
+    captured = []
+    monkeypatch.setattr(_ledger, "append_rejected_candidate", lambda rec, **k: captured.append(rec) or rec)
+
+    no_override = {k: None for k in ("tile_m1", "tile_n1", "tile_k1", "tile_n2", "tile_k2")}
+    # (1) no explicit tile -> reject (no silent default-tile fallback).
+    with _pytest.raises(ValueError, match="at least one explicit"):
+        harness.prepare_candidate_run(no_override, model="deepseek_v3", dtype="a4w4", tokens=[16])
+
+    # (2) legal explicit tile -> returns (run_list, tiles) of equal length.
+    ov = dict(no_override, tile_n1=128)
+    rl, tiles = harness.prepare_candidate_run(ov, model="deepseek_v3", dtype="a4w4", tokens=[16, 64])
+    assert len(rl) == len(tiles) == 2 and all(t["tile_n1"] == 128 for t in tiles)
+
+    # (3) illegal explicit tile -> raise AND record a machine-readable rejection
+    #     carrying the full provenance class (act/stage/branch/commit/command/...).
+    bad = dict(no_override, tile_m1=16)  # fp4 tile_m<32 illegal
+    prov = harness.Provenance(gpu_id="0", gpu_model="MI350X", branch="b", commit="c")
+    with _pytest.raises(ValueError, match="illegal candidate"):
+        harness.prepare_candidate_run(
+            bad, model="deepseek_v3", dtype="a4w4", tokens=[16], prov=prov, command="python3 harness candidate ..."
+        )
+    rec = captured[-1]
+    assert rec and rec["reason"] and rec["model"] == "deepseek_v3"
+    # Every full-provenance field is present and non-empty (stage 0 is valid).
+    for k in ("act", "gpu_id", "gpu_model", "branch", "commit", "command", "warmup", "iters", "selection"):
+        assert rec.get(k) not in (None, ""), k
+    assert rec["stage"] == 0 and rec["act"] == "silu"
+    # The record satisfies the ledger's own rejected-candidate contract.
+    assert not [f for f in _ledger.REQUIRED_REJECTED_FIELDS if rec.get(f) in (None, "")]
+
+    # (4) empty selection -> reject.
+    with _pytest.raises(ValueError, match="matched no points"):
+        harness.prepare_candidate_run(ov, model="nonesuch", dtype="a4w4", tokens=[16])
diff --git a/tests/unit/test_moe_tuning_legality.py b/tests/unit/test_moe_tuning_legality.py
new file mode 100644
index 000000000..17a9eafe8
--- /dev/null
+++ b/tests/unit/test_moe_tuning_legality.py
@@ -0,0 +1,176 @@
+# SPDX-License-Identifier: MIT
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
+
+"""Backend-agnostic tests for the MoE tile-config legality filter.
+
+These tests exercise pure host-side math in ``kernels/moe_tuning.py`` and do not
+require a GPU, the FlyROCDL bindings, or a compile.  They lock in two properties:
+
+1. Every tile config currently used by ``scripts/run_benchmark.sh`` for the
+   in-scope MXFP4 / A8W4 MoE shapes is accepted.
+2. Each named illegal case is rejected with the expected machine-readable reason.
+"""
+
+import pytest
+
+from kernels.moe_tuning import (
+    LDS_LIMIT_BYTES,
+    check_tile_config,
+    enumerate_legal_configs,
+)
+
+pytestmark = pytest.mark.l0_backend_agnostic
+
+
+# (stage, model_dim, inter_dim, tile_m, tile_n, tile_k, a_dtype)
+# Derived from run_benchmark.sh MOE_FP4_SHAPES / MOE_A8W4_SHAPES.  Stage1 uses
+# (tile_m, tile_n, tile_k); stage2 uses (tile_m, tile_n2, tile_k2).  In the
+# benchmark tables tile_n2 == tile_k2 == 256 for all in-scope MoE rows.
+_RUN_BENCHMARK_CONFIGS = [
+    # MOE_FP4_SHAPES group A: 7168/256/257/9, tile 64/256/256, n2/k2 256/256
+    (1, 7168, 256, 64, 256, 256, "fp4"),
+    (2, 7168, 256, 64, 256, 256, "fp4"),
+    # MOE_FP4_SHAPES group B: 7168/2048/32/8, tile 64/256/256
+    (1, 7168, 2048, 64, 256, 256, "fp4"),
+    (2, 7168, 2048, 64, 256, 256, "fp4"),
+    # MOE_A8W4_SHAPES GPT-OSS: 3072/3072/128/4, stage1 tile 32/128/256
+    (1, 3072, 3072, 32, 128, 256, "fp8"),
+    # stage2 tile_n2=256, tile_k2=256
+    (2, 3072, 3072, 32, 256, 256, "fp8"),
+]
+
+
+@pytest.mark.parametrize("stage,model_dim,inter_dim,tile_m,tile_n,tile_k,a_dtype", _RUN_BENCHMARK_CONFIGS)
+def test_accepts_run_benchmark_configs(stage, model_dim, inter_dim, tile_m, tile_n, tile_k, a_dtype):
+    res = check_tile_config(
+        stage=stage,
+        model_dim=model_dim,
+        inter_dim=inter_dim,
+        tile_m=tile_m,
+        tile_n=tile_n,
+        tile_k=tile_k,
+        a_dtype=a_dtype,
+        gpu_arch="gfx950",
+    )
+    assert res.legal, f"expected legal, got reason={res.reason!r} ({res.detail})"
+    assert res.lds_bytes is not None and res.lds_bytes <= LDS_LIMIT_BYTES["gfx950"]
+
+
+def test_rejects_tile_k_bytes_not_div_64():
+    # fp4 a_elem_bytes=1 -> tile_k_bytes = tile_k; 288 % 64 != 0.  tile_k>=256 ok.
+    res = check_tile_config(stage=1, model_dim=7168, inter_dim=256, tile_m=64, tile_n=256, tile_k=288, a_dtype="fp4")
+    assert not res.legal
+    assert res.reason == "tile_k_bytes_not_div_64"
+
+
+def test_rejects_splitk_k_per_batch_not_div_tile_k():
+    # model_dim=7168, k_batch=56 -> k_per_batch=128; 128 % 256 != 0.
+    res = check_tile_config(
+        stage=1, model_dim=7168, inter_dim=256, tile_m=64, tile_n=256, tile_k=256, a_dtype="fp4", k_batch=56
+    )
+    assert not res.legal
+    assert res.reason == "k_per_batch_not_div_tile_k"
+
+
+def test_rejects_splitk_model_dim_not_div_k_batch():
+    res = check_tile_config(
+        stage=1, model_dim=7168, inter_dim=256, tile_m=64, tile_n=256, tile_k=256, a_dtype="fp4", k_batch=3
+    )
+    assert not res.legal
+    assert res.reason == "model_dim_not_div_k_batch"
+
+
+def test_rejects_stage2_model_dim_not_div_tile_n():
+    # 7168 % 384 != 0
+    res = check_tile_config(stage=2, model_dim=7168, inter_dim=256, tile_m=64, tile_n=384, tile_k=256, a_dtype="fp4")
+    assert not res.legal
+    assert res.reason == "model_dim_not_div_tile_n"
+
+
+def test_rejects_stage2_inter_dim_not_div_tile_k():
+    # inter_dim=2048, tile_k=768 -> 2048 % 768 != 0 (and 768 % 64 == 0, tile_k>=256)
+    res = check_tile_config(stage=2, model_dim=7168, inter_dim=2048, tile_m=64, tile_n=256, tile_k=768, a_dtype="fp4")
+    assert not res.legal
+    assert res.reason == "inter_dim_not_div_tile_k"
+
+
+def test_rejects_lds_over_limit():
+    # A very large tile pushes stage1 LDS past the gfx950 163840-byte limit.
+    res = check_tile_config(stage=1, model_dim=7168, inter_dim=256, tile_m=512, tile_n=512, tile_k=256, a_dtype="fp8")
+    assert not res.legal
+    assert res.reason == "lds_over_limit"
+    assert res.lds_bytes is not None and res.lds_bytes > LDS_LIMIT_BYTES["gfx950"]
+
+
+def test_stage1_fp4_lds_mirrors_builder_no_vec_pack_halving():
+    # Regression: stage1 sizes _single_x_bytes from the FULL lds_stride for fp4
+    # (no a_elem_vec_pack division), matching compile_mixed_moe_gemm1.  These
+    # large-tile_k fp4 configs overflow the gfx950 163840-byte limit and MUST be
+    # rejected -- an earlier version halved the fp4 stride and wrongly accepted
+    # them.  Source-faithful footprints: 230400 and 197632 bytes.
+    from kernels.moe_tuning import stage1_lds_bytes
+
+    r1 = check_tile_config(stage=1, model_dim=7168, inter_dim=256, tile_m=32, tile_n=32, tile_k=3584, a_dtype="fp4")
+    assert not r1.legal and r1.reason == "lds_over_limit"
+    assert stage1_lds_bytes(tile_m=32, tile_n=32, tile_k=3584, a_dtype="fp4") == 230400
+
+    r2 = check_tile_config(stage=1, model_dim=3072, inter_dim=3072, tile_m=32, tile_n=32, tile_k=3072, a_dtype="fp4")
+    assert not r2.legal and r2.reason == "lds_over_limit"
+    assert stage1_lds_bytes(tile_m=32, tile_n=32, tile_k=3072, a_dtype="fp4") == 197632
+
+    # fp4 and fp8 share the same single_x sizing at stage1 (a_elem_bytes==1, no
+    # vec-pack division), so equal tiles give equal LDS.
+    assert stage1_lds_bytes(tile_m=64, tile_n=256, tile_k=256, a_dtype="fp4") == stage1_lds_bytes(
+        tile_m=64, tile_n=256, tile_k=256, a_dtype="fp8"
+    )
+
+
+def test_rejects_fp4_tile_m_too_small():
+    res = check_tile_config(stage=1, model_dim=7168, inter_dim=256, tile_m=16, tile_n=256, tile_k=256, a_dtype="fp4")
+    assert not res.legal
+    assert res.reason == "tile_m_lt_32"
+
+
+def test_rejects_fp4_tile_k_too_small():
+    # tile_k=128 is < 256; still tile_k_bytes % 64 == 0, so the MX-FP4 floor must catch it.
+    res = check_tile_config(stage=1, model_dim=7168, inter_dim=256, tile_m=64, tile_n=256, tile_k=128, a_dtype="fp4")
+    assert not res.legal
+    assert res.reason == "tile_k_lt_256"
+
+
+def test_rejects_bad_stage_and_dtype():
+    assert (
+        check_tile_config(
+            stage=3, model_dim=7168, inter_dim=256, tile_m=64, tile_n=256, tile_k=256, a_dtype="fp4"
+        ).reason
+        == "bad_stage"
+    )
+    assert (
+        check_tile_config(
+            stage=1, model_dim=7168, inter_dim=256, tile_m=64, tile_n=256, tile_k=256, a_dtype="bogus"
+        ).reason
+        == "bad_a_dtype"
+    )
+
+
+def test_enumerate_logs_rejections_with_reasons():
+    rejected = []
+    legal = enumerate_legal_configs(
+        stage=1,
+        model_dim=7168,
+        inter_dim=256,
+        a_dtype="fp4",
+        tile_m_choices=(16, 32, 64),  # 16 is illegal (tile_m_lt_32)
+        tile_n_choices=(256,),
+        tile_k_choices=(128, 256),  # 128 is illegal (tile_k_lt_256)
+        rejected_log=rejected,
+    )
+    # At least one legal config (e.g. tile_m in {32,64}, tile_k=256).
+    assert legal, "expected some legal configs"
+    assert all(r.legal for r in legal)
+    # Every rejection carries a machine-readable reason.
+    assert rejected, "expected some rejected configs"
+    assert all(r["reason"] for r in rejected)
+    reasons = {r["reason"] for r in rejected}
+    assert "tile_m_lt_32" in reasons
+    assert "tile_k_lt_256" in reasons

From 6323615f922412e2069c93ba7e392a84a3865a1e Mon Sep 17 00:00:00 2001
From: Jin Pan <jpan236@wisc.edu>
Date: Thu, 25 Jun 2026 01:54:13 +0000
Subject: [PATCH 2/2] moe bench: timed-loop p95 observability + MXFP4 MoE
 target shapes (#708)

- tests/test_common.py + tests/kernels/test_moe_gemm.py: capture and print the
  per-iteration timed-loop p95 alongside the median for MoE stage1/stage2 (additive
  observability; no kernel logic change).
- scripts/run_benchmark.sh: add the #708 MXFP4 MoE target shapes (DeepSeek V3,
  Kimi K2, GPT-OSS a4w4; plus a8w4 rows) bracketing the small-token latency and
  large-shape MFU regimes; document the model->shape mapping.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 scripts/run_benchmark.sh       | 30 ++++++++++++++--
 tests/kernels/test_moe_gemm.py | 14 ++++++--
 tests/test_common.py           | 65 ++++++++++++++++++++++++++++++++++
 3 files changed, 104 insertions(+), 5 deletions(-)

diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh
index d8ef10a0a..6dd1dbded 100755
--- a/scripts/run_benchmark.sh
+++ b/scripts/run_benchmark.sh
@@ -177,6 +177,9 @@ MOE_SHAPES='
 '
 
 # MoE FP4 shapes (requires --in_dtype fp4, gfx950 only): same format as MOE_SHAPES
+# Models: DeepSeek V3 (7168/256/257/9), Kimi K2 (7168/256/384/8), GPT-OSS
+# (3072/3072/128/4).  Token rows bracket the small-token latency regime
+# (tokens<=64) and the large-shape MFU regime (tokens>=4096; targets 16384/32768).
 MOE_FP4_SHAPES='
 16,7168,256,257,9,64,256,256,256,256
 128,7168,256,257,9,64,256,256,256,256
@@ -188,6 +191,13 @@ MOE_FP4_SHAPES='
 2048,7168,2048,32,8,64,256,256,256,256
 8192,7168,2048,32,8,64,256,256,256,256
 32768,7168,2048,32,8,64,256,256,256,256
+16,7168,256,384,8,64,256,256,256,256
+2048,7168,256,384,8,64,256,256,256,256
+16384,7168,256,384,8,64,256,256,256,256
+32768,7168,256,384,8,64,256,256,256,256
+2048,3072,3072,128,4,32,128,256,256,256
+16384,3072,3072,128,4,32,128,256,256,256
+32768,3072,3072,128,4,32,128,256,256,256
 '
 
 # MoE W4A16 groupwise shapes (int4_bf16, group_size=32): same format as MOE_SHAPES
@@ -199,14 +209,30 @@ MOE_W4A16_SHAPES='
 '
 
 # MoE A8W4 shapes (FP8 activation + MX-FP4 weight, gfx950 only): same format as MOE_SHAPES.
-# GPT-OSS inspired: model_dim=3072, inter_dim=3072, E=128, topk=4; sweep tokens from 512 to
-# bracket memory- and compute-bound regimes.  tile_m>=32 / tile_k>=256 are MX-FP4 layout requirements.
+# Models: GPT-OSS (3072/3072/128/4), DeepSeek V3 (7168/256/257/9), DeepSeek V4
+# (7168/512/385/7, a8w4 only), Kimi K2 (7168/256/384/8).  tile_m>=32 / tile_k>=256
+# are MX-FP4 layout requirements.  Token rows bracket the small-token latency
+# regime (tokens<=64) and the large-shape MFU regime (tokens>=4096; 16384/32768).
 MOE_A8W4_SHAPES='
 512,3072,3072,128,4,32,128,256,256,256
 1024,3072,3072,128,4,32,128,256,256,256
 2048,3072,3072,128,4,32,128,256,256,256
 4096,3072,3072,128,4,32,128,256,256,256
 8192,3072,3072,128,4,32,128,256,256,256
+16384,3072,3072,128,4,32,128,256,256,256
+32768,3072,3072,128,4,32,128,256,256,256
+16,7168,256,257,9,64,256,256,256,256
+2048,7168,256,257,9,64,256,256,256,256
+16384,7168,256,257,9,64,256,256,256,256
+32768,7168,256,257,9,64,256,256,256,256
+16,7168,512,385,7,64,256,256,256,256
+2048,7168,512,385,7,64,256,256,256,256
+16384,7168,512,385,7,64,256,256,256,256
+32768,7168,512,385,7,64,256,256,256,256
+16,7168,256,384,8,64,256,256,256,256
+2048,7168,256,384,8,64,256,256,256,256
+16384,7168,256,384,8,64,256,256,256,256
+32768,7168,256,384,8,64,256,256,256,256
 '
 
 # Memory bound threshold (M or tokens <= threshold => memory bound)
diff --git a/tests/kernels/test_moe_gemm.py b/tests/kernels/test_moe_gemm.py
index e956f83b5..4aaaba798 100644
--- a/tests/kernels/test_moe_gemm.py
+++ b/tests/kernels/test_moe_gemm.py
@@ -34,7 +34,15 @@
 
 from flydsl.runtime.device import get_rocm_arch  # noqa: E402
 from tests.kernels.test_ref import torch_moe_gemm1, torch_moe_gemm2  # noqa: E402
-from tests.test_common import run_perftest, verify_output  # noqa: E402
+from tests.test_common import LAST_PERF_DIST, run_perftest, verify_output  # noqa: E402
+
+
+def _perf_p95_suffix():
+    """Return ' p95=<v> us' when a timed-loop distribution was captured, else ''."""
+    p95 = LAST_PERF_DIST.get("p95")
+    return f" p95={p95:.1f} us" if p95 is not None else ""
+
+
 from tests.utils import pertoken_quant, shuffle_scale_for_int4, shuffle_weight  # noqa: E402
 
 ARCH = get_rocm_arch()
@@ -798,7 +806,7 @@ def launch(o, x, w, sx, sw, st, eids, sw_sorted):
 
     print(
         f"FlyDSL MoE stage1[{in_dtype}]: "
-        f"{us:.1f} us, "
+        f"{us:.1f} us,{_perf_p95_suffix()} "
         f"{tflops:.2f} TFLOPS(logical, M={tokens*topk}), "
         f"{tbps:.3f} TB/s (doweight_stage1={doweight_stage1})"
     )
@@ -1560,7 +1568,7 @@ def launch(o, x, w, sx, sw, st, eids, sw_sorted):
     print(
         f"FlyDSL MoE stage2 [{kernel_name}] {in_dtype} {'reduce' if use_reduce else 'atomic'} | "
         f"{model_dim}x{inter_dim}, E={experts}, K={topk}, M_eff={tokens*topk} | "
-        f"{us:.1f} us, {tflops:.2f} TFLOPS, {tbps:.3f} TB/s"
+        f"{us:.1f} us,{_perf_p95_suffix()} {tflops:.2f} TFLOPS, {tbps:.3f} TB/s"
     )
     # Optional compare vs aiter stage2.
     if compare_aiter_ck is None:
diff --git a/tests/test_common.py b/tests/test_common.py
index 28ac28691..1d061ec15 100644
--- a/tests/test_common.py
+++ b/tests/test_common.py
@@ -20,6 +20,42 @@
 # pd.set_option("display.expand_frame_repr", False)
 
 
+# Distribution (median + p95, microseconds) of the most recent perftest call,
+# populated only when FLYDSL_PERF_DIST is set.  Lets callers report a true
+# timed-loop median+p95 over num_iters without changing the (data, avg) return
+# signature shared by every other caller.
+LAST_PERF_DIST = {"median": None, "p95": None, "n_rotate": None}
+
+
+def _percentile(sorted_vals, q):
+    if not sorted_vals:
+        return None
+    idx = max(0, min(len(sorted_vals) - 1, int(round(q * (len(sorted_vals) - 1)))))
+    return sorted_vals[idx]
+
+
+def _timed_distribution(func, rotate_args, num_iters, time_call):
+    """Run ``func`` for ``num_iters``, CYCLING through ``rotate_args`` (the
+    cache-sized argument copies = L2-flush behavior), timing each call with
+    ``time_call(func, args, kwargs) -> microseconds``.
+
+    Returns ``(data, median_us, p95_us, n_rotate)``.  Pure/host-testable: the GPU
+    event timing is injected via ``time_call`` so the rotation contract (iteration
+    i uses ``rotate_args[i % n]``) can be unit-tested without a device.
+    """
+    n_rot = len(rotate_args)
+    latencies = []
+    data = None
+    for i in range(num_iters):
+        a_i, kw_i = rotate_args[i % n_rot]
+        us, data = time_call(func, a_i, kw_i)
+        latencies.append(us)
+    ordered = sorted(latencies)
+    n = len(ordered)
+    median = ordered[n // 2] if n % 2 else (ordered[n // 2 - 1] + ordered[n // 2]) / 2.0
+    return data, median, _percentile(ordered, 0.95), n_rot
+
+
 def perftest(num_iters=20, num_warmup=3, testGraph=False, num_rotate_args=0, needTrace=False):
     def decorator(func):
         def wrapper(*args, **kwargs):
@@ -46,6 +82,35 @@ def wrapper(*args, **kwargs):
             run_iters(num_warmup, func, *args, **kwargs)
             torch.cuda.synchronize()
 
+            # True per-iteration timed-loop distribution (median + p95) over
+            # num_iters, recorded in LAST_PERF_DIST.  Opt-in via FLYDSL_PERF_DIST so
+            # the default profiler/event path is unchanged.  Returns the MEDIAN as
+            # the central-tendency `avg` so the reported us is the median.
+            #
+            # Cycles through the SAME ``rotate_args`` set the default path uses
+            # (``num`` cache-sized argument copies), so each iteration touches a
+            # different working set -- this is the L2-flush behavior the recorded
+            # protocol claims (l2_flush_per_iter=True), not a hot-cache reuse of one
+            # tensor set.  LAST_PERF_DIST["n_rotate"] records how many copies cycled.
+            if int(os.environ.get("FLYDSL_PERF_DIST", 0)):
+                start_event = torch.cuda.Event(enable_timing=True)
+                end_event = torch.cuda.Event(enable_timing=True)
+
+                def _time_call(fn, a_i, kw_i):
+                    start_event.record()
+                    out = fn(*a_i, **kw_i)
+                    end_event.record()
+                    end_event.synchronize()
+                    return start_event.elapsed_time(end_event) * 1000.0, out  # ms -> us
+
+                data, median, p95, n_rot = _timed_distribution(func, rotate_args, num_iters, _time_call)
+                torch.cuda.synchronize()
+                LAST_PERF_DIST["median"] = median
+                LAST_PERF_DIST["p95"] = p95
+                LAST_PERF_DIST["n_rotate"] = n_rot
+                logger.info(f"perf_dist: median={median:.3f} us p95={p95:.3f} us over {num_iters} iters")
+                return data, median
+
             if int(os.environ.get("FLYDSL_LOG_MORE", 0)):
                 latencies = []
                 start_event = torch.cuda.Event(enable_timing=True)