From 2c6c0cd710fe1c7598712fbbf3b01508e86c1503 Mon Sep 17 00:00:00 2001 From: Jin Pan Date: Thu, 25 Jun 2026 01:54:13 +0000 Subject: [PATCH 1/2] MXFP4 MoE tuning harness: legality filter, measurement, ledger, strict guardrail (#708) Measurement + verification infrastructure for tuning the MXFP4 (per-1x32 fp4) MoE 2-stage GEMM on gfx950/MI350X, toward ROCm/FlyDSL#708 (low MFU at large shapes, long latency at small tokens). Infrastructure only -- no production kernel logic changes. - kernels/moe_tuning.py: pre-compile legality filter for stage1/stage2 tile configs (LDS footprint, divisibility, MX-FP4 floors); mirrors builder LDS sizing (stage1 full lds_stride vs stage2 fp4-halved). - kernels/moe_tuning_spec.py: locked spec constants + win/no-regression predicates (win margins, regime-aware band, token grid, MFU denominator, metric formula). - scripts/moe_tuning_harness.py: provenance-complete measurement harness (verified clock pinning, idle check, faithful timed-loop median+p95) + fail-closed candidate sweep CLI (illegal/unmeasured configs recorded as rejections). - scripts/moe_tuning_ledger.py: attempt ledger + full-coverage Pareto comparator with a single claimable_win gate (coverage + no-regression + win + AOT/correctness hard gate) and integrity scans (duplicate / replay / supersede-link). - scripts/aiter_strict_point.py: strict AOT-checked model-correct aiter e2e + correctness guardrail (logits_diff <= 0.01). - scripts/sync_aiter_flydsl_kernels.sh: overlay FlyDSL MoE kernels onto aiter's vendored copies for the e2e guardrail. - docs/mxfp4_moe_tuning.md + docs/baseline_523ca1c7_validated.csv: docs + a validated locked a4w4 baseline reference table. - Host-side unit tests (no GPU required): 94 passed, 4 skipped (committed-ledger scans skip without a ledger). black + ruff clean. Co-Authored-By: Claude Opus 4.8 (1M context) --- .gitignore | 2 + docs/baseline_523ca1c7_validated.csv | 41 + docs/mxfp4_moe_tuning.md | 58 + kernels/moe_tuning.py | 511 ++++++++ kernels/moe_tuning_spec.py | 254 ++++ scripts/aiter_strict_point.py | 196 +++ scripts/moe_tuning_harness.py | 1157 +++++++++++++++++ scripts/moe_tuning_ledger.py | 501 ++++++++ scripts/sync_aiter_flydsl_kernels.sh | 64 + tests/unit/test_moe_tuning_harness.py | 1584 ++++++++++++++++++++++++ tests/unit/test_moe_tuning_legality.py | 176 +++ 11 files changed, 4544 insertions(+) create mode 100644 docs/baseline_523ca1c7_validated.csv create mode 100644 docs/mxfp4_moe_tuning.md create mode 100644 kernels/moe_tuning.py create mode 100644 kernels/moe_tuning_spec.py create mode 100644 scripts/aiter_strict_point.py create mode 100644 scripts/moe_tuning_harness.py create mode 100644 scripts/moe_tuning_ledger.py create mode 100755 scripts/sync_aiter_flydsl_kernels.sh create mode 100644 tests/unit/test_moe_tuning_harness.py create mode 100644 tests/unit/test_moe_tuning_legality.py diff --git a/.gitignore b/.gitignore index 4a341beb1..35f6fcfd6 100644 --- a/.gitignore +++ b/.gitignore @@ -64,3 +64,5 @@ Thumbs.db # Sphinx documentation build docs/_build/ python/flydsl/_mlir + +.humanize* diff --git a/docs/baseline_523ca1c7_validated.csv b/docs/baseline_523ca1c7_validated.csv new file mode 100644 index 000000000..55692980f --- /dev/null +++ b/docs/baseline_523ca1c7_validated.csv @@ -0,0 +1,41 @@ +gpu_id,gpu_model,branch,commit,command,warmup,iters,idle_gpu_verified,graph_capture,l2_flush_per_iter,clocks_pinned,metric_formula,model,model_dim,inter_dim,experts,topk,dtype,act,token,tile_m1,tile_n1,tile_k1,tile_m2,tile_n2,tile_k2,stage1_us,stage2_us,sorting_us,kernel_path_us,kernel_path_us_p95,effective_tflops,mfu,e2e_us,e2e_us_p95,logits_diff,correctness_pass,flydsl_command,strict_error,error_category,aot_status +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 1 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 1 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,1,64,256,256,64,256,256,82.1,48.2,0.0,130.2,157.60000000000002,0.7610632258064516,0.00016826513946638328,34.0650421052633,493.84400248527527,0.0019092379303825568,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 1 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 2 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 2 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,2,64,256,256,64,256,256,81.0,46.5,0.0,127.4,140.6,1.55557978021978,0.0003439265487994207,41.40170370370407,493.5239851474762,0.001377186866181268,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 2 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 4 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 4 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,4,64,256,256,64,256,256,82.2,47.2,0.0,129.4,141.8,3.063073632148377,0.0006772216741429089,56.274242424242736,658.486008644104,0.0029107140716863045,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 4 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 8 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 8 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,8,64,256,256,64,256,256,85.8,52.4,0.0,138.2,145.6,5.73605973950796,0.0012681980410143622,60.24796703296714,375.7230043411255,1.1455316338060406e-05,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 8 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 16 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 16 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,16,64,256,256,64,256,256,91.6,61.1,0.0,152.7,171.2,10.382756463654223,0.0022955464213252758,83.8479381443307,571.2850093841553,1.029067283353502e-05,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 16 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 32 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 32 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,32,64,256,256,64,256,256,102.9,76.9,0.0,179.8,193.0,17.635671991101223,0.0038991094386692953,114.85418367346972,406.5229892730713,1.0363225636189632e-05,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 32 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 64 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 64 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,64,64,256,256,64,256,256,113.4,89.6,0.0,203.0,216.3,31.24033324137931,0.00690699386278561,148.7931530612243,435.8829855918884,1.0139395224539882e-05,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 64 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 128 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 128 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,128,64,256,256,64,256,256,119.2,94.9,0.0,214.39999999999998,225.5,59.15846686567165,0.013079475318521258,158.68847422680435,446.8429982662201,1.015027033468563e-05,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 128 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 256 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 256 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,256,64,256,256,64,256,256,165.9,100.9,0.0,266.7,278.79999999999995,95.11492535433072,0.021029167666223904,171.77611111111284,628.6050081253052,1.0238226904046854e-05,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 256 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 512 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 512 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,512,64,256,256,64,256,256,168.0,111.2,0.0,279.2,292.2,181.71311312320918,0.040175351121646954,191.89716161616175,633.204996585846,1.0047836131898968e-05,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 512 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 1024 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 1024 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,1024,64,256,256,64,256,256,168.8,135.0,0.0,304.2,317.0,333.5588506508876,0.07374725860068265,248.54487878787992,718.5260057449341,3.4465752332124566e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 1024 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 2048 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 2048 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,2048,64,256,256,64,256,256,169.0,218.4,0.0,387.5,398.8,523.7089154477419,0.1157879538907234,365.92926262626133,782.4059724807739,3.4398751157516116e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 2048 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 4096 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 4096 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,4096,64,256,256,64,256,256,249.4,367.8,0.0,616.6,630.4,658.2458797794357,0.1455330267033906,568.4188444444443,734.9259853363037,3.437462961830562e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 4096 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 8192 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 8192 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,8192,64,256,256,64,256,256,428.3,650.8,0.0,1079.1999999999998,1103.5,752.1764445366939,0.1663003414850086,982.3065411764695,1264.5310163497925,3.436508721699205e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 8192 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 16384 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 16384 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,16384,64,256,256,64,256,256,669.1,1233.6,0.0,1902.6999999999998,1934.1999999999998,853.2599137478321,0.18864910761614684,1729.7136813186798,1922.6160049438477,3.433807777675213e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 16384 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 32768 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 32768 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,deepseek_v3,7168,256,257,9,a4w4,silu,32768,64,256,256,64,256,256,1048.9,2375.2,0.0,3427.4,3500.7999999999997,947.3639714582482,0.20945478033567283,3223.8851157894737,3390.666961669922,3.435615013036575e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 32768 -e 257 -k 9 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 1 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 1 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,1,64,256,256,64,256,256,81.7,47.3,0.0,129.0,147.1,0.6827936744186047,0.0001509603525135098,38.18579591836654,493.88399720191956,0.0013388059847474487,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 1 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 2 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 2 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,2,64,256,256,64,256,256,81.2,46.5,0.0,127.7,136.0,1.379489177760376,0.0003049942909043502,41.845000000001676,500.6440281867981,0.002241814551414034,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 2 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 4 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 4 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,4,64,256,256,64,256,256,82.9,47.8,0.0,130.6,141.8,2.6977146707503827,0.0005964436592417383,55.812939393940304,648.045003414154,0.0008894657763870439,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 4 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 8 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 8 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,8,64,256,256,64,256,256,84.2,50.5,0.0,134.6,142.2,5.235089687964339,0.0011574374724661373,65.84719101123677,552.5649785995483,9.760027172789343e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 8 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 16 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 16 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,16,64,256,256,64,256,256,89.8,59.7,0.0,149.6,157.6,9.42036192513369,0.0020827684999190116,84.7373000000007,568.884015083313,9.839066007155672e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 16 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 32 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 32 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,32,64,256,256,64,256,256,100.7,74.1,0.0,174.89999999999998,185.5,16.115336123499144,0.0035629750438866117,119.12603225806363,594.165027141571,9.410086305061682e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 32 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 64 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 64 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,64,64,256,256,64,256,256,169.7,97.9,0.0,267.6,280.0,21.065562690582958,0.004657431503555816,168.3802087912095,609.9249720573425,9.403851483069658e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 64 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 128 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 128 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,128,64,256,256,64,256,256,176.5,116.0,0.0,292.4,304.8,38.5577604377565,0.008524819906645258,196.7042395833342,596.405029296875,0.0005954914352407359,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 128 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 256 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 256 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,256,64,256,256,64,256,256,179.8,124.6,0.0,304.3,315.8,74.09983011501807,0.01638289412226798,208.04478260869544,596.563994884491,0.0006630390382372786,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 256 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 512 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 512 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,512,64,256,256,64,256,256,180.3,134.4,0.0,314.70000000000005,327.9,143.3020546806482,0.031682965881195714,224.1345567010319,620.8850145339966,0.0006180732459515337,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 512 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 1024 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 1024 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,1024,64,256,256,64,256,256,181.5,151.8,0.0,333.3,343.6,270.6100006480648,0.05982975915278903,258.40870103092766,657.2449803352356,0.0006517958301904825,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 1024 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 2048 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 2048 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,2048,64,256,256,64,256,256,184.5,194.4,0.0,379.4,391.0,475.4576342435425,0.10511997219622872,382.2209595959608,787.6060009002686,3.441369365364544e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 2048 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 4096 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 4096 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,4096,64,256,256,64,256,256,252.1,348.4,0.0,600.5,615.2,600.7947591407161,0.13283103231057178,511.54278787878735,895.2869772911072,3.4482669706292768e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 4096 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 8192 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 8192 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,8192,64,256,256,64,256,256,387.2,564.1,0.0,950.8,975.5,758.8919917206563,0.1677850965555287,899.7465434782592,1170.7290410995483,3.4450919983441963e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 8192 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 16384 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 16384 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,16384,64,256,256,64,256,256,690.4,1095.2,0.0,1785.6,1822.1,808.1927707526881,0.17868511402889414,1597.2175056179763,1841.694951057434,3.443782965351083e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 16384 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 32768 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 7168 --inter-dim 256 -e 384 -k 8 -t 32768 --aq fp4 --wq fp4 --act silu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,kimi_k2,7168,256,384,8,a4w4,silu,32768,64,256,256,64,256,256,1062.3,2143.4,0.0,3205.2,3256.9,900.4798523998503,0.1990890675215234,2964.037744680856,3157.3050022125244,3.4440292647763826e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 7168,256 -t 32768 -e 384 -k 8 --num_warmup 10 --num_iters 100 --tile_m 64 --tile_n 256 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 256 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 3072 --inter-dim 3072 -e 128 -k 4 -t 256 --aq fp4 --wq fp4 --act swiglu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,gpt_oss,3072,3072,128,4,a4w4,swiglu,256,32,128,256,32,256,256,246.5,143.5,0.0,390.0,415.5,148.67194486153846,0.03287020669058997,328.1737765957458,656.0050249099731,6.2468397891146665e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 256 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 512 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 3072 --inter-dim 3072 -e 128 -k 4 -t 512 --aq fp4 --wq fp4 --act swiglu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,gpt_oss,3072,3072,128,4,a4w4,swiglu,512,32,128,256,32,256,256,254.4,146.2,0.0,400.6,428.79999999999995,289.47607836245635,0.0640009016941093,336.36401041666727,666.8050289154053,6.183315036101256e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 512 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 1024 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 3072 --inter-dim 3072 -e 128 -k 4 -t 1024 --aq fp4 --wq fp4 --act swiglu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,gpt_oss,3072,3072,128,4,a4w4,swiglu,1024,32,128,256,32,256,256,271.3,236.8,0.0,507.6,525.4,456.9114144680851,0.10101954774885809,343.81028125000074,689.1649961471558,6.178899654263326e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 1024 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 2048 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 3072 --inter-dim 3072 -e 128 -k 4 -t 2048 --aq fp4 --wq fp4 --act swiglu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,gpt_oss,3072,3072,128,4,a4w4,swiglu,2048,32,128,256,32,256,256,332.6,317.6,0.0,650.2,670.2,713.4058258505075,0.1577284602808993,453.1531264367808,835.0859880447388,6.184897809680123e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 2048 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 4096 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 3072 --inter-dim 3072 -e 128 -k 4 -t 4096 --aq fp4 --wq fp4 --act swiglu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,gpt_oss,3072,3072,128,4,a4w4,swiglu,4096,32,128,256,32,256,256,464.2,534.1,0.0,998.4000000000001,1056.0,929.1996553846153,0.20543879181618732,703.0980000000009,1013.7679576873779,6.199037882903546e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 4096 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 8192 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 3072 --inter-dim 3072 -e 128 -k 4 -t 8192 --aq fp4 --wq fp4 --act swiglu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,gpt_oss,3072,3072,128,4,a4w4,swiglu,8192,32,128,256,32,256,256,742.9,935.7,0.0,1678.3,1805.3,1105.5388618673658,0.2444260141205761,1302.9729550561804,1645.4930305480957,6.18097885130009e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 8192 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 16384 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 3072 --inter-dim 3072 -e 128 -k 4 -t 16384 --aq fp4 --wq fp4 --act swiglu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,gpt_oss,3072,3072,128,4,a4w4,swiglu,16384,32,128,256,32,256,256,1321.3,1578.6,0.0,2899.8999999999996,3148.1,1279.6481753660473,0.2829202244895086,2158.8413440860227,2466.418981552124,6.177899778192497e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 16384 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked +0,AMD Instinct MI350X,HEAD,523ca1c7e224ee62d5e3a4c0f52a18b9cec5e727,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 32768 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false ; HIP_VISIBLE_DEVICES=0 python3 /sgl-workspace/FlyDSL-mxfp4-moe/scripts/aiter_strict_point.py --model-dim 3072 --inter-dim 3072 -e 128 -k 4 -t 32768 --aq fp4 --wq fp4 --act swiglu --gate separated --warmup 10 --iters 100 --aiter-repo /sgl-workspace/aiter",10,100,True,False,True,True,effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523,gpt_oss,3072,3072,128,4,a4w4,swiglu,32768,32,128,256,32,256,256,2509.3,2999.5,0.0,5508.8,5865.0,1347.245042021493,0.2978653641435978,4044.7997684210477,4329.154014587402,6.180992124016349e-06,True,"HIP_VISIBLE_DEVICES=0 FLYDSL_PERF_DIST=1 python3 /sgl-workspace/FlyDSL-mxfp4-moe/tests/kernels/test_moe_gemm.py --in_dtype fp4 -dim 3072,3072 -t 32768 -e 128 -k 4 --num_warmup 10 --num_iters 100 --tile_m 32 --tile_n 128 --tile_k 256 --tile_n2 256 --tile_k2 256 --skip_ref true --compare_aiter_ck false",,,checked diff --git a/docs/mxfp4_moe_tuning.md b/docs/mxfp4_moe_tuning.md new file mode 100644 index 000000000..1b2a8a8fb --- /dev/null +++ b/docs/mxfp4_moe_tuning.md @@ -0,0 +1,58 @@ +# MXFP4 MoE 2-Stage Tuning Harness (gfx950) + +Measurement + verification infrastructure for tuning the MXFP4 (per-1×32 +microscale fp4) MoE 2-stage GEMM pipeline on AMD gfx950 / MI350X, in support of +[ROCm/FlyDSL#708](https://github.com/ROCm/FlyDSL/issues/708) ("MXFP4 MoE low MFU +at large shapes and long latency at small tokens"). + +> **Status: tuning *infrastructure* + a validated baseline. This does NOT yet +> contain a performance change to any kernel** — it is the measurement, legality, +> and bookkeeping foundation that a tuning campaign runs on top of. No production +> kernel logic is modified by this change set. + +## Components + +- **`kernels/moe_tuning.py`** — pre-compile legality filter for stage1/stage2 tile + configs (LDS footprint, thread/divisibility constraints, MX-FP4 floors). Lets a + config search reject illegal tiles before spending GPU time; mirrors the + builders' real LDS sizing (stage1 vs stage2 fp4 asymmetry included). +- **`kernels/moe_tuning_spec.py`** — locked spec constants + win/no-regression + predicates (win margins, regime-aware no-regression band, token grid, MFU + denominator, metric formula). +- **`scripts/moe_tuning_harness.py`** — the measurement harness: full provenance + per point (GPU id+model, branch+commit, exact replayable command, warmup/iters, + idle-GPU check, verified clock pinning), median+p95 from a faithful timed loop, + and a fail-closed candidate sweep CLI (illegal/unmeasured configs are recorded + as machine-readable rejections, never silently skipped). +- **`scripts/moe_tuning_ledger.py`** — attempt ledger + full-coverage Pareto + comparator. A candidate is promotable only via a single `claimable_win` gate + (full coverage + no kernel-path/e2e regression + a real win + a strict + AOT/correctness hard gate). Includes ledger-integrity scans (duplicate / + replayable-command / supersede-link). +- **`scripts/aiter_strict_point.py`** — strict, AOT-checked, model-correct single + -case aiter fused-MoE e2e + correctness guardrail (`logits_diff <= 0.01`). +- **`scripts/sync_aiter_flydsl_kernels.sh`** — overlay the current FlyDSL MoE + kernels onto aiter's vendored copies so the e2e guardrail runs against the same + sources being tuned. +- **`docs/baseline_523ca1c7_validated.csv`** — a validated locked a4w4 baseline + table (reference every candidate is compared against). + +## Tests + +`tests/unit/test_moe_tuning_harness.py` and +`tests/unit/test_moe_tuning_legality.py` cover the legality filter, provenance +contracts, the Pareto comparator + win gate, and the integrity scans (host-side, +no GPU required): + +```bash +python3 -m pytest tests/unit/test_moe_tuning_harness.py \ + tests/unit/test_moe_tuning_legality.py -q +``` + +## Scope notes + +- This change set targets the a4w4 (fp4×fp4) path. a8w4 (fp8×fp4) correctness is + currently environment-blocked by an aiter non-fp4-activation wrapper/layout + contract mismatch (not a FlyDSL kernel bug); it is quarantined for win claims. +- The actual tile/lever tuning that produces MFU/latency wins runs on top of this + harness and is tracked separately against #708. diff --git a/kernels/moe_tuning.py b/kernels/moe_tuning.py new file mode 100644 index 000000000..691d2617a --- /dev/null +++ b/kernels/moe_tuning.py @@ -0,0 +1,511 @@ +# SPDX-License-Identifier: MIT +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. + +"""Tuning support for the mixed (fp4/fp8 x fp4) MoE 2-stage GEMM kernels. + +This module holds host-side, pre-compile tooling for the MXFP4 MoE tuning +campaign. Nothing here changes kernel behavior; it mirrors the legality checks +that ``compile_mixed_moe_gemm1`` / ``compile_mixed_moe_gemm2`` already enforce so +that a tile-config search can reject illegal candidates *before* spending GPU +time on a compile that the kernel would refuse. + +The single entry point is :func:`check_tile_config`, which returns a +:class:`TileCheck` describing whether a ``(stage, tile_m, tile_n, tile_k, ...)`` +candidate is legal and, when it is not, a machine-readable reason. + +The constraints encoded here are a faithful copy of the ones in +``kernels/mixed_moe_gemm_2stage.py`` (stage1: ``tile_k_bytes % 64``, +``tile_m*tile_k*elem_bytes % total_threads``, split-K divisibility, the LDS +sizing / arch limit; stage2: ``model_dim % tile_n``, ``inter_dim % tile_k``, +``sort_block_m % tile_m``, ``tile_m*tile_k % 256``, the LDS sizing) plus the +MX-FP4 layout requirements (``tile_m % 32``, ``tile_m >= 32``, ``tile_k >= 256``). +Keep the two files in sync: if a constraint changes in the kernel builder, update +the matching check below. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Optional + +# gfx -> total LDS bytes available to a single workgroup. Matches the +# ``_lds_limit`` dict in compile_mixed_moe_gemm1 / 2. +LDS_LIMIT_BYTES = {"gfx950": 163840, "gfx942": 65536} + +# Element byte width of the activation operand, keyed by a_dtype. fp4 and fp8 +# both occupy 1 byte in the kernel's sizing math (fp4 is vector-packed 2:1 via +# a_elem_vec_pack, handled separately); fp16 is 2 bytes. +_A_ELEM_BYTES = {"fp8": 1, "fp4": 1, "int8": 1, "fp16": 2} + +# Activation vector pack factor (fp4 packs two logical elements per byte). +_A_ELEM_VEC_PACK = {"fp4": 2} + + +@dataclass +class TileCheck: + """Result of a legality check for one tile candidate. + + ``legal`` is True iff the kernel builder would accept the candidate. When + illegal, ``reason`` is a short machine-readable token (e.g. + ``"tile_k_bytes_not_div_64"``) and ``detail`` is a human-readable message. + ``lds_bytes`` is the computed LDS footprint when it could be evaluated. + """ + + legal: bool + stage: int + reason: str = "" + detail: str = "" + lds_bytes: Optional[int] = None + params: dict = field(default_factory=dict) + + def as_record(self) -> dict: + """Flat dict suitable for JSONL/CSV logging of a rejected candidate.""" + rec = { + "stage": self.stage, + "legal": self.legal, + "reason": self.reason, + "detail": self.detail, + "lds_bytes": self.lds_bytes, + } + rec.update(self.params) + return rec + + +def _align(ptr: int, align: int) -> int: + """Round ``ptr`` up to a multiple of ``align`` (mirrors SmemAllocator._align).""" + if ptr % align == 0: + return ptr + return (ptr + align - 1) // align * align + + +def _a_elem_bytes(a_dtype: str) -> int: + if a_dtype not in _A_ELEM_BYTES: + raise ValueError(f"a_dtype must be one of {sorted(_A_ELEM_BYTES)}, got {a_dtype!r}") + return _A_ELEM_BYTES[a_dtype] + + +def stage1_lds_bytes( + *, + tile_m: int, + tile_n: int, + tile_k: int, + a_dtype: str, + out_dtype: str = "f16", + waves_per_eu: int = 4, + use_cshuffle_epilog: bool = True, + gpu_arch: str = "gfx950", +) -> int: + """LDS bytes used by a stage1 config, mirroring compile_mixed_moe_gemm1. + + Follows the ping/pong allocator walk: pong holds max(input, lds_out)+tid, + ping holds input, with the lds_out auto-split when the standard layout would + overflow the arch limit, plus the waves_per_eu minimum-LDS padding. + """ + a_elem_bytes = _a_elem_bytes(a_dtype) + # FLIR_CK_LDS128 defaults on -> pad_k = 0. + lds_stride = tile_k + # NOTE: stage1 sizes the LDS A tile from the FULL lds_stride; unlike stage2 it + # does NOT divide by a_elem_vec_pack for fp4 here. The fp4 vec-pack stride + # halving only applies, conditionally, to an inner async-copy buffer in the + # kernel body, not to this top-level ping/pong allocation. See + # compile_mixed_moe_gemm1: ``_single_x_bytes = tile_m * lds_stride * a_elem_bytes``. + + out_s = str(out_dtype).strip().lower() + out_is_f32 = out_s in ("f32", "fp32", "float") + need_quant = out_s in ("fp4", "fp8") + if need_quant: + use_cshuffle_epilog = True + + single_x_bytes = tile_m * lds_stride * a_elem_bytes + cshuffle_elem_bytes = 4 if need_quant else (4 if out_is_f32 else 2) + lds_out_bytes = cshuffle_elem_bytes * tile_m * tile_n if use_cshuffle_epilog else 0 + lds_tid_bytes = tile_m * 4 + num_waves = min(4, tile_n // 32) if tile_n >= 32 else 0 + + global_align = 1024 + std_pong = max(single_x_bytes, lds_out_bytes) + lds_tid_bytes + std_ping = single_x_bytes + std_pong_aligned = _align(std_pong, 128) + std_total = _align(std_pong_aligned, global_align) + _align(std_ping, 128) + lds_limit = LDS_LIMIT_BYTES.get(gpu_arch, 0) + + split_lds_out = lds_limit > 0 and lds_out_bytes > 0 and std_total > lds_limit and num_waves >= 2 + + if split_lds_out: + half_out_bytes = cshuffle_elem_bytes * tile_m * (tile_n // 2) + pong_buffer_bytes = max(single_x_bytes, half_out_bytes) + ping_buffer_bytes = max(single_x_bytes, half_out_bytes) + else: + pong_buffer_bytes = max(single_x_bytes, lds_out_bytes) + ping_buffer_bytes = single_x_bytes + + # Allocator walk: pong = align16(0)+pong_buf, then align4()+tid. + pong_ptr = _align(0, 16) + pong_buffer_bytes + pong_ptr = _align(pong_ptr, 4) + lds_tid_bytes + ping_ptr = _align(0, 16) + ping_buffer_bytes + + if waves_per_eu is not None and waves_per_eu >= 1: + total_cu_lds = 160 * 1024 + min_lds = total_cu_lds // (waves_per_eu + 1) + 1 + pong_sz = _align(pong_ptr, 128) + ping_sz = _align(ping_ptr, 128) + cur_lds = pong_sz + ping_sz + if cur_lds < min_lds: + ping_ptr += min_lds - cur_lds + + # Final footprint uses the same global/128 alignment as _std_total. + return _align(_align(pong_ptr, 128), global_align) + _align(ping_ptr, 128) + + +def stage2_lds_bytes( + *, + tile_m: int, + tile_n: int, + tile_k: int, + a_dtype: str, + use_cshuffle_epilog: bool = True, +) -> int: + """LDS bytes used by a stage2 config, mirroring compile_mixed_moe_gemm2. + + Stage2 has no lds_out auto-split and no waves_per_eu padding. + """ + a_elem_bytes = _a_elem_bytes(a_dtype) + vec_pack = _A_ELEM_VEC_PACK.get(a_dtype, 1) + lds_stride = tile_k # pad_k = 0 with FLIR_CK_LDS128 default. + eff_lds_stride = lds_stride // vec_pack if vec_pack > 1 else lds_stride + + single_x_bytes = tile_m * eff_lds_stride * a_elem_bytes + cshuffle_elem_bytes = 2 # stage2 f16/bf16 + lds_out_bytes = cshuffle_elem_bytes * tile_m * tile_n if use_cshuffle_epilog else 0 + lds_tid_bytes = tile_m * 4 + + pong_buffer_bytes = max(single_x_bytes, lds_out_bytes) + ping_buffer_bytes = single_x_bytes + + pong_ptr = _align(0, 16) + pong_buffer_bytes + pong_ptr = _align(pong_ptr, 4) + lds_tid_bytes + ping_ptr = _align(0, 16) + ping_buffer_bytes + return pong_ptr + ping_ptr + + +def _check_stage1( + *, + model_dim: int, + inter_dim: int, + tile_m: int, + tile_n: int, + tile_k: int, + a_dtype: str, + out_dtype: str, + k_batch: int, + waves_per_eu: int, + gpu_arch: str, + params: dict, +) -> TileCheck: + a_elem_bytes = _a_elem_bytes(a_dtype) + + # MX-FP4 layout requirements (fp4/fp8 weight path). + if tile_m < 32: + return TileCheck( + False, 1, "tile_m_lt_32", f"tile_m={tile_m} < 32 (MX-FP4 layout requires tile_m>=32)", params=params + ) + if tile_m % 32 != 0: + return TileCheck( + False, 1, "tile_m_not_div_32", f"tile_m={tile_m} not divisible by 32 (MX-FP4 layout)", params=params + ) + if tile_k < 256: + return TileCheck( + False, 1, "tile_k_lt_256", f"tile_k={tile_k} < 256 (MX-FP4 layout requires tile_k>=256)", params=params + ) + + if tile_n < 32 or tile_n % 32 != 0: + return TileCheck( + False, 1, "tile_n_not_mult_32", f"tile_n={tile_n} must be a positive multiple of 32", params=params + ) + + # tile_k_bytes % 64 (kernel raises otherwise). + tile_k_bytes = tile_k * a_elem_bytes + if tile_k_bytes % 64 != 0: + return TileCheck( + False, 1, "tile_k_bytes_not_div_64", f"tile_k_bytes={tile_k_bytes} not divisible by 64", params=params + ) + + # total_threads = min(4, tile_n // 32) * 64 + num_waves = min(4, tile_n // 32) + total_threads = num_waves * 64 + bytes_x_per_tile = tile_m * tile_k * a_elem_bytes + if bytes_x_per_tile % total_threads != 0: + return TileCheck( + False, + 1, + "tile_load_not_div_total_threads", + f"tile_m*tile_k*elem_bytes={bytes_x_per_tile} not divisible by total_threads={total_threads}", + params=params, + ) + + # K-loop coverage: model_dim must be divisible by tile_k (implicit but required). + if model_dim % tile_k != 0: + return TileCheck( + False, + 1, + "model_dim_not_div_tile_k", + f"model_dim={model_dim} not divisible by tile_k={tile_k}", + params=params, + ) + + # Split-K divisibility. + if k_batch > 1: + if model_dim % k_batch != 0: + return TileCheck( + False, + 1, + "model_dim_not_div_k_batch", + f"model_dim={model_dim} not divisible by k_batch={k_batch}", + params=params, + ) + k_per_batch = model_dim // k_batch + if k_per_batch % tile_k != 0: + return TileCheck( + False, + 1, + "k_per_batch_not_div_tile_k", + f"(model_dim//k_batch)={k_per_batch} not divisible by tile_k={tile_k}", + params=params, + ) + + # LDS fits the arch limit. + lds = stage1_lds_bytes( + tile_m=tile_m, + tile_n=tile_n, + tile_k=tile_k, + a_dtype=a_dtype, + out_dtype=out_dtype, + waves_per_eu=waves_per_eu, + gpu_arch=gpu_arch, + ) + limit = LDS_LIMIT_BYTES.get(gpu_arch, 0) + if limit and lds > limit: + return TileCheck( + False, 1, "lds_over_limit", f"stage1 LDS {lds} > {gpu_arch} limit {limit}", lds_bytes=lds, params=params + ) + + return TileCheck(True, 1, lds_bytes=lds, params=params) + + +def _check_stage2( + *, + model_dim: int, + inter_dim: int, + tile_m: int, + tile_n: int, + tile_k: int, + a_dtype: str, + sort_block_m: int, + gpu_arch: str, + params: dict, +) -> TileCheck: + a_elem_bytes = _a_elem_bytes(a_dtype) + + # MX-FP4 layout requirements. + if tile_m < 32: + return TileCheck( + False, 2, "tile_m_lt_32", f"tile_m={tile_m} < 32 (MX-FP4 layout requires tile_m>=32)", params=params + ) + if tile_m % 32 != 0: + return TileCheck( + False, 2, "tile_m_not_div_32", f"tile_m={tile_m} not divisible by 32 (MX-FP4 layout)", params=params + ) + if tile_k < 256: + return TileCheck( + False, 2, "tile_k_lt_256", f"tile_k={tile_k} < 256 (MX-FP4 layout requires tile_k>=256)", params=params + ) + + # model_dim % 16 (kernel asserts) and the N-tile coverage model_dim % tile_n. + if model_dim % 16 != 0: + return TileCheck(False, 2, "model_dim_not_div_16", f"model_dim={model_dim} not divisible by 16", params=params) + if model_dim % tile_n != 0: + return TileCheck( + False, + 2, + "model_dim_not_div_tile_n", + f"model_dim={model_dim} not divisible by tile_n={tile_n}", + params=params, + ) + + # inter_dim (= stage2 K) must be divisible by tile_k. + if inter_dim % tile_k != 0: + return TileCheck( + False, + 2, + "inter_dim_not_div_tile_k", + f"inter_dim={inter_dim} not divisible by tile_k={tile_k}", + params=params, + ) + + # tile_k_bytes % 64. + tile_k_bytes = tile_k * a_elem_bytes + if tile_k_bytes % 64 != 0: + return TileCheck( + False, 2, "tile_k_bytes_not_div_64", f"tile_k_bytes={tile_k_bytes} not divisible by 64", params=params + ) + + # total_threads is a fixed 256 in stage2. + bytes_x_per_tile = tile_m * tile_k * a_elem_bytes + if bytes_x_per_tile % 256 != 0: + return TileCheck( + False, + 2, + "tile_load_not_div_256", + f"tile_m*tile_k*elem_bytes={bytes_x_per_tile} not divisible by 256", + params=params, + ) + # gmem load mapping: bytes_per_thread must be divisible by 4. + if (bytes_x_per_tile // 256) % 4 != 0: + return TileCheck( + False, + 2, + "bytes_per_thread_not_div_4", + f"bytes_per_thread_x={bytes_x_per_tile // 256} not divisible by 4", + params=params, + ) + + # sort_block_m must be a multiple of tile_m (0 -> equals tile_m, always legal). + eff_sort_block_m = tile_m if sort_block_m <= 0 else sort_block_m + if eff_sort_block_m != tile_m and eff_sort_block_m % tile_m != 0: + return TileCheck( + False, + 2, + "sort_block_m_not_mult_tile_m", + f"sort_block_m={eff_sort_block_m} not a multiple of tile_m={tile_m}", + params=params, + ) + + # LDS fits the arch limit. + lds = stage2_lds_bytes(tile_m=tile_m, tile_n=tile_n, tile_k=tile_k, a_dtype=a_dtype) + limit = LDS_LIMIT_BYTES.get(gpu_arch, 0) + if limit and lds > limit: + return TileCheck( + False, 2, "lds_over_limit", f"stage2 LDS {lds} > {gpu_arch} limit {limit}", lds_bytes=lds, params=params + ) + + return TileCheck(True, 2, lds_bytes=lds, params=params) + + +def check_tile_config( + *, + stage: int, + model_dim: int, + inter_dim: int, + tile_m: int, + tile_n: int, + tile_k: int, + a_dtype: str = "fp4", + out_dtype: str = "f16", + k_batch: int = 1, + waves_per_eu: int = 4, + sort_block_m: int = 0, + gpu_arch: str = "gfx950", +) -> TileCheck: + """Check whether a single tile candidate is legal for ``stage`` (1 or 2). + + Mirrors the pre-compile constraints in ``compile_mixed_moe_gemm1`` / + ``compile_mixed_moe_gemm2`` so the candidate never reaches a compile the + kernel would reject. ``a_dtype`` is ``"fp4"`` for a4w4 and ``"fp8"`` for + a8w4 (the activation operand); the weight operand is fp4 in both cases. + + Returns a :class:`TileCheck`; ``.legal`` is the accept/reject decision and + ``.reason`` is a machine-readable token on rejection. + """ + params = { + "model_dim": model_dim, + "inter_dim": inter_dim, + "tile_m": tile_m, + "tile_n": tile_n, + "tile_k": tile_k, + "a_dtype": a_dtype, + "out_dtype": out_dtype, + "k_batch": k_batch, + "waves_per_eu": waves_per_eu, + "sort_block_m": sort_block_m, + "gpu_arch": gpu_arch, + } + if a_dtype not in _A_ELEM_BYTES: + return TileCheck(False, stage, "bad_a_dtype", f"a_dtype={a_dtype!r} not supported", params=params) + + if stage == 1: + return _check_stage1( + model_dim=model_dim, + inter_dim=inter_dim, + tile_m=tile_m, + tile_n=tile_n, + tile_k=tile_k, + a_dtype=a_dtype, + out_dtype=out_dtype, + k_batch=k_batch, + waves_per_eu=waves_per_eu, + gpu_arch=gpu_arch, + params=params, + ) + if stage == 2: + return _check_stage2( + model_dim=model_dim, + inter_dim=inter_dim, + tile_m=tile_m, + tile_n=tile_n, + tile_k=tile_k, + a_dtype=a_dtype, + sort_block_m=sort_block_m, + gpu_arch=gpu_arch, + params=params, + ) + return TileCheck(False, stage, "bad_stage", f"stage must be 1 or 2, got {stage}", params=params) + + +def enumerate_legal_configs( + *, + stage: int, + model_dim: int, + inter_dim: int, + a_dtype: str, + tile_m_choices, + tile_n_choices, + tile_k_choices, + out_dtype: str = "f16", + k_batch_choices=(1,), + waves_per_eu_choices=(4,), + sort_block_m_choices=(0,), + gpu_arch: str = "gfx950", + rejected_log: Optional[list] = None, +): + """Yield every legal tile candidate from the cross product of the choices. + + Rejected candidates are appended (as ``TileCheck.as_record()`` dicts) to + ``rejected_log`` when provided, so the search never silently drops a + candidate without a machine-readable reason. + """ + legal = [] + for tile_m in tile_m_choices: + for tile_n in tile_n_choices: + for tile_k in tile_k_choices: + for k_batch in (k_batch_choices if stage == 1 else (1,)): + for waves_per_eu in (waves_per_eu_choices if stage == 1 else (4,)): + for sort_block_m in (sort_block_m_choices if stage == 2 else (0,)): + res = check_tile_config( + stage=stage, + model_dim=model_dim, + inter_dim=inter_dim, + tile_m=tile_m, + tile_n=tile_n, + tile_k=tile_k, + a_dtype=a_dtype, + out_dtype=out_dtype, + k_batch=k_batch, + waves_per_eu=waves_per_eu, + sort_block_m=sort_block_m, + gpu_arch=gpu_arch, + ) + if res.legal: + legal.append(res) + elif rejected_log is not None: + rejected_log.append(res.as_record()) + return legal diff --git a/kernels/moe_tuning_spec.py b/kernels/moe_tuning_spec.py new file mode 100644 index 000000000..910bf409b --- /dev/null +++ b/kernels/moe_tuning_spec.py @@ -0,0 +1,254 @@ +# SPDX-License-Identifier: MIT +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. + +"""Locked specification for the MXFP4 MoE 2-stage tuning campaign on gfx950. + +This is the single source of truth for the campaign's fixed parameters: the +target model shapes, the token sweep grid, the measurement protocol, the +win/no-regression predicates, the MFU denominator, and the routing-distribution +set used in correctness checks. The measurement harness and the (later) +shape->config dispatch both import from here so the numbers live in exactly one +place. + +All values are fixed inputs locked by the user before the campaign began; do not +change them as part of tuning. Tuning changes tile configs, not these gates. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Tuple + +# --- MFU denominator ------------------------------------------------------- +# Empirically measured fp4 GEMM ceiling on the target MI350X (gfx950, 256 CU, +# sclk max 2200 MHz). MFU = effective_TFLOPS / FP4_PEAK_TFLOPS. +FP4_PEAK_TFLOPS = 4523.0 + +# --- Win margins (the win-margin policy) --------------------------------------------------- +WIN_MARGIN = 0.10 # 10% relative improvement required to claim a win. +# Large-shape (tokens >= LARGE_TOKEN_MIN): tuned_MFU >= baseline_MFU * (1 + WIN_MARGIN). +# Small-token (tokens <= SMALL_TOKEN_MAX): tuned_us <= baseline_us * (1 - WIN_MARGIN) +# AND (baseline_us - tuned_us) >= ABS_US_BAND. + +# --- No-regression tolerance + protocol (the no-regression policy) ---------------------------- +REGRESSION_REL = 0.02 # 2% relative. +ABS_US_BAND = 2.0 # microseconds; default absolute floor (tokens >= 128). + +# Regime-aware absolute floor (user-approved amendment). On this shared node the +# small/low-token absolute latency is tiny (~30-300 us) and run-to-run jitter is +# ~3-7 us even after the in-protocol controls are exhausted (faithful L2-flush +# argument rotation, repeated measurement, AND harness-verified clock pinning). +# This is irreducible measurement noise at tiny absolute latency, not a harness +# defect: under the 8 us small-token floor the residual a4w4 repeatability +# instability is confined to a single mid-token point (token 128, under the strict +# 2 us tokens>=128 floor) plus the e2e guardrail outlier (token 64) -- i.e. the +# small-token (<=64) kernel-path band is satisfied; tokens >= 128 keep the strict +# 2 us floor. 8 us is still far below the small-token win threshold (>= 10% AND +# >= 2 us; 10% of even the smallest ~127 us point is ~12.7 us), so widening the +# band does NOT weaken win detection. Floor is regime-aware: 8 us for +# tokens <= SMALL_TOKEN_MAX, 2 us otherwise. +SMALL_TOKEN_ABS_US_BAND = 8.0 + + +def abs_floor_us(token: int) -> float: + """Regime-aware absolute floor for the no-regression / repeatability band. + + 8 us for the small-token regime (tokens <= SMALL_TOKEN_MAX), 2 us otherwise. + Used together with the 2% relative term as ``max(2%, abs_floor_us(token))``. + """ + return SMALL_TOKEN_ABS_US_BAND if token <= SMALL_TOKEN_MAX else ABS_US_BAND + + +WARMUP_ITERS = 10 +BENCH_ITERS = 100 +# Reported statistics per point. +REPORT_STATS = ("median", "p95") +# Protocol flags (recorded with every measurement; runs under other settings are +# non-comparable). +GRAPH_CAPTURE = False +L2_FLUSH_PER_ITER = True +CLOCKS_PINNED = True + +# --- Token regimes (the win-margin policy / the target-bucket policy) ----------------------------------------- +LARGE_TOKEN_MIN = 4096 # MFU regime. +SMALL_TOKEN_MAX = 64 # latency regime. +# Predeclared MFU target buckets (the target-bucket policy): the two largest in-sweep tokens. +MFU_TARGET_BUCKETS: Tuple[int, ...] = (16384, 32768) + +# --- Token grids (the token-grid policy) --------------------------------------------------- +TOKEN_GRID_FULL: Tuple[int, ...] = ( + 1, + 2, + 4, + 8, + 16, + 32, + 64, + 128, + 256, + 512, + 1024, + 2048, + 4096, + 8192, + 16384, + 32768, +) +TOKEN_GRID_GPTOSS: Tuple[int, ...] = (256, 512, 1024, 2048, 4096, 8192, 16384, 32768) + +# --- Routing distributions for correctness (the routing-distribution policy) ------------------------- +ROUTING_DISTRIBUTIONS: Tuple[str, ...] = ( + "default", + "uniform", + "expert_skewed", + "few_active", + "all_active", + "sentinel_padding", +) + +# --- Node environment (the node/shape policy) ---------------------------------------------- +TARGET_ARCH = "gfx950" + + +@dataclass(frozen=True) +class ModelShape: + """One target MoE model shape and its in-scope quant dtypes. + + ``dtypes`` are the activation x weight quant aliases in scope for this loop: + ``"a4w4"`` (fp4 x fp4) and/or ``"a8w4"`` (fp8 x fp4). ``i4`` is out of scope. + ``token_grid`` is the sweep used for this model (the token-grid policy). + """ + + name: str + model_dim: int + inter_dim: int + experts: int + topk: int + act: str # "silu" or "swiglu" + dtypes: Tuple[str, ...] + token_grid: Tuple[int, ...] + + +# The four target models (the node/shape policy + plan workload table). DeepSeek V4 is a8w4 +# only; i4 (Kimi a16wi4) is excluded from this loop. +MODELS: Tuple[ModelShape, ...] = ( + ModelShape("deepseek_v3", 7168, 256, 257, 9, "silu", ("a4w4", "a8w4"), TOKEN_GRID_FULL), + ModelShape("deepseek_v4", 7168, 512, 385, 7, "silu", ("a8w4",), TOKEN_GRID_FULL), + ModelShape("kimi_k2", 7168, 256, 384, 8, "silu", ("a4w4", "a8w4"), TOKEN_GRID_FULL), + ModelShape("gpt_oss", 3072, 3072, 128, 4, "swiglu", ("a4w4", "a8w4"), TOKEN_GRID_GPTOSS), +) + +# Map a quant alias to the activation operand dtype passed to the kernel builder +# (the weight operand is fp4 in both in-scope cases). +DTYPE_ALIAS_TO_A_DTYPE = {"a4w4": "fp4", "a8w4": "fp8"} + +# --- Correctness quarantine (non-fp4-activation e2e is environment-blocked) --- +# Controlled evidence (direct aiter test_fmoe, each model's true activation, both +# gate modes, token=16) shows the failing axis is the ACTIVATION operand being +# non-fp4: +# a4w4 (fp4 activation): logits_diff ~1e-5 -> PASS (all models, both gates) +# a8w4 (fp8 activation): logits_diff ~0.98 -> FAIL (DS V3/V4, Kimi; both gates) +# a16w4 (bf16 activation): logits_diff ~0.98 -> FAIL (DS V3; both gates) +# GPT-OSS a8w4 Swiglu+INTERLEAVE: ~6e-6 -> PASS (lone non-fp4-act pass; +# aiter selects a different runtime q_dtype_a/fuse-quant path there) +# fp8 AND bf16 activation both fail with fp4 weight; only fp4 activation passes. +# Note: aiter test_fmoe passes the SAME activation/gate to BOTH its torch +# reference and the kernel, so the activation choice alone cannot explain the +# mismatch. +# +# Root cause is an activation-dtype-dependent wrapper/layout CONTRACT mismatch in +# the aiter e2e path, NOT a proven FlyDSL kernel math bug -- this checkout's own +# tests/kernels/test_moe_gemm.py --in_dtype a8w4 passes with --skip_ref false. +# For non-fp4 activation aiter preps weights via shuffle_weight_a16w4 / +# shuffle_scale_a16w4 and its reference sets a2_scale=None (no stage1->stage2 A2 +# requant), while the FlyDSL mixed stage2 kernel expects a pre-scattered A2 E8M0 +# scale (mixed_moe_gemm_2stage.py); this checkout's own 2-stage harness does +# requantize A2 and passes. Reconciling this is aiter-environment integration +# work, outside the GEMM-tuning scope. +# +# All a8w4 (model, dtype) pairs are therefore QUARANTINED until the e2e a8w4 +# correctness path is validated. Their rows are kept for provenance but excluded +# from the validated baseline and from any win claim -- a genuine correctness +# block, not a silent scope reduction. +QUARANTINED_SHAPES: Tuple[Tuple[str, str], ...] = ( + ("deepseek_v3", "a8w4"), + ("deepseek_v4", "a8w4"), + ("kimi_k2", "a8w4"), + ("gpt_oss", "a8w4"), +) + + +def is_quarantined(model: str, dtype: str) -> bool: + """True if (model, dtype) is correctness-quarantined (see QUARANTINED_SHAPES).""" + return (model, dtype) in QUARANTINED_SHAPES + + +def validated_models(): + """Yield (ModelShape, dtype) pairs that are NOT correctness-quarantined.""" + for m in MODELS: + for dtype in m.dtypes: + if not is_quarantined(m.name, dtype): + yield m, dtype + + +def validated_point_keys() -> set: + """(model, dtype, act, token) keys for the correctness-passing subset. + + This is the workload the validated baseline must fully cover; the quarantined + a8w4 shapes are excluded until their correctness path is fixed. + """ + keys = set() + for m, dtype in validated_models(): + for token in m.token_grid: + keys.add((m.name, dtype, m.act, str(token))) + return keys + + +def is_large_token(token: int) -> bool: + """True if ``token`` is in the large-shape MFU regime (tokens >= 4096).""" + return token >= LARGE_TOKEN_MIN + + +def is_small_token(token: int) -> bool: + """True if ``token`` is in the small-token latency regime (tokens <= 64).""" + return token <= SMALL_TOKEN_MAX + + +def is_regression(baseline_us: float, tuned_us: float, token: int = None) -> bool: + """No-regression gate (the no-regression policy): regression iff BOTH the + relative AND absolute bands are exceeded — ``tuned > baseline*1.02`` AND + ``tuned-baseline > abs_floor``. + + The absolute floor is regime-aware (``abs_floor_us(token)``): 8 us for + tokens <= SMALL_TOKEN_MAX, 2 us otherwise. When ``token`` is None the strict + 2 us floor is used (back-compatible). Applied per point on BOTH the + kernel-path and e2e metrics; a point is a regression if either metric regresses. + """ + floor = ABS_US_BAND if token is None else abs_floor_us(token) + return (tuned_us > baseline_us * (1.0 + REGRESSION_REL)) and ((tuned_us - baseline_us) > floor) + + +def is_large_shape_win(baseline_mfu: float, tuned_mfu: float) -> bool: + """Large-shape win gate (the win-margin policy): ``tuned_MFU >= baseline_MFU * 1.10``.""" + return tuned_mfu >= baseline_mfu * (1.0 + WIN_MARGIN) + + +def is_small_token_win(baseline_us: float, tuned_us: float) -> bool: + """Small-token win gate (the win-margin policy): both a relative and an absolute floor — + ``tuned_us <= baseline_us*0.90`` AND ``(baseline_us - tuned_us) >= 2us``. + + The absolute floor rejects sub-microsecond percentage-only claims. + """ + return (tuned_us <= baseline_us * (1.0 - WIN_MARGIN)) and ((baseline_us - tuned_us) >= ABS_US_BAND) + + +def effective_tflops(token: int, model_dim: int, inter_dim: int, topk: int, combined_us: float) -> float: + """Combined effective TFLOPS per the aiter test_moe_2stage formula: + ``token*model_dim*inter_dim*3*topk*2 / us`` (us in microseconds). + """ + return token * model_dim * inter_dim * 3 * topk * 2 / combined_us / 1e6 + + +def mfu(effective_tflops_value: float) -> float: + """MFU = effective TFLOPS / fp4 peak (4523 TFLOPS).""" + return effective_tflops_value / FP4_PEAK_TFLOPS diff --git a/scripts/aiter_strict_point.py b/scripts/aiter_strict_point.py new file mode 100644 index 000000000..324dcf94f --- /dev/null +++ b/scripts/aiter_strict_point.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: MIT +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. + +"""Run ONE aiter MoE point through the strict, AOT-checked, model-correct path. + +This replaces the aiter *legacy CLI* path (which sets ``strict_accuracy=False``, +``check_aot_cache=False``, hardcodes ``ActivationType.Swiglu`` for the fp8/fp4 +case, and times with warmup=2/iters=5) with a direct call to aiter's +``test_fmoe`` using: + +* the model's TRUE activation and gate mode (passed by the caller), +* ``strict_accuracy=True`` and ``check_aot_cache=True`` (the AOT-cache-wrapped + variant ``test_fmoe_with_aot_cache_check`` — so an AOT-cache miss raises), +* the locked e2e measurement protocol (warmup/iters injected by monkeypatching + the module's ``run_perftest`` reference). + +It prints one machine-readable ``STRICT_RESULT {json}`` line with e2e us, +logits_diff, correctness pass/fail, and the strict/AOT/protocol flags actually +used, which ``moe_tuning_harness.parse_strict_aiter_output`` consumes. + +Usage: + python3 scripts/aiter_strict_point.py \ + --model-dim 7168 --inter-dim 256 -e 257 -k 9 -t 16 \ + --aq fp4 --wq fp4 --act silu --gate separated \ + [--warmup 10 --iters 100] [--no-aot] [--aiter-repo /sgl-workspace/aiter] +""" + +from __future__ import annotations + +import argparse +import importlib.util +import json +import sys + + +def _load_aiter_module(aiter_repo: str): + """Import test_moe_2stage.py without running its default CLI sweep. + + The module has no ``__main__`` guard, so executing it runs the bottom sweep; + we set argv to ``--no-legacy --no-flydsl-csv`` first to make that sweep empty. + """ + sys.argv = ["test_moe_2stage.py", "--no-legacy", "--no-flydsl-csv"] + path = f"{aiter_repo}/op_tests/test_moe_2stage.py" + spec = importlib.util.spec_from_file_location("aiter_test_moe_2stage", path) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + + +_DTYPES = {} + + +def _resolve_dtypes(): + from aiter import dtypes + + return { + "fp4": dtypes.fp4x2, + "fp8": dtypes.fp8, + "bf16": dtypes.bf16, + "fp16": dtypes.fp16, + } + + +def main(argv=None) -> int: + ap = argparse.ArgumentParser(description="strict single-case aiter MoE guardrail") + ap.add_argument("--model-dim", type=int, required=True) + ap.add_argument("--inter-dim", type=int, required=True) + ap.add_argument("-e", "--experts", type=int, required=True) + ap.add_argument("-k", "--topk", type=int, required=True) + ap.add_argument("-t", "--token", type=int, required=True) + ap.add_argument("--aq", required=True, help="activation quant dtype: fp4|fp8|bf16") + ap.add_argument("--wq", default="fp4", help="weight quant dtype (fp4)") + ap.add_argument("--act", required=True, help="silu|swiglu") + ap.add_argument("--gate", default="separated", help="separated|interleave") + ap.add_argument("--warmup", type=int, default=10) + ap.add_argument("--iters", type=int, default=100) + ap.add_argument("--no-aot", action="store_true", help="disable AOT-cache check (records it)") + ap.add_argument("--aiter-repo", default="/sgl-workspace/aiter") + args = ap.parse_args(argv) + + mod = _load_aiter_module(args.aiter_repo) + import aiter + + dts = _resolve_dtypes() + aq, wq = dts[args.aq], dts[args.wq] + act = getattr(aiter.ActivationType, args.act.capitalize()) + check_aot = not args.no_aot + + # Inject the locked e2e protocol by wrapping the module's run_perftest so the + # internal warmup=2/iters=5 are overridden with the locked values. + _orig_run_perftest = mod.run_perftest + + # True timed-loop e2e distribution: after a warmup, time the fused_moe call per + # iteration (median + p95 over `iters`) IN ADDITION TO aiter's own rotated + # average. We keep aiter's rotated average as the median e2e_us (it defeats L2 + # via arg rotation, matching the L2-flush intent and staying comparable across + # runs) and use the per-iteration loop only for the e2e p95 dispersion. + e2e_dist = {"median": None, "p95": None} + # run_perftest's own control kwargs are NOT forwarded to the timed callable. + _PERF_CTRL_KW = ("num_iters", "num_warmup", "testGraph", "num_rotate_args", "needTrace") + + def _locked_run_perftest(func, *a, **kw): + # aiter's rotated average (locked warmup/iters) -> the comparable median. + kw_avg = dict(kw) + kw_avg["num_iters"] = args.iters + kw_avg["num_warmup"] = args.warmup + data, avg = _orig_run_perftest(func, *a, **kw_avg) + e2e_dist["median"] = avg + # Per-iteration p95 dispersion (best-effort; does not change the median). + try: + import torch + + call_kw = {k: v for k, v in kw.items() if k not in _PERF_CTRL_KW} + lat = [] + ev0 = torch.cuda.Event(enable_timing=True) + ev1 = torch.cuda.Event(enable_timing=True) + for _ in range(max(1, args.iters)): + ev0.record() + func(*a, **call_kw) + ev1.record() + ev1.synchronize() + lat.append(ev0.elapsed_time(ev1) * 1000.0) # ms -> us + ordered = sorted(lat) + idx = max(0, min(len(ordered) - 1, int(round(0.95 * (len(ordered) - 1))))) + e2e_dist["p95"] = ordered[idx] + except Exception: + e2e_dist["p95"] = None + return data, avg + + mod.run_perftest = _locked_run_perftest + + test_fn = mod.test_fmoe_with_aot_cache_check if check_aot else mod.test_fmoe + + result = { + "strict_accuracy": True, + "check_aot_cache": check_aot, + "warmup": args.warmup, + "iters": args.iters, + "act": args.act, + "gate": args.gate, + "aq": args.aq, + "wq": args.wq, + } + try: + ret = test_fn( + aiter.dtypes.bf16, + args.token, + args.model_dim, + args.inter_dim, + args.experts, + args.topk, + act, + args.gate, + aiter.QuantType.per_1x32, + aq, + wq, + use_g1u1=True, + doweight_stage1=False, + strict_accuracy=True, + check_aot_cache=check_aot, + ) + if ret is None: + result.update({"error": "skipped_or_none", "error_category": "skipped", "correctness_pass": False}) + else: + ld = float(ret["logits_diff"]) + result.update( + { + "e2e_us": e2e_dist["median"] if e2e_dist["median"] is not None else float(ret["us"]), + "e2e_us_p95": e2e_dist["p95"], + "logits_diff": ld, + "correctness_pass": ld <= 0.01, + "error_category": "" if ld <= 0.01 else "correctness", + } + ) + except Exception as e: # AOT miss, strict assertion, or runtime error. + name = type(e).__name__ + msg = str(e) + if "AOT cache miss" in msg: + cat = "aot_miss" + elif name == "AssertionError" or "accuracy check failed" in msg: + cat = "correctness" + elif "out of memory" in msg.lower() or "OOM" in msg: + cat = "oom" + else: + cat = "runtime" + result.update({"error": f"{name}: {msg[:200]}", "error_category": cat, "correctness_pass": False}) + finally: + mod.run_perftest = _orig_run_perftest + + print("STRICT_RESULT " + json.dumps(result), flush=True) + return 0 if result.get("correctness_pass") else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/moe_tuning_harness.py b/scripts/moe_tuning_harness.py new file mode 100644 index 000000000..8f9b3ba8d --- /dev/null +++ b/scripts/moe_tuning_harness.py @@ -0,0 +1,1157 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: MIT +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. + +"""Measurement harness for the MXFP4 MoE 2-stage tuning campaign on gfx950. + +The harness emits a per-point CSV that is the single reference table every +candidate is compared against. Two measurement paths feed it: + +* **Per-stage kernel-path us** comes from the FlyDSL ``tests/kernels/test_moe_gemm.py`` + benchmark, which prints ``FlyDSL MoE stage1[..]`` / ``FlyDSL MoE stage2 [..]`` + lines with per-stage us. Combined kernel-path us = stage1 + stage2 + sorting. +* **Strict correctness + full fused-MoE e2e us** comes from the aiter + ``op_tests/test_moe_2stage.py`` harness (``strict_accuracy``, + ``logits_diff <= 0.01``, ``fail_on_aot_cache_miss``). That harness times the + whole ``fused_moe`` call as the e2e guardrail. + +Every row records full provenance (GPU id+model, branch+commit, exact command, +shape, dtype+act, warmup/iters, idle-GPU check) and the resolved metric formula, +under the locked protocol in :mod:`kernels.moe_tuning_spec`. + +This module keeps the parsing / metric / provenance / CSV logic as pure +functions so they are unit-testable without a GPU. The live sweep driver +(:func:`run_point`) shells out to the two harnesses and is intended to run on the +fixed idle gfx950 node. +""" + +from __future__ import annotations + +import csv +import json +import os +import re +import shlex +import statistics +import subprocess +import sys +from dataclasses import dataclass +from typing import Dict, List, Optional + +_REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if _REPO_ROOT not in sys.path: + sys.path.insert(0, _REPO_ROOT) + +from kernels import moe_tuning_spec as spec # noqa: E402 + +# CSV columns: provenance first, then shape/config, then metrics. +CSV_COLUMNS = [ + # provenance + "gpu_id", + "gpu_model", + "branch", + "commit", + "command", + "warmup", + "iters", + "idle_gpu_verified", + "graph_capture", + "l2_flush_per_iter", + "clocks_pinned", + "metric_formula", + # shape / config + "model", + "model_dim", + "inter_dim", + "experts", + "topk", + "dtype", + "act", + "token", + "tile_m1", + "tile_n1", + "tile_k1", + "tile_m2", + "tile_n2", + "tile_k2", + # metrics (median + p95 over iters) + "stage1_us", + "stage2_us", + "sorting_us", + "kernel_path_us", + "kernel_path_us_p95", + "effective_tflops", + "mfu", + "e2e_us", + "e2e_us_p95", + "logits_diff", + "correctness_pass", + # failure provenance (auditable for quarantined / failing rows) + "flydsl_command", + "strict_error", + "error_category", + "aot_status", +] + +METRIC_FORMULA = ( + "effective_tflops = token*model_dim*inter_dim*3*topk*2 / combined_us / 1e6; mfu = effective_tflops / 4523" +) + +# Print formats from tests/kernels/test_moe_gemm.py (the first us is the median; +# an optional " p95= us" suffix appears when FLYDSL_PERF_DIST is set): +# "FlyDSL MoE stage1[fp4]: 1163.2 us, p95=1170.0 us 1654.24 TFLOPS(...), 0.377 TB/s (...)" +# "FlyDSL MoE stage2 [moe_gemm2] fp4 atomic | ... | 1163.2 us, p95=1170.0 us 1654.24 TFLOPS, 0.377 TB/s" +_STAGE1_RE = re.compile(r"FlyDSL MoE stage1\[[^\]]+\]:\s*([0-9.]+)\s*us") +_STAGE2_RE = re.compile(r"FlyDSL MoE stage2 \[[^\]]+\]\s+\S+\s+(atomic|reduce)\b.*?([0-9.]+)\s*us") +# Optional per-stage p95 suffix. +_STAGE1_P95_RE = re.compile(r"FlyDSL MoE stage1\[[^\]]+\]:\s*[0-9.]+\s*us,\s*p95=([0-9.]+)\s*us") +_STAGE2_P95_RE = re.compile( + r"FlyDSL MoE stage2 \[[^\]]+\]\s+\S+\s+(?:atomic|reduce)\b.*?[0-9.]+\s*us,\s*p95=([0-9.]+)\s*us" +) +# Optional sorting print, if the FlyDSL benchmark emits one. +_SORT_RE = re.compile(r"FlyDSL MoE sort(?:ing)?[^\d]*([0-9.]+)\s*us", re.IGNORECASE) + +# aiter op_tests/test_moe_2stage.py full fused_moe e2e print (line 363): +# "ck_moe_2stages: 123.45 us, 654.00 tflops......(quant:...)" +_AITER_E2E_RE = re.compile(r"ck_moe_2stages:\s*([0-9.]+)\s*us") +# aiter logits_diff warning line (only printed when logits_diff > 1e-3). +_AITER_LOGITS_RE = re.compile(r"logits_diff[:=]\s*([0-9.eE+-]+)") +# aiter summary markdown data row: the final two numeric cells are +# ``... | | | |``. This carries logits_diff even +# when it is below the 1e-3 warning threshold (so no warning line is printed). +_AITER_MD_ROW_RE = re.compile(r"\|\s*([0-9][0-9.eE+-]*)\s*\|\s*([0-9][0-9.eE+-]*)\s*\|\s*\w+\s*\|\s*$") +# Real correctness-miss signals: the strict-accuracy assertion or a hard error. +# NOTE: the bare ``checkAllclose ... failed!`` line is the LOOSE elementwise check +# and is EXPECTED for fp4; correctness is gated on logits_diff <= 0.01 per the +# locked contract, not on that line. +_AITER_FAIL_RE = re.compile(r"accuracy check failed|AssertionError|Traceback|RuntimeError", re.IGNORECASE) + +# aiter -q quant index -> dtype alias used here (see l_quant in the harness). +DTYPE_ALIAS_TO_AITER_Q = {"a4w4": 4, "a8w4": 7} + + +@dataclass +class Provenance: + """Run provenance recorded with every measured point.""" + + gpu_id: str = "" + gpu_model: str = "" + branch: str = "" + commit: str = "" + warmup: int = spec.WARMUP_ITERS + iters: int = spec.BENCH_ITERS + idle_gpu_verified: bool = False + graph_capture: bool = spec.GRAPH_CAPTURE + l2_flush_per_iter: bool = spec.L2_FLUSH_PER_ITER + # NOT proof until verified: defaults False so a row never claims pinned clocks + # unless the driver enabled performance determinism AND verified the state. + # (spec.CLOCKS_PINNED is the protocol's INTENT, not evidence.) + clocks_pinned: bool = False + metric_formula: str = METRIC_FORMULA + + REQUIRED_FIELDS = ("gpu_id", "gpu_model", "branch", "commit", "warmup", "iters") + + def missing_fields(self) -> List[str]: + """Required provenance fields that are empty/unset (the baseline contract negative gate).""" + missing = [] + for f in self.REQUIRED_FIELDS: + v = getattr(self, f) + if v in ("", None): + missing.append(f) + return missing + + def is_complete(self) -> bool: + return not self.missing_fields() + + +@dataclass +class PointRow: + """One per-point measurement row (provenance + shape/config + metrics).""" + + provenance: Provenance + command: str + model: str + model_dim: int + inter_dim: int + experts: int + topk: int + dtype: str + act: str + token: int + tile_m1: int = 0 + tile_n1: int = 0 + tile_k1: int = 0 + tile_m2: int = 0 + tile_n2: int = 0 + tile_k2: int = 0 + stage1_us: Optional[float] = None + stage2_us: Optional[float] = None + sorting_us: Optional[float] = None + kernel_path_us: Optional[float] = None + kernel_path_us_p95: Optional[float] = None + effective_tflops: Optional[float] = None + mfu: Optional[float] = None + e2e_us: Optional[float] = None + e2e_us_p95: Optional[float] = None + logits_diff: Optional[float] = None + correctness_pass: Optional[bool] = None + flydsl_command: str = "" + strict_error: str = "" + error_category: str = "" + aot_status: str = "" + + def to_csv_dict(self) -> dict: + p = self.provenance + row = { + "gpu_id": p.gpu_id, + "gpu_model": p.gpu_model, + "branch": p.branch, + "commit": p.commit, + "command": self.command, + "warmup": p.warmup, + "iters": p.iters, + "idle_gpu_verified": p.idle_gpu_verified, + "graph_capture": p.graph_capture, + "l2_flush_per_iter": p.l2_flush_per_iter, + "clocks_pinned": p.clocks_pinned, + "metric_formula": p.metric_formula, + } + for k in ( + "model", + "model_dim", + "inter_dim", + "experts", + "topk", + "dtype", + "act", + "token", + "tile_m1", + "tile_n1", + "tile_k1", + "tile_m2", + "tile_n2", + "tile_k2", + "stage1_us", + "stage2_us", + "sorting_us", + "kernel_path_us", + "kernel_path_us_p95", + "effective_tflops", + "mfu", + "e2e_us", + "e2e_us_p95", + "logits_diff", + "correctness_pass", + "flydsl_command", + "strict_error", + "error_category", + "aot_status", + ): + row[k] = getattr(self, k) + return row + + +# --- pure parsing / metric helpers (unit-testable, no GPU) ----------------- + + +def parse_flydsl_stage_us(stdout: str) -> dict: + """Extract stage1 / stage2 median us and optional p95 from FlyDSL stdout. + + Returns ``{"stage1_us", "stage2_us", "stage1_p95", "stage2_p95"}`` using the + last matching line for each stage (the benchmarked, post-warmup print). The + p95 fields are populated only when the FlyDSL benchmark was run with + FLYDSL_PERF_DIST (true timed-loop distribution); otherwise None. + """ + s1 = _STAGE1_RE.findall(stdout) + s2 = _STAGE2_RE.findall(stdout) + s1p = _STAGE1_P95_RE.findall(stdout) + s2p = _STAGE2_P95_RE.findall(stdout) + return { + "stage1_us": float(s1[-1]) if s1 else None, + "stage2_us": float(s2[-1][1]) if s2 else None, + "stage1_p95": float(s1p[-1]) if s1p else None, + "stage2_p95": float(s2p[-1]) if s2p else None, + } + + +def parse_flydsl_sorting_us(stdout: str) -> Optional[float]: + """Extract sorting us from FlyDSL stdout if present, else None (sorting is 0).""" + m = _SORT_RE.findall(stdout) + return float(m[-1]) if m else None + + +def parse_aiter_output(stdout: str) -> dict: + """Extract e2e us, logits_diff, and correctness pass/fail from aiter stdout. + + The aiter ``op_tests/test_moe_2stage.py`` harness times the whole fused_moe + call (the e2e guardrail) and logs ``ck_moe_2stages: us``; the + per-case ``us`` and ``logits_diff`` also appear in the final summary markdown + row (which carries logits_diff even when it is below the 1e-3 warning + threshold). Correctness is gated on ``logits_diff <= 0.01`` (the locked + contract) plus the absence of a hard assertion/error; the bare loose + ``checkAllclose ... failed!`` line is expected for fp4 and is NOT a miss. + + ``correctness_pass`` requires an e2e number, a logits_diff, ``logits_diff <= + 0.01``, and no hard failure. + """ + md = _AITER_MD_ROW_RE.findall(stdout) + md_e2e = float(md[-1][0]) if md else None + md_logits = float(md[-1][1]) if md else None + + e2e_line = _AITER_E2E_RE.findall(stdout) + logits_line = _AITER_LOGITS_RE.findall(stdout) + e2e_us = float(e2e_line[-1]) if e2e_line else md_e2e + # Prefer the markdown logits cell (always present); fall back to the warning line. + logits_diff = md_logits if md_logits is not None else (float(logits_line[-1]) if logits_line else None) + + failed = bool(_AITER_FAIL_RE.search(stdout)) + correctness_pass = (e2e_us is not None) and (logits_diff is not None) and (logits_diff <= 0.01) and (not failed) + return {"e2e_us": e2e_us, "logits_diff": logits_diff, "correctness_pass": correctness_pass} + + +def parse_strict_aiter_output(stdout: str) -> dict: + """Parse the ``STRICT_RESULT {json}`` line from ``scripts/aiter_strict_point.py``. + + Returns ``{"e2e_us", "logits_diff", "correctness_pass", "error"}``. The strict + runner already applies ``strict_accuracy=True`` + ``logits_diff <= 0.01``, so + ``correctness_pass`` is authoritative; an AOT miss or strict assertion is + reported as ``error`` with ``correctness_pass=False``. + """ + line = None + for ln in stdout.splitlines(): + if ln.startswith("STRICT_RESULT "): + line = ln[len("STRICT_RESULT ") :] + empty = { + "e2e_us": None, + "e2e_us_p95": None, + "logits_diff": None, + "correctness_pass": False, + "error": "no_strict_result", + "error_category": "no_result", + "aot_status": "", + } + if line is None: + return empty + try: + d = json.loads(line) + except json.JSONDecodeError: + return {**empty, "error": "bad_strict_json", "error_category": "bad_json"} + return { + "e2e_us": d.get("e2e_us"), + "e2e_us_p95": d.get("e2e_us_p95"), + "logits_diff": d.get("logits_diff"), + "correctness_pass": bool(d.get("correctness_pass")), + "error": d.get("error", ""), + "error_category": d.get("error_category", ""), + "aot_status": "checked" if d.get("check_aot_cache") else "no_aot", + } + + +def combined_kernel_path_us(stage1_us: float, stage2_us: float, sorting_us: float = 0.0) -> float: + """Combined kernel-path latency = stage1 + stage2 + sorting (microseconds).""" + return float(stage1_us) + float(stage2_us) + float(sorting_us) + + +def summarize(samples: List[float]) -> dict: + """Median + p95 over a list of per-iter latencies (the locked statistics).""" + if not samples: + return {"median": None, "p95": None} + ordered = sorted(samples) + median = statistics.median(ordered) + # Nearest-rank p95. + idx = max(0, min(len(ordered) - 1, int(round(0.95 * (len(ordered) - 1))))) + return {"median": median, "p95": ordered[idx]} + + +def compute_metrics(*, token: int, model_dim: int, inter_dim: int, topk: int, combined_us: float) -> dict: + """Effective TFLOPS + MFU for a combined kernel-path us, via the spec formula.""" + tflops = spec.effective_tflops(token, model_dim, inter_dim, topk, combined_us) + return {"effective_tflops": tflops, "mfu": spec.mfu(tflops)} + + +# --- provenance collection (uses the host; safe no-ops when tools absent) --- + + +def _run(cmd: List[str]) -> str: + try: + return subprocess.check_output(cmd, stderr=subprocess.DEVNULL, text=True).strip() + except Exception: + return "" + + +def git_provenance(repo_root: str = _REPO_ROOT) -> dict: + """Current branch + commit SHA of ``repo_root`` (empty strings on failure).""" + branch = _run(["git", "-C", repo_root, "rev-parse", "--abbrev-ref", "HEAD"]) + commit = _run(["git", "-C", repo_root, "rev-parse", "HEAD"]) + return {"branch": branch, "commit": commit} + + +def gpu_provenance(gpu_id: str) -> dict: + """GPU model name from rocm-smi for ``gpu_id`` (empty string on failure).""" + out = _run(["rocm-smi", "--showproductname"]) + model = "" + for line in out.splitlines(): + if "Card Series" in line: + model = line.split(":")[-1].strip() + break + return {"gpu_id": str(gpu_id), "gpu_model": model} + + +def write_csv(rows: List[PointRow], path: str) -> None: + """Write per-point rows to ``path`` using the fixed CSV schema.""" + os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True) + with open(path, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=CSV_COLUMNS) + writer.writeheader() + for r in rows: + writer.writerow(r.to_csv_dict()) + + +def read_csv(path: str) -> List[dict]: + """Read a per-point CSV back as a list of column dicts.""" + with open(path, newline="") as f: + return list(csv.DictReader(f)) + + +# --- workload run list (full the token-grid policy coverage from the spec) ------------------ + + +@dataclass(frozen=True) +class RunPoint: + """One (model, dtype, act, token) point in the campaign workload.""" + + model: str + model_dim: int + inter_dim: int + experts: int + topk: int + act: str + dtype: str # "a4w4" | "a8w4" + token: int + + +def build_run_list() -> List[RunPoint]: + """Every model x in-scope dtype x the token-grid policy token from ``moe_tuning_spec.MODELS``. + + This is the authoritative campaign workload; the harness sweeps exactly these + points so coverage is the full the token-grid policy grid (not a partial manual table). + """ + points: List[RunPoint] = [] + for m in spec.MODELS: + for dtype in m.dtypes: + for token in m.token_grid: + points.append(RunPoint(m.name, m.model_dim, m.inter_dim, m.experts, m.topk, m.act, dtype, token)) + return points + + +def expected_point_keys() -> set: + """The set of (model, dtype, act, token) keys the full workload must cover.""" + return {(p.model, p.dtype, p.act, str(p.token)) for p in build_run_list()} + + +def select_run_points(model=None, dtype=None, tokens=None) -> List[RunPoint]: + """Filter the full run list by model / dtype / token set (for candidate sweeps). + + ``model`` and ``dtype`` are exact-match strings (None = all); ``tokens`` is an + iterable of ints (None = the model's full grid). Lets a reproducible candidate + sweep target e.g. one model+dtype over chosen tokens instead of the whole grid. + """ + tok_set = set(int(t) for t in tokens) if tokens else None + out = [] + for rp in build_run_list(): + if model is not None and rp.model != model: + continue + if dtype is not None and rp.dtype != dtype: + continue + if tok_set is not None and rp.token not in tok_set: + continue + out.append(rp) + return out + + +def candidate_tile_for(rp: RunPoint, overrides: dict) -> dict: + """Tile config for a candidate sweep: the shape's default tiles with explicit + per-key overrides applied (only keys present in ``overrides`` are changed). + + Raises ValueError if the resulting (stage1, stage2) tiles are illegal for the + shape under the pre-compile legality filter, so a candidate sweep never spends + GPU time on a config the kernel would reject. + """ + from kernels import moe_tuning as _mt + + tile = dict(default_tile_for(rp)) + for k in ("tile_m1", "tile_n1", "tile_k1", "tile_n2", "tile_k2"): + if overrides.get(k) is not None: + tile[k] = int(overrides[k]) + a_dtype = spec.DTYPE_ALIAS_TO_A_DTYPE[rp.dtype] + r1 = _mt.check_tile_config( + stage=1, + model_dim=rp.model_dim, + inter_dim=rp.inter_dim, + tile_m=tile["tile_m1"], + tile_n=tile["tile_n1"], + tile_k=tile["tile_k1"], + a_dtype=a_dtype, + ) + r2 = _mt.check_tile_config( + stage=2, + model_dim=rp.model_dim, + inter_dim=rp.inter_dim, + tile_m=tile["tile_m1"], + tile_n=tile["tile_n2"], + tile_k=tile["tile_k2"], + a_dtype=a_dtype, + ) + if not (r1.legal and r2.legal): + raise ValueError(f"illegal candidate tiles for {rp.model}/{rp.dtype}: s1={r1.reason} s2={r2.reason}") + return tile + + +def prepare_candidate_run(overrides: dict, model=None, dtype=None, tokens=None, prov=None, command=""): + """Resolve a fail-closed candidate run: (run_list, per-point tiles). + + Requirements (raises ValueError, recording a machine-readable rejection for + illegal tiles, so the caller fails closed WITHOUT writing a partial CSV): + - at least one explicit tile override must be given (no silent default-tile + fallback for candidate mode); + - the selection must match at least one point; + - EVERY selected point's tiles must pass the legality filter — the first + illegal point aborts the whole run (a candidate run must be all-legal). + + ``prov`` (a ``Provenance``) and ``command`` (the exact top-level invocation) + supply the run-provenance class carried by every rejected-candidate record so + a rejection is as auditable as a measured attempt. When ``prov`` is None the + git branch/commit are still resolved (host-side path), so the record stays + complete; GPU identity is then left to the caller's monkeypatch/tests. + """ + import moe_tuning_ledger as _ledger + + if not any(v is not None for v in overrides.values()): + raise ValueError("candidate mode requires at least one explicit --tile-* override") + run_list = select_run_points(model=model, dtype=dtype, tokens=tokens) + if not run_list: + raise ValueError("candidate selection matched no points") + # Provenance shared by every rejection from this run (filled from prov + git). + git = git_provenance() + base_prov = { + "gpu_id": getattr(prov, "gpu_id", "") or "", + "gpu_model": getattr(prov, "gpu_model", "") or "", + "branch": getattr(prov, "branch", "") or git.get("branch", ""), + "commit": getattr(prov, "commit", "") or git.get("commit", ""), + "warmup": getattr(prov, "warmup", spec.WARMUP_ITERS), + "iters": getattr(prov, "iters", spec.BENCH_ITERS), + "command": command, + "selection": {"model": model, "dtype": dtype, "tokens": list(tokens) if tokens else None}, + } + tiles = [] + for rp in run_list: + try: + tiles.append(candidate_tile_for(rp, overrides)) + except ValueError as e: + _ledger.append_rejected_candidate( + { + **base_prov, + "model": rp.model, + "dtype": rp.dtype, + "act": rp.act, + "token": rp.token, + "stage": 0, # candidate-tile rejection spans both stages; reason names the stage + "config": {k: overrides.get(k) for k in overrides}, + "reason": str(e), + # No measured artifact exists for a pre-compile rejection, but + # the keys must be present to match a measured attempt's schema. + "csv_path": "", + "profile_path": "", + } + ) + raise ValueError(f"illegal candidate at {rp.model}/{rp.dtype} t={rp.token}: {e}") from e + return run_list, tiles + + +# --- baseline validation gate (the baseline contract negative tests) ------------------------ + +# The locked baseline must come from this exact commit (DEC scope). +LOCKED_BASELINE_COMMIT = "523ca1c7" +# Identity/provenance fields every baseline row must carry beyond the protocol. +ROW_REQUIRED_FIELDS = ("command", "dtype", "act", "model", "token") +# Numeric metric fields every baseline row must carry, parseable as float +# (the baseline contract + the no-regression policy: per-stage, combined kernel-path median+p95, effective TFLOPS, +# MFU, and the e2e guardrail median+p95, plus the correctness logits_diff). +ROW_REQUIRED_METRIC_FIELDS = ( + "stage1_us", + "stage2_us", + "sorting_us", + "kernel_path_us", + "kernel_path_us_p95", + "effective_tflops", + "mfu", + "e2e_us", + "e2e_us_p95", + "logits_diff", +) + + +def _is_float(v) -> bool: + if v in (None, "", "None"): + return False + try: + float(v) + return True + except (TypeError, ValueError): + return False + + +def validate_baseline_row(row: dict) -> List[str]: + """Return reasons ``row`` is NOT an acceptable locked-baseline row (empty=OK). + + Rejects rows that are not from the locked commit, not idle-GPU verified, miss + a required provenance/identity field, miss or non-numeric any the baseline contract/the no-regression policy metric + field (per-stage, kernel-path median+p95, effective TFLOPS, MFU, e2e + median+p95, logits_diff), are not correctness_pass=True, or use a non-locked + protocol (warmup/iters/graph/L2/clock). + """ + reasons: List[str] = [] + + commit = str(row.get("commit", "")) + if not commit: + reasons.append("missing_commit") + elif not commit.startswith(LOCKED_BASELINE_COMMIT): + reasons.append(f"commit_not_{LOCKED_BASELINE_COMMIT}") + + if str(row.get("idle_gpu_verified", "")).lower() not in ("true", "1"): + reasons.append("idle_gpu_not_verified") + + for f in ("gpu_id", "gpu_model", "branch", *ROW_REQUIRED_FIELDS): + if str(row.get(f, "")).strip() in ("", "None"): + reasons.append(f"missing_{f}") + + # Every the baseline contract/the no-regression policy metric must be present AND numeric. + for f in ROW_REQUIRED_METRIC_FIELDS: + if not _is_float(row.get(f)): + reasons.append(f"missing_{f}") + + # Correctness gate must have passed for this point. + if str(row.get("correctness_pass", "")).lower() not in ("true", "1"): + reasons.append("correctness_not_passed") + + # Locked protocol (the no-regression policy): warmup=10, iters=100, graph OFF, L2 flush on, clocks pinned. + if str(row.get("warmup", "")) != str(spec.WARMUP_ITERS): + reasons.append("warmup_mismatch") + if str(row.get("iters", "")) != str(spec.BENCH_ITERS): + reasons.append("iters_mismatch") + if str(row.get("graph_capture", "")).lower() not in ("false", "0"): + reasons.append("graph_capture_must_be_off") + if str(row.get("l2_flush_per_iter", "")).lower() not in ("true", "1"): + reasons.append("l2_flush_must_be_on") + if str(row.get("clocks_pinned", "")).lower() not in ("true", "1"): + reasons.append("clocks_must_be_pinned") + return reasons + + +def validate_baseline_csv(path: str, expected_keys: Optional[set] = None) -> dict: + """Validate every row of a baseline CSV and that coverage equals the workload. + + Returns ``{"valid": bool, "row_errors": {key: [reasons]}, "missing_points": + [...], "n_rows": int}``. A baseline is valid only if every row that belongs + to ``expected_keys`` passes :func:`validate_baseline_row` AND all + ``expected_keys`` points are present. + + ``expected_keys`` defaults to the full the token-grid policy workload + (:func:`expected_point_keys`). Pass a subset (e.g. + ``moe_tuning_spec.validated_point_keys()``) to validate the correctness-passing + subset independently of the quarantined a8w4 shapes. Rows outside + ``expected_keys`` are ignored (neither required nor cause errors). + """ + if expected_keys is None: + expected_keys = expected_point_keys() + rows = read_csv(path) + row_errors: Dict[str, list] = {} + seen = set() + for row in rows: + key = (row.get("model"), row.get("dtype"), row.get("act"), row.get("token")) + if key not in expected_keys: + continue # quarantined / out-of-subset row: not validated here. + seen.add(key) + errs = validate_baseline_row(row) + if errs: + row_errors[str(key)] = errs + missing = sorted(str(k) for k in (expected_keys - seen)) + valid = not row_errors and not missing + return {"valid": valid, "row_errors": row_errors, "missing_points": missing, "n_rows": len(rows)} + + +# --- live measurement (runs on the gfx950 node) ---------------------------- + + +def check_idle_gpu(gpu_id: str, busy_pct_threshold: int = 5) -> bool: + """True if the GPU's utilization is below ``busy_pct_threshold`` (idle check).""" + out = _run(["rocm-smi", "-d", str(gpu_id), "--showuse"]) + for line in out.splitlines(): + m = re.search(r"GPU use \(%\)\s*:?\s*([0-9]+)", line) + if m: + return int(m.group(1)) < busy_pct_threshold + # If utilization could not be read, do not claim idle. + return False + + +# Locked sclk to pin for the measurement protocol (this node's max, MHz). +PINNED_SCLK_MHZ = 2200 + + +def pin_clocks(gpu_id: str, sclk_mhz: int = PINNED_SCLK_MHZ) -> bool: + """Enable performance determinism (pin sclk) so the recorded + ``clocks_pinned`` flag is truthful, not aspirational. + + Returns True if determinism was enabled (rocm-smi reports success), else + False (e.g. the container forbids it). DVFS auto-scaling is the dominant + source of small-token run-to-run jitter; pinning is the in-protocol way to + reduce it without changing the no-regression band. + """ + out = _run(["rocm-smi", "-d", str(gpu_id), "--setperfdeterminism", str(sclk_mhz)]) + return "performance determinism" in out.lower() and "successfully" in out.lower() + + +def clocks_pinned_state(gpu_id: str) -> bool: + """True if the GPU performance level is a pinned/deterministic mode (not auto).""" + out = _run(["rocm-smi", "-d", str(gpu_id), "--showperflevel"]).lower() + # "determinism" or "manual"/"high" indicate a pinned level; "auto" is DVFS. + return ("determinism" in out) or ("manual" in out) or ("high" in out) + + +def setup_run_provenance(gpu_id: str, assume_idle: bool = False, repo_ref: str = _REPO_ROOT) -> Provenance: + """Build the run Provenance with VERIFIED idle + clock-pinned state. + + Enables performance determinism (pins sclk) and verifies it via + ``clocks_pinned_state``; ``Provenance.clocks_pinned`` reflects only the + verified state (never the static intent default). Used by the live sweep so + every emitted row's clock provenance is trustworthy. + """ + idle = True if assume_idle else check_idle_gpu(gpu_id) + pin_clocks(gpu_id) # best-effort enable + pinned = clocks_pinned_state(gpu_id) # verify the actual state + prov = Provenance(idle_gpu_verified=idle, clocks_pinned=pinned) + prov.__dict__.update(git_provenance(repo_ref)) + prov.__dict__.update(gpu_provenance(gpu_id)) + return prov + + +def _flydsl_cmd(rp: RunPoint, gpu_id: str, tile: dict) -> List[str]: + """FlyDSL per-stage benchmark command for one point under the locked protocol.""" + in_dtype = "fp4" if rp.dtype == "a4w4" else "a8w4" + return [ + "python3", + os.path.join(_REPO_ROOT, "tests", "kernels", "test_moe_gemm.py"), + "--in_dtype", + in_dtype, + "-dim", + f"{rp.model_dim},{rp.inter_dim}", + "-t", + str(rp.token), + "-e", + str(rp.experts), + "-k", + str(rp.topk), + "--num_warmup", + str(spec.WARMUP_ITERS), + "--num_iters", + str(spec.BENCH_ITERS), + "--tile_m", + str(tile["tile_m1"]), + "--tile_n", + str(tile["tile_n1"]), + "--tile_k", + str(tile["tile_k1"]), + "--tile_n2", + str(tile["tile_n2"]), + "--tile_k2", + str(tile["tile_k2"]), + "--skip_ref", + "true", + "--compare_aiter_ck", + "false", + ] + + +AITER_REPO = "/sgl-workspace/aiter" +# Default gate mode per quant alias for the strict aiter guardrail. a4w4 uses +# SEPARATED (validated correct); a8w4 is quarantined (see moe_tuning_spec) so its +# gate choice is recorded but never gates a win. +DTYPE_ALIAS_TO_GATE = {"a4w4": "separated", "a8w4": "interleave"} + + +def _aiter_cmd(rp: RunPoint, check_aot: bool = True) -> List[str]: + """Strict, AOT-checked, model-correct single-case aiter guardrail command. + + Invokes ``scripts/aiter_strict_point.py`` which calls aiter ``test_fmoe`` with + the model's TRUE activation and gate mode, ``strict_accuracy=True``, the + AOT-cache-wrapped variant (``check_aot`` -> ``fail_on_aot_cache_miss``), and + the locked e2e protocol (warmup=10/iters=100 injected over aiter's internal + 2/5). This is NOT the aiter legacy CLI (which is non-strict, non-AOT, and + hardcodes Swiglu/INTERLEAVE for the fp8xfp4 case). + """ + aq = spec.DTYPE_ALIAS_TO_A_DTYPE[rp.dtype] # a4w4->fp4, a8w4->fp8 + gate = DTYPE_ALIAS_TO_GATE[rp.dtype] + cmd = [ + "python3", + os.path.join(_REPO_ROOT, "scripts", "aiter_strict_point.py"), + "--model-dim", + str(rp.model_dim), + "--inter-dim", + str(rp.inter_dim), + "-e", + str(rp.experts), + "-k", + str(rp.topk), + "-t", + str(rp.token), + "--aq", + aq, + "--wq", + "fp4", + "--act", + rp.act, + "--gate", + gate, + "--warmup", + str(spec.WARMUP_ITERS), + "--iters", + str(spec.BENCH_ITERS), + "--aiter-repo", + AITER_REPO, + ] + if not check_aot: + cmd.append("--no-aot") + return cmd + + +def _exec(cmd: List[str], gpu_id: str, extra_env: Optional[dict] = None) -> str: + env = dict(os.environ) + env["HIP_VISIBLE_DEVICES"] = str(gpu_id) + if extra_env: + env.update({k: str(v) for k, v in extra_env.items()}) + try: + out = subprocess.run(cmd, env=env, capture_output=True, text=True, timeout=3600) + return (out.stdout or "") + "\n" + (out.stderr or "") + except Exception as e: # pragma: no cover - live-run only + return f"HARNESS_EXEC_ERROR: {e}" + + +def run_point( + rp: RunPoint, + tile: dict, + gpu_id: str, + provenance: Provenance, + measure_e2e: bool = True, + reps: int = 3, + check_aot: bool = True, +) -> PointRow: # pragma: no cover - exercised only on the gfx950 node + """Measure one workload point: FlyDSL per-stage us + aiter e2e/correctness. + + ``tile`` carries tile_m1/n1/k1 and tile_n2/k2 (stage1 + stage2 tiles). The + combined kernel-path us = stage1 + stage2 + sorting; the aiter run supplies + the e2e guardrail us, logits_diff, and correctness pass/fail. + + Median + p95 come from the TRUE timed loop inside each subprocess: the FlyDSL + benchmark runs with ``FLYDSL_PERF_DIST=1`` (per-iteration median+p95 over + ``iters``) and the strict aiter runner times fused_moe per iteration. ``reps`` + here is just how many independent subprocess samples to take of the median; the + per-point p95 is the timed-loop p95 (median of the per-rep p95 values), NOT a + dispersion across reps. ``flydsl_command``, ``strict_error``, + ``error_category``, and ``aot_status`` are recorded for auditability. + + ``check_aot`` gates the strict aiter AOT-cache check; when False the e2e still + runs strict+correct but does not require a pre-populated AOT cache (recorded as + ``aot_status="no_aot"``). ``command`` names ONLY the commands actually executed + for this row: the aiter command is appended only when ``measure_e2e`` is True. + """ + flydsl_cmd = _flydsl_cmd(rp, gpu_id, tile) + aiter_cmd = _aiter_cmd(rp, check_aot=check_aot) + # The FlyDSL benchmark must emit its true per-iteration distribution; the env + # is part of the reproducible command provenance (a replay must set it too). + flydsl_env = {"FLYDSL_PERF_DIST": "1"} + env_prefix = f"HIP_VISIBLE_DEVICES={gpu_id} FLYDSL_PERF_DIST=1 " + flydsl_command_str = env_prefix + " ".join(flydsl_cmd) + # Only name commands that actually run for this row (truthful provenance). + command = flydsl_command_str + if measure_e2e: + command += " ; " + f"HIP_VISIBLE_DEVICES={gpu_id} " + " ".join(aiter_cmd) + + s1_samples, s2_samples, sort_samples, combined_samples = [], [], [], [] + s1_p95s, s2_p95s = [], [] + for _ in range(max(1, reps)): + out = _exec(flydsl_cmd, gpu_id, extra_env=flydsl_env) + stages = parse_flydsl_stage_us(out) + if stages["stage1_us"] is None or stages["stage2_us"] is None: + continue + srt = parse_flydsl_sorting_us(out) or 0.0 + s1_samples.append(stages["stage1_us"]) + s2_samples.append(stages["stage2_us"]) + sort_samples.append(srt) + combined_samples.append(combined_kernel_path_us(stages["stage1_us"], stages["stage2_us"], srt)) + if stages["stage1_p95"] is not None: + s1_p95s.append(stages["stage1_p95"]) + if stages["stage2_p95"] is not None: + s2_p95s.append(stages["stage2_p95"]) + + e2e_samples, e2e_p95s, logits_samples, correctness = [], [], [], None + strict_error, error_category, aot_status = "", "", "" + if measure_e2e: + for _ in range(max(1, reps)): + res = parse_strict_aiter_output(_exec(aiter_cmd, gpu_id)) + if res["e2e_us"] is not None: + e2e_samples.append(res["e2e_us"]) + if res.get("e2e_us_p95") is not None: + e2e_p95s.append(res["e2e_us_p95"]) + if res["logits_diff"] is not None: + logits_samples.append(res["logits_diff"]) + rep_ok = res["correctness_pass"] + correctness = rep_ok if correctness is None else (correctness and bool(rep_ok)) + # keep the last rep's failure provenance (representative). + strict_error = res.get("error", "") or strict_error + error_category = res.get("error_category", "") or error_category + aot_status = res.get("aot_status", "") or aot_status + + row = PointRow( + provenance=provenance, + command=command, + model=rp.model, + model_dim=rp.model_dim, + inter_dim=rp.inter_dim, + experts=rp.experts, + topk=rp.topk, + dtype=rp.dtype, + act=rp.act, + token=rp.token, + tile_m1=tile["tile_m1"], + tile_n1=tile["tile_n1"], + tile_k1=tile["tile_k1"], + tile_m2=tile["tile_m1"], + tile_n2=tile["tile_n2"], + tile_k2=tile["tile_k2"], + flydsl_command=flydsl_command_str, + strict_error=strict_error, + error_category=error_category, + aot_status=aot_status, + ) + if combined_samples: + row.stage1_us = summarize(s1_samples)["median"] + row.stage2_us = summarize(s2_samples)["median"] + row.sorting_us = summarize(sort_samples)["median"] + row.kernel_path_us = summarize(combined_samples)["median"] + # p95 is the timed-loop p95 (median across the per-rep timed-loop p95s); + # fall back to the across-rep combined p95 only if the timed-loop p95 is + # unavailable. + if s1_p95s and s2_p95s: + row.kernel_path_us_p95 = ( + summarize(s1_p95s)["median"] + summarize(s2_p95s)["median"] + summarize(sort_samples)["median"] + ) + else: + row.kernel_path_us_p95 = summarize(combined_samples)["p95"] + m = compute_metrics( + token=rp.token, model_dim=rp.model_dim, inter_dim=rp.inter_dim, topk=rp.topk, combined_us=row.kernel_path_us + ) + row.effective_tflops = m["effective_tflops"] + row.mfu = m["mfu"] + if e2e_samples: + row.e2e_us = summarize(e2e_samples)["median"] + row.e2e_us_p95 = summarize(e2e_p95s)["median"] if e2e_p95s else summarize(e2e_samples)["p95"] + if logits_samples: + row.logits_diff = max(logits_samples) # worst-case correctness across reps + row.correctness_pass = correctness + return row + + +def row_missing_kernel_path(row: "PointRow") -> bool: + """True if a measured row has no parseable kernel-path timing. + + The FlyDSL benchmark emits no stage times for some tile shapes (e.g. the + tile_k1!=256 / tile_n1=512 harness limitation): the subprocess returns but + ``parse_flydsl_stage_us`` finds nothing, so the row's stage/kernel-path fields + stay ``None``. Such a row is NOT a measurement and must never be recorded as a + ``loss`` -- candidate mode treats it as a fail-closed rejected measurement. + """ + return row.stage1_us is None or row.stage2_us is None or row.kernel_path_us is None + + +# Default (baseline) tile config per shape: matches scripts/run_benchmark.sh. +def default_tile_for(rp: RunPoint) -> dict: # pragma: no cover - simple table + if rp.model_dim == 3072: # GPT-OSS + return {"tile_m1": 32, "tile_n1": 128, "tile_k1": 256, "tile_n2": 256, "tile_k2": 256} + return {"tile_m1": 64, "tile_n1": 256, "tile_k1": 256, "tile_n2": 256, "tile_k2": 256} + + +def _main(argv: Optional[List[str]] = None) -> int: # pragma: no cover - CLI/live + import argparse + + ap = argparse.ArgumentParser(description="MXFP4 MoE tuning measurement harness (gfx950)") + ap.add_argument("mode", choices=["baseline", "candidate", "validate", "list"]) + ap.add_argument("--gpu", default=os.environ.get("GPU", "0"), help="GPU id (HIP_VISIBLE_DEVICES)") + ap.add_argument("--out", default="", help="output CSV path") + ap.add_argument("--csv", default="", help="CSV to validate (validate mode)") + ap.add_argument("--no-e2e", action="store_true", help="skip the aiter e2e/correctness run") + ap.add_argument( + "--no-aot-check", + action="store_true", + help="run e2e strict+correct but do not require a pre-populated AOT cache (records aot_status=no_aot)", + ) + ap.add_argument("--assume-idle", action="store_true", help="skip the live idle-GPU probe") + ap.add_argument( + "--allow-unpinned", + action="store_true", + help="proceed (recording clocks_pinned=False) even if clock pinning cannot be verified", + ) + # Candidate-mode selection + explicit tile overrides (reproducible sweeps). + ap.add_argument("--model", default=None, help="restrict to one model (candidate mode)") + ap.add_argument("--dtype", default=None, help="restrict to one dtype alias, e.g. a4w4 (candidate mode)") + ap.add_argument("--tokens", default=None, help="comma/space-separated token list (candidate mode)") + ap.add_argument("--reps", type=int, default=3, help="independent subprocess reps per point") + for _k in ("tile-m1", "tile-n1", "tile-k1", "tile-n2", "tile-k2"): + ap.add_argument(f"--{_k}", type=int, default=None, help=f"candidate {_k.replace('-', '_')} override") + args = ap.parse_args(argv) + + if args.mode == "list": + for rp in build_run_list(): + print(rp) + return 0 + + if args.mode == "validate": + res = validate_baseline_csv(args.csv) + print(json.dumps(res, indent=2)) + return 0 if res["valid"] else 1 + + prov = setup_run_provenance(args.gpu, assume_idle=args.assume_idle) + print(f"clocks_pinned (verified)={prov.clocks_pinned} idle_gpu_verified={prov.idle_gpu_verified}") + # The locked protocol requires fixed clocks: if verification failed, do not + # emit a baseline that falsely claims pinned clocks. + if spec.CLOCKS_PINNED and not prov.clocks_pinned and not args.allow_unpinned: + print( + "ERROR: locked protocol requires pinned clocks but verification failed; " + "the run would be non-comparable. Re-run with the GPU clocks pinnable, " + "or pass --allow-unpinned to record clocks_pinned=False explicitly.", + file=sys.stderr, + ) + return 2 + + overrides = { + "tile_m1": args.tile_m1, + "tile_n1": args.tile_n1, + "tile_k1": args.tile_k1, + "tile_n2": args.tile_n2, + "tile_k2": args.tile_k2, + } + + if args.mode == "candidate": + toks = [int(t) for t in args.tokens.replace(",", " ").split()] if args.tokens else None + top_command = "python3 " + shlex.join([os.path.relpath(__file__, _REPO_ROOT), *(argv or sys.argv[1:])]) + try: + run_list, tiles = prepare_candidate_run( + overrides, model=args.model, dtype=args.dtype, tokens=toks, prov=prov, command=top_command + ) + except ValueError as e: + # Fail closed: do not write a partial CSV; rejection already recorded. + print(f"ERROR: candidate run rejected: {e}", file=sys.stderr) + return 2 + rows = [ + run_point( + rp, + tiles[i], + args.gpu, + prov, + measure_e2e=not args.no_e2e, + reps=args.reps, + check_aot=not args.no_aot_check, + ) + for i, rp in enumerate(run_list) + ] + # Fail closed on unmeasured rows: a missing kernel-path row is NOT a loss. + import moe_tuning_ledger as _ledger + + bad = [(rp, tiles[i], r) for i, (rp, r) in enumerate(zip(run_list, rows)) if row_missing_kernel_path(r)] + if bad: + for rp, tile, r in bad: + _ledger.append_rejected_candidate( + { + "model": rp.model, + "dtype": rp.dtype, + "act": rp.act, + "token": rp.token, + "stage": 1, + "config": {k: tile.get(k) for k in ("tile_m1", "tile_n1", "tile_k1", "tile_n2", "tile_k2")}, + "reason": "no parseable kernel-path stage times emitted (unmeasured shape; e.g. " + "tile_k1!=256 / tile_n1=512 harness limitation)", + "selection": {"model": args.model, "dtype": args.dtype, "tokens": toks}, + "gpu_id": prov.gpu_id, + "gpu_model": prov.gpu_model, + "branch": prov.branch, + "commit": prov.commit, + "command": top_command, + "warmup": prov.warmup, + "iters": prov.iters, + "csv_path": "", + "profile_path": "", + } + ) + print( + f"ERROR: {len(bad)} candidate point(s) produced no kernel-path measurement; " + "recorded as rejected measurements, no CSV written.", + file=sys.stderr, + ) + return 2 + else: # baseline: full grid, default tiles + run_list = build_run_list() + rows = [ + run_point( + rp, + default_tile_for(rp), + args.gpu, + prov, + measure_e2e=not args.no_e2e, + reps=args.reps, + check_aot=not args.no_aot_check, + ) + for rp in run_list + ] + + out = args.out or f"/tmp/moe_{args.mode}.csv" + write_csv(rows, out) + print(f"wrote {len(rows)} rows -> {out}") + return 0 + + +__all__ = [ + "CSV_COLUMNS", + "METRIC_FORMULA", + "LOCKED_BASELINE_COMMIT", + "Provenance", + "PointRow", + "RunPoint", + "parse_flydsl_stage_us", + "parse_flydsl_sorting_us", + "parse_aiter_output", + "parse_strict_aiter_output", + "combined_kernel_path_us", + "summarize", + "compute_metrics", + "git_provenance", + "gpu_provenance", + "check_idle_gpu", + "pin_clocks", + "clocks_pinned_state", + "setup_run_provenance", + "build_run_list", + "expected_point_keys", + "select_run_points", + "candidate_tile_for", + "prepare_candidate_run", + "default_tile_for", + "validate_baseline_row", + "validate_baseline_csv", + "run_point", + "row_missing_kernel_path", + "write_csv", + "read_csv", +] + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(_main()) diff --git a/scripts/moe_tuning_ledger.py b/scripts/moe_tuning_ledger.py new file mode 100644 index 000000000..80629cf0b --- /dev/null +++ b/scripts/moe_tuning_ledger.py @@ -0,0 +1,501 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: MIT +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. + +"""Attempt ledger + Pareto comparison for the MXFP4 MoE tuning campaign. + +Every candidate attempt — win or loss — is appended to ``docs/attempts.jsonl`` +with full provenance (config, stage, model, dtype, act, GPU id+model, +branch+commit, command, warmup/iters, CSV/profile path, result). A human-facing +running log lives in ``docs/optimization-ledger.md``. + +The Pareto comparison takes a baseline per-point CSV and a candidate per-point +CSV (both emitted by ``scripts/moe_tuning_harness.py``) and reports, per point, +whether the candidate is a win / regression / neutral under the locked the win-margin policy / +the no-regression policy predicates. A win is only claimable when no point regresses on either the +kernel-path or e2e metric (no Pareto regression) and the re-run-stability rule +holds. +""" + +from __future__ import annotations + +import csv +import json +import os +import sys +import time +from dataclasses import asdict, dataclass, field +from typing import Dict, List, Optional, Tuple + +_REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if _REPO_ROOT not in sys.path: + sys.path.insert(0, _REPO_ROOT) + +from kernels import moe_tuning_spec as spec # noqa: E402 + +ATTEMPTS_JSONL = os.path.join(_REPO_ROOT, "docs", "attempts.jsonl") +LEDGER_MD = os.path.join(_REPO_ROOT, "docs", "optimization-ledger.md") + +# Required provenance keys for any ledger attempt (the ledger contract). +REQUIRED_ATTEMPT_FIELDS = ( + "config", + "stage", + "model", + "dtype", + "act", + "gpu_id", + "gpu_model", + "branch", + "commit", + "command", + "warmup", + "iters", + "result", +) + +# A rejected search candidate never reaches compile/GPU, so it has no measured +# metrics (csv_path/profile_path stay empty), but it MUST still carry the same +# identity + run-provenance class as a measured attempt so the rejection is +# auditable (the rejected-candidate ledger contract). ``stage`` is 0 when the +# rejection is at the candidate-tile level spanning both stages; the reason +# string still names the offending stage. ``selection`` records the run's +# model/dtype/tokens filter so the rejection is reproducible. +REQUIRED_REJECTED_FIELDS = ( + "model", + "dtype", + "act", + "token", + "stage", + "config", + "reason", + "selection", + "gpu_id", + "gpu_model", + "branch", + "commit", + "command", + "warmup", + "iters", +) + +# Keys that must be PRESENT on a rejected record but may legitimately be empty +# strings: a pre-compile rejection produces no measured CSV/profile artifact, yet +# the keys must exist so the record schema matches a measured attempt. +REQUIRED_REJECTED_PRESENT_KEYS = ( + "csv_path", + "profile_path", +) + + +@dataclass +class Attempt: + """One tuning attempt record (win or loss).""" + + config: dict + stage: int + model: str + dtype: str + act: str + gpu_id: str + gpu_model: str + branch: str + commit: str + command: str + warmup: int + iters: int + result: str # "win" | "loss" | "rejected" | "neutral" + csv_path: str = "" + profile_path: str = "" + note: str = "" + timestamp: Optional[float] = None + + def missing_fields(self) -> List[str]: + return [f for f in REQUIRED_ATTEMPT_FIELDS if getattr(self, f, None) in ("", None)] + + +def append_attempt(attempt: Attempt, path: str = ATTEMPTS_JSONL, now: Optional[float] = None) -> dict: + """Append an attempt to the JSONL ledger. + + Raises ``ValueError`` if any required provenance field is missing, so a win + can never be recorded without complete provenance (the ledger contract negative gate). + """ + missing = attempt.missing_fields() + if missing: + raise ValueError(f"attempt missing required provenance fields: {missing}") + rec = asdict(attempt) + rec["timestamp"] = now if now is not None else time.time() + os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True) + with open(path, "a") as f: + f.write(json.dumps(rec, sort_keys=True) + "\n") + return rec + + +def append_rejected_candidate(record: dict, path: str = ATTEMPTS_JSONL, now: float = None) -> dict: + """Append a machine-readable rejected-candidate record to the JSONL ledger. + + ``record`` must carry the full provenance class (``REQUIRED_REJECTED_FIELDS``) + so a rejected search candidate is as auditable as a measured attempt — even + though it never reached compile/GPU. The measured-artifact keys + (``REQUIRED_REJECTED_PRESENT_KEYS``: ``csv_path``/``profile_path``) must be + present but may be empty strings (no artifact exists pre-compile). Raises + ``ValueError`` if any required field is missing, so an incomplete rejection can + never be recorded (the rejected-candidate contract negative gate). + """ + # Treat only None / "" as missing — integer 0 (stage, warmup, iters) is valid. + missing = [k for k in REQUIRED_REJECTED_FIELDS if record.get(k) in (None, "")] + # Artifact keys must EXIST (empty string allowed); only a truly absent key fails. + missing += [k for k in REQUIRED_REJECTED_PRESENT_KEYS if k not in record] + if missing: + raise ValueError(f"rejected-candidate record missing fields: {missing}") + # selection must be a non-empty dict so the rejection's run filter is recorded. + sel = record.get("selection") + if not isinstance(sel, dict) or not sel: + raise ValueError("rejected-candidate record 'selection' must be a non-empty dict") + rec = {"result": "rejected_candidate", **record} + rec["timestamp"] = now if now is not None else time.time() + os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True) + with open(path, "a") as f: + f.write(json.dumps(rec, sort_keys=True) + "\n") + return rec + + +def read_point_csv(path: str) -> Dict[Tuple, dict]: + """Read a per-point harness CSV keyed by (model, dtype, token, stage tiles). + + The key is (model, dtype, act, token) — the comparison axis between baseline + and candidate at one shape/token point. + """ + table: Dict[Tuple, dict] = {} + with open(path, newline="") as f: + for row in csv.DictReader(f): + key = (row.get("model"), row.get("dtype"), row.get("act"), row.get("token")) + table[key] = row + return table + + +def _f(row: dict, col: str) -> Optional[float]: + v = row.get(col) + if v in (None, "", "None"): + return None + try: + return float(v) + except (TypeError, ValueError): + return None + + +@dataclass +class PointVerdict: + key: Tuple + token: int + kernel_path_regression: bool = False + e2e_regression: bool = False + large_shape_win: bool = False + small_token_win: bool = False + note: str = "" + + +def compare_point(baseline: dict, candidate: dict) -> PointVerdict: + """Apply the win-margin policy / the no-regression policy predicates to one (baseline, candidate) point pair.""" + token = int(float(candidate.get("token") or baseline.get("token") or 0)) + key = (candidate.get("model"), candidate.get("dtype"), candidate.get("act"), candidate.get("token")) + v = PointVerdict(key=key, token=token) + + b_kp, c_kp = _f(baseline, "kernel_path_us"), _f(candidate, "kernel_path_us") + b_e2e, c_e2e = _f(baseline, "e2e_us"), _f(candidate, "e2e_us") + b_mfu, c_mfu = _f(baseline, "mfu"), _f(candidate, "mfu") + + if b_kp is not None and c_kp is not None: + v.kernel_path_regression = spec.is_regression(b_kp, c_kp, token=token) + if b_e2e is not None and c_e2e is not None: + v.e2e_regression = spec.is_regression(b_e2e, c_e2e, token=token) + + if spec.is_large_token(token) and token in spec.MFU_TARGET_BUCKETS: + if b_mfu is not None and c_mfu is not None: + v.large_shape_win = spec.is_large_shape_win(b_mfu, c_mfu) + if spec.is_small_token(token): + if b_kp is not None and c_kp is not None: + v.small_token_win = spec.is_small_token_win(b_kp, c_kp) + return v + + +def _required_fields_for_point(token: int) -> Tuple[str, ...]: + """Comparison fields a candidate row must carry for its token regime. + + Every point needs both latency metrics; large target buckets additionally + need ``mfu`` (the large-shape win/regression axis). + """ + fields = ["kernel_path_us", "e2e_us"] + if spec.is_large_token(token) and token in spec.MFU_TARGET_BUCKETS: + fields.append("mfu") + return tuple(fields) + + +def _row_missing_fields(row: dict, fields: Tuple[str, ...]) -> List[str]: + return [f for f in fields if _f(row, f) is None] + + +@dataclass +class CampaignVerdict: + points: List[PointVerdict] = field(default_factory=list) + any_regression: bool = False + large_wins: List[Tuple] = field(default_factory=list) + small_wins: List[Tuple] = field(default_factory=list) + missing_candidate_points: List[Tuple] = field(default_factory=list) + incomplete_points: List[Tuple] = field(default_factory=list) + # Strict correctness + AOT-cache hard gate over the candidate CSV + # (``selected_candidate_gate`` output). Populated by ``compare_csvs``; a + # candidate that fails this gate (e.g. ``aot_status=no_aot``) can never be a + # claimable win even if its metrics look winning. + gate: dict = field(default_factory=lambda: {"passed": False, "n_rows": 0, "violations": []}) + + @property + def coverage_complete(self) -> bool: + """True only if every baseline point has a candidate row with all the + regime-required comparison fields present (no cherry-picking).""" + return not self.missing_candidate_points and not self.incomplete_points + + @property + def pareto_clean(self) -> bool: + """True only if coverage is complete AND no point regressed on kernel-path + or e2e. Incomplete/cherry-picked candidate CSVs can never be clean.""" + return self.coverage_complete and not self.any_regression + + @property + def claimable_win(self) -> bool: + """The SINGLE source of truth for whether a candidate may be promoted to a + win. True only when ALL hold: + - ``pareto_clean`` (full coverage + no kernel-path/e2e regression), + - at least one target-bucket or small-token win is present, and + - the strict correctness + AOT-cache hard gate passed + (``aot_status=checked`` + correctness + ``logits_diff<=0.01`` on every + row) -- so a ``no_aot`` / failed-correctness candidate is never claimable + regardless of how good its metrics look. + Re-run stability is enforced separately by re-running and re-comparing.""" + return self.pareto_clean and bool(self.large_wins or self.small_wins) and bool(self.gate.get("passed")) + + +def compare_csvs(baseline_csv: str, candidate_csv: str) -> CampaignVerdict: + """Full per-point Pareto comparison of a candidate vs the locked baseline. + + Iterates the COMPLETE baseline key set so a candidate cannot pass by omitting + a regressing/uncovered point. A point with a missing candidate row, or whose + candidate row lacks a regime-required field (kernel_path_us/e2e_us for every + point; mfu for large target buckets), makes ``coverage_complete`` False, which + forces ``pareto_clean`` False. + + The candidate is run through ``selected_candidate_gate`` and the result is + stored on the verdict. ``CampaignVerdict.claimable_win`` is the single source + of truth for promotability: it requires ``pareto_clean`` + at least one win + + the gate (``aot_status=checked`` + correctness + ``logits_diff<=0.01``). Do + NOT promote a candidate from ``pareto_clean`` + win lists alone -- a ``no_aot`` + candidate can be pareto_clean with wins yet must not be claimable. + """ + base = read_point_csv(baseline_csv) + cand = read_point_csv(candidate_csv) + cv = CampaignVerdict() + cv.gate = selected_candidate_gate(candidate_csv) + for key, b_row in base.items(): + token = int(float(b_row.get("token") or 0)) + c_row = cand.get(key) + if c_row is None: + cv.missing_candidate_points.append(key) + cv.points.append(PointVerdict(key=key, token=token, note="missing_candidate_point")) + continue + missing = _row_missing_fields(c_row, _required_fields_for_point(token)) + if missing: + cv.incomplete_points.append(key) + cv.points.append(PointVerdict(key=key, token=token, note="missing_fields:" + ",".join(missing))) + continue + pv = compare_point(b_row, c_row) + cv.points.append(pv) + if pv.kernel_path_regression or pv.e2e_regression: + cv.any_regression = True + if pv.large_shape_win: + cv.large_wins.append(key) + if pv.small_token_win: + cv.small_wins.append(key) + return cv + + +def selected_candidate_gate(candidate_csv: str, max_logits_diff: float = 0.01) -> dict: + """Hard gate a candidate CSV before it can be promoted to a win. + + A selected candidate must clear the strict correctness + AOT-cache hard gate on + EVERY row: ``aot_status == "checked"`` (the strict aiter run required a + pre-populated AOT cache, not the ``no_aot`` repeatability/diagnostic bypass), + ``correctness_pass`` is true, and ``logits_diff <= max_logits_diff``. Rows + measured with ``--no-aot-check`` (``aot_status == "no_aot"``) are valid for + NEUTRAL repeatability/diagnostic artifacts but can never be promoted to a win, + so they fail this gate. + + Returns ``{"passed": bool, "n_rows": int, "violations": [(key, reason), ...]}``. + ``passed`` is False if there are zero rows (nothing to promote) or any violation. + """ + rows = read_point_csv(candidate_csv) + violations: List[Tuple] = [] + for key, row in rows.items(): + aot = (row.get("aot_status") or "").strip() + if aot != "checked": + violations.append((key, f"aot_status={aot or 'missing'} (need 'checked')")) + cp = (row.get("correctness_pass") or "").strip().lower() + if cp not in ("true", "1"): + violations.append((key, f"correctness_pass={row.get('correctness_pass')!r} (need True)")) + ld = _f(row, "logits_diff") + if ld is None: + violations.append((key, "logits_diff missing")) + elif ld > max_logits_diff: + violations.append((key, f"logits_diff={ld} > {max_logits_diff}")) + return {"passed": bool(rows) and not violations, "n_rows": len(rows), "violations": violations} + + +def repeatability_check(csv_a: str, csv_b: str) -> dict: + """Compare two independent sweeps of the SAME config under the no-regression policy. + + For each shared (model, dtype, act, token) point, a metric is "stable" if the + two runs agree within the no-regression noise band (NOT a regression in either + direction): ``|b - a| <= max(a*REGRESSION_REL, abs_floor_us(token))``, where + the absolute floor is regime-aware (8 us for tokens <= SMALL_TOKEN_MAX, 2 us + otherwise). Returns the set of unstable points per metric; an empty unstable + set demonstrates the harness is repeatable (the measurement protocol). + """ + a = read_point_csv(csv_a) + b = read_point_csv(csv_b) + shared = sorted(set(a) & set(b)) + unstable = {"kernel_path_us": [], "e2e_us": []} + + def band(x, token): + return max(abs(x) * spec.REGRESSION_REL, spec.abs_floor_us(token)) + + for key in shared: + token = int(float(a[key].get("token") or 0)) + for metric in ("kernel_path_us", "e2e_us"): + va, vb = _f(a[key], metric), _f(b[key], metric) + if va is None or vb is None: + unstable[metric].append((key, "missing")) + elif abs(vb - va) > band(va, token): + unstable[metric].append((key, va, vb)) + return { + "n_shared": len(shared), + "unstable": unstable, + "stable": not unstable["kernel_path_us"] and not unstable["e2e_us"], + } + + +def scan_replay_consistency(path: str = ATTEMPTS_JSONL) -> List[Tuple]: + """Find committed attempts whose ``csv_path`` lists files the ``command`` cannot replay. + + A multi-file attempt (``csv_path`` = ``a.csv;b.csv``) must name EVERY listed + file in its ``command`` string, so the attempt is replayable end-to-end from + the ledger alone (no brace shorthand like ``run{1,2}.csv``, no required step + hidden behind a ``#`` comment). Superseded records are skipped. Returns a + list of ``(timestamp, [missing files])`` for offending records (empty == clean). + """ + if not os.path.exists(path): + return [] + offenders: List[Tuple] = [] + with open(path) as f: + for ln in f: + ln = ln.strip() + if not ln: + continue + rec = json.loads(ln) + if "superseded_by" in rec: + continue + csv_path = rec.get("csv_path") or "" + files = [p for p in csv_path.split(";") if p.strip()] + if len(files) < 2: + continue # single/no file: nothing multi-file to reconcile + command = rec.get("command") or "" + # Strip anything after a '#' on each segment: a required step hidden in + # a comment is not actually replayed by a shell. + replayable = " ".join(seg.split("#", 1)[0] for seg in command.splitlines()) + missing = [fp for fp in files if fp not in replayable] + if missing: + offenders.append((rec.get("timestamp"), missing)) + return offenders + + +def _rejected_key(rec: dict) -> Tuple: + """Identity of a rejected probe: model/dtype/token/act + the tile config. + Used to detect duplicate non-superseded rejection records for the same probe.""" + cfg = rec.get("config") or {} + cfg_key = tuple(sorted((str(k), str(v)) for k, v in cfg.items())) + return (rec.get("model"), rec.get("dtype"), rec.get("act"), rec.get("token"), cfg_key) + + +def scan_duplicate_rejected_candidates(path: str = ATTEMPTS_JSONL) -> List[Tuple]: + """Find probes with more than one ACTIVE (non-superseded) rejected record. + + Two ledger entries that reject the same (model,dtype,act,token,config) probe + are a provenance defect -- there must be exactly one active reason per probe + (older duplicates must be marked ``superseded_by``). Returns a list of + ``(key, [timestamps])`` for probes with >1 active record (empty == clean). + """ + if not os.path.exists(path): + return [] + seen: Dict[Tuple, List] = {} + with open(path) as f: + for ln in f: + ln = ln.strip() + if not ln: + continue + rec = json.loads(ln) + if rec.get("result") != "rejected_candidate" or "superseded_by" in rec: + continue + seen.setdefault(_rejected_key(rec), []).append(rec.get("timestamp")) + return [(k, ts) for k, ts in seen.items() if len(ts) > 1] + + +def scan_superseded_rejected_candidates(path: str = ATTEMPTS_JSONL) -> List[Tuple]: + """Find superseded rejected records that do NOT link to a matching successor. + + Every ``rejected_candidate`` carrying ``superseded_by`` must point at the + timestamp of an EXISTING active (non-superseded) rejected record for the SAME + rejected key ``(model,dtype,act,token,config)``. A supersede link to a + different probe's record (or to no record) is an evidence-integrity defect: + ``scan_duplicate_rejected_candidates`` only proves one active record per key, it + does not prove the superseded chain points to the correct successor. Returns a + list of ``(timestamp, reason)`` for offending records (empty == clean). + """ + if not os.path.exists(path): + return [] + active_ts_by_key: Dict[Tuple, set] = {} + superseded: List[dict] = [] + with open(path) as f: + for ln in f: + ln = ln.strip() + if not ln: + continue + rec = json.loads(ln) + if rec.get("result") != "rejected_candidate": + continue + if "superseded_by" in rec: + superseded.append(rec) + else: + active_ts_by_key.setdefault(_rejected_key(rec), set()).add(rec.get("timestamp")) + offenders: List[Tuple] = [] + for rec in superseded: + key = _rejected_key(rec) + target = rec.get("superseded_by") + if target not in active_ts_by_key.get(key, set()): + offenders.append((rec.get("timestamp"), f"superseded_by={target} is not an active record of the same key")) + return offenders + + +__all__ = [ + "ATTEMPTS_JSONL", + "LEDGER_MD", + "REQUIRED_ATTEMPT_FIELDS", + "Attempt", + "append_attempt", + "read_point_csv", + "compare_point", + "compare_csvs", + "selected_candidate_gate", + "scan_replay_consistency", + "scan_duplicate_rejected_candidates", + "scan_superseded_rejected_candidates", + "repeatability_check", + "PointVerdict", + "CampaignVerdict", +] diff --git a/scripts/sync_aiter_flydsl_kernels.sh b/scripts/sync_aiter_flydsl_kernels.sh new file mode 100755 index 000000000..d6eedcc85 --- /dev/null +++ b/scripts/sync_aiter_flydsl_kernels.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: MIT +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. +# +# Sync aiter's vendored FlyDSL MoE kernels with this FlyDSL checkout so the aiter +# fused-MoE e2e + strict-correctness guardrail (op_tests/test_moe_2stage.py) runs +# against the SAME kernel sources we tune here. +# +# Why this is needed: aiter pins `flydsl==0.1.8` and ships its own (older) vendored +# copies under aiter/ops/flydsl/kernels/. Against the installed FlyDSL compiler +# (0.2.x) those stale copies crash during MLIR emission BEFORE producing any number +# (`'Int32' object has no attribute 'type'`, then `arith.extsi i64->i32 cast +# incompatible`). Overlaying the current FlyDSL kernel sources resolves the skew; +# the e2e path then produces real us + logits_diff and the strict correctness gate +# (`logits_diff <= 0.01`) can be applied. This is an aiter-environment integration +# step, not a change to the FlyDSL kernels themselves. +# +# Idempotent. Backs up the originals once to /ops/flydsl/kernels/.orig_bak/. +# Usage: bash scripts/sync_aiter_flydsl_kernels.sh [AITER_REPO] +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +AITER_REPO="${1:-/sgl-workspace/aiter}" +SRC="${REPO_ROOT}/kernels" +DST="${AITER_REPO}/aiter/ops/flydsl/kernels" +BAK="${DST}/.orig_bak" + +if [[ ! -d "${DST}" ]]; then + echo "ERROR: aiter vendored kernel dir not found: ${DST}" >&2 + exit 1 +fi + +# The MoE 2-stage kernel and its sibling deps imported via `from .`. +FILES=( + mixed_moe_gemm_2stage.py + moe_gemm_2stage.py + moe_common.py + mfma_epilogues.py + mfma_preshuffle_pipeline.py + layout_utils.py +) + +mkdir -p "${BAK}" +for f in "${FILES[@]}"; do + if [[ ! -f "${SRC}/${f}" ]]; then + echo "ERROR: missing FlyDSL source: ${SRC}/${f}" >&2 + exit 1 + fi + # Back up the original aiter copy once. + if [[ -f "${DST}/${f}" && ! -f "${BAK}/${f}" ]]; then + cp "${DST}/${f}" "${BAK}/${f}" + fi + cp "${SRC}/${f}" "${DST}/${f}" + echo "synced ${f}" +done + +# Clear the aiter FlyDSL JIT cache so stale compiled artifacts are not reused. +CACHE="${AITER_REPO}/aiter/jit/flydsl_cache" +if [[ -d "${CACHE}" ]]; then + rm -rf "${CACHE:?}/"* 2>/dev/null || true + echo "cleared aiter flydsl JIT cache: ${CACHE}" +fi + +echo "done: aiter vendored FlyDSL MoE kernels synced from ${SRC}" diff --git a/tests/unit/test_moe_tuning_harness.py b/tests/unit/test_moe_tuning_harness.py new file mode 100644 index 000000000..d16c0fca3 --- /dev/null +++ b/tests/unit/test_moe_tuning_harness.py @@ -0,0 +1,1584 @@ +# SPDX-License-Identifier: MIT +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. + +"""Backend-agnostic tests for the MXFP4 MoE tuning harness, spec, and ledger. + +These exercise the pure host-side logic (decision predicates, stage-us parsing, +metric computation, provenance gating, attempt-ledger validation, and per-point +Pareto comparison) with no GPU and no compile. +""" + +import os +import sys + +import pytest + +_REPO_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +_SCRIPTS = os.path.join(_REPO_ROOT, "scripts") +for p in (_REPO_ROOT, _SCRIPTS): + if p not in sys.path: + sys.path.insert(0, p) + +import moe_tuning_harness as harness # noqa: E402 +import moe_tuning_ledger as ledger # noqa: E402 + +from kernels import moe_tuning_spec as spec # noqa: E402 + +pytestmark = pytest.mark.l0_backend_agnostic + + +# --- spec: locked values + predicates -------------------------------------- + + +def test_locked_constants(): + assert spec.FP4_PEAK_TFLOPS == 4523.0 + assert spec.WIN_MARGIN == 0.10 + assert spec.REGRESSION_REL == 0.02 + assert spec.ABS_US_BAND == 2.0 + assert spec.WARMUP_ITERS == 10 + assert spec.BENCH_ITERS == 100 + assert spec.MFU_TARGET_BUCKETS == (16384, 32768) + assert spec.LARGE_TOKEN_MIN == 4096 + assert spec.SMALL_TOKEN_MAX == 64 + assert spec.TARGET_ARCH == "gfx950" + + +def test_token_grids(): + assert spec.TOKEN_GRID_FULL[0] == 1 and spec.TOKEN_GRID_FULL[-1] == 32768 + assert len(spec.TOKEN_GRID_FULL) == 16 + assert spec.TOKEN_GRID_GPTOSS[0] == 256 and spec.TOKEN_GRID_GPTOSS[-1] == 32768 + + +def test_models_in_scope_dtypes(): + by_name = {m.name: m for m in spec.MODELS} + assert set(by_name) == {"deepseek_v3", "deepseek_v4", "kimi_k2", "gpt_oss"} + # DeepSeek V4 is a8w4-only; i4 excluded everywhere. + assert by_name["deepseek_v4"].dtypes == ("a8w4",) + assert by_name["kimi_k2"].dtypes == ("a4w4", "a8w4") + assert all("i4" not in m.dtypes for m in spec.MODELS) + assert by_name["gpt_oss"].act == "swiglu" + assert by_name["deepseek_v4"].model_dim == 7168 and by_name["deepseek_v4"].inter_dim == 512 + + +def test_regression_predicate_requires_both_bands(): + # 1.5% over but only +1.5us: relative under 2%? 1.5% < 2% -> not a regression. + assert not spec.is_regression(100.0, 101.5) + # 3% over but only +0.3us absolute (small base): abs band not exceeded -> not a regression. + assert not spec.is_regression(10.0, 10.3) + # 5% over AND +5us: both bands exceeded -> regression. + assert spec.is_regression(100.0, 105.0) + # exactly at boundaries (strict >): 102.0 and +2.0 -> not a regression. + assert not spec.is_regression(100.0, 102.0) + + +def test_large_shape_win_predicate(): + assert spec.is_large_shape_win(0.50, 0.55) # exactly +10% + assert not spec.is_large_shape_win(0.50, 0.549) + + +def test_small_token_win_predicate(): + # 12% faster AND >= 2us absolute -> win. + assert spec.is_small_token_win(100.0, 88.0) + # 12% faster but only 0.6us absolute (tiny base) -> rejected (abs floor). + assert not spec.is_small_token_win(5.0, 4.4) + # 8% faster -> rejected (under 10%). + assert not spec.is_small_token_win(100.0, 92.0) + + +def test_effective_tflops_and_mfu_formula(): + # token*model_dim*inter_dim*3*topk*2 / us / 1e6 + tflops = spec.effective_tflops(4096, 7168, 256, 9, combined_us=1000.0) + expected = 4096 * 7168 * 256 * 3 * 9 * 2 / 1000.0 / 1e6 + assert abs(tflops - expected) < 1e-9 + assert abs(spec.mfu(tflops) - tflops / 4523.0) < 1e-12 + + +# --- harness: parsing / metrics / provenance ------------------------------- + + +def test_parse_flydsl_stage_us(): + stdout = ( + "noise\n" + "FlyDSL MoE stage1[fp4]: 1163.2 us, 1654.24 TFLOPS(logical, M=4608), 0.377 TB/s (doweight_stage1=False)\n" + "FlyDSL MoE stage2 [moe_gemm2] fp4 atomic | 7168x2048, E=32, K=8, M_eff=4608 | 845.5 us, 1200.00 TFLOPS, 0.300 TB/s\n" + "FlyDSL MoE stage2 [moe_gemm2] fp4 reduce | 7168x2048, E=32, K=8, M_eff=4608 | 900.1 us, 1100.00 TFLOPS, 0.280 TB/s\n" + ) + got = harness.parse_flydsl_stage_us(stdout) + assert got["stage1_us"] == 1163.2 + # last matching stage2 line wins + assert got["stage2_us"] == 900.1 + + +def test_parse_flydsl_stage_us_missing(): + got = harness.parse_flydsl_stage_us("nothing here") + assert got["stage1_us"] is None and got["stage2_us"] is None + + +def test_parse_aiter_output_pass_warning_line(): + out = ( + "calling test_fmoe(...)\n" + "ck_moe_2stages: 234.56 us, 654.00 tflops......(quant:fp4x2)[checkAllclose passed~]\n" + "logits_diff: 0.0008\n" + ) + res = harness.parse_aiter_output(out) + assert res["e2e_us"] == 234.56 + assert res["logits_diff"] == 0.0008 + assert res["correctness_pass"] is True + + +def test_parse_aiter_output_pass_markdown_row(): + # logits_diff below 1e-3 prints no warning line; it only appears in the + # summary markdown row. The loose "checkAllclose ... failed!" line is the + # EXPECTED fp4 elementwise warning and must NOT fail correctness. + out = ( + "ck_moe_2stages: 84.32 us, 18.80 tflops......(quant:fp4x2)[checkAllclose atol=0.01 rtol=0.01 failed!]\n" + "moe_2stage summary (markdown):\n" + "| dtype | token | ... | us | logits_diff | model |\n" + "|:------|------:| ... |--------:|--------------:|:--------|\n" + "| torch.bfloat16 | 16 | ... | 87.195 | 9.6236e-06 | legacy |\n" + ) + res = harness.parse_aiter_output(out) + assert res["e2e_us"] == 84.32 + assert res["logits_diff"] == 9.6236e-06 + assert res["correctness_pass"] is True + + +def test_parse_aiter_output_fail_cases(): + # logits over 0.01 (markdown row) -> fail. + out_logits = "ck_moe_2stages: 100.00 us, 100.00 tflops\n" "| torch.bfloat16 | 16 | ... | 100.0 | 0.05 | legacy |\n" + assert harness.parse_aiter_output(out_logits)["correctness_pass"] is False + # hard assertion text -> fail even if a number was produced. + out_assert = "ck_moe_2stages: 100.00 us\naccuracy check failed: err=1, logits_diff=0.2\n" + assert harness.parse_aiter_output(out_assert)["correctness_pass"] is False + # no logits at all -> fail (cannot confirm correctness). + out_no_logits = "ck_moe_2stages: 100.00 us, 100.00 tflops\n" + assert harness.parse_aiter_output(out_no_logits)["correctness_pass"] is False + # no e2e number at all -> fail. + assert harness.parse_aiter_output("nothing")["correctness_pass"] is False + + +def test_aiter_cmd_is_strict_aot_model_correct(): + # The aiter guardrail must use the strict/AOT/model-correct runner + # (scripts/aiter_strict_point.py), NOT the non-strict legacy CLI, and must + # carry the model's true act/gate, locked warmup/iters, and AOT enabled. + rp = harness.RunPoint("kimi_k2", 7168, 256, 384, 8, "silu", "a4w4", 16) + cmd = harness._aiter_cmd(rp) + joined = " ".join(cmd) + assert "aiter_strict_point.py" in joined + # Must NOT be the legacy CLI path. + assert "test_moe_2stage.py" not in joined + assert "--no-flydsl-csv" not in cmd + assert cmd[cmd.index("--aq") + 1] == "fp4" # a4w4 -> fp4 activation + assert cmd[cmd.index("--act") + 1] == "silu" + assert cmd[cmd.index("--gate") + 1] == "separated" + assert cmd[cmd.index("--warmup") + 1] == "10" + assert cmd[cmd.index("--iters") + 1] == "100" + assert "--no-aot" not in cmd # AOT cache check ON by default + assert cmd[cmd.index("-t") + 1] == "16" + # a8w4 -> fp8 activation; swiglu model carries swiglu act. + rpg = harness.RunPoint("gpt_oss", 3072, 3072, 128, 4, "swiglu", "a8w4", 512) + cmdg = harness._aiter_cmd(rpg) + assert cmdg[cmdg.index("--aq") + 1] == "fp8" + assert cmdg[cmdg.index("--act") + 1] == "swiglu" + # --no-aot toggle is honored. + assert "--no-aot" in harness._aiter_cmd(rp, check_aot=False) + + +def test_parse_strict_aiter_output(): + ok = ( + 'noise\nSTRICT_RESULT {"e2e_us": 80.7, "e2e_us_p95": 84.0, "logits_diff": 1.0e-05, ' + '"correctness_pass": true, "check_aot_cache": true, "error_category": ""}\n' + ) + r = harness.parse_strict_aiter_output(ok) + assert r["e2e_us"] == 80.7 and r["e2e_us_p95"] == 84.0 and r["correctness_pass"] is True + assert r["aot_status"] == "checked" + fail = ( + 'STRICT_RESULT {"error": "AssertionError: accuracy check failed", ' + '"error_category": "correctness", "correctness_pass": false, "check_aot_cache": false}\n' + ) + rf = harness.parse_strict_aiter_output(fail) + assert rf["correctness_pass"] is False and "AssertionError" in rf["error"] + assert rf["error_category"] == "correctness" and rf["aot_status"] == "no_aot" + miss = harness.parse_strict_aiter_output("no result here") + assert miss["correctness_pass"] is False and miss["error"] == "no_strict_result" + + +def test_parse_flydsl_stage_p95(): + stdout = ( + "FlyDSL MoE stage1[fp4]: 100.0 us, p95=105.0 us 1654.24 TFLOPS(logical, M=144), 4.0 TB/s (x)\n" + "FlyDSL MoE stage2 [moe_gemm2] fp4 atomic | 7168x256, ... | 50.0 us, p95=55.0 us 1200.0 TFLOPS, 3.0 TB/s\n" + ) + g = harness.parse_flydsl_stage_us(stdout) + assert g["stage1_us"] == 100.0 and g["stage1_p95"] == 105.0 + assert g["stage2_us"] == 50.0 and g["stage2_p95"] == 55.0 + # Without the p95 suffix, the p95 fields are None but median us still parses. + g2 = harness.parse_flydsl_stage_us("FlyDSL MoE stage1[fp4]: 100.0 us, 1.0 TFLOPS(logical, M=1), 4.0 TB/s (x)\n") + assert g2["stage1_us"] == 100.0 and g2["stage1_p95"] is None + + +# --- run-list coverage (full token grid from spec) ------------------------- + + +def test_run_list_covers_full_dec6_grid(): + rl = harness.build_run_list() + # DS V3 (16 tok x 2 dtype) + DS V4 (16 x 1) + Kimi (16 x 2) + GPT-OSS (8 x 2) + assert len(rl) == 16 * 2 + 16 * 1 + 16 * 2 + 8 * 2 == 96 + keys = harness.expected_point_keys() + # DeepSeek V4 is a8w4-only. + assert ("deepseek_v4", "a8w4", "silu", "1") in keys + assert ("deepseek_v4", "a4w4", "silu", "1") not in keys + # GPT-OSS has no tiny-token regime; starts at 256. + assert ("gpt_oss", "a4w4", "swiglu", "256") in keys + assert ("gpt_oss", "a4w4", "swiglu", "1") not in keys + # full small + large coverage for a skinny model. + for tok in (1, 16, 64, 4096, 16384, 32768): + assert ("kimi_k2", "a4w4", "silu", str(tok)) in keys + + +# --- baseline validation gate (negative tests) ------------------------ + + +def _good_baseline_row(**over): + row = { + "gpu_id": "0", + "gpu_model": "MI350X", + "branch": "rlcr/mxfp4-moe", + "commit": "523ca1c7deadbeef", + "command": "python3 test_moe_gemm.py ... ; python3 test_moe_2stage.py ...", + "warmup": "10", + "iters": "100", + "idle_gpu_verified": "True", + "graph_capture": "False", + "l2_flush_per_iter": "True", + "clocks_pinned": "True", + "model": "kimi_k2", + "dtype": "a4w4", + "act": "silu", + "token": "16", + # All required metric fields present and numeric. + "stage1_us": "55.3", + "stage2_us": "21.8", + "sorting_us": "0.0", + "kernel_path_us": "77.1", + "kernel_path_us_p95": "79.0", + "effective_tflops": "12.3", + "mfu": "0.0027", + "e2e_us": "150.0", + "e2e_us_p95": "155.0", + "logits_diff": "0.0008", + "correctness_pass": "True", + } + row.update(over) + return row + + +def test_validate_baseline_row_accepts_good_row(): + assert harness.validate_baseline_row(_good_baseline_row()) == [] + + +@pytest.mark.parametrize( + "over,expect", + [ + ({"commit": "abc123"}, "commit_not_523ca1c7"), + ({"commit": ""}, "missing_commit"), + ({"idle_gpu_verified": "False"}, "idle_gpu_not_verified"), + ({"command": ""}, "missing_command"), + ({"dtype": ""}, "missing_dtype"), + ({"act": ""}, "missing_act"), + ({"e2e_us": ""}, "missing_e2e_us"), + ({"logits_diff": ""}, "missing_logits_diff"), + # Hardened metric-field requirements. + ({"stage1_us": ""}, "missing_stage1_us"), + ({"stage2_us": ""}, "missing_stage2_us"), + ({"sorting_us": ""}, "missing_sorting_us"), + ({"kernel_path_us": ""}, "missing_kernel_path_us"), + ({"kernel_path_us_p95": ""}, "missing_kernel_path_us_p95"), + ({"effective_tflops": ""}, "missing_effective_tflops"), + ({"mfu": ""}, "missing_mfu"), + ({"e2e_us_p95": ""}, "missing_e2e_us_p95"), + ({"kernel_path_us": "not-a-number"}, "missing_kernel_path_us"), + ({"correctness_pass": "False"}, "correctness_not_passed"), + ({"correctness_pass": ""}, "correctness_not_passed"), + ({"warmup": "2"}, "warmup_mismatch"), + ({"iters": "5"}, "iters_mismatch"), + ({"graph_capture": "True"}, "graph_capture_must_be_off"), + ({"l2_flush_per_iter": "False"}, "l2_flush_must_be_on"), + ({"clocks_pinned": "False"}, "clocks_must_be_pinned"), + ], +) +def test_validate_baseline_row_rejections(over, expect): + reasons = harness.validate_baseline_row(_good_baseline_row(**over)) + assert expect in reasons + + +def test_validate_baseline_csv_missing_coverage(tmp_path): + # A single fully-valid row is not enough; the full workload must be covered. + out = tmp_path / "baseline.csv" + p = harness.Provenance( + gpu_id="0", gpu_model="MI350X", branch="b", commit="523ca1c7", idle_gpu_verified=True, clocks_pinned=True + ) + row = harness.PointRow( + provenance=p, + command="cmd", + model="kimi_k2", + model_dim=7168, + inter_dim=256, + experts=384, + topk=8, + dtype="a4w4", + act="silu", + token=16, + stage1_us=55.3, + stage2_us=21.8, + sorting_us=0.0, + kernel_path_us=77.1, + kernel_path_us_p95=79.0, + effective_tflops=12.3, + mfu=0.0027, + e2e_us=150.0, + e2e_us_p95=155.0, + logits_diff=0.0008, + correctness_pass=True, + ) + harness.write_csv([row], str(out)) + res = harness.validate_baseline_csv(str(out)) + assert res["valid"] is False + assert res["missing_points"] # almost all points missing + assert res["row_errors"] == {} # the one present row is itself fully valid + + +def test_validate_baseline_csv_rejects_missing_kernel_metrics(tmp_path): + # Regression: a full-coverage CSV with e2e/logits present + # but kernel metrics empty must NOT validate. + out = tmp_path / "baseline.csv" + p = harness.Provenance( + gpu_id="0", gpu_model="MI350X", branch="b", commit="523ca1c7", idle_gpu_verified=True, clocks_pinned=True + ) + rows = [] + for rp in harness.build_run_list(): + rows.append( + harness.PointRow( + provenance=p, + command="cmd", + model=rp.model, + model_dim=rp.model_dim, + inter_dim=rp.inter_dim, + experts=rp.experts, + topk=rp.topk, + dtype=rp.dtype, + act=rp.act, + token=rp.token, + # kernel metrics deliberately omitted + e2e_us=150.0, + e2e_us_p95=155.0, + logits_diff=0.0008, + correctness_pass=True, + ) + ) + harness.write_csv(rows, str(out)) + res = harness.validate_baseline_csv(str(out)) + assert res["valid"] is False + assert not res["missing_points"] # coverage is complete... + assert res["row_errors"] # ...but rows fail on missing kernel metrics + some = next(iter(res["row_errors"].values())) + assert "missing_kernel_path_us" in some and "missing_mfu" in some + + +def test_combined_and_metrics(): + combined = harness.combined_kernel_path_us(1000.0, 800.0, 50.0) + assert combined == 1850.0 + m = harness.compute_metrics(token=4096, model_dim=7168, inter_dim=256, topk=9, combined_us=combined) + assert m["effective_tflops"] > 0 and 0 < m["mfu"] < 10 + + +def test_summarize_median_p95(): + s = harness.summarize([10, 11, 12, 13, 100]) + assert s["median"] == 12 + assert s["p95"] == 100 + + +def test_provenance_missing_fields_gate(): + p = harness.Provenance() # gpu_id/gpu_model/branch/commit unset + missing = p.missing_fields() + assert "gpu_id" in missing and "commit" in missing + assert not p.is_complete() + p2 = harness.Provenance(gpu_id="0", gpu_model="MI350X", branch="rlcr/mxfp4-moe", commit="deadbeef") + assert p2.is_complete() + + +def test_pointrow_csv_dict_has_all_columns(): + p = harness.Provenance(gpu_id="0", gpu_model="MI350X", branch="b", commit="c") + row = harness.PointRow( + provenance=p, + command="cmd", + model="kimi_k2", + model_dim=7168, + inter_dim=256, + experts=384, + topk=8, + dtype="a4w4", + act="silu", + token=4096, + ) + d = row.to_csv_dict() + assert set(d.keys()) == set(harness.CSV_COLUMNS) + assert d["metric_formula"] == harness.METRIC_FORMULA + + +def test_write_csv_roundtrip(tmp_path): + p = harness.Provenance(gpu_id="0", gpu_model="MI350X", branch="b", commit="c") + rows = [ + harness.PointRow( + provenance=p, + command="cmd", + model="kimi_k2", + model_dim=7168, + inter_dim=256, + experts=384, + topk=8, + dtype="a4w4", + act="silu", + token=4096, + kernel_path_us=1850.0, + e2e_us=2000.0, + mfu=0.5, + ) + ] + out = tmp_path / "baseline.csv" + harness.write_csv(rows, str(out)) + text = out.read_text() + assert "kernel_path_us" in text.splitlines()[0] + assert "kimi_k2" in text + + +# --- ledger: attempt validation + comparison ------------------------------- + + +def _complete_attempt(**over): + base = dict( + config={"tile_m": 64}, + stage=1, + model="kimi_k2", + dtype="a4w4", + act="silu", + gpu_id="0", + gpu_model="MI350X", + branch="b", + commit="c", + command="cmd", + warmup=10, + iters=100, + result="loss", + ) + base.update(over) + return ledger.Attempt(**base) + + +def test_attempt_missing_provenance_rejected(tmp_path): + bad = _complete_attempt(commit="") # missing required field + assert "commit" in bad.missing_fields() + with pytest.raises(ValueError): + ledger.append_attempt(bad, path=str(tmp_path / "attempts.jsonl")) + + +def test_attempt_append_roundtrip(tmp_path): + path = str(tmp_path / "attempts.jsonl") + rec = ledger.append_attempt(_complete_attempt(result="win"), path=path, now=123.0) + assert rec["timestamp"] == 123.0 + lines = open(path).read().strip().splitlines() + assert len(lines) == 1 and '"result": "win"' in lines[0] + + +def _complete_rejected(**over): + base = dict( + model="kimi_k2", + dtype="a4w4", + act="silu", + token=64, + stage=0, + config={"tile_m1": 16}, + reason="illegal candidate tiles: s1=fp4 tile_m<32", + selection={"model": "kimi_k2", "dtype": "a4w4", "tokens": [64]}, + gpu_id="0", + gpu_model="MI350X", + branch="b", + commit="c", + command="python3 scripts/moe_tuning_harness.py candidate --tile-m1 16", + warmup=10, + iters=100, + csv_path="", # present-but-empty: no measured artifact pre-compile + profile_path="", + ) + base.update(over) + return base + + +def test_rejected_candidate_full_provenance_roundtrip(tmp_path): + path = str(tmp_path / "attempts.jsonl") + rec = ledger.append_rejected_candidate(_complete_rejected(), path=path, now=7.0) + assert rec["result"] == "rejected_candidate" and rec["timestamp"] == 7.0 + # csv_path/profile_path are present (empty allowed); selection is a non-empty dict. + assert rec["csv_path"] == "" and rec["profile_path"] == "" and rec["selection"] + # stage 0 is a valid value (candidate-tile rejection spanning both stages). + rec0 = ledger.append_rejected_candidate(_complete_rejected(stage=0), path=path, now=8.0) + assert rec0["stage"] == 0 + lines = open(path).read().strip().splitlines() + assert len(lines) == 2 + + +def test_rejected_candidate_missing_provenance_rejected(tmp_path): + path = str(tmp_path / "attempts.jsonl") + # Each required (non-empty) provenance field, when blanked, must be refused. + for field in ("act", "gpu_id", "gpu_model", "branch", "commit", "command", "warmup", "iters"): + bad = _complete_rejected(**{field: ""}) + with pytest.raises(ValueError, match="missing fields"): + ledger.append_rejected_candidate(bad, path=path) + # csv_path/profile_path keys must EXIST even though empty is allowed: drop them. + for field in ("csv_path", "profile_path"): + bad = _complete_rejected() + del bad[field] + with pytest.raises(ValueError, match="missing fields"): + ledger.append_rejected_candidate(bad, path=path) + # selection None/"" trips the missing-fields gate; {} / non-dict trips the + # dedicated selection gate. + for sel in (None, ""): + with pytest.raises(ValueError, match="missing fields"): + ledger.append_rejected_candidate(_complete_rejected(selection=sel), path=path) + for sel in ({}, "a4w4"): + with pytest.raises(ValueError, match="selection"): + ledger.append_rejected_candidate(_complete_rejected(selection=sel), path=path) + # The minimal-only record (the old contract) is now rejected. + with pytest.raises(ValueError, match="missing fields"): + ledger.append_rejected_candidate( + {"model": "kimi_k2", "dtype": "a4w4", "token": 64, "config": {}, "reason": "x"}, path=path + ) + # No partial file should have been written. + assert not os.path.exists(path) + + +def test_committed_rejected_records_are_contract_complete(): + """Every committed rejected_candidate record must carry full provenance, unless + it is an explicitly superseded pre-contract artifact (marked superseded_by).""" + import json as _json + + attempts = os.path.join(_REPO_ROOT, "docs", "attempts.jsonl") + if not os.path.exists(attempts): + pytest.skip("no committed attempts ledger") + required = set(ledger.REQUIRED_REJECTED_FIELDS) + present_keys = set(ledger.REQUIRED_REJECTED_PRESENT_KEYS) + offenders = [] + for ln in open(attempts): + ln = ln.strip() + if not ln: + continue + rec = _json.loads(ln) + if rec.get("result") != "rejected_candidate": + continue + if "superseded_by" in rec: # incomplete historical record, explicitly invalidated + continue + missing = [k for k in required if rec.get(k) in (None, "")] + missing += [k for k in present_keys if k not in rec] + sel = rec.get("selection") + if not isinstance(sel, dict) or not sel: + missing.append("selection") + if missing: + offenders.append((rec.get("timestamp"), missing)) + assert not offenders, f"incomplete committed rejected records: {offenders}" + + +def _csv(path, rows): + import csv as _c + + with open(path, "w", newline="") as f: + w = _c.DictWriter(f, fieldnames=["model", "dtype", "act", "token", "kernel_path_us", "e2e_us", "mfu"]) + w.writeheader() + for r in rows: + w.writerow(r) + + +def _gate_csv(path, rows): + import csv as _c + + cols = [ + "model", + "dtype", + "act", + "token", + "kernel_path_us", + "e2e_us", + "aot_status", + "correctness_pass", + "logits_diff", + ] + with open(path, "w", newline="") as f: + w = _c.DictWriter(f, fieldnames=cols) + w.writeheader() + for r in rows: + w.writerow(r) + + +def _gate_row(**over): + base = dict( + model="kimi_k2", + dtype="a4w4", + act="silu", + token=16, + kernel_path_us=150.0, + e2e_us=80.0, + aot_status="checked", + correctness_pass=True, + logits_diff=0.001, + ) + base.update(over) + return base + + +def test_selected_candidate_gate_accepts_checked_correct(tmp_path): + path = str(tmp_path / "cand.csv") + _gate_csv(path, [_gate_row(token=16), _gate_row(token=16384, kernel_path_us=1700, e2e_us=1500)]) + res = ledger.selected_candidate_gate(path) + assert res["passed"] is True and res["n_rows"] == 2 and res["violations"] == [] + + +def test_selected_candidate_gate_rejects_no_aot_and_bad_correctness(tmp_path): + # no_aot row (repeatability/diagnostic bypass) can never be promoted to a win. + p1 = str(tmp_path / "no_aot.csv") + _gate_csv(p1, [_gate_row(aot_status="no_aot")]) + r1 = ledger.selected_candidate_gate(p1) + assert r1["passed"] is False and any("aot_status" in v[1] for v in r1["violations"]) + + # failed correctness rejected. + p2 = str(tmp_path / "bad_correct.csv") + _gate_csv(p2, [_gate_row(correctness_pass=False)]) + r2 = ledger.selected_candidate_gate(p2) + assert r2["passed"] is False and any("correctness_pass" in v[1] for v in r2["violations"]) + + # logits over threshold rejected. + p3 = str(tmp_path / "bad_logits.csv") + _gate_csv(p3, [_gate_row(logits_diff=0.05)]) + r3 = ledger.selected_candidate_gate(p3) + assert r3["passed"] is False and any("logits_diff" in v[1] for v in r3["violations"]) + + # empty CSV: nothing to promote -> not passed. + p4 = str(tmp_path / "empty.csv") + _gate_csv(p4, []) + assert ledger.selected_candidate_gate(p4)["passed"] is False + + +def test_scan_replay_consistency(tmp_path): + path = str(tmp_path / "attempts.jsonl") + import json as _json + + def _write(recs): + with open(path, "w") as f: + for r in recs: + f.write(_json.dumps(r) + "\n") + + # multi-file attempt whose command replays BOTH files -> clean. + good = { + "result": "neutral", + "csv_path": "docs/a.csv;docs/b.csv", + "command": "h candidate --out docs/a.csv ; h candidate --out docs/b.csv ; repeatability_check", + "timestamp": 1.0, + } + _write([good]) + assert ledger.scan_replay_consistency(path) == [] + + # command misses b.csv -> offender. + bad = dict(good, command="h candidate --out docs/a.csv", timestamp=2.0) + _write([bad]) + off = ledger.scan_replay_consistency(path) + assert off and off[0][0] == 2.0 and "docs/b.csv" in off[0][1] + + # brace shorthand does not literally contain either file -> offender. + brace = dict(good, command="h candidate --out docs/{a,b}.csv", timestamp=3.0) + _write([brace]) + assert ledger.scan_replay_consistency(path) + + # required file hidden behind a '#' comment -> offender. + commented = dict(good, command="h candidate --out docs/a.csv # then docs/b.csv", timestamp=4.0) + _write([commented]) + assert ledger.scan_replay_consistency(path) + + # superseded records are skipped. + superseded = dict(bad, superseded_by=9.0, timestamp=5.0) + _write([superseded]) + assert ledger.scan_replay_consistency(path) == [] + + +def test_committed_repeatability_attempts_replayable(): + """Committed multi-file repeatability attempts must replay all their CSVs.""" + attempts = os.path.join(_REPO_ROOT, "docs", "attempts.jsonl") + if not os.path.exists(attempts): + pytest.skip("no committed attempts ledger") + off = ledger.scan_replay_consistency(attempts) + assert off == [], f"non-replayable committed repeatability attempts: {off}" + + +def test_scan_duplicate_rejected_candidates(tmp_path): + path = str(tmp_path / "attempts.jsonl") + import json as _json + + def _probe(ts, sup=None): + r = { + "result": "rejected_candidate", + "model": "deepseek_v3", + "dtype": "a4w4", + "act": "silu", + "token": 32, + "config": {"tile_m1": 256, "tile_n1": 32}, + "reason": "x", + "timestamp": ts, + } + if sup is not None: + r["superseded_by"] = sup + return r + + # Two ACTIVE records for the same probe -> duplicate. + open(path, "w").write(_json.dumps(_probe(1.0)) + "\n" + _json.dumps(_probe(2.0)) + "\n") + dups = ledger.scan_duplicate_rejected_candidates(path) + assert dups and sorted(dups[0][1]) == [1.0, 2.0] + + # Superseding the older one leaves exactly one active -> clean. + open(path, "w").write(_json.dumps(_probe(1.0, sup=2.0)) + "\n" + _json.dumps(_probe(2.0)) + "\n") + assert ledger.scan_duplicate_rejected_candidates(path) == [] + + +def test_committed_rejected_candidates_unique(): + """Committed ledger must have exactly one active rejected record per probe.""" + attempts = os.path.join(_REPO_ROOT, "docs", "attempts.jsonl") + if not os.path.exists(attempts): + pytest.skip("no committed attempts ledger") + dups = ledger.scan_duplicate_rejected_candidates(attempts) + assert dups == [], f"duplicate active rejected-candidate records: {dups}" + + +def test_scan_superseded_rejected_candidates(tmp_path): + path = str(tmp_path / "attempts.jsonl") + import json as _json + + def _probe(ts, n, sup=None): + r = { + "result": "rejected_candidate", + "model": "deepseek_v3", + "dtype": "a4w4", + "act": "silu", + "token": 32, + "config": {"tile_m1": 256, "tile_n1": n}, + "reason": "x", + "timestamp": ts, + } + if sup is not None: + r["superseded_by"] = sup + return r + + # superseded record links to the matching active record of the SAME key -> clean. + open(path, "w").write(_json.dumps(_probe(1.0, 32, sup=2.0)) + "\n" + _json.dumps(_probe(2.0, 32)) + "\n") + assert ledger.scan_superseded_rejected_candidates(path) == [] + + # superseded record links to a DIFFERENT probe's active record -> offender. + open(path, "w").write( + _json.dumps(_probe(1.0, 32, sup=3.0)) # links to the n=64 record, wrong key + + "\n" + + _json.dumps(_probe(2.0, 32)) + + "\n" + + _json.dumps(_probe(3.0, 64)) + + "\n" + ) + off = ledger.scan_superseded_rejected_candidates(path) + assert off and off[0][0] == 1.0 + + +def test_committed_superseded_links_valid(): + """Every committed superseded rejected record must link to an active record of the same key.""" + attempts = os.path.join(_REPO_ROOT, "docs", "attempts.jsonl") + if not os.path.exists(attempts): + pytest.skip("no committed attempts ledger") + off = ledger.scan_superseded_rejected_candidates(attempts) + assert off == [], f"superseded records linking to the wrong/no successor: {off}" + + +def test_row_missing_kernel_path(): + rp = harness.RunPoint("deepseek_v3", 7168, 256, 257, 9, "silu", "a4w4", 32) + prov = harness.Provenance(gpu_id="0", gpu_model="MI350X", branch="b", commit="c") + # A row with no parsed stage times is "missing" (the tile_n=512 / tile_k!=256 case). + blank = harness.PointRow( + provenance=prov, + command="x", + model=rp.model, + model_dim=rp.model_dim, + inter_dim=rp.inter_dim, + experts=rp.experts, + topk=rp.topk, + dtype=rp.dtype, + act=rp.act, + token=rp.token, + ) + assert harness.row_missing_kernel_path(blank) is True + # A row with kernel-path populated is not missing. + blank.stage1_us = 90.0 + blank.stage2_us = 70.0 + blank.kernel_path_us = 160.0 + assert harness.row_missing_kernel_path(blank) is False + + +def test_compare_csvs_detects_regression_and_wins(tmp_path): + base = str(tmp_path / "base.csv") + cand = str(tmp_path / "cand.csv") + _csv( + base, + [ + { + "model": "kimi_k2", + "dtype": "a4w4", + "act": "silu", + "token": 16384, + "kernel_path_us": 1000, + "e2e_us": 1200, + "mfu": 0.50, + }, + { + "model": "kimi_k2", + "dtype": "a4w4", + "act": "silu", + "token": 16, + "kernel_path_us": 100, + "e2e_us": 150, + "mfu": 0.05, + }, + { + "model": "kimi_k2", + "dtype": "a4w4", + "act": "silu", + "token": 128, + "kernel_path_us": 500, + "e2e_us": 600, + "mfu": 0.30, + }, + ], + ) + _csv( + cand, + [ + # large bucket: +10% MFU win, no kernel-path regression + { + "model": "kimi_k2", + "dtype": "a4w4", + "act": "silu", + "token": 16384, + "kernel_path_us": 950, + "e2e_us": 1180, + "mfu": 0.56, + }, + # small token: 20% faster and >=2us -> win + { + "model": "kimi_k2", + "dtype": "a4w4", + "act": "silu", + "token": 16, + "kernel_path_us": 80, + "e2e_us": 150, + "mfu": 0.05, + }, + # mid token: regression on kernel-path (+10% and +50us) + { + "model": "kimi_k2", + "dtype": "a4w4", + "act": "silu", + "token": 128, + "kernel_path_us": 550, + "e2e_us": 600, + "mfu": 0.30, + }, + ], + ) + cv = ledger.compare_csvs(base, cand) + assert cv.any_regression is True # the 128-token point regressed + assert cv.coverage_complete # candidate covers all 3 baseline points + assert not cv.pareto_clean + assert ("kimi_k2", "a4w4", "silu", "16384") in cv.large_wins + assert ("kimi_k2", "a4w4", "silu", "16") in cv.small_wins + + +def _gated_compare_csv(path, rows): + """Write a candidate/baseline CSV that ALSO carries the gate columns.""" + import csv as _c + + cols = [ + "model", + "dtype", + "act", + "token", + "kernel_path_us", + "e2e_us", + "mfu", + "aot_status", + "correctness_pass", + "logits_diff", + ] + with open(path, "w", newline="") as f: + w = _c.DictWriter(f, fieldnames=cols) + w.writeheader() + for r in rows: + w.writerow(r) + + +def _two_point_baseline_and_candidate(tmp_path, aot_status): + """A fully-covered, non-regressing, otherwise-WINNING 2-point candidate whose + gate columns are parameterized by ``aot_status``.""" + base = str(tmp_path / "base.csv") + cand = str(tmp_path / "cand.csv") + bl = [ + dict( + model="kimi_k2", + dtype="a4w4", + act="silu", + token=16384, + kernel_path_us=1000, + e2e_us=1200, + mfu=0.50, + aot_status="checked", + correctness_pass=True, + logits_diff=0.001, + ), + dict( + model="kimi_k2", + dtype="a4w4", + act="silu", + token=16, + kernel_path_us=100, + e2e_us=150, + mfu=0.05, + aot_status="checked", + correctness_pass=True, + logits_diff=0.001, + ), + ] + # candidate: +12% MFU at 16384 (large win), 20% faster at 16 (small win), no regressions + cd = [ + dict( + model="kimi_k2", + dtype="a4w4", + act="silu", + token=16384, + kernel_path_us=950, + e2e_us=1180, + mfu=0.56, + aot_status=aot_status, + correctness_pass=True, + logits_diff=0.001, + ), + dict( + model="kimi_k2", + dtype="a4w4", + act="silu", + token=16, + kernel_path_us=80, + e2e_us=150, + mfu=0.05, + aot_status=aot_status, + correctness_pass=True, + logits_diff=0.001, + ), + ] + _gated_compare_csv(base, bl) + _gated_compare_csv(cand, cd) + return base, cand + + +def test_claimable_win_blocks_no_aot_winning_candidate(tmp_path): + # The leak Codex flagged: an otherwise-winning, fully-covered, non-regressing + # candidate measured with --no-aot-check must NOT be promotable. + base, cand = _two_point_baseline_and_candidate(tmp_path, aot_status="no_aot") + cv = ledger.compare_csvs(base, cand) + # metrics still look winning... + assert cv.pareto_clean is True + assert cv.large_wins and cv.small_wins + # ...but the hard gate fails, so the candidate is NOT claimable. + assert cv.gate["passed"] is False + assert cv.claimable_win is False + # and the standalone gate agrees. + assert ledger.selected_candidate_gate(cand)["passed"] is False + + +def test_claimable_win_allows_checked_correct_candidate(tmp_path): + base, cand = _two_point_baseline_and_candidate(tmp_path, aot_status="checked") + cv = ledger.compare_csvs(base, cand) + assert cv.pareto_clean is True + assert cv.large_wins and cv.small_wins + assert cv.gate["passed"] is True + assert cv.claimable_win is True + + +def test_compare_csvs_rejects_cherry_picked_candidate(tmp_path): + # Baseline has 3 points; candidate reports only the single winning large + # point and omits the others. Coverage must be incomplete and the verdict + # must NOT be pareto_clean -- a cherry-picked win cannot pass. + base = str(tmp_path / "base.csv") + cand = str(tmp_path / "cand.csv") + _csv( + base, + [ + { + "model": "kimi_k2", + "dtype": "a4w4", + "act": "silu", + "token": 16384, + "kernel_path_us": 1000, + "e2e_us": 1200, + "mfu": 0.50, + }, + { + "model": "kimi_k2", + "dtype": "a4w4", + "act": "silu", + "token": 16, + "kernel_path_us": 100, + "e2e_us": 150, + "mfu": 0.05, + }, + { + "model": "kimi_k2", + "dtype": "a4w4", + "act": "silu", + "token": 128, + "kernel_path_us": 500, + "e2e_us": 600, + "mfu": 0.30, + }, + ], + ) + _csv( + cand, + [ + { + "model": "kimi_k2", + "dtype": "a4w4", + "act": "silu", + "token": 16384, + "kernel_path_us": 900, + "e2e_us": 1100, + "mfu": 0.56, + }, + ], + ) + cv = ledger.compare_csvs(base, cand) + assert not cv.coverage_complete + assert ("kimi_k2", "a4w4", "silu", "16") in cv.missing_candidate_points + assert ("kimi_k2", "a4w4", "silu", "128") in cv.missing_candidate_points + assert not cv.pareto_clean # forced False by incomplete coverage + + +def test_compare_csvs_rejects_missing_regime_fields(tmp_path): + # Candidate covers every point but the large target bucket lacks mfu, and a + # point lacks e2e. Those points are incomplete -> not pareto_clean. + base = str(tmp_path / "base.csv") + cand = str(tmp_path / "cand.csv") + _csv( + base, + [ + { + "model": "kimi_k2", + "dtype": "a4w4", + "act": "silu", + "token": 16384, + "kernel_path_us": 1000, + "e2e_us": 1200, + "mfu": 0.50, + }, + { + "model": "kimi_k2", + "dtype": "a4w4", + "act": "silu", + "token": 128, + "kernel_path_us": 500, + "e2e_us": 600, + "mfu": 0.30, + }, + ], + ) + _csv( + cand, + [ + # large bucket missing mfu + { + "model": "kimi_k2", + "dtype": "a4w4", + "act": "silu", + "token": 16384, + "kernel_path_us": 900, + "e2e_us": 1100, + "mfu": "", + }, + # mid point missing e2e + { + "model": "kimi_k2", + "dtype": "a4w4", + "act": "silu", + "token": 128, + "kernel_path_us": 480, + "e2e_us": "", + "mfu": 0.30, + }, + ], + ) + cv = ledger.compare_csvs(base, cand) + assert not cv.coverage_complete + assert ("kimi_k2", "a4w4", "silu", "16384") in cv.incomplete_points + assert ("kimi_k2", "a4w4", "silu", "128") in cv.incomplete_points + assert not cv.pareto_clean + + +def test_repeatability_check(tmp_path): + a = str(tmp_path / "a.csv") + b = str(tmp_path / "b.csv") + _csv( + a, + [ + { + "model": "kimi_k2", + "dtype": "a4w4", + "act": "silu", + "token": 16384, + "kernel_path_us": 1000, + "e2e_us": 1200, + "mfu": 0.5, + }, + { + "model": "kimi_k2", + "dtype": "a4w4", + "act": "silu", + "token": 16, + "kernel_path_us": 100, + "e2e_us": 150, + "mfu": 0.05, + }, + ], + ) + # b: first point within band (1.5% < 2% and +15us... wait 15us>2us, so need <=max(2%*1000=20us,2us)=20us -> 1015 ok), + # second point unstable (+10us on a 100us base -> band=max(2us,2us)=2us, 10>2 -> unstable). + _csv( + b, + [ + { + "model": "kimi_k2", + "dtype": "a4w4", + "act": "silu", + "token": 16384, + "kernel_path_us": 1015, + "e2e_us": 1210, + "mfu": 0.5, + }, + { + "model": "kimi_k2", + "dtype": "a4w4", + "act": "silu", + "token": 16, + "kernel_path_us": 110, + "e2e_us": 150, + "mfu": 0.05, + }, + ], + ) + res = ledger.repeatability_check(a, b) + assert res["n_shared"] == 2 + assert not res["stable"] # the 16-token kernel_path drifted > band + assert any(u[0] == ("kimi_k2", "a4w4", "silu", "16") for u in res["unstable"]["kernel_path_us"]) + # 16384 kernel_path within band, e2e within band -> not flagged. + assert all(u[0] != ("kimi_k2", "a4w4", "silu", "16384") for u in res["unstable"]["kernel_path_us"]) + + +def test_quarantine_and_validated_keys(): + from kernels import moe_tuning_spec as spec + + # ALL a8w4 shapes are correctness-quarantined (the non-fp4-activation + # e2e path fails the aiter correctness gate for fp8 AND bf16 activation; only + # fp4 activation passes). DS V3 a8w4 is included (its earlier legacy-path "pass" was the + # legacy-Swiglu artifact, not a real Silu a8w4 pass). + assert spec.is_quarantined("deepseek_v3", "a8w4") + assert spec.is_quarantined("deepseek_v4", "a8w4") + assert spec.is_quarantined("kimi_k2", "a8w4") + assert spec.is_quarantined("gpt_oss", "a8w4") + # a4w4 is NOT quarantined for any model. + assert not spec.is_quarantined("deepseek_v3", "a4w4") + assert not spec.is_quarantined("kimi_k2", "a4w4") + + vkeys = spec.validated_point_keys() + # Validated = all a4w4: DS V3 (16) + Kimi (16) + GPT-OSS (8) = 40. + assert len(vkeys) == 40 + assert ("deepseek_v3", "a4w4", "silu", "1") in vkeys + assert ("deepseek_v3", "a8w4", "silu", "1") not in vkeys # quarantined + assert ("kimi_k2", "a8w4", "silu", "1") not in vkeys # quarantined + assert ("gpt_oss", "a8w4", "swiglu", "256") not in vkeys # quarantined + # validated subset is a strict subset of the full workload. + assert vkeys < harness.expected_point_keys() + + +def test_validate_baseline_csv_subset_keys(tmp_path): + # A CSV covering only the validated subset validates against validated keys, + # but fails against the full workload (missing the quarantined points). + from kernels import moe_tuning_spec as spec + + out = tmp_path / "sub.csv" + p = harness.Provenance( + gpu_id="0", gpu_model="MI350X", branch="b", commit="523ca1c7", idle_gpu_verified=True, clocks_pinned=True + ) + rows = [] + for key in spec.validated_point_keys(): + model, dtype, act, token = key + rows.append( + harness.PointRow( + provenance=p, + command="cmd", + model=model, + model_dim=7168, + inter_dim=256, + experts=257, + topk=9, + dtype=dtype, + act=act, + token=int(token), + stage1_us=10.0, + stage2_us=5.0, + sorting_us=0.0, + kernel_path_us=15.0, + kernel_path_us_p95=15.5, + effective_tflops=1.0, + mfu=0.01, + e2e_us=12.0, + e2e_us_p95=12.5, + logits_diff=0.0001, + correctness_pass=True, + ) + ) + harness.write_csv(rows, str(out)) + assert harness.validate_baseline_csv(str(out), expected_keys=spec.validated_point_keys())["valid"] is True + assert harness.validate_baseline_csv(str(out))["valid"] is False # full workload not covered + + +def test_perf_dist_percentile(): + import importlib + + tc = importlib.import_module("tests.test_common") + # nearest-rank p95 over 1..100: idx=round(0.95*99)=94 -> value 95 (0-based). + assert tc._percentile(list(range(1, 101)), 0.95) == 95 + assert tc._percentile([], 0.95) is None + assert "n_rotate" in tc.LAST_PERF_DIST + + +def test_timed_distribution_rotates_distinct_args(): + # Branch-level regression for the FLYDSL_PERF_DIST timed loop: it must cycle + # the cache-sized rotated arg copies (iteration i -> rotate_args[i % n]) so + # DISTINCT working sets reach func (the L2-flush behavior), and compute + # median/p95 from the injected per-call timings. + import importlib + + tc = importlib.import_module("tests.test_common") + + # 3 distinct arg copies; record which args each call received. + rotate_args = [((tag,), {}) for tag in ("A", "B", "C")] + seen = [] + + def func(tag): + seen.append(tag) + return f"out-{tag}" + + # Injected timer returns a deterministic latency per call so we can check + # median/p95 without a GPU. + timings = iter([10.0, 30.0, 20.0, 50.0, 40.0, 60.0, 70.0]) + + def time_call(fn, a_i, kw_i): + out = fn(*a_i, **kw_i) + return next(timings), out + + data, median, p95, n_rot = tc._timed_distribution(func, rotate_args, num_iters=7, time_call=time_call) + # 7 iters over 3 copies -> A,B,C,A,B,C,A (distinct args actually reach func). + assert seen == ["A", "B", "C", "A", "B", "C", "A"] + assert n_rot == 3 + assert data == "out-A" # last call's output + # median of [10,30,20,50,40,60,70] sorted=[10,20,30,40,50,60,70] -> 40. + assert median == 40.0 + # nearest-rank p95: idx=round(0.95*6)=6 -> 70. + assert p95 == 70.0 + + +def test_clock_pinning_helpers(monkeypatch): + # pin_clocks parses the rocm-smi determinism-success message; clocks_pinned_state + # treats determinism/manual/high as pinned and auto as DVFS (not pinned). + outs = {} + + def fake_run(cmd): + if "--setperfdeterminism" in cmd: + return outs.get("set", "") + if "--showperflevel" in cmd: + return outs.get("level", "") + return "" + + monkeypatch.setattr(harness, "_run", fake_run) + outs["set"] = "GPU[0]: Successfully enabled performance determinism and set GFX clock frequency: 2200" + assert harness.pin_clocks("0") is True + outs["set"] = "GPU[0]: set_perf_level, Not supported on the given system" + assert harness.pin_clocks("0") is False + outs["level"] = "GPU[0]: Performance Level: determinism" + assert harness.clocks_pinned_state("0") is True + outs["level"] = "GPU[0]: Performance Level: auto" + assert harness.clocks_pinned_state("0") is False + + +def test_setup_run_provenance_reflects_verified_clock_state(monkeypatch): + # The live setup path must record the VERIFIED clock-pinned state, never the + # static spec intent default. Provenance.clocks_pinned defaults to False. + assert harness.Provenance().clocks_pinned is False + + calls = {"pin": 0} + + def fake_pin(gpu_id, *a, **k): + calls["pin"] += 1 + return True + + monkeypatch.setattr(harness, "check_idle_gpu", lambda g, **k: True) + monkeypatch.setattr(harness, "pin_clocks", fake_pin) + monkeypatch.setattr(harness, "git_provenance", lambda *a, **k: {"branch": "b", "commit": "523ca1c7"}) + monkeypatch.setattr(harness, "gpu_provenance", lambda g: {"gpu_id": str(g), "gpu_model": "MI350X"}) + + # Verified pinned -> clocks_pinned True. + monkeypatch.setattr(harness, "clocks_pinned_state", lambda g: True) + prov = harness.setup_run_provenance("0") + assert calls["pin"] == 1 # the driver actually attempted to pin + assert prov.clocks_pinned is True + assert prov.idle_gpu_verified is True + assert prov.commit == "523ca1c7" and prov.gpu_model == "MI350X" + + # Verification fails -> clocks_pinned MUST be False (not the intent default). + monkeypatch.setattr(harness, "clocks_pinned_state", lambda g: False) + prov2 = harness.setup_run_provenance("0") + assert prov2.clocks_pinned is False + # A row built from unverified provenance is rejected by the baseline validator. + row = { + "commit": "523ca1c7", + "idle_gpu_verified": "True", + "gpu_id": "0", + "gpu_model": "MI350X", + "branch": "b", + "command": "c", + "dtype": "a4w4", + "act": "silu", + "model": "kimi_k2", + "token": "16", + "stage1_us": "1", + "stage2_us": "1", + "sorting_us": "0", + "kernel_path_us": "2", + "kernel_path_us_p95": "2", + "effective_tflops": "1", + "mfu": "0.1", + "e2e_us": "1", + "e2e_us_p95": "1", + "logits_diff": "0.0001", + "correctness_pass": "True", + "warmup": "10", + "iters": "100", + "graph_capture": "False", + "l2_flush_per_iter": "True", + "clocks_pinned": str(prov2.clocks_pinned), + } + assert "clocks_must_be_pinned" in harness.validate_baseline_row(row) + + +def test_main_clock_provenance_fail_closed(monkeypatch, tmp_path): + # Direct regression around the live _main() path: it must pin+verify clocks, + # write rows with the verified clocks_pinned, fail-closed (rc=2, no CSV) when + # pinning cannot be verified, and proceed under --allow-unpinned. + rp = harness.RunPoint("kimi_k2", 7168, 256, 384, 8, "silu", "a4w4", 16) + monkeypatch.setattr(harness, "build_run_list", lambda: [rp]) + monkeypatch.setattr(harness, "check_idle_gpu", lambda g, **k: True) + monkeypatch.setattr(harness, "git_provenance", lambda *a, **k: {"branch": "b", "commit": "523ca1c7"}) + monkeypatch.setattr(harness, "gpu_provenance", lambda g: {"gpu_id": str(g), "gpu_model": "MI350X"}) + + written = {} + + def fake_write_csv(rows, path): + written["rows"] = rows + written["path"] = path + + def fake_run_point(rp_, tile, gpu, prov, **k): + return harness.PointRow( + provenance=prov, + command="cmd", + model=rp_.model, + model_dim=rp_.model_dim, + inter_dim=rp_.inter_dim, + experts=rp_.experts, + topk=rp_.topk, + dtype=rp_.dtype, + act=rp_.act, + token=rp_.token, + ) + + monkeypatch.setattr(harness, "write_csv", fake_write_csv) + monkeypatch.setattr(harness, "run_point", fake_run_point) + monkeypatch.setattr(harness, "pin_clocks", lambda g, *a, **k: True) + + out = str(tmp_path / "b.csv") + + # (a) verified pinned -> rc 0, rows written with clocks_pinned True. + written.clear() + monkeypatch.setattr(harness, "clocks_pinned_state", lambda g: True) + rc = harness._main(["baseline", "--gpu", "0", "--assume-idle", "--no-e2e", "--out", out]) + assert rc == 0 + assert written["rows"][0].provenance.clocks_pinned is True + + # (b) verification fails -> fail-closed: rc 2 and NO csv written. + written.clear() + monkeypatch.setattr(harness, "clocks_pinned_state", lambda g: False) + rc = harness._main(["baseline", "--gpu", "0", "--assume-idle", "--no-e2e", "--out", out]) + assert rc == 2 + assert "rows" not in written # fail-closed: did not write a false-pinned CSV + + # (c) --allow-unpinned proceeds, recording clocks_pinned False. + written.clear() + rc = harness._main(["baseline", "--gpu", "0", "--assume-idle", "--no-e2e", "--allow-unpinned", "--out", out]) + assert rc == 0 + assert written["rows"][0].provenance.clocks_pinned is False + + +def test_regime_aware_abs_floor(): + # Regime-aware floor: 8us for tokens<=64, 2us for tokens>=128. + assert spec.abs_floor_us(1) == 8.0 + assert spec.abs_floor_us(64) == 8.0 + assert spec.abs_floor_us(128) == 2.0 + assert spec.abs_floor_us(32768) == 2.0 + + +def test_is_regression_regime_aware(): + # Small token (16): a 5us drift on a 130us base is within the 8us floor -> NOT a regression. + assert spec.is_regression(130.0, 135.0, token=16) is False + # Small token: 9us drift on 130us base -> regression (exceeds 8us AND 2%). + assert spec.is_regression(130.0, 139.0, token=16) is True + # Large token (128): 5us drift on 130us base -> regression under the 2us floor. + assert spec.is_regression(130.0, 135.0, token=128) is True + # Back-compat: token=None keeps the strict 2us floor. + assert spec.is_regression(130.0, 135.0) is True + + +def test_repeatability_check_regime_aware(tmp_path): + a = str(tmp_path / "a.csv") + b = str(tmp_path / "b.csv") + _csv( + a, + [ + { + "model": "kimi_k2", + "dtype": "a4w4", + "act": "silu", + "token": 16, + "kernel_path_us": 130, + "e2e_us": 40, + "mfu": 0.05, + }, + { + "model": "kimi_k2", + "dtype": "a4w4", + "act": "silu", + "token": 128, + "kernel_path_us": 290, + "e2e_us": 250, + "mfu": 0.3, + }, + ], + ) + _csv( + b, + [ + # token 16: +5us kernel-path -> within 8us small-token floor -> stable. + { + "model": "kimi_k2", + "dtype": "a4w4", + "act": "silu", + "token": 16, + "kernel_path_us": 135, + "e2e_us": 40, + "mfu": 0.05, + }, + # token 128: +7us -> exceeds 2us floor (and 2%) -> unstable. + { + "model": "kimi_k2", + "dtype": "a4w4", + "act": "silu", + "token": 128, + "kernel_path_us": 297, + "e2e_us": 250, + "mfu": 0.3, + }, + ], + ) + res = ledger.repeatability_check(a, b) + kp = res["unstable"]["kernel_path_us"] + assert any(u[0] == ("kimi_k2", "a4w4", "silu", "128") for u in kp) # 128 unstable + assert all(u[0] != ("kimi_k2", "a4w4", "silu", "16") for u in kp) # 16 stable under 8us + + +def test_select_run_points_filters(): + # Candidate selection filters the full grid by model/dtype/token. + pts = harness.select_run_points(model="deepseek_v3", dtype="a4w4", tokens=[16, 16384]) + keys = {(p.model, p.dtype, p.token) for p in pts} + assert keys == {("deepseek_v3", "a4w4", 16), ("deepseek_v3", "a4w4", 16384)} + # dtype filter excludes a8w4. + assert all(p.dtype == "a4w4" for p in harness.select_run_points(model="kimi_k2", dtype="a4w4")) + # whole-grid when unfiltered equals build_run_list. + assert len(harness.select_run_points()) == len(harness.build_run_list()) + + +def test_candidate_tile_for_overrides_and_legality(): + rp = harness.RunPoint("deepseek_v3", 7168, 256, 257, 9, "silu", "a4w4", 16) + # Legal override: stage1 tile_n -> 128 (the DS V3 lead). + t = harness.candidate_tile_for(rp, {"tile_n1": 128}) + assert t["tile_n1"] == 128 and t["tile_m1"] == 64 and t["tile_k1"] == 256 + # No overrides -> the shape's default tiles. + assert harness.candidate_tile_for(rp, {}) == harness.default_tile_for(rp) + # Illegal override is rejected before any compile (e.g. fp4 tile_m < 32). + import pytest as _pytest + + with _pytest.raises(ValueError): + harness.candidate_tile_for(rp, {"tile_m1": 16}) + + +def test_prepare_candidate_run_fail_closed(tmp_path, monkeypatch): + # candidate run is fail-closed: requires explicit tiles, all-legal, non-empty. + import moe_tuning_ledger as _ledger + import pytest as _pytest + + # Capture rejected-candidate records instead of writing to the real ledger. + captured = [] + monkeypatch.setattr(_ledger, "append_rejected_candidate", lambda rec, **k: captured.append(rec) or rec) + + no_override = {k: None for k in ("tile_m1", "tile_n1", "tile_k1", "tile_n2", "tile_k2")} + # (1) no explicit tile -> reject (no silent default-tile fallback). + with _pytest.raises(ValueError, match="at least one explicit"): + harness.prepare_candidate_run(no_override, model="deepseek_v3", dtype="a4w4", tokens=[16]) + + # (2) legal explicit tile -> returns (run_list, tiles) of equal length. + ov = dict(no_override, tile_n1=128) + rl, tiles = harness.prepare_candidate_run(ov, model="deepseek_v3", dtype="a4w4", tokens=[16, 64]) + assert len(rl) == len(tiles) == 2 and all(t["tile_n1"] == 128 for t in tiles) + + # (3) illegal explicit tile -> raise AND record a machine-readable rejection + # carrying the full provenance class (act/stage/branch/commit/command/...). + bad = dict(no_override, tile_m1=16) # fp4 tile_m<32 illegal + prov = harness.Provenance(gpu_id="0", gpu_model="MI350X", branch="b", commit="c") + with _pytest.raises(ValueError, match="illegal candidate"): + harness.prepare_candidate_run( + bad, model="deepseek_v3", dtype="a4w4", tokens=[16], prov=prov, command="python3 harness candidate ..." + ) + rec = captured[-1] + assert rec and rec["reason"] and rec["model"] == "deepseek_v3" + # Every full-provenance field is present and non-empty (stage 0 is valid). + for k in ("act", "gpu_id", "gpu_model", "branch", "commit", "command", "warmup", "iters", "selection"): + assert rec.get(k) not in (None, ""), k + assert rec["stage"] == 0 and rec["act"] == "silu" + # The record satisfies the ledger's own rejected-candidate contract. + assert not [f for f in _ledger.REQUIRED_REJECTED_FIELDS if rec.get(f) in (None, "")] + + # (4) empty selection -> reject. + with _pytest.raises(ValueError, match="matched no points"): + harness.prepare_candidate_run(ov, model="nonesuch", dtype="a4w4", tokens=[16]) diff --git a/tests/unit/test_moe_tuning_legality.py b/tests/unit/test_moe_tuning_legality.py new file mode 100644 index 000000000..17a9eafe8 --- /dev/null +++ b/tests/unit/test_moe_tuning_legality.py @@ -0,0 +1,176 @@ +# SPDX-License-Identifier: MIT +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. + +"""Backend-agnostic tests for the MoE tile-config legality filter. + +These tests exercise pure host-side math in ``kernels/moe_tuning.py`` and do not +require a GPU, the FlyROCDL bindings, or a compile. They lock in two properties: + +1. Every tile config currently used by ``scripts/run_benchmark.sh`` for the + in-scope MXFP4 / A8W4 MoE shapes is accepted. +2. Each named illegal case is rejected with the expected machine-readable reason. +""" + +import pytest + +from kernels.moe_tuning import ( + LDS_LIMIT_BYTES, + check_tile_config, + enumerate_legal_configs, +) + +pytestmark = pytest.mark.l0_backend_agnostic + + +# (stage, model_dim, inter_dim, tile_m, tile_n, tile_k, a_dtype) +# Derived from run_benchmark.sh MOE_FP4_SHAPES / MOE_A8W4_SHAPES. Stage1 uses +# (tile_m, tile_n, tile_k); stage2 uses (tile_m, tile_n2, tile_k2). In the +# benchmark tables tile_n2 == tile_k2 == 256 for all in-scope MoE rows. +_RUN_BENCHMARK_CONFIGS = [ + # MOE_FP4_SHAPES group A: 7168/256/257/9, tile 64/256/256, n2/k2 256/256 + (1, 7168, 256, 64, 256, 256, "fp4"), + (2, 7168, 256, 64, 256, 256, "fp4"), + # MOE_FP4_SHAPES group B: 7168/2048/32/8, tile 64/256/256 + (1, 7168, 2048, 64, 256, 256, "fp4"), + (2, 7168, 2048, 64, 256, 256, "fp4"), + # MOE_A8W4_SHAPES GPT-OSS: 3072/3072/128/4, stage1 tile 32/128/256 + (1, 3072, 3072, 32, 128, 256, "fp8"), + # stage2 tile_n2=256, tile_k2=256 + (2, 3072, 3072, 32, 256, 256, "fp8"), +] + + +@pytest.mark.parametrize("stage,model_dim,inter_dim,tile_m,tile_n,tile_k,a_dtype", _RUN_BENCHMARK_CONFIGS) +def test_accepts_run_benchmark_configs(stage, model_dim, inter_dim, tile_m, tile_n, tile_k, a_dtype): + res = check_tile_config( + stage=stage, + model_dim=model_dim, + inter_dim=inter_dim, + tile_m=tile_m, + tile_n=tile_n, + tile_k=tile_k, + a_dtype=a_dtype, + gpu_arch="gfx950", + ) + assert res.legal, f"expected legal, got reason={res.reason!r} ({res.detail})" + assert res.lds_bytes is not None and res.lds_bytes <= LDS_LIMIT_BYTES["gfx950"] + + +def test_rejects_tile_k_bytes_not_div_64(): + # fp4 a_elem_bytes=1 -> tile_k_bytes = tile_k; 288 % 64 != 0. tile_k>=256 ok. + res = check_tile_config(stage=1, model_dim=7168, inter_dim=256, tile_m=64, tile_n=256, tile_k=288, a_dtype="fp4") + assert not res.legal + assert res.reason == "tile_k_bytes_not_div_64" + + +def test_rejects_splitk_k_per_batch_not_div_tile_k(): + # model_dim=7168, k_batch=56 -> k_per_batch=128; 128 % 256 != 0. + res = check_tile_config( + stage=1, model_dim=7168, inter_dim=256, tile_m=64, tile_n=256, tile_k=256, a_dtype="fp4", k_batch=56 + ) + assert not res.legal + assert res.reason == "k_per_batch_not_div_tile_k" + + +def test_rejects_splitk_model_dim_not_div_k_batch(): + res = check_tile_config( + stage=1, model_dim=7168, inter_dim=256, tile_m=64, tile_n=256, tile_k=256, a_dtype="fp4", k_batch=3 + ) + assert not res.legal + assert res.reason == "model_dim_not_div_k_batch" + + +def test_rejects_stage2_model_dim_not_div_tile_n(): + # 7168 % 384 != 0 + res = check_tile_config(stage=2, model_dim=7168, inter_dim=256, tile_m=64, tile_n=384, tile_k=256, a_dtype="fp4") + assert not res.legal + assert res.reason == "model_dim_not_div_tile_n" + + +def test_rejects_stage2_inter_dim_not_div_tile_k(): + # inter_dim=2048, tile_k=768 -> 2048 % 768 != 0 (and 768 % 64 == 0, tile_k>=256) + res = check_tile_config(stage=2, model_dim=7168, inter_dim=2048, tile_m=64, tile_n=256, tile_k=768, a_dtype="fp4") + assert not res.legal + assert res.reason == "inter_dim_not_div_tile_k" + + +def test_rejects_lds_over_limit(): + # A very large tile pushes stage1 LDS past the gfx950 163840-byte limit. + res = check_tile_config(stage=1, model_dim=7168, inter_dim=256, tile_m=512, tile_n=512, tile_k=256, a_dtype="fp8") + assert not res.legal + assert res.reason == "lds_over_limit" + assert res.lds_bytes is not None and res.lds_bytes > LDS_LIMIT_BYTES["gfx950"] + + +def test_stage1_fp4_lds_mirrors_builder_no_vec_pack_halving(): + # Regression: stage1 sizes _single_x_bytes from the FULL lds_stride for fp4 + # (no a_elem_vec_pack division), matching compile_mixed_moe_gemm1. These + # large-tile_k fp4 configs overflow the gfx950 163840-byte limit and MUST be + # rejected -- an earlier version halved the fp4 stride and wrongly accepted + # them. Source-faithful footprints: 230400 and 197632 bytes. + from kernels.moe_tuning import stage1_lds_bytes + + r1 = check_tile_config(stage=1, model_dim=7168, inter_dim=256, tile_m=32, tile_n=32, tile_k=3584, a_dtype="fp4") + assert not r1.legal and r1.reason == "lds_over_limit" + assert stage1_lds_bytes(tile_m=32, tile_n=32, tile_k=3584, a_dtype="fp4") == 230400 + + r2 = check_tile_config(stage=1, model_dim=3072, inter_dim=3072, tile_m=32, tile_n=32, tile_k=3072, a_dtype="fp4") + assert not r2.legal and r2.reason == "lds_over_limit" + assert stage1_lds_bytes(tile_m=32, tile_n=32, tile_k=3072, a_dtype="fp4") == 197632 + + # fp4 and fp8 share the same single_x sizing at stage1 (a_elem_bytes==1, no + # vec-pack division), so equal tiles give equal LDS. + assert stage1_lds_bytes(tile_m=64, tile_n=256, tile_k=256, a_dtype="fp4") == stage1_lds_bytes( + tile_m=64, tile_n=256, tile_k=256, a_dtype="fp8" + ) + + +def test_rejects_fp4_tile_m_too_small(): + res = check_tile_config(stage=1, model_dim=7168, inter_dim=256, tile_m=16, tile_n=256, tile_k=256, a_dtype="fp4") + assert not res.legal + assert res.reason == "tile_m_lt_32" + + +def test_rejects_fp4_tile_k_too_small(): + # tile_k=128 is < 256; still tile_k_bytes % 64 == 0, so the MX-FP4 floor must catch it. + res = check_tile_config(stage=1, model_dim=7168, inter_dim=256, tile_m=64, tile_n=256, tile_k=128, a_dtype="fp4") + assert not res.legal + assert res.reason == "tile_k_lt_256" + + +def test_rejects_bad_stage_and_dtype(): + assert ( + check_tile_config( + stage=3, model_dim=7168, inter_dim=256, tile_m=64, tile_n=256, tile_k=256, a_dtype="fp4" + ).reason + == "bad_stage" + ) + assert ( + check_tile_config( + stage=1, model_dim=7168, inter_dim=256, tile_m=64, tile_n=256, tile_k=256, a_dtype="bogus" + ).reason + == "bad_a_dtype" + ) + + +def test_enumerate_logs_rejections_with_reasons(): + rejected = [] + legal = enumerate_legal_configs( + stage=1, + model_dim=7168, + inter_dim=256, + a_dtype="fp4", + tile_m_choices=(16, 32, 64), # 16 is illegal (tile_m_lt_32) + tile_n_choices=(256,), + tile_k_choices=(128, 256), # 128 is illegal (tile_k_lt_256) + rejected_log=rejected, + ) + # At least one legal config (e.g. tile_m in {32,64}, tile_k=256). + assert legal, "expected some legal configs" + assert all(r.legal for r in legal) + # Every rejection carries a machine-readable reason. + assert rejected, "expected some rejected configs" + assert all(r["reason"] for r in rejected) + reasons = {r["reason"] for r in rejected} + assert "tile_m_lt_32" in reasons + assert "tile_k_lt_256" in reasons From 6323615f922412e2069c93ba7e392a84a3865a1e Mon Sep 17 00:00:00 2001 From: Jin Pan Date: Thu, 25 Jun 2026 01:54:13 +0000 Subject: [PATCH 2/2] moe bench: timed-loop p95 observability + MXFP4 MoE target shapes (#708) - tests/test_common.py + tests/kernels/test_moe_gemm.py: capture and print the per-iteration timed-loop p95 alongside the median for MoE stage1/stage2 (additive observability; no kernel logic change). - scripts/run_benchmark.sh: add the #708 MXFP4 MoE target shapes (DeepSeek V3, Kimi K2, GPT-OSS a4w4; plus a8w4 rows) bracketing the small-token latency and large-shape MFU regimes; document the model->shape mapping. Co-Authored-By: Claude Opus 4.8 (1M context) --- scripts/run_benchmark.sh | 30 ++++++++++++++-- tests/kernels/test_moe_gemm.py | 14 ++++++-- tests/test_common.py | 65 ++++++++++++++++++++++++++++++++++ 3 files changed, 104 insertions(+), 5 deletions(-) diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh index d8ef10a0a..6dd1dbded 100755 --- a/scripts/run_benchmark.sh +++ b/scripts/run_benchmark.sh @@ -177,6 +177,9 @@ MOE_SHAPES=' ' # MoE FP4 shapes (requires --in_dtype fp4, gfx950 only): same format as MOE_SHAPES +# Models: DeepSeek V3 (7168/256/257/9), Kimi K2 (7168/256/384/8), GPT-OSS +# (3072/3072/128/4). Token rows bracket the small-token latency regime +# (tokens<=64) and the large-shape MFU regime (tokens>=4096; targets 16384/32768). MOE_FP4_SHAPES=' 16,7168,256,257,9,64,256,256,256,256 128,7168,256,257,9,64,256,256,256,256 @@ -188,6 +191,13 @@ MOE_FP4_SHAPES=' 2048,7168,2048,32,8,64,256,256,256,256 8192,7168,2048,32,8,64,256,256,256,256 32768,7168,2048,32,8,64,256,256,256,256 +16,7168,256,384,8,64,256,256,256,256 +2048,7168,256,384,8,64,256,256,256,256 +16384,7168,256,384,8,64,256,256,256,256 +32768,7168,256,384,8,64,256,256,256,256 +2048,3072,3072,128,4,32,128,256,256,256 +16384,3072,3072,128,4,32,128,256,256,256 +32768,3072,3072,128,4,32,128,256,256,256 ' # MoE W4A16 groupwise shapes (int4_bf16, group_size=32): same format as MOE_SHAPES @@ -199,14 +209,30 @@ MOE_W4A16_SHAPES=' ' # MoE A8W4 shapes (FP8 activation + MX-FP4 weight, gfx950 only): same format as MOE_SHAPES. -# GPT-OSS inspired: model_dim=3072, inter_dim=3072, E=128, topk=4; sweep tokens from 512 to -# bracket memory- and compute-bound regimes. tile_m>=32 / tile_k>=256 are MX-FP4 layout requirements. +# Models: GPT-OSS (3072/3072/128/4), DeepSeek V3 (7168/256/257/9), DeepSeek V4 +# (7168/512/385/7, a8w4 only), Kimi K2 (7168/256/384/8). tile_m>=32 / tile_k>=256 +# are MX-FP4 layout requirements. Token rows bracket the small-token latency +# regime (tokens<=64) and the large-shape MFU regime (tokens>=4096; 16384/32768). MOE_A8W4_SHAPES=' 512,3072,3072,128,4,32,128,256,256,256 1024,3072,3072,128,4,32,128,256,256,256 2048,3072,3072,128,4,32,128,256,256,256 4096,3072,3072,128,4,32,128,256,256,256 8192,3072,3072,128,4,32,128,256,256,256 +16384,3072,3072,128,4,32,128,256,256,256 +32768,3072,3072,128,4,32,128,256,256,256 +16,7168,256,257,9,64,256,256,256,256 +2048,7168,256,257,9,64,256,256,256,256 +16384,7168,256,257,9,64,256,256,256,256 +32768,7168,256,257,9,64,256,256,256,256 +16,7168,512,385,7,64,256,256,256,256 +2048,7168,512,385,7,64,256,256,256,256 +16384,7168,512,385,7,64,256,256,256,256 +32768,7168,512,385,7,64,256,256,256,256 +16,7168,256,384,8,64,256,256,256,256 +2048,7168,256,384,8,64,256,256,256,256 +16384,7168,256,384,8,64,256,256,256,256 +32768,7168,256,384,8,64,256,256,256,256 ' # Memory bound threshold (M or tokens <= threshold => memory bound) diff --git a/tests/kernels/test_moe_gemm.py b/tests/kernels/test_moe_gemm.py index e956f83b5..4aaaba798 100644 --- a/tests/kernels/test_moe_gemm.py +++ b/tests/kernels/test_moe_gemm.py @@ -34,7 +34,15 @@ from flydsl.runtime.device import get_rocm_arch # noqa: E402 from tests.kernels.test_ref import torch_moe_gemm1, torch_moe_gemm2 # noqa: E402 -from tests.test_common import run_perftest, verify_output # noqa: E402 +from tests.test_common import LAST_PERF_DIST, run_perftest, verify_output # noqa: E402 + + +def _perf_p95_suffix(): + """Return ' p95= us' when a timed-loop distribution was captured, else ''.""" + p95 = LAST_PERF_DIST.get("p95") + return f" p95={p95:.1f} us" if p95 is not None else "" + + from tests.utils import pertoken_quant, shuffle_scale_for_int4, shuffle_weight # noqa: E402 ARCH = get_rocm_arch() @@ -798,7 +806,7 @@ def launch(o, x, w, sx, sw, st, eids, sw_sorted): print( f"FlyDSL MoE stage1[{in_dtype}]: " - f"{us:.1f} us, " + f"{us:.1f} us,{_perf_p95_suffix()} " f"{tflops:.2f} TFLOPS(logical, M={tokens*topk}), " f"{tbps:.3f} TB/s (doweight_stage1={doweight_stage1})" ) @@ -1560,7 +1568,7 @@ def launch(o, x, w, sx, sw, st, eids, sw_sorted): print( f"FlyDSL MoE stage2 [{kernel_name}] {in_dtype} {'reduce' if use_reduce else 'atomic'} | " f"{model_dim}x{inter_dim}, E={experts}, K={topk}, M_eff={tokens*topk} | " - f"{us:.1f} us, {tflops:.2f} TFLOPS, {tbps:.3f} TB/s" + f"{us:.1f} us,{_perf_p95_suffix()} {tflops:.2f} TFLOPS, {tbps:.3f} TB/s" ) # Optional compare vs aiter stage2. if compare_aiter_ck is None: diff --git a/tests/test_common.py b/tests/test_common.py index 28ac28691..1d061ec15 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -20,6 +20,42 @@ # pd.set_option("display.expand_frame_repr", False) +# Distribution (median + p95, microseconds) of the most recent perftest call, +# populated only when FLYDSL_PERF_DIST is set. Lets callers report a true +# timed-loop median+p95 over num_iters without changing the (data, avg) return +# signature shared by every other caller. +LAST_PERF_DIST = {"median": None, "p95": None, "n_rotate": None} + + +def _percentile(sorted_vals, q): + if not sorted_vals: + return None + idx = max(0, min(len(sorted_vals) - 1, int(round(q * (len(sorted_vals) - 1))))) + return sorted_vals[idx] + + +def _timed_distribution(func, rotate_args, num_iters, time_call): + """Run ``func`` for ``num_iters``, CYCLING through ``rotate_args`` (the + cache-sized argument copies = L2-flush behavior), timing each call with + ``time_call(func, args, kwargs) -> microseconds``. + + Returns ``(data, median_us, p95_us, n_rotate)``. Pure/host-testable: the GPU + event timing is injected via ``time_call`` so the rotation contract (iteration + i uses ``rotate_args[i % n]``) can be unit-tested without a device. + """ + n_rot = len(rotate_args) + latencies = [] + data = None + for i in range(num_iters): + a_i, kw_i = rotate_args[i % n_rot] + us, data = time_call(func, a_i, kw_i) + latencies.append(us) + ordered = sorted(latencies) + n = len(ordered) + median = ordered[n // 2] if n % 2 else (ordered[n // 2 - 1] + ordered[n // 2]) / 2.0 + return data, median, _percentile(ordered, 0.95), n_rot + + def perftest(num_iters=20, num_warmup=3, testGraph=False, num_rotate_args=0, needTrace=False): def decorator(func): def wrapper(*args, **kwargs): @@ -46,6 +82,35 @@ def wrapper(*args, **kwargs): run_iters(num_warmup, func, *args, **kwargs) torch.cuda.synchronize() + # True per-iteration timed-loop distribution (median + p95) over + # num_iters, recorded in LAST_PERF_DIST. Opt-in via FLYDSL_PERF_DIST so + # the default profiler/event path is unchanged. Returns the MEDIAN as + # the central-tendency `avg` so the reported us is the median. + # + # Cycles through the SAME ``rotate_args`` set the default path uses + # (``num`` cache-sized argument copies), so each iteration touches a + # different working set -- this is the L2-flush behavior the recorded + # protocol claims (l2_flush_per_iter=True), not a hot-cache reuse of one + # tensor set. LAST_PERF_DIST["n_rotate"] records how many copies cycled. + if int(os.environ.get("FLYDSL_PERF_DIST", 0)): + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + + def _time_call(fn, a_i, kw_i): + start_event.record() + out = fn(*a_i, **kw_i) + end_event.record() + end_event.synchronize() + return start_event.elapsed_time(end_event) * 1000.0, out # ms -> us + + data, median, p95, n_rot = _timed_distribution(func, rotate_args, num_iters, _time_call) + torch.cuda.synchronize() + LAST_PERF_DIST["median"] = median + LAST_PERF_DIST["p95"] = p95 + LAST_PERF_DIST["n_rotate"] = n_rot + logger.info(f"perf_dist: median={median:.3f} us p95={p95:.3f} us over {num_iters} iters") + return data, median + if int(os.environ.get("FLYDSL_LOG_MORE", 0)): latencies = [] start_event = torch.cuda.Event(enable_timing=True)