diff --git a/tests/integration/defs/perf/test_perf_sanity.py b/tests/integration/defs/perf/test_perf_sanity.py index 4a24d6f748d3..2b89d7557c77 100644 --- a/tests/integration/defs/perf/test_perf_sanity.py +++ b/tests/integration/defs/perf/test_perf_sanity.py @@ -362,15 +362,8 @@ def to_match_keys(self) -> List[str]: "l_cp", "l_gpus_per_node", "l_max_batch_size", - "b_disable_overlap_scheduler", - "b_enable_chunked_prefill", "b_enable_attention_dp", - "b_enable_lm_head_tp_in_adp", "s_serving_backend", - # attention_dp_config - "b_attention_dp_balance", - # cuda_graph_config - "b_enable_cuda_graph", # kv_cache_config "s_kv_cache_dtype", # cache_transceiver_config diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index f151b45321ac..f77373bad167 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -363,9 +363,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backe accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=fp8-kv_cache_reuse=True-fp8kv=True-overlap_scheduler=True] SKIP (https://nvbugs/6084445) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding_4gpus[attention_dp=True-mtp_nextn=0] SKIP (https://nvbugs/6084447) accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_attention_dp] SKIP (https://nvbugs/6084568) -perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugs/6088149) -perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX] SKIP (https://nvbugs/6088149) -perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_2_nodes_grace_blackwell-k25_thinking_fp4_dep8_32k8k] SKIP (https://nvbugs/6088149) accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8 SKIP (https://nvbugs/6070857) accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_bf16_4gpu[tp4ep4_cudagraph_overlap_adp_on] SKIP (https://nvbugs/6094068) accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9-fp8kv=True] SKIP (https://nvbugs/6094066) diff --git a/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml index 131e77c5ebc9..0e194a0b1997 100644 --- a/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml +++ b/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml @@ -5,7 +5,7 @@ metadata: hardware: gpus_per_node: 4 server_configs: - # 1k1k configs - DEP8 with CUTLASS, MTP1 + # 1k1k configs - DEP8 with CUTEDSL, MTP1 - name: "r1_fp4_v2_dep8_mtp1_1k1k" model_name: "deepseek_r1_0528_fp4_v2" trust_remote_code: true @@ -21,7 +21,7 @@ server_configs: enable_balance: true timeout_iters: 60 moe_config: - backend: 'CUTLASS' + backend: 'CUTEDSL' use_low_precision_moe_combine: true cuda_graph_config: enable_padding: true @@ -42,7 +42,7 @@ server_configs: backend: "openai" dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json - # 8k1k configs - DEP8 with CUTLASS, MTP1 + # 8k1k configs - DEP8 with CUTEDSL, MTP1 - name: "r1_fp4_v2_dep8_mtp1_8k1k" model_name: "deepseek_r1_0528_fp4_v2" trust_remote_code: true @@ -58,7 +58,7 @@ server_configs: enable_balance: true timeout_iters: 60 moe_config: - backend: 'CUTLASS' + backend: 'CUTEDSL' use_low_precision_moe_combine: true cuda_graph_config: enable_padding: true diff --git a/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_blackwell.yaml index 5dc374cc1383..df123fbc5df5 100644 --- a/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_blackwell.yaml +++ b/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_blackwell.yaml @@ -36,7 +36,7 @@ server_configs: backend: "openai" dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json - # 1k1k configs - DEP8 with CUTLASS, MTP1 + # 1k1k configs - DEP8 with CUTEDSL, MTP1 - name: "r1_fp4_v2_dep8_mtp1_1k1k" model_name: "deepseek_r1_0528_fp4_v2" tensor_parallel_size: 8 @@ -51,7 +51,7 @@ server_configs: enable_balance: true timeout_iters: 60 moe_config: - backend: 'CUTLASS' + backend: 'CUTEDSL' use_low_precision_moe_combine: true cuda_graph_config: enable_padding: true @@ -103,7 +103,7 @@ server_configs: backend: "openai" dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json - # 8k1k configs - DEP8 with CUTLASS, MTP1 + # 8k1k configs - DEP8 with CUTEDSL, MTP1 - name: "r1_fp4_v2_dep8_mtp1_8k1k" model_name: "deepseek_r1_0528_fp4_v2" tensor_parallel_size: 8 @@ -118,7 +118,7 @@ server_configs: enable_balance: true timeout_iters: 60 moe_config: - backend: 'CUTLASS' + backend: 'CUTEDSL' use_low_precision_moe_combine: true cuda_graph_config: enable_padding: true diff --git a/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_grace_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_grace_blackwell.yaml index e6f02c3f2a6d..621a7542a60d 100644 --- a/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_grace_blackwell.yaml +++ b/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_grace_blackwell.yaml @@ -6,7 +6,7 @@ metadata: hardware: gpus_per_node: 4 server_configs: - # 1k1k configs - DEP4 with CUTLASS, MTP1 + # 1k1k configs - DEP4 with CUTEDSL, MTP1 - name: "r1_fp4_v2_dep4_mtp1_1k1k" model_name: "deepseek_r1_0528_fp4_v2" tensor_parallel_size: 4 @@ -21,7 +21,7 @@ server_configs: enable_balance: true timeout_iters: 60 moe_config: - backend: 'CUTLASS' + backend: 'CUTEDSL' use_low_precision_moe_combine: true cuda_graph_config: enable_padding: true @@ -104,7 +104,7 @@ server_configs: backend: "openai" dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json - # 8k1k configs - DEP4 with CUTLASS, MTP1 + # 8k1k configs - DEP4 with CUTEDSL, MTP1 - name: "r1_fp4_v2_dep4_mtp1_8k1k" model_name: "deepseek_r1_0528_fp4_v2" tensor_parallel_size: 4 @@ -119,7 +119,7 @@ server_configs: enable_balance: true timeout_iters: 60 moe_config: - backend: 'CUTLASS' + backend: 'CUTEDSL' use_low_precision_moe_combine: true cuda_graph_config: enable_padding: true @@ -202,7 +202,7 @@ server_configs: backend: "openai" dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json - # 1k8k configs - DEP4 with CUTLASS, MTP1 + # 1k8k configs - DEP4 with CUTEDSL, MTP1 - name: "r1_fp4_v2_dep4_mtp1_1k8k" model_name: "deepseek_r1_0528_fp4_v2" tensor_parallel_size: 4 @@ -217,7 +217,7 @@ server_configs: enable_balance: true timeout_iters: 60 moe_config: - backend: 'CUTLASS' + backend: 'CUTEDSL' use_low_precision_moe_combine: true cuda_graph_config: enable_padding: true diff --git a/tests/scripts/perf-sanity/aggregated/deepseek_v32_fp4_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/deepseek_v32_fp4_blackwell.yaml index ac307a47c827..b45d899d9e41 100644 --- a/tests/scripts/perf-sanity/aggregated/deepseek_v32_fp4_blackwell.yaml +++ b/tests/scripts/perf-sanity/aggregated/deepseek_v32_fp4_blackwell.yaml @@ -36,7 +36,7 @@ server_configs: backend: "openai" dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json - # 8k1k configs - DEP8 with CUTLASS, MTP1 + # 8k1k configs - DEP8 with CUTEDSL, MTP1 - name: "v32_fp4_dep8_mtp1_8k1k" model_name: "deepseek_v32_fp4" tensor_parallel_size: 8 @@ -51,7 +51,7 @@ server_configs: enable_balance: true timeout_iters: 60 moe_config: - backend: 'CUTLASS' + backend: 'CUTEDSL' use_low_precision_moe_combine: true cuda_graph_config: enable_padding: true diff --git a/tests/scripts/perf-sanity/aggregated/deepseek_v32_fp4_grace_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/deepseek_v32_fp4_grace_blackwell.yaml index 39dd452d3d3f..d8b9d1735540 100644 --- a/tests/scripts/perf-sanity/aggregated/deepseek_v32_fp4_grace_blackwell.yaml +++ b/tests/scripts/perf-sanity/aggregated/deepseek_v32_fp4_grace_blackwell.yaml @@ -36,7 +36,7 @@ server_configs: backend: "openai" dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json - # 1k1k configs - DEP4 with CUTLASS, MTP1 + # 1k1k configs - DEP4 with CUTEDSL, MTP1 - name: "v32_fp4_dep4_mtp1_1k1k" model_name: "deepseek_v32_fp4" tensor_parallel_size: 4 @@ -51,7 +51,7 @@ server_configs: enable_balance: true timeout_iters: 60 moe_config: - backend: 'CUTLASS' + backend: 'CUTEDSL' use_low_precision_moe_combine: true cuda_graph_config: enable_padding: true @@ -103,7 +103,7 @@ server_configs: backend: "openai" dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json - # 8k1k configs - DEP4 with CUTLASS, MTP1 + # 8k1k configs - DEP4 with CUTEDSL, MTP1 - name: "v32_fp4_dep4_mtp1_8k1k" model_name: "deepseek_v32_fp4" tensor_parallel_size: 4 @@ -118,7 +118,7 @@ server_configs: enable_balance: true timeout_iters: 60 moe_config: - backend: 'CUTLASS' + backend: 'CUTEDSL' use_low_precision_moe_combine: true cuda_graph_config: enable_padding: true diff --git a/tests/scripts/perf-sanity/aggregated/gb300_deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/gb300_deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml index ff17bb63ba41..465a4abf8a4c 100644 --- a/tests/scripts/perf-sanity/aggregated/gb300_deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml +++ b/tests/scripts/perf-sanity/aggregated/gb300_deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml @@ -5,7 +5,7 @@ metadata: hardware: gpus_per_node: 4 server_configs: - # 1k1k configs - DEP8 with CUTLASS, MTP1 + # 1k1k configs - DEP8 with CUTEDSL, MTP1 - name: "r1_fp4_v2_dep8_mtp1_1k1k" model_name: "deepseek_r1_0528_fp4_v2" trust_remote_code: true @@ -21,7 +21,7 @@ server_configs: enable_balance: true timeout_iters: 60 moe_config: - backend: 'CUTLASS' + backend: 'CUTEDSL' use_low_precision_moe_combine: true cuda_graph_config: enable_padding: true @@ -42,7 +42,7 @@ server_configs: random_range_ratio: 0.2 backend: "openai" - # 8k1k configs - DEP8 with CUTLASS, MTP1 + # 8k1k configs - DEP8 with CUTEDSL, MTP1 - name: "r1_fp4_v2_dep8_mtp1_8k1k" model_name: "deepseek_r1_0528_fp4_v2" trust_remote_code: true @@ -58,7 +58,7 @@ server_configs: enable_balance: true timeout_iters: 60 moe_config: - backend: 'CUTLASS' + backend: 'CUTEDSL' use_low_precision_moe_combine: true cuda_graph_config: enable_padding: true diff --git a/tests/scripts/perf-sanity/aggregated/k25_thinking_fp4_2_nodes_grace_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/k25_thinking_fp4_2_nodes_grace_blackwell.yaml index 5f3cccc52005..864cc8792ab5 100644 --- a/tests/scripts/perf-sanity/aggregated/k25_thinking_fp4_2_nodes_grace_blackwell.yaml +++ b/tests/scripts/perf-sanity/aggregated/k25_thinking_fp4_2_nodes_grace_blackwell.yaml @@ -36,7 +36,7 @@ server_configs: trust_remote_code: true dataset_file: - # 32k8k configs - DEP8 with CUTLASS + # 32k8k configs - DEP8 with CUTEDSL - name: "k25_thinking_fp4_dep8_32k8k" model_name: "k25_thinking_fp4" tensor_parallel_size: 8 @@ -53,7 +53,7 @@ server_configs: enable_balance: true timeout_iters: 60 moe_config: - backend: 'CUTLASS' + backend: 'CUTEDSL' use_low_precision_moe_combine: true cuda_graph_config: enable_padding: true diff --git a/tests/scripts/perf-sanity/aggregated/k25_thinking_fp4_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/k25_thinking_fp4_blackwell.yaml index 9b2f6f5fe925..fe1f29489771 100644 --- a/tests/scripts/perf-sanity/aggregated/k25_thinking_fp4_blackwell.yaml +++ b/tests/scripts/perf-sanity/aggregated/k25_thinking_fp4_blackwell.yaml @@ -35,7 +35,7 @@ server_configs: trust_remote_code: true dataset_file: - # 8k1k configs - DEP8 with CUTLASS + # 8k1k configs - DEP8 with CUTEDSL - name: "k25_thinking_fp4_dep8_8k1k" model_name: "k25_thinking_fp4" tensor_parallel_size: 8 @@ -51,7 +51,7 @@ server_configs: enable_balance: true timeout_iters: 60 moe_config: - backend: 'CUTLASS' + backend: 'CUTEDSL' use_low_precision_moe_combine: true cuda_graph_config: enable_padding: true @@ -101,7 +101,7 @@ server_configs: trust_remote_code: true dataset_file: - # 32k8k configs - DEP8 with CUTLASS + # 32k8k configs - DEP8 with CUTEDSL - name: "k25_thinking_fp4_dep8_32k8k" model_name: "k25_thinking_fp4" tensor_parallel_size: 8 @@ -118,7 +118,7 @@ server_configs: enable_balance: true timeout_iters: 60 moe_config: - backend: 'CUTLASS' + backend: 'CUTEDSL' use_low_precision_moe_combine: true cuda_graph_config: enable_padding: true diff --git a/tests/scripts/perf-sanity/aggregated/k25_thinking_fp4_grace_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/k25_thinking_fp4_grace_blackwell.yaml index 749a71bd80a3..35335416f193 100644 --- a/tests/scripts/perf-sanity/aggregated/k25_thinking_fp4_grace_blackwell.yaml +++ b/tests/scripts/perf-sanity/aggregated/k25_thinking_fp4_grace_blackwell.yaml @@ -35,7 +35,7 @@ server_configs: trust_remote_code: true dataset_file: - # 8k1k configs - DEP4 with CUTLASS + # 8k1k configs - DEP4 with CUTEDSL - name: "k25_thinking_fp4_dep4_8k1k" model_name: "k25_thinking_fp4" tensor_parallel_size: 4 @@ -51,7 +51,7 @@ server_configs: enable_balance: true timeout_iters: 60 moe_config: - backend: 'CUTLASS' + backend: 'CUTEDSL' use_low_precision_moe_combine: true cuda_graph_config: enable_padding: true diff --git a/tests/scripts/perf-sanity/aggregated/k2_thinking_fp4_2_nodes_grace_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/k2_thinking_fp4_2_nodes_grace_blackwell.yaml index 4e3d4d5e00c2..d4c9a30fb56c 100644 --- a/tests/scripts/perf-sanity/aggregated/k2_thinking_fp4_2_nodes_grace_blackwell.yaml +++ b/tests/scripts/perf-sanity/aggregated/k2_thinking_fp4_2_nodes_grace_blackwell.yaml @@ -36,7 +36,7 @@ server_configs: trust_remote_code: true dataset_file: datasets/perf-ci/k2_thinking-32k8k-20480-ratio-1_for_serve.json - # 32k8k configs - DEP8 with CUTLASS + # 32k8k configs - DEP8 with CUTEDSL - name: "k2_thinking_fp4_dep8_32k8k" model_name: "k2_thinking_fp4" tensor_parallel_size: 8 @@ -53,7 +53,7 @@ server_configs: enable_balance: true timeout_iters: 60 moe_config: - backend: 'CUTLASS' + backend: 'CUTEDSL' use_low_precision_moe_combine: true cuda_graph_config: enable_padding: true diff --git a/tests/scripts/perf-sanity/aggregated/k2_thinking_fp4_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/k2_thinking_fp4_blackwell.yaml index a8bda52462c0..180cf3eb9d03 100644 --- a/tests/scripts/perf-sanity/aggregated/k2_thinking_fp4_blackwell.yaml +++ b/tests/scripts/perf-sanity/aggregated/k2_thinking_fp4_blackwell.yaml @@ -35,7 +35,7 @@ server_configs: trust_remote_code: true dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json - # 8k1k configs - DEP8 with CUTLASS + # 8k1k configs - DEP8 with CUTEDSL - name: "k2_thinking_fp4_dep8_8k1k" model_name: "k2_thinking_fp4" tensor_parallel_size: 8 @@ -51,7 +51,7 @@ server_configs: enable_balance: true timeout_iters: 60 moe_config: - backend: 'CUTLASS' + backend: 'CUTEDSL' use_low_precision_moe_combine: true cuda_graph_config: enable_padding: true @@ -101,7 +101,7 @@ server_configs: trust_remote_code: true dataset_file: datasets/perf-ci/k2_thinking-32k8k-20480-ratio-1_for_serve.json - # 32k8k configs - DEP8 with CUTLASS + # 32k8k configs - DEP8 with CUTEDSL - name: "k2_thinking_fp4_dep8_32k8k" model_name: "k2_thinking_fp4" tensor_parallel_size: 8 @@ -118,7 +118,7 @@ server_configs: enable_balance: true timeout_iters: 60 moe_config: - backend: 'CUTLASS' + backend: 'CUTEDSL' use_low_precision_moe_combine: true cuda_graph_config: enable_padding: true diff --git a/tests/scripts/perf-sanity/aggregated/k2_thinking_fp4_grace_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/k2_thinking_fp4_grace_blackwell.yaml index 209230ff0bb7..9b2f3e165a59 100644 --- a/tests/scripts/perf-sanity/aggregated/k2_thinking_fp4_grace_blackwell.yaml +++ b/tests/scripts/perf-sanity/aggregated/k2_thinking_fp4_grace_blackwell.yaml @@ -35,7 +35,7 @@ server_configs: trust_remote_code: true dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json - # 8k1k configs - DEP4 with CUTLASS + # 8k1k configs - DEP4 with CUTEDSL - name: "k2_thinking_fp4_dep4_8k1k" model_name: "k2_thinking_fp4" tensor_parallel_size: 4 @@ -51,7 +51,7 @@ server_configs: enable_balance: true timeout_iters: 60 moe_config: - backend: 'CUTLASS' + backend: 'CUTEDSL' use_low_precision_moe_combine: true cuda_graph_config: enable_padding: true