NVIDIA · chenfeiz0326 · Apr 18, 2026 · Apr 18, 2026 · Apr 20, 2026 · Apr 20, 2026
@@ -362,15 +362,8 @@ def to_match_keys(self) -> List[str]:
             "l_cp",
             "l_gpus_per_node",
             "l_max_batch_size",
-            "b_disable_overlap_scheduler",
-            "b_enable_chunked_prefill",
             "b_enable_attention_dp",
-            "b_enable_lm_head_tp_in_adp",
             "s_serving_backend",
-            # attention_dp_config
-            "b_attention_dp_balance",
-            # cuda_graph_config
-            "b_enable_cuda_graph",
             # kv_cache_config
             "s_kv_cache_dtype",
             # cache_transceiver_config

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -363,9 +363,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backe
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=fp8-kv_cache_reuse=True-fp8kv=True-overlap_scheduler=True] SKIP (https://nvbugs/6084445)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding_4gpus[attention_dp=True-mtp_nextn=0] SKIP (https://nvbugs/6084447)
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_attention_dp] SKIP (https://nvbugs/6084568)
-perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugs/6088149)
-perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX] SKIP (https://nvbugs/6088149)
-perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_2_nodes_grace_blackwell-k25_thinking_fp4_dep8_32k8k] SKIP (https://nvbugs/6088149)
 accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8 SKIP (https://nvbugs/6070857)
 accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_bf16_4gpu[tp4ep4_cudagraph_overlap_adp_on] SKIP (https://nvbugs/6094068)
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9-fp8kv=True] SKIP (https://nvbugs/6094066)

diff --git a/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml
@@ -5,7 +5,7 @@ metadata:
 hardware:
   gpus_per_node: 4
 server_configs:
-  # 1k1k configs - DEP8 with CUTLASS, MTP1
+  # 1k1k configs - DEP8 with CUTEDSL, MTP1
   - name: "r1_fp4_v2_dep8_mtp1_1k1k"
     model_name: "deepseek_r1_0528_fp4_v2"
     trust_remote_code: true
@@ -21,7 +21,7 @@ server_configs:
       enable_balance: true
       timeout_iters: 60
     moe_config:
-      backend: 'CUTLASS'
+      backend: 'CUTEDSL'
       use_low_precision_moe_combine: true
     cuda_graph_config:
       enable_padding: true
@@ -42,7 +42,7 @@ server_configs:
         backend: "openai"
         dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
 
-  # 8k1k configs - DEP8 with CUTLASS, MTP1
+  # 8k1k configs - DEP8 with CUTEDSL, MTP1
   - name: "r1_fp4_v2_dep8_mtp1_8k1k"
     model_name: "deepseek_r1_0528_fp4_v2"
     trust_remote_code: true
@@ -58,7 +58,7 @@ server_configs:
       enable_balance: true
       timeout_iters: 60
     moe_config:
-      backend: 'CUTLASS'
+      backend: 'CUTEDSL'
       use_low_precision_moe_combine: true
     cuda_graph_config:
       enable_padding: true

diff --git a/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_blackwell.yaml
@@ -36,7 +36,7 @@ server_configs:
         backend: "openai"
         dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
 
-  # 1k1k configs - DEP8 with CUTLASS, MTP1
+  # 1k1k configs - DEP8 with CUTEDSL, MTP1
   - name: "r1_fp4_v2_dep8_mtp1_1k1k"
     model_name: "deepseek_r1_0528_fp4_v2"
     tensor_parallel_size: 8
@@ -51,7 +51,7 @@ server_configs:
       enable_balance: true
       timeout_iters: 60
     moe_config:
-      backend: 'CUTLASS'
+      backend: 'CUTEDSL'
       use_low_precision_moe_combine: true
     cuda_graph_config:
       enable_padding: true
@@ -103,7 +103,7 @@ server_configs:
         backend: "openai"
         dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
 
-  # 8k1k configs - DEP8 with CUTLASS, MTP1
+  # 8k1k configs - DEP8 with CUTEDSL, MTP1
   - name: "r1_fp4_v2_dep8_mtp1_8k1k"
     model_name: "deepseek_r1_0528_fp4_v2"
     tensor_parallel_size: 8
@@ -118,7 +118,7 @@ server_configs:
       enable_balance: true
       timeout_iters: 60
     moe_config:
-      backend: 'CUTLASS'
+      backend: 'CUTEDSL'
       use_low_precision_moe_combine: true
     cuda_graph_config:
       enable_padding: true

diff --git a/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_grace_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/deepseek_r1_fp4_v2_grace_blackwell.yaml
@@ -6,7 +6,7 @@ metadata:
 hardware:
   gpus_per_node: 4
 server_configs:
-  # 1k1k configs - DEP4 with CUTLASS, MTP1
+  # 1k1k configs - DEP4 with CUTEDSL, MTP1
   - name: "r1_fp4_v2_dep4_mtp1_1k1k"
     model_name: "deepseek_r1_0528_fp4_v2"
     tensor_parallel_size: 4
@@ -21,7 +21,7 @@ server_configs:
       enable_balance: true
       timeout_iters: 60
     moe_config:
-      backend: 'CUTLASS'
+      backend: 'CUTEDSL'
       use_low_precision_moe_combine: true
     cuda_graph_config:
       enable_padding: true
@@ -104,7 +104,7 @@ server_configs:
         backend: "openai"
         dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
 
-  # 8k1k configs - DEP4 with CUTLASS, MTP1
+  # 8k1k configs - DEP4 with CUTEDSL, MTP1
   - name: "r1_fp4_v2_dep4_mtp1_8k1k"
     model_name: "deepseek_r1_0528_fp4_v2"
     tensor_parallel_size: 4
@@ -119,7 +119,7 @@ server_configs:
       enable_balance: true
       timeout_iters: 60
     moe_config:
-      backend: 'CUTLASS'
+      backend: 'CUTEDSL'
       use_low_precision_moe_combine: true
     cuda_graph_config:
       enable_padding: true
@@ -202,7 +202,7 @@ server_configs:
         backend: "openai"
         dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
 
-  # 1k8k configs - DEP4 with CUTLASS, MTP1
+  # 1k8k configs - DEP4 with CUTEDSL, MTP1
   - name: "r1_fp4_v2_dep4_mtp1_1k8k"
     model_name: "deepseek_r1_0528_fp4_v2"
     tensor_parallel_size: 4
@@ -217,7 +217,7 @@ server_configs:
       enable_balance: true
       timeout_iters: 60
     moe_config:
-      backend: 'CUTLASS'
+      backend: 'CUTEDSL'
       use_low_precision_moe_combine: true
     cuda_graph_config:
       enable_padding: true

diff --git a/tests/scripts/perf-sanity/aggregated/deepseek_v32_fp4_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/deepseek_v32_fp4_blackwell.yaml
@@ -36,7 +36,7 @@ server_configs:
         backend: "openai"
         dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json
 
-  # 8k1k configs - DEP8 with CUTLASS, MTP1
+  # 8k1k configs - DEP8 with CUTEDSL, MTP1
   - name: "v32_fp4_dep8_mtp1_8k1k"
     model_name: "deepseek_v32_fp4"
     tensor_parallel_size: 8
@@ -51,7 +51,7 @@ server_configs:
       enable_balance: true
       timeout_iters: 60
     moe_config:
-      backend: 'CUTLASS'
+      backend: 'CUTEDSL'
       use_low_precision_moe_combine: true
     cuda_graph_config:
       enable_padding: true

diff --git a/tests/scripts/perf-sanity/aggregated/deepseek_v32_fp4_grace_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/deepseek_v32_fp4_grace_blackwell.yaml
@@ -36,7 +36,7 @@ server_configs:
         backend: "openai"
         dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json
 
-  # 1k1k configs - DEP4 with CUTLASS, MTP1
+  # 1k1k configs - DEP4 with CUTEDSL, MTP1
   - name: "v32_fp4_dep4_mtp1_1k1k"
     model_name: "deepseek_v32_fp4"
     tensor_parallel_size: 4
@@ -51,7 +51,7 @@ server_configs:
       enable_balance: true
       timeout_iters: 60
     moe_config:
-      backend: 'CUTLASS'
+      backend: 'CUTEDSL'
       use_low_precision_moe_combine: true
     cuda_graph_config:
       enable_padding: true
@@ -103,7 +103,7 @@ server_configs:
         backend: "openai"
         dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json
 
-  # 8k1k configs - DEP4 with CUTLASS, MTP1
+  # 8k1k configs - DEP4 with CUTEDSL, MTP1
   - name: "v32_fp4_dep4_mtp1_8k1k"
     model_name: "deepseek_v32_fp4"
     tensor_parallel_size: 4
@@ -118,7 +118,7 @@ server_configs:
       enable_balance: true
       timeout_iters: 60
     moe_config:
-      backend: 'CUTLASS'
+      backend: 'CUTEDSL'
       use_low_precision_moe_combine: true
     cuda_graph_config:
       enable_padding: true

diff --git a/tests/scripts/perf-sanity/aggregated/gb300_deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/gb300_deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml
@@ -5,7 +5,7 @@ metadata:
 hardware:
   gpus_per_node: 4
 server_configs:
-  # 1k1k configs - DEP8 with CUTLASS, MTP1
+  # 1k1k configs - DEP8 with CUTEDSL, MTP1
   - name: "r1_fp4_v2_dep8_mtp1_1k1k"
     model_name: "deepseek_r1_0528_fp4_v2"
     trust_remote_code: true
@@ -21,7 +21,7 @@ server_configs:
       enable_balance: true
       timeout_iters: 60
     moe_config:
-      backend: 'CUTLASS'
+      backend: 'CUTEDSL'
       use_low_precision_moe_combine: true
     cuda_graph_config:
       enable_padding: true
@@ -42,7 +42,7 @@ server_configs:
         random_range_ratio: 0.2
         backend: "openai"
 
-  # 8k1k configs - DEP8 with CUTLASS, MTP1
+  # 8k1k configs - DEP8 with CUTEDSL, MTP1
   - name: "r1_fp4_v2_dep8_mtp1_8k1k"
     model_name: "deepseek_r1_0528_fp4_v2"
     trust_remote_code: true
@@ -58,7 +58,7 @@ server_configs:
       enable_balance: true
       timeout_iters: 60
     moe_config:
-      backend: 'CUTLASS'
+      backend: 'CUTEDSL'
       use_low_precision_moe_combine: true
     cuda_graph_config:
       enable_padding: true

diff --git a/tests/scripts/perf-sanity/aggregated/k25_thinking_fp4_2_nodes_grace_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/k25_thinking_fp4_2_nodes_grace_blackwell.yaml
@@ -36,7 +36,7 @@ server_configs:
         trust_remote_code: true
         dataset_file: <dataset_file>
 
-  # 32k8k configs - DEP8 with CUTLASS
+  # 32k8k configs - DEP8 with CUTEDSL
   - name: "k25_thinking_fp4_dep8_32k8k"
     model_name: "k25_thinking_fp4"
     tensor_parallel_size: 8
@@ -53,7 +53,7 @@ server_configs:
       enable_balance: true
       timeout_iters: 60
     moe_config:
-      backend: 'CUTLASS'
+      backend: 'CUTEDSL'
       use_low_precision_moe_combine: true
     cuda_graph_config:
       enable_padding: true

diff --git a/tests/scripts/perf-sanity/aggregated/k25_thinking_fp4_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/k25_thinking_fp4_blackwell.yaml
@@ -35,7 +35,7 @@ server_configs:
         trust_remote_code: true
         dataset_file: <dataset_file>
 
-  # 8k1k configs - DEP8 with CUTLASS
+  # 8k1k configs - DEP8 with CUTEDSL
   - name: "k25_thinking_fp4_dep8_8k1k"
     model_name: "k25_thinking_fp4"
     tensor_parallel_size: 8
@@ -51,7 +51,7 @@ server_configs:
       enable_balance: true
       timeout_iters: 60
     moe_config:
-      backend: 'CUTLASS'
+      backend: 'CUTEDSL'
       use_low_precision_moe_combine: true
     cuda_graph_config:
       enable_padding: true
@@ -101,7 +101,7 @@ server_configs:
         trust_remote_code: true
         dataset_file: <dataset_file>
 
-  # 32k8k configs - DEP8 with CUTLASS
+  # 32k8k configs - DEP8 with CUTEDSL
   - name: "k25_thinking_fp4_dep8_32k8k"
     model_name: "k25_thinking_fp4"
     tensor_parallel_size: 8
@@ -118,7 +118,7 @@ server_configs:
       enable_balance: true
       timeout_iters: 60
     moe_config:
-      backend: 'CUTLASS'
+      backend: 'CUTEDSL'
       use_low_precision_moe_combine: true
     cuda_graph_config:
       enable_padding: true

diff --git a/tests/scripts/perf-sanity/aggregated/k25_thinking_fp4_grace_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/k25_thinking_fp4_grace_blackwell.yaml
@@ -35,7 +35,7 @@ server_configs:
         trust_remote_code: true
         dataset_file: <dataset_file>
 
-  # 8k1k configs - DEP4 with CUTLASS
+  # 8k1k configs - DEP4 with CUTEDSL
   - name: "k25_thinking_fp4_dep4_8k1k"
     model_name: "k25_thinking_fp4"
     tensor_parallel_size: 4
@@ -51,7 +51,7 @@ server_configs:
       enable_balance: true
       timeout_iters: 60
     moe_config:
-      backend: 'CUTLASS'
+      backend: 'CUTEDSL'
       use_low_precision_moe_combine: true
     cuda_graph_config:
       enable_padding: true

diff --git a/tests/scripts/perf-sanity/aggregated/k2_thinking_fp4_2_nodes_grace_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/k2_thinking_fp4_2_nodes_grace_blackwell.yaml
@@ -36,7 +36,7 @@ server_configs:
         trust_remote_code: true
         dataset_file: datasets/perf-ci/k2_thinking-32k8k-20480-ratio-1_for_serve.json
 
-  # 32k8k configs - DEP8 with CUTLASS
+  # 32k8k configs - DEP8 with CUTEDSL
   - name: "k2_thinking_fp4_dep8_32k8k"
     model_name: "k2_thinking_fp4"
     tensor_parallel_size: 8
@@ -53,7 +53,7 @@ server_configs:
       enable_balance: true
       timeout_iters: 60
     moe_config:
-      backend: 'CUTLASS'
+      backend: 'CUTEDSL'
       use_low_precision_moe_combine: true
     cuda_graph_config:
       enable_padding: true

diff --git a/tests/scripts/perf-sanity/aggregated/k2_thinking_fp4_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/k2_thinking_fp4_blackwell.yaml
@@ -35,7 +35,7 @@ server_configs:
         trust_remote_code: true
         dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json
 
-  # 8k1k configs - DEP8 with CUTLASS
+  # 8k1k configs - DEP8 with CUTEDSL
   - name: "k2_thinking_fp4_dep8_8k1k"
     model_name: "k2_thinking_fp4"
     tensor_parallel_size: 8
@@ -51,7 +51,7 @@ server_configs:
       enable_balance: true
       timeout_iters: 60
     moe_config:
-      backend: 'CUTLASS'
+      backend: 'CUTEDSL'
       use_low_precision_moe_combine: true
     cuda_graph_config:
       enable_padding: true
@@ -101,7 +101,7 @@ server_configs:
         trust_remote_code: true
         dataset_file: datasets/perf-ci/k2_thinking-32k8k-20480-ratio-1_for_serve.json
 
-  # 32k8k configs - DEP8 with CUTLASS
+  # 32k8k configs - DEP8 with CUTEDSL
   - name: "k2_thinking_fp4_dep8_32k8k"
     model_name: "k2_thinking_fp4"
     tensor_parallel_size: 8
@@ -118,7 +118,7 @@ server_configs:
       enable_balance: true
       timeout_iters: 60
     moe_config:
-      backend: 'CUTLASS'
+      backend: 'CUTEDSL'
       use_low_precision_moe_combine: true
     cuda_graph_config:
       enable_padding: true

diff --git a/tests/scripts/perf-sanity/aggregated/k2_thinking_fp4_grace_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/k2_thinking_fp4_grace_blackwell.yaml
@@ -35,7 +35,7 @@ server_configs:
         trust_remote_code: true
         dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json
 
-  # 8k1k configs - DEP4 with CUTLASS
+  # 8k1k configs - DEP4 with CUTEDSL
   - name: "k2_thinking_fp4_dep4_8k1k"
     model_name: "k2_thinking_fp4"
     tensor_parallel_size: 4
@@ -51,7 +51,7 @@ server_configs:
       enable_balance: true
       timeout_iters: 60
     moe_config:
-      backend: 'CUTLASS'
+      backend: 'CUTEDSL'
       use_low_precision_moe_combine: true
     cuda_graph_config:
       enable_padding: true