Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions tests/integration/defs/perf/test_perf_sanity.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,15 +362,8 @@ def to_match_keys(self) -> List[str]:
"l_cp",
"l_gpus_per_node",
"l_max_batch_size",
"b_disable_overlap_scheduler",
"b_enable_chunked_prefill",
"b_enable_attention_dp",
"b_enable_lm_head_tp_in_adp",
"s_serving_backend",
# attention_dp_config
"b_attention_dp_balance",
# cuda_graph_config
"b_enable_cuda_graph",
# kv_cache_config
"s_kv_cache_dtype",
# cache_transceiver_config
Expand Down
3 changes: 0 additions & 3 deletions tests/integration/test_lists/waives.txt
Original file line number Diff line number Diff line change
Expand Up @@ -362,9 +362,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backe
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=fp8-kv_cache_reuse=True-fp8kv=True-overlap_scheduler=True] SKIP (https://nvbugs/6084445)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding_4gpus[attention_dp=True-mtp_nextn=0] SKIP (https://nvbugs/6084447)
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_attention_dp] SKIP (https://nvbugs/6084568)
perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugs/6088149)
perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX] SKIP (https://nvbugs/6088149)
perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_2_nodes_grace_blackwell-k25_thinking_fp4_dep8_32k8k] SKIP (https://nvbugs/6088149)
accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8 SKIP (https://nvbugs/6070857)
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=True-attn_backend=FLASHINFER-torch_compile=False] SKIP (https://nvbugs/6094071)
accuracy/test_llm_api_pytorch_ray.py::TestLlama3_1_8BInstruct::test_pp2_ray SKIP (https://nvbugs/6094070)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ metadata:
hardware:
gpus_per_node: 4
server_configs:
# 1k1k configs - DEP8 with CUTLASS, MTP1
# 1k1k configs - DEP8 with CUTEDSL, MTP1
- name: "r1_fp4_v2_dep8_mtp1_1k1k"
model_name: "deepseek_r1_0528_fp4_v2"
trust_remote_code: true
Expand All @@ -21,7 +21,7 @@ server_configs:
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'CUTLASS'
backend: 'CUTEDSL'
use_low_precision_moe_combine: true
cuda_graph_config:
enable_padding: true
Expand All @@ -42,7 +42,7 @@ server_configs:
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json

# 8k1k configs - DEP8 with CUTLASS, MTP1
# 8k1k configs - DEP8 with CUTEDSL, MTP1
- name: "r1_fp4_v2_dep8_mtp1_8k1k"
model_name: "deepseek_r1_0528_fp4_v2"
trust_remote_code: true
Expand All @@ -58,7 +58,7 @@ server_configs:
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'CUTLASS'
backend: 'CUTEDSL'
use_low_precision_moe_combine: true
cuda_graph_config:
enable_padding: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ server_configs:
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json

# 1k1k configs - DEP8 with CUTLASS, MTP1
# 1k1k configs - DEP8 with CUTEDSL, MTP1
- name: "r1_fp4_v2_dep8_mtp1_1k1k"
model_name: "deepseek_r1_0528_fp4_v2"
tensor_parallel_size: 8
Expand All @@ -51,7 +51,7 @@ server_configs:
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'CUTLASS'
backend: 'CUTEDSL'
use_low_precision_moe_combine: true
cuda_graph_config:
enable_padding: true
Expand Down Expand Up @@ -103,7 +103,7 @@ server_configs:
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json

# 8k1k configs - DEP8 with CUTLASS, MTP1
# 8k1k configs - DEP8 with CUTEDSL, MTP1
- name: "r1_fp4_v2_dep8_mtp1_8k1k"
model_name: "deepseek_r1_0528_fp4_v2"
tensor_parallel_size: 8
Expand All @@ -118,7 +118,7 @@ server_configs:
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'CUTLASS'
backend: 'CUTEDSL'
use_low_precision_moe_combine: true
cuda_graph_config:
enable_padding: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ metadata:
hardware:
gpus_per_node: 4
server_configs:
# 1k1k configs - DEP4 with CUTLASS, MTP1
# 1k1k configs - DEP4 with CUTEDSL, MTP1
- name: "r1_fp4_v2_dep4_mtp1_1k1k"
model_name: "deepseek_r1_0528_fp4_v2"
tensor_parallel_size: 4
Expand All @@ -21,7 +21,7 @@ server_configs:
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'CUTLASS'
backend: 'CUTEDSL'
use_low_precision_moe_combine: true
cuda_graph_config:
enable_padding: true
Expand Down Expand Up @@ -104,7 +104,7 @@ server_configs:
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json

# 8k1k configs - DEP4 with CUTLASS, MTP1
# 8k1k configs - DEP4 with CUTEDSL, MTP1
- name: "r1_fp4_v2_dep4_mtp1_8k1k"
model_name: "deepseek_r1_0528_fp4_v2"
tensor_parallel_size: 4
Expand All @@ -119,7 +119,7 @@ server_configs:
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'CUTLASS'
backend: 'CUTEDSL'
use_low_precision_moe_combine: true
cuda_graph_config:
enable_padding: true
Expand Down Expand Up @@ -202,7 +202,7 @@ server_configs:
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json

# 1k8k configs - DEP4 with CUTLASS, MTP1
# 1k8k configs - DEP4 with CUTEDSL, MTP1
- name: "r1_fp4_v2_dep4_mtp1_1k8k"
model_name: "deepseek_r1_0528_fp4_v2"
tensor_parallel_size: 4
Expand All @@ -217,7 +217,7 @@ server_configs:
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'CUTLASS'
backend: 'CUTEDSL'
use_low_precision_moe_combine: true
cuda_graph_config:
enable_padding: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ server_configs:
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json

# 8k1k configs - DEP8 with CUTLASS, MTP1
# 8k1k configs - DEP8 with CUTEDSL, MTP1
- name: "v32_fp4_dep8_mtp1_8k1k"
model_name: "deepseek_v32_fp4"
tensor_parallel_size: 8
Expand All @@ -51,7 +51,7 @@ server_configs:
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'CUTLASS'
backend: 'CUTEDSL'
use_low_precision_moe_combine: true
cuda_graph_config:
enable_padding: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ server_configs:
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json

# 1k1k configs - DEP4 with CUTLASS, MTP1
# 1k1k configs - DEP4 with CUTEDSL, MTP1
- name: "v32_fp4_dep4_mtp1_1k1k"
model_name: "deepseek_v32_fp4"
tensor_parallel_size: 4
Expand All @@ -51,7 +51,7 @@ server_configs:
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'CUTLASS'
backend: 'CUTEDSL'
use_low_precision_moe_combine: true
cuda_graph_config:
enable_padding: true
Expand Down Expand Up @@ -103,7 +103,7 @@ server_configs:
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json

# 8k1k configs - DEP4 with CUTLASS, MTP1
# 8k1k configs - DEP4 with CUTEDSL, MTP1
- name: "v32_fp4_dep4_mtp1_8k1k"
model_name: "deepseek_v32_fp4"
tensor_parallel_size: 4
Expand All @@ -118,7 +118,7 @@ server_configs:
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'CUTLASS'
backend: 'CUTEDSL'
use_low_precision_moe_combine: true
cuda_graph_config:
enable_padding: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ metadata:
hardware:
gpus_per_node: 4
server_configs:
# 1k1k configs - DEP8 with CUTLASS, MTP1
# 1k1k configs - DEP8 with CUTEDSL, MTP1
- name: "r1_fp4_v2_dep8_mtp1_1k1k"
model_name: "deepseek_r1_0528_fp4_v2"
trust_remote_code: true
Expand All @@ -21,7 +21,7 @@ server_configs:
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'CUTLASS'
backend: 'CUTEDSL'
use_low_precision_moe_combine: true
cuda_graph_config:
enable_padding: true
Expand All @@ -42,7 +42,7 @@ server_configs:
random_range_ratio: 0.2
backend: "openai"

# 8k1k configs - DEP8 with CUTLASS, MTP1
# 8k1k configs - DEP8 with CUTEDSL, MTP1
- name: "r1_fp4_v2_dep8_mtp1_8k1k"
model_name: "deepseek_r1_0528_fp4_v2"
trust_remote_code: true
Expand All @@ -58,7 +58,7 @@ server_configs:
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'CUTLASS'
backend: 'CUTEDSL'
use_low_precision_moe_combine: true
cuda_graph_config:
enable_padding: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ server_configs:
trust_remote_code: true
dataset_file: <dataset_file>

# 32k8k configs - DEP8 with CUTLASS
# 32k8k configs - DEP8 with CUTEDSL
- name: "k25_thinking_fp4_dep8_32k8k"
model_name: "k25_thinking_fp4"
tensor_parallel_size: 8
Expand All @@ -53,7 +53,7 @@ server_configs:
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'CUTLASS'
backend: 'CUTEDSL'
use_low_precision_moe_combine: true
cuda_graph_config:
enable_padding: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ server_configs:
trust_remote_code: true
dataset_file: <dataset_file>

# 8k1k configs - DEP8 with CUTLASS
# 8k1k configs - DEP8 with CUTEDSL
- name: "k25_thinking_fp4_dep8_8k1k"
model_name: "k25_thinking_fp4"
tensor_parallel_size: 8
Expand All @@ -51,7 +51,7 @@ server_configs:
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'CUTLASS'
backend: 'CUTEDSL'
use_low_precision_moe_combine: true
cuda_graph_config:
enable_padding: true
Expand Down Expand Up @@ -101,7 +101,7 @@ server_configs:
trust_remote_code: true
dataset_file: <dataset_file>

# 32k8k configs - DEP8 with CUTLASS
# 32k8k configs - DEP8 with CUTEDSL
- name: "k25_thinking_fp4_dep8_32k8k"
model_name: "k25_thinking_fp4"
tensor_parallel_size: 8
Expand All @@ -118,7 +118,7 @@ server_configs:
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'CUTLASS'
backend: 'CUTEDSL'
use_low_precision_moe_combine: true
cuda_graph_config:
enable_padding: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ server_configs:
trust_remote_code: true
dataset_file: <dataset_file>

# 8k1k configs - DEP4 with CUTLASS
# 8k1k configs - DEP4 with CUTEDSL
- name: "k25_thinking_fp4_dep4_8k1k"
model_name: "k25_thinking_fp4"
tensor_parallel_size: 4
Expand All @@ -51,7 +51,7 @@ server_configs:
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'CUTLASS'
backend: 'CUTEDSL'
use_low_precision_moe_combine: true
cuda_graph_config:
enable_padding: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ server_configs:
trust_remote_code: true
dataset_file: datasets/perf-ci/k2_thinking-32k8k-20480-ratio-1_for_serve.json

# 32k8k configs - DEP8 with CUTLASS
# 32k8k configs - DEP8 with CUTEDSL
- name: "k2_thinking_fp4_dep8_32k8k"
model_name: "k2_thinking_fp4"
tensor_parallel_size: 8
Expand All @@ -53,7 +53,7 @@ server_configs:
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'CUTLASS'
backend: 'CUTEDSL'
use_low_precision_moe_combine: true
cuda_graph_config:
enable_padding: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ server_configs:
trust_remote_code: true
dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json

# 8k1k configs - DEP8 with CUTLASS
# 8k1k configs - DEP8 with CUTEDSL
- name: "k2_thinking_fp4_dep8_8k1k"
model_name: "k2_thinking_fp4"
tensor_parallel_size: 8
Expand All @@ -51,7 +51,7 @@ server_configs:
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'CUTLASS'
backend: 'CUTEDSL'
use_low_precision_moe_combine: true
cuda_graph_config:
enable_padding: true
Expand Down Expand Up @@ -101,7 +101,7 @@ server_configs:
trust_remote_code: true
dataset_file: datasets/perf-ci/k2_thinking-32k8k-20480-ratio-1_for_serve.json

# 32k8k configs - DEP8 with CUTLASS
# 32k8k configs - DEP8 with CUTEDSL
- name: "k2_thinking_fp4_dep8_32k8k"
model_name: "k2_thinking_fp4"
tensor_parallel_size: 8
Expand All @@ -118,7 +118,7 @@ server_configs:
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'CUTLASS'
backend: 'CUTEDSL'
use_low_precision_moe_combine: true
cuda_graph_config:
enable_padding: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ server_configs:
trust_remote_code: true
dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json

# 8k1k configs - DEP4 with CUTLASS
# 8k1k configs - DEP4 with CUTEDSL
- name: "k2_thinking_fp4_dep4_8k1k"
model_name: "k2_thinking_fp4"
tensor_parallel_size: 4
Expand All @@ -51,7 +51,7 @@ server_configs:
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'CUTLASS'
backend: 'CUTEDSL'
use_low_precision_moe_combine: true
cuda_graph_config:
enable_padding: true
Expand Down
Loading