Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -3273,6 +3273,7 @@ def launchTestJobs(pipeline, testFilter)
"DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["auto:dgx-b200-flex", "l0_b200_multi_gpus_perf_sanity", 2, 4, 8, 1, true],
"DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["auto:dgx-b200-flex", "l0_b200_multi_gpus_perf_sanity", 3, 4, 8, 1, true],
"DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-4": ["auto:dgx-b200-flex", "l0_b200_multi_gpus_perf_sanity", 4, 4, 8, 1, true],
"DGX_H200-8_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["dgx-h200-x8", "l0_dgx_h200_perf_sanity", 1, 1, 8, 1, true],
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question for infra reviewer: This is adding a new stage. Is there anything else we need to do?

]
fullSet += x86SlurmTestConfigs.keySet()

Expand Down
1 change: 1 addition & 0 deletions tests/integration/defs/perf/test_perf_sanity.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
"qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf", # Qwen3-235B-A22B-FP4
"super_nvfp4": "NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4", # Super (Nemotron-H SSM+MoE) NvFP4
"qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf", # Qwen3-235B-A22B-FP8
"qwen3_32b_fp8": "Qwen3/Qwen3-32B-FP8",
"llama_v3.3_70b_instruct_fp4": "llama-3.3-models/Llama-3.3-70B-Instruct-FP4",
"deepseek_v3_lite_fp8": "DeepSeek-V3-Lite/fp8",
"llama_v3.1_8b_instruct": "llama-3.1-model/Llama-3.1-8B-Instruct",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ l0_b200_multi_gpus_perf_sanity:
- perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_blackwell-k25_thinking_fp4_dep8_8k1k] TIMEOUT (90)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_blackwell-k25_thinking_fp4_tep8_32k8k]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_blackwell-k25_thinking_fp4_dep8_32k8k] TIMEOUT (90)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_k25_thinking_fp4_blackwell-k25_thinking_fp4_tep8_adp_2k1k]
# ctx_only tests (disagg config)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-b200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-b200_deepseek-r1-fp4_1k1k_con2048_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX] TIMEOUT (120)
Expand Down
18 changes: 18 additions & 0 deletions tests/integration/test_lists/test-db/l0_dgx_h200_perf_sanity.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
version: 0.0.1
l0_dgx_h200_perf_sanity:
- condition:
ranges:
system_gpu_count:
gte: 8
lte: 8
wildcards:
gpu:
- '*h200*'
linux_distribution_name: ubuntu*
cpu: x86_64
terms:
stage: post_merge
backend: pytorch
tests:
- perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_qwen3_235b_a22b_fp8_hopper-qwen3_235b_a22b_fp8_tp4_ep4_deepgemm_8k1k]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_qwen3_32b_fp8_hopper-qwen3_32b_fp8_tp2_6k1k]
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ l0_gb200_multi_gpus_perf_sanity:
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp2_mtp0_1k1k]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp1_mtp0_1k1k]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp1_mtp0_8k1k]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_gpt_oss_120b_fp4_blackwell-gpt_oss_fp4_tep4_adp_cutlass_1k1k]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_gpt_oss_120b_fp4_blackwell-gpt_oss_fp4_tep4_adp_cutlass_8k1k]
# k25-thinking-fp4 aggregated
- perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_grace_blackwell-k25_thinking_fp4_tep4_8k1k]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_grace_blackwell-k25_thinking_fp4_dep4_8k1k] TIMEOUT (90)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ l0_gb200_multi_nodes_perf_sanity_node2_gpu8:
# k25-thinking-fp4 aggregated 2-nodes
- perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_2_nodes_grace_blackwell-k25_thinking_fp4_tep8_32k8k]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_2_nodes_grace_blackwell-k25_thinking_fp4_dep8_32k8k] TIMEOUT (90)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_deepseek_v32_fp4_2_nodes_grace_blackwell-dsv32_fp4_dep8_trtllm_lpc_mnnvl_8k1k] TIMEOUT (120)
# ctx only tests (disagg config)
# deepseek-r1-fp4
- perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp2_ccb-UCX] TIMEOUT (120)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Dynamo-replica: agg-round-robin DeepSeek-V3.2 FP4 TRT-LLM deployment.
#
# Source (upstream Dynamo recipe):
# https://github.com/ai-dynamo/dynamo/tree/main/recipes/deepseek-v32-fp4/trtllm/agg-round-robin

metadata:
model_name: deepseek_v32_fp4
supported_gpus:
- GB200
hardware:
gpus_per_node: 4
server_configs:
- name: "dsv32_fp4_dep8_trtllm_lpc_mnnvl_8k1k"
model_name: "deepseek_v32_fp4"
tensor_parallel_size: 8
moe_expert_parallel_size: 8
pipeline_parallel_size: 1
max_batch_size: 8
max_num_tokens: 8192
max_seq_len: 121000
attn_backend: "TRTLLM"
enable_attention_dp: true
enable_chunked_prefill: true
disable_overlap_scheduler: true
allreduce_strategy: MNNVL
num_postprocess_workers: 8
print_iter_log: true
stream_interval: 10
moe_config:
backend: 'TRTLLM'
use_low_precision_moe_combine: true
cuda_graph_config:
enable_padding: true
max_batch_size: 8
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.9
tokens_per_block: 64
cache_transceiver_config:
backend: UCX
max_tokens_in_buffer: 120000
client_configs:
- name: "con4_iter10_8k1k"
concurrency: 4
iterations: 10
isl: 8192
osl: 1024
backend: "openai"
dataset_file: <dataset_file>
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Dynamo-replica: agg GPT-OSS-120B FP4 TRT-LLM deployment.
#
# Source (upstream Dynamo recipe):
# https://github.com/ai-dynamo/dynamo/tree/main/recipes/gpt-oss-120b/trtllm/agg

metadata:
model_name: gpt_oss_120b_fp4
supported_gpus:
- GB200
hardware:
gpus_per_node: 4
server_configs:
- name: "gpt_oss_fp4_tep4_adp_cutlass_1k1k"
model_name: "gpt_oss_120b_fp4"
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
max_batch_size: 800
max_num_tokens: 20000
attn_backend: "TRTLLM"
enable_attention_dp: true
stream_interval: 20
moe_config:
backend: 'CUTLASS'
cuda_graph_config:
enable_padding: true
max_batch_size: 800
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.9
client_configs:
- name: "con128_iter10_1k1k"
concurrency: 128
iterations: 10
isl: 1024
osl: 1024
backend: "openai"
dataset_file: <dataset_file>

- name: "gpt_oss_fp4_tep4_adp_cutlass_8k1k"
model_name: "gpt_oss_120b_fp4"
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
max_batch_size: 800
max_num_tokens: 20000
attn_backend: "TRTLLM"
enable_attention_dp: true
stream_interval: 20
moe_config:
backend: 'CUTLASS'
cuda_graph_config:
enable_padding: true
max_batch_size: 800
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.9
client_configs:
- name: "con128_iter10_8k1k"
concurrency: 128
iterations: 10
isl: 8192
osl: 1024
backend: "openai"
dataset_file: <dataset_file>
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Dynamo-replica: agg Kimi K2.5 FP4 TRT-LLM deployment (NVIDIA recipe variant).
#
# Source (upstream Dynamo recipe):
# https://github.com/ai-dynamo/dynamo/tree/main/recipes/kimi-k2.5/trtllm/agg/nvidia

metadata:
model_name: k25_thinking_fp4
supported_gpus:
- B200
hardware:
gpus_per_node: 8
server_configs:
- name: "k25_thinking_fp4_tep8_adp_2k1k"
model_name: "k25_thinking_fp4"
tensor_parallel_size: 8
moe_expert_parallel_size: 8
pipeline_parallel_size: 1
max_batch_size: 128
max_num_tokens: 8448
max_seq_len: 8212
attn_backend: "TRTLLM"
enable_attention_dp: true
trust_remote_code: true
print_iter_log: true
kv_cache_config:
dtype: 'fp8'
free_gpu_memory_fraction: 0.75
cache_transceiver_config:
backend: UCX
max_tokens_in_buffer: 8448
client_configs:
- name: "con128_iter10_2k1k"
concurrency: 128
iterations: 10
isl: 2048
osl: 1024
backend: "openai"
trust_remote_code: true
dataset_file: <dataset_file>
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Dynamo-replica: agg Qwen3-235B-A22B FP8 TRT-LLM deployment.
#
# Source (upstream Dynamo recipe):
# https://github.com/ai-dynamo/dynamo/tree/main/recipes/qwen3-235b-a22b-fp8/trtllm/agg

metadata:
model_name: qwen3_235b_a22b_fp8
supported_gpus:
- H200
hardware:
gpus_per_node: 8
server_configs:
- name: "qwen3_235b_a22b_fp8_tp4_ep4_deepgemm_8k1k"
model_name: "qwen3_235b_a22b_fp8"
trust_remote_code: true
tensor_parallel_size: 4
moe_expert_parallel_size: 4
moe_tensor_parallel_size: 1
pipeline_parallel_size: 1
max_batch_size: 128
max_num_tokens: 8192
max_seq_len: 8192
attn_backend: "TRTLLM"
enable_attention_dp: false
enable_chunked_prefill: true
disable_overlap_scheduler: false
print_iter_log: false
moe_config:
backend: 'DEEPGEMM'
cuda_graph_config:
enable_padding: true
max_batch_size: 128
kv_cache_config:
dtype: 'auto'
enable_block_reuse: true
free_gpu_memory_fraction: 0.8
cache_transceiver_config:
backend: DEFAULT
client_configs:
# ISL+OSL must stay within recipe's max_seq_len=8192, so use 7k/1k.
- name: "con128_iter10_7k1k"
concurrency: 128
iterations: 10
isl: 7168
osl: 1024
backend: "openai"
trust_remote_code: true
dataset_file: <dataset_file>
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Dynamo-replica: agg Qwen3-32B FP8 TRT-LLM deployment.
#
# Source (upstream Dynamo recipe):
# https://github.com/ai-dynamo/dynamo/tree/main/recipes/qwen3-32b-fp8/trtllm/agg

metadata:
model_name: qwen3_32b_fp8
supported_gpus:
- H200
hardware:
gpus_per_node: 8
server_configs:
- name: "qwen3_32b_fp8_tp2_6k1k"
model_name: "qwen3_32b_fp8"
tensor_parallel_size: 2
pipeline_parallel_size: 1
max_batch_size: 96
max_num_tokens: 7964
max_seq_len: 7964
attn_backend: "TRTLLM"
enable_attention_dp: false
enable_chunked_prefill: false
disable_overlap_scheduler: false
print_iter_log: false
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 96
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.9
client_configs:
# ISL+OSL must stay within recipe's max_seq_len=7964, so use ~6.7k/1k.
- name: "con96_iter10_6k1k"
concurrency: 96
iterations: 10
isl: 6912
osl: 1024
backend: "openai"
dataset_file: <dataset_file>
Loading