Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -3273,6 +3273,7 @@ def launchTestJobs(pipeline, testFilter)
"DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["auto:dgx-b200-flex", "l0_b200_multi_gpus_perf_sanity", 2, 4, 8, 1, true],
"DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["auto:dgx-b200-flex", "l0_b200_multi_gpus_perf_sanity", 3, 4, 8, 1, true],
"DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-4": ["auto:dgx-b200-flex", "l0_b200_multi_gpus_perf_sanity", 4, 4, 8, 1, true],
"DGX_H200-8_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["auto:dgx-h200-x8", "l0_dgx_h200_perf_sanity", 1, 1, 8, 1, true],
Comment thread
brb-nv marked this conversation as resolved.
Outdated
]
fullSet += x86SlurmTestConfigs.keySet()

Expand Down
2 changes: 2 additions & 0 deletions tests/integration/defs/perf/test_perf_sanity.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,9 @@
"k25_thinking_fp4": "Kimi-K2.5-NVFP4",
"qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf", # Qwen3-235B-A22B-FP4
"super_nvfp4": "NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4", # Super (Nemotron-H SSM+MoE) NvFP4
"super_fp8": "NVIDIA-Nemotron-3-Super-120B-A12B-FP8",
"qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf", # Qwen3-235B-A22B-FP8
"qwen3_32b_fp8": "Qwen3/Qwen3-32B-FP8",
Comment thread
brb-nv marked this conversation as resolved.
"llama_v3.3_70b_instruct_fp4": "llama-3.3-models/Llama-3.3-70B-Instruct-FP4",
"deepseek_v3_lite_fp8": "DeepSeek-V3-Lite/fp8",
"llama_v3.1_8b_instruct": "llama-3.1-model/Llama-3.1-8B-Instruct",
Expand Down
25 changes: 25 additions & 0 deletions tests/integration/test_lists/test-db/l0_dgx_h200_perf_sanity.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
version: 0.0.1
l0_dgx_h200_perf_sanity:
- condition:
ranges:
system_gpu_count:
gte: 8
lte: 8
wildcards:
gpu:
- '*h200*'
linux_distribution_name: ubuntu*
cpu: x86_64
terms:
stage: post_merge
backend: pytorch
tests:


- perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-h200_nemotron-super-fp8_8k1k_con64_ctx1_tp2_gen1_tp2_eplb0_mtp0_ccb-UCX]
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-h200_qwen3-235b-a22b-fp8_8k1k_con512_ctx1_tp2_gen1_tep4_eplb0_mtp0_ccb-DEFAULT]
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-h200_qwen3-32b-fp8_4k1k_con128_ctx1_tp1_gen1_tp2_eplb0_mtp0_ccb-DEFAULT]

# - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-h200_nemotron-super-fp8_8k1k_con64_ctx1_tp2_gen1_tp2_eplb0_mtp0_ccb-UCX]
# - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-h200_qwen3-235b-a22b-fp8_8k1k_con512_ctx1_tp2_gen1_tep4_eplb0_mtp0_ccb-DEFAULT]
# - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-h200_qwen3-32b-fp8_4k1k_con128_ctx1_tp1_gen1_tp2_eplb0_mtp0_ccb-DEFAULT]
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
metadata:
model_name: super_fp8
precision: fp8
model_dir_name: NVIDIA-Nemotron-3-Super-120B-A12B-FP8
supported_gpus:
- H200
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
# Native-target (Hopper) mirror of the Dynamo Nemotron-3-Super-FP8 TRT-LLM
# disagg deployment recipe:
# https://github.com/ai-dynamo/dynamo/tree/main/recipes/nemotron-3-super-fp8/trtllm/disagg
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: "--gres=gpu:8"
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 10
benchmark_ratio: 0.0
streaming: true
concurrency_list: '64'
input_length: 8192
output_length: 1024
dataset_file: datasets/perf-ci/nemotron_super-8k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 8
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
print_iter_log: true
tensor_parallel_size: 2
moe_expert_parallel_size: 1
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: false
enable_chunked_prefill: true
max_batch_size: 16
max_num_tokens: 8192
trust_remote_code: true
cuda_graph_config:
enable_padding: true
max_batch_size: 16
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.85
moe_config:
backend: TRTLLM
cache_transceiver_config:
backend: UCX
disable_overlap_scheduler: false
ctx:
print_iter_log: true
tensor_parallel_size: 2
moe_expert_parallel_size: 1
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: false
enable_chunked_prefill: true
max_batch_size: 16
max_num_tokens: 8192
trust_remote_code: true
cuda_graph_config:
enable_padding: true
max_batch_size: 16
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.85
moe_config:
backend: TRTLLM
cache_transceiver_config:
backend: UCX
disable_overlap_scheduler: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
metadata:
model_name: qwen3_235b_a22b_fp8
precision: fp8
model_dir_name: Qwen3-235B-A22B-FP8
supported_gpus:
- H200
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
# Native-target (Hopper) mirror of the Dynamo Qwen3-235B-A22B-FP8 TRT-LLM
# disagg deployment recipe:
# https://github.com/ai-dynamo/dynamo/tree/main/recipes/qwen3-235b-a22b-fp8/trtllm/disagg
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: "--gres=gpu:8"
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 10
benchmark_ratio: 0.0
streaming: true
concurrency_list: '512'
input_length: 8192
output_length: 1024
dataset_file: datasets/perf-ci/qwen3_235b-8k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 8
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
print_iter_log: true
tensor_parallel_size: 4
moe_expert_parallel_size: 4
moe_tensor_parallel_size: 1
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: false
enable_chunked_prefill: false
trust_remote_code: true
max_batch_size: 512
max_num_tokens: 1024
max_seq_len: 8192
cuda_graph_config:
enable_padding: true
max_batch_size: 512
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.95
dtype: fp8
moe_config:
backend: DEEPGEMM
cache_transceiver_config:
backend: DEFAULT
disable_overlap_scheduler: false
ctx:
print_iter_log: true
tensor_parallel_size: 2
moe_expert_parallel_size: 1
moe_tensor_parallel_size: 2
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: false
enable_chunked_prefill: false
trust_remote_code: true
max_batch_size: 2
max_num_tokens: 8192
max_seq_len: 8192
cuda_graph_config:
enable_padding: true
max_batch_size: 2
kv_cache_config:
enable_block_reuse: true
free_gpu_memory_fraction: 0.7
dtype: fp8
moe_config:
backend: DEEPGEMM
cache_transceiver_config:
backend: DEFAULT
disable_overlap_scheduler: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
metadata:
model_name: qwen3_32b_fp8
precision: fp8
model_dir_name: Qwen3-32B-FP8
supported_gpus:
- H200
script_file: disaggr_torch.slurm
benchmark_type: 4k1k
# Native-target (Hopper) mirror of the Dynamo Qwen3-32B-FP8 TRT-LLM disagg
# deployment recipe:
# https://github.com/ai-dynamo/dynamo/tree/main/recipes/qwen3-32b-fp8/trtllm/disagg
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: "--gres=gpu:8"
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 10
benchmark_ratio: 0.0
streaming: true
concurrency_list: '128'
input_length: 4096
output_length: 1024
dataset_file: datasets/perf-ci/qwen3_32b-4k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 8
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL=True"
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
worker_config:
gen:
print_iter_log: true
tensor_parallel_size: 2
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: false
enable_chunked_prefill: false
trust_remote_code: true
max_batch_size: 128
max_num_tokens: 7808
max_seq_len: 7808
cuda_graph_config:
enable_padding: true
max_batch_size: 128
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.9
dtype: fp8
cache_transceiver_config:
backend: DEFAULT
disable_overlap_scheduler: false
ctx:
print_iter_log: true
tensor_parallel_size: 1
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: false
enable_chunked_prefill: false
trust_remote_code: true
max_batch_size: 1
max_num_tokens: 7808
max_seq_len: 7808
cuda_graph_config:
enable_padding: true
max_batch_size: 256
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.9
dtype: fp8
cache_transceiver_config:
backend: DEFAULT
disable_overlap_scheduler: true
Loading