diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 665fda0f6c6a..f7abf4071525 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -3273,6 +3273,7 @@ def launchTestJobs(pipeline, testFilter) "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["auto:dgx-b200-flex", "l0_b200_multi_gpus_perf_sanity", 2, 4, 8, 1, true], "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["auto:dgx-b200-flex", "l0_b200_multi_gpus_perf_sanity", 3, 4, 8, 1, true], "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-4": ["auto:dgx-b200-flex", "l0_b200_multi_gpus_perf_sanity", 4, 4, 8, 1, true], + "DGX_H200-8_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["dgx-h200-x8", "l0_dgx_h200_perf_sanity", 1, 1, 8, 1, true], ] fullSet += x86SlurmTestConfigs.keySet() diff --git a/tests/integration/defs/perf/test_perf_sanity.py b/tests/integration/defs/perf/test_perf_sanity.py index 4a24d6f748d3..7056fe24655d 100644 --- a/tests/integration/defs/perf/test_perf_sanity.py +++ b/tests/integration/defs/perf/test_perf_sanity.py @@ -50,6 +50,7 @@ "qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf", # Qwen3-235B-A22B-FP4 "super_nvfp4": "NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4", # Super (Nemotron-H SSM+MoE) NvFP4 "qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf", # Qwen3-235B-A22B-FP8 + "qwen3_32b_fp8": "Qwen3/Qwen3-32B-FP8", "llama_v3.3_70b_instruct_fp4": "llama-3.3-models/Llama-3.3-70B-Instruct-FP4", "deepseek_v3_lite_fp8": "DeepSeek-V3-Lite/fp8", "llama_v3.1_8b_instruct": "llama-3.1-model/Llama-3.1-8B-Instruct", diff --git a/tests/integration/test_lists/test-db/l0_b200_multi_gpus_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_b200_multi_gpus_perf_sanity.yml index 563191fa52bb..ff841be8868c 100644 --- a/tests/integration/test_lists/test-db/l0_b200_multi_gpus_perf_sanity.yml +++ b/tests/integration/test_lists/test-db/l0_b200_multi_gpus_perf_sanity.yml @@ -41,6 +41,7 @@ l0_b200_multi_gpus_perf_sanity: - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_blackwell-k25_thinking_fp4_dep8_8k1k] TIMEOUT (90) - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_blackwell-k25_thinking_fp4_tep8_32k8k] - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_blackwell-k25_thinking_fp4_dep8_32k8k] TIMEOUT (90) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_k25_thinking_fp4_blackwell-k25_thinking_fp4_tep8_adp_2k1k] # ctx_only tests (disagg config) - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-b200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-b200_deepseek-r1-fp4_1k1k_con2048_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_dgx_h200_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_dgx_h200_perf_sanity.yml new file mode 100644 index 000000000000..a6461b1970d6 --- /dev/null +++ b/tests/integration/test_lists/test-db/l0_dgx_h200_perf_sanity.yml @@ -0,0 +1,18 @@ +version: 0.0.1 +l0_dgx_h200_perf_sanity: +- condition: + ranges: + system_gpu_count: + gte: 8 + lte: 8 + wildcards: + gpu: + - '*h200*' + linux_distribution_name: ubuntu* + cpu: x86_64 + terms: + stage: post_merge + backend: pytorch + tests: + - perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_qwen3_235b_a22b_fp8_hopper-qwen3_235b_a22b_fp8_tp4_ep4_deepgemm_8k1k] + - perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_qwen3_32b_fp8_hopper-qwen3_32b_fp8_tp2_6k1k] diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml index 292d8633f2ab..72a19fca4eb4 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml @@ -55,6 +55,8 @@ l0_gb200_multi_gpus_perf_sanity: - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp2_mtp0_1k1k] - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp1_mtp0_1k1k] - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp1_mtp0_8k1k] + - perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_gpt_oss_120b_fp4_blackwell-gpt_oss_fp4_tep4_adp_cutlass_1k1k] + - perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_gpt_oss_120b_fp4_blackwell-gpt_oss_fp4_tep4_adp_cutlass_8k1k] # k25-thinking-fp4 aggregated - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_grace_blackwell-k25_thinking_fp4_tep4_8k1k] - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_grace_blackwell-k25_thinking_fp4_dep4_8k1k] TIMEOUT (90) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_node2_gpu8.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_node2_gpu8.yml index 18431b483242..ae3d5665c4ca 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_node2_gpu8.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_node2_gpu8.yml @@ -21,6 +21,7 @@ l0_gb200_multi_nodes_perf_sanity_node2_gpu8: # k25-thinking-fp4 aggregated 2-nodes - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_2_nodes_grace_blackwell-k25_thinking_fp4_tep8_32k8k] - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_2_nodes_grace_blackwell-k25_thinking_fp4_dep8_32k8k] TIMEOUT (90) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_deepseek_v32_fp4_2_nodes_grace_blackwell-dsv32_fp4_dep8_trtllm_lpc_mnnvl_8k1k] TIMEOUT (120) # ctx only tests (disagg config) # deepseek-r1-fp4 - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp2_ccb-UCX] TIMEOUT (120) diff --git a/tests/scripts/perf-sanity/aggregated/dynamo_deepseek_v32_fp4_2_nodes_grace_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/dynamo_deepseek_v32_fp4_2_nodes_grace_blackwell.yaml new file mode 100644 index 000000000000..debea1681f7e --- /dev/null +++ b/tests/scripts/perf-sanity/aggregated/dynamo_deepseek_v32_fp4_2_nodes_grace_blackwell.yaml @@ -0,0 +1,50 @@ +# Dynamo-replica: agg-round-robin DeepSeek-V3.2 FP4 TRT-LLM deployment. +# +# Source (upstream Dynamo recipe): +# https://github.com/ai-dynamo/dynamo/tree/main/recipes/deepseek-v32-fp4/trtllm/agg-round-robin + +metadata: + model_name: deepseek_v32_fp4 + supported_gpus: + - GB200 +hardware: + gpus_per_node: 4 +server_configs: + - name: "dsv32_fp4_dep8_trtllm_lpc_mnnvl_8k1k" + model_name: "deepseek_v32_fp4" + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 8192 + max_seq_len: 121000 + attn_backend: "TRTLLM" + enable_attention_dp: true + enable_chunked_prefill: true + disable_overlap_scheduler: true + allreduce_strategy: MNNVL + num_postprocess_workers: 8 + print_iter_log: true + stream_interval: 10 + moe_config: + backend: 'TRTLLM' + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + max_batch_size: 8 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + tokens_per_block: 64 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 120000 + client_configs: + - name: "con4_iter10_8k1k" + concurrency: 4 + iterations: 10 + isl: 8192 + osl: 1024 + backend: "openai" + dataset_file: diff --git a/tests/scripts/perf-sanity/aggregated/dynamo_gpt_oss_120b_fp4_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/dynamo_gpt_oss_120b_fp4_blackwell.yaml new file mode 100644 index 000000000000..f1d2b6c272cf --- /dev/null +++ b/tests/scripts/perf-sanity/aggregated/dynamo_gpt_oss_120b_fp4_blackwell.yaml @@ -0,0 +1,67 @@ +# Dynamo-replica: agg GPT-OSS-120B FP4 TRT-LLM deployment. +# +# Source (upstream Dynamo recipe): +# https://github.com/ai-dynamo/dynamo/tree/main/recipes/gpt-oss-120b/trtllm/agg + +metadata: + model_name: gpt_oss_120b_fp4 + supported_gpus: + - GB200 +hardware: + gpus_per_node: 4 +server_configs: + - name: "gpt_oss_fp4_tep4_adp_cutlass_1k1k" + model_name: "gpt_oss_120b_fp4" + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + max_batch_size: 800 + max_num_tokens: 20000 + attn_backend: "TRTLLM" + enable_attention_dp: true + stream_interval: 20 + moe_config: + backend: 'CUTLASS' + cuda_graph_config: + enable_padding: true + max_batch_size: 800 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + client_configs: + - name: "con128_iter10_1k1k" + concurrency: 128 + iterations: 10 + isl: 1024 + osl: 1024 + backend: "openai" + dataset_file: + + - name: "gpt_oss_fp4_tep4_adp_cutlass_8k1k" + model_name: "gpt_oss_120b_fp4" + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + max_batch_size: 800 + max_num_tokens: 20000 + attn_backend: "TRTLLM" + enable_attention_dp: true + stream_interval: 20 + moe_config: + backend: 'CUTLASS' + cuda_graph_config: + enable_padding: true + max_batch_size: 800 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + client_configs: + - name: "con128_iter10_8k1k" + concurrency: 128 + iterations: 10 + isl: 8192 + osl: 1024 + backend: "openai" + dataset_file: diff --git a/tests/scripts/perf-sanity/aggregated/dynamo_k25_thinking_fp4_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/dynamo_k25_thinking_fp4_blackwell.yaml new file mode 100644 index 000000000000..24216305a8cb --- /dev/null +++ b/tests/scripts/perf-sanity/aggregated/dynamo_k25_thinking_fp4_blackwell.yaml @@ -0,0 +1,39 @@ +# Dynamo-replica: agg Kimi K2.5 FP4 TRT-LLM deployment (NVIDIA recipe variant). +# +# Source (upstream Dynamo recipe): +# https://github.com/ai-dynamo/dynamo/tree/main/recipes/kimi-k2.5/trtllm/agg/nvidia + +metadata: + model_name: k25_thinking_fp4 + supported_gpus: + - B200 +hardware: + gpus_per_node: 8 +server_configs: + - name: "k25_thinking_fp4_tep8_adp_2k1k" + model_name: "k25_thinking_fp4" + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 8448 + max_seq_len: 8212 + attn_backend: "TRTLLM" + enable_attention_dp: true + trust_remote_code: true + print_iter_log: true + kv_cache_config: + dtype: 'fp8' + free_gpu_memory_fraction: 0.75 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8448 + client_configs: + - name: "con128_iter10_2k1k" + concurrency: 128 + iterations: 10 + isl: 2048 + osl: 1024 + backend: "openai" + trust_remote_code: true + dataset_file: diff --git a/tests/scripts/perf-sanity/aggregated/dynamo_qwen3_235b_a22b_fp8_hopper.yaml b/tests/scripts/perf-sanity/aggregated/dynamo_qwen3_235b_a22b_fp8_hopper.yaml new file mode 100644 index 000000000000..e3c4e9ffb969 --- /dev/null +++ b/tests/scripts/perf-sanity/aggregated/dynamo_qwen3_235b_a22b_fp8_hopper.yaml @@ -0,0 +1,48 @@ +# Dynamo-replica: agg Qwen3-235B-A22B FP8 TRT-LLM deployment. +# +# Source (upstream Dynamo recipe): +# https://github.com/ai-dynamo/dynamo/tree/main/recipes/qwen3-235b-a22b-fp8/trtllm/agg + +metadata: + model_name: qwen3_235b_a22b_fp8 + supported_gpus: + - H200 +hardware: + gpus_per_node: 8 +server_configs: + - name: "qwen3_235b_a22b_fp8_tp4_ep4_deepgemm_8k1k" + model_name: "qwen3_235b_a22b_fp8" + trust_remote_code: true + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + moe_tensor_parallel_size: 1 + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 8192 + max_seq_len: 8192 + attn_backend: "TRTLLM" + enable_attention_dp: false + enable_chunked_prefill: true + disable_overlap_scheduler: false + print_iter_log: false + moe_config: + backend: 'DEEPGEMM' + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + kv_cache_config: + dtype: 'auto' + enable_block_reuse: true + free_gpu_memory_fraction: 0.8 + cache_transceiver_config: + backend: DEFAULT + client_configs: + # ISL+OSL must stay within recipe's max_seq_len=8192, so use 7k/1k. + - name: "con128_iter10_7k1k" + concurrency: 128 + iterations: 10 + isl: 7168 + osl: 1024 + backend: "openai" + trust_remote_code: true + dataset_file: diff --git a/tests/scripts/perf-sanity/aggregated/dynamo_qwen3_32b_fp8_hopper.yaml b/tests/scripts/perf-sanity/aggregated/dynamo_qwen3_32b_fp8_hopper.yaml new file mode 100644 index 000000000000..dfdc056776e4 --- /dev/null +++ b/tests/scripts/perf-sanity/aggregated/dynamo_qwen3_32b_fp8_hopper.yaml @@ -0,0 +1,48 @@ +# Dynamo-replica: agg Qwen3-32B FP8 TRT-LLM deployment. +# +# Source (upstream Dynamo recipe): +# https://github.com/ai-dynamo/dynamo/tree/main/recipes/qwen3-32b-fp8/trtllm/agg + +metadata: + model_name: qwen3_32b_fp8 + supported_gpus: + - H200 +hardware: + gpus_per_node: 8 +server_configs: + - name: "qwen3_32b_fp8_tp2_6k1k" + model_name: "qwen3_32b_fp8" + tensor_parallel_size: 2 + pipeline_parallel_size: 1 + max_batch_size: 96 + max_num_tokens: 7964 + max_seq_len: 7964 + attn_backend: "TRTLLM" + enable_attention_dp: false + enable_chunked_prefill: false + disable_overlap_scheduler: false + print_iter_log: false + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 96 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + client_configs: + # ISL+OSL must stay within recipe's max_seq_len=7964, so use ~6.7k/1k. + - name: "con96_iter10_6k1k" + concurrency: 96 + iterations: 10 + isl: 6912 + osl: 1024 + backend: "openai" + dataset_file: