diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 665fda0f6c6a..c78edcfdde39 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -3202,6 +3202,7 @@ def launchTestJobs(pipeline, testFilter) "DGX_H200-4_GPUs-Triton-Post-Merge-1": ["dgx-h200-x4", "l0_dgx_h200", 1, 1, 4], "DGX_H200-8_GPUs-PyTorch-Post-Merge-1": ["dgx-h200-x8", "l0_dgx_h200", 1, 1, 8], "DGX_H200-4_GPUs-PyTorch-Post-Merge-1": ["dgx-h200-x4", "l0_dgx_h200", 1, 1, 4], + "DGX_H200-8_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["dgx-h200-x8", "l0_dgx_h200_perf_sanity", 1, 1, 8], // "DGX_H200-4_GPUs-TensorRT-Post-Merge-1": ["dgx-h200-x4", "l0_dgx_h200", 1, 3, 4], // "DGX_H200-4_GPUs-TensorRT-Post-Merge-2": ["dgx-h200-x4", "l0_dgx_h200", 2, 3, 4], // "DGX_H200-4_GPUs-TensorRT-Post-Merge-3": ["dgx-h200-x4", "l0_dgx_h200", 3, 3, 4], diff --git a/tests/integration/defs/perf/test_perf_sanity.py b/tests/integration/defs/perf/test_perf_sanity.py index b6c7737cbe5a..04a2db1462c6 100644 --- a/tests/integration/defs/perf/test_perf_sanity.py +++ b/tests/integration/defs/perf/test_perf_sanity.py @@ -49,7 +49,9 @@ "k25_thinking_fp4": "Kimi-K2.5-NVFP4", "qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf", # Qwen3-235B-A22B-FP4 "super_nvfp4": "NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4", # Super (Nemotron-H SSM+MoE) NvFP4 + "super_fp8": "NVIDIA-Nemotron-3-Super-120B-A12B-FP8", "qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf", # Qwen3-235B-A22B-FP8 + "qwen3_32b_fp8": "Qwen3/Qwen3-32B-FP8", "llama_v3.3_70b_instruct_fp4": "llama-3.3-models/Llama-3.3-70B-Instruct-FP4", "deepseek_v3_lite_fp8": "DeepSeek-V3-Lite/fp8", "llama_v3.1_8b_instruct": "llama-3.1-model/Llama-3.1-8B-Instruct", diff --git a/tests/integration/test_lists/test-db/l0_b200_multi_gpus_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_b200_multi_gpus_perf_sanity.yml index 563191fa52bb..ff841be8868c 100644 --- a/tests/integration/test_lists/test-db/l0_b200_multi_gpus_perf_sanity.yml +++ b/tests/integration/test_lists/test-db/l0_b200_multi_gpus_perf_sanity.yml @@ -41,6 +41,7 @@ l0_b200_multi_gpus_perf_sanity: - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_blackwell-k25_thinking_fp4_dep8_8k1k] TIMEOUT (90) - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_blackwell-k25_thinking_fp4_tep8_32k8k] - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_blackwell-k25_thinking_fp4_dep8_32k8k] TIMEOUT (90) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_k25_thinking_fp4_blackwell-k25_thinking_fp4_tep8_adp_2k1k] # ctx_only tests (disagg config) - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-b200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-b200_deepseek-r1-fp4_1k1k_con2048_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_dgx_h200_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_dgx_h200_perf_sanity.yml new file mode 100644 index 000000000000..b7b871660a0a --- /dev/null +++ b/tests/integration/test_lists/test-db/l0_dgx_h200_perf_sanity.yml @@ -0,0 +1,28 @@ +version: 0.0.1 +l0_dgx_h200_perf_sanity: +- condition: + ranges: + system_gpu_count: + gte: 8 + lte: 8 + wildcards: + gpu: + - '*h200*' + linux_distribution_name: ubuntu* + cpu: x86_64 + terms: + stage: post_merge + backend: pytorch + tests: +<<<<<<< HEAD + + + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-h200_nemotron-super-fp8_8k1k_con64_ctx1_tp2_gen1_tp2_eplb0_mtp0_ccb-UCX] + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-h200_qwen3-235b-a22b-fp8_8k1k_con512_ctx1_tp2_gen1_tep4_eplb0_mtp0_ccb-DEFAULT] + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-h200_qwen3-32b-fp8_4k1k_con128_ctx1_tp1_gen1_tp2_eplb0_mtp0_ccb-DEFAULT] + - perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_qwen3_235b_a22b_fp8_hopper-qwen3_235b_a22b_fp8_tp4_ep4_deepgemm_8k1k] + - perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_qwen3_32b_fp8_hopper-qwen3_32b_fp8_tp2_6k1k] + # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-h200_nemotron-super-fp8_8k1k_con64_ctx1_tp2_gen1_tp2_eplb0_mtp0_ccb-UCX] + # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-h200_qwen3-235b-a22b-fp8_8k1k_con512_ctx1_tp2_gen1_tep4_eplb0_mtp0_ccb-DEFAULT] + # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-h200_qwen3-32b-fp8_4k1k_con128_ctx1_tp1_gen1_tp2_eplb0_mtp0_ccb-DEFAULT] + diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml index 292d8633f2ab..72a19fca4eb4 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml @@ -55,6 +55,8 @@ l0_gb200_multi_gpus_perf_sanity: - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp2_mtp0_1k1k] - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp1_mtp0_1k1k] - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp1_mtp0_8k1k] + - perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_gpt_oss_120b_fp4_blackwell-gpt_oss_fp4_tep4_adp_cutlass_1k1k] + - perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_gpt_oss_120b_fp4_blackwell-gpt_oss_fp4_tep4_adp_cutlass_8k1k] # k25-thinking-fp4 aggregated - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_grace_blackwell-k25_thinking_fp4_tep4_8k1k] - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_grace_blackwell-k25_thinking_fp4_dep4_8k1k] TIMEOUT (90) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_node2_gpu8.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_node2_gpu8.yml index 18431b483242..ae3d5665c4ca 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_node2_gpu8.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_node2_gpu8.yml @@ -21,6 +21,7 @@ l0_gb200_multi_nodes_perf_sanity_node2_gpu8: # k25-thinking-fp4 aggregated 2-nodes - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_2_nodes_grace_blackwell-k25_thinking_fp4_tep8_32k8k] - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_2_nodes_grace_blackwell-k25_thinking_fp4_dep8_32k8k] TIMEOUT (90) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_deepseek_v32_fp4_2_nodes_grace_blackwell-dsv32_fp4_dep8_trtllm_lpc_mnnvl_8k1k] TIMEOUT (120) # ctx only tests (disagg config) # deepseek-r1-fp4 - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp2_ccb-UCX] TIMEOUT (120) diff --git a/tests/scripts/perf-sanity/aggregated/dynamo_deepseek_v32_fp4_2_nodes_grace_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/dynamo_deepseek_v32_fp4_2_nodes_grace_blackwell.yaml new file mode 100644 index 000000000000..debea1681f7e --- /dev/null +++ b/tests/scripts/perf-sanity/aggregated/dynamo_deepseek_v32_fp4_2_nodes_grace_blackwell.yaml @@ -0,0 +1,50 @@ +# Dynamo-replica: agg-round-robin DeepSeek-V3.2 FP4 TRT-LLM deployment. +# +# Source (upstream Dynamo recipe): +# https://github.com/ai-dynamo/dynamo/tree/main/recipes/deepseek-v32-fp4/trtllm/agg-round-robin + +metadata: + model_name: deepseek_v32_fp4 + supported_gpus: + - GB200 +hardware: + gpus_per_node: 4 +server_configs: + - name: "dsv32_fp4_dep8_trtllm_lpc_mnnvl_8k1k" + model_name: "deepseek_v32_fp4" + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 8192 + max_seq_len: 121000 + attn_backend: "TRTLLM" + enable_attention_dp: true + enable_chunked_prefill: true + disable_overlap_scheduler: true + allreduce_strategy: MNNVL + num_postprocess_workers: 8 + print_iter_log: true + stream_interval: 10 + moe_config: + backend: 'TRTLLM' + use_low_precision_moe_combine: true + cuda_graph_config: + enable_padding: true + max_batch_size: 8 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + tokens_per_block: 64 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 120000 + client_configs: + - name: "con4_iter10_8k1k" + concurrency: 4 + iterations: 10 + isl: 8192 + osl: 1024 + backend: "openai" + dataset_file: diff --git a/tests/scripts/perf-sanity/aggregated/dynamo_gpt_oss_120b_fp4_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/dynamo_gpt_oss_120b_fp4_blackwell.yaml new file mode 100644 index 000000000000..f1d2b6c272cf --- /dev/null +++ b/tests/scripts/perf-sanity/aggregated/dynamo_gpt_oss_120b_fp4_blackwell.yaml @@ -0,0 +1,67 @@ +# Dynamo-replica: agg GPT-OSS-120B FP4 TRT-LLM deployment. +# +# Source (upstream Dynamo recipe): +# https://github.com/ai-dynamo/dynamo/tree/main/recipes/gpt-oss-120b/trtllm/agg + +metadata: + model_name: gpt_oss_120b_fp4 + supported_gpus: + - GB200 +hardware: + gpus_per_node: 4 +server_configs: + - name: "gpt_oss_fp4_tep4_adp_cutlass_1k1k" + model_name: "gpt_oss_120b_fp4" + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + max_batch_size: 800 + max_num_tokens: 20000 + attn_backend: "TRTLLM" + enable_attention_dp: true + stream_interval: 20 + moe_config: + backend: 'CUTLASS' + cuda_graph_config: + enable_padding: true + max_batch_size: 800 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + client_configs: + - name: "con128_iter10_1k1k" + concurrency: 128 + iterations: 10 + isl: 1024 + osl: 1024 + backend: "openai" + dataset_file: + + - name: "gpt_oss_fp4_tep4_adp_cutlass_8k1k" + model_name: "gpt_oss_120b_fp4" + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + max_batch_size: 800 + max_num_tokens: 20000 + attn_backend: "TRTLLM" + enable_attention_dp: true + stream_interval: 20 + moe_config: + backend: 'CUTLASS' + cuda_graph_config: + enable_padding: true + max_batch_size: 800 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + client_configs: + - name: "con128_iter10_8k1k" + concurrency: 128 + iterations: 10 + isl: 8192 + osl: 1024 + backend: "openai" + dataset_file: diff --git a/tests/scripts/perf-sanity/aggregated/dynamo_k25_thinking_fp4_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/dynamo_k25_thinking_fp4_blackwell.yaml new file mode 100644 index 000000000000..24216305a8cb --- /dev/null +++ b/tests/scripts/perf-sanity/aggregated/dynamo_k25_thinking_fp4_blackwell.yaml @@ -0,0 +1,39 @@ +# Dynamo-replica: agg Kimi K2.5 FP4 TRT-LLM deployment (NVIDIA recipe variant). +# +# Source (upstream Dynamo recipe): +# https://github.com/ai-dynamo/dynamo/tree/main/recipes/kimi-k2.5/trtllm/agg/nvidia + +metadata: + model_name: k25_thinking_fp4 + supported_gpus: + - B200 +hardware: + gpus_per_node: 8 +server_configs: + - name: "k25_thinking_fp4_tep8_adp_2k1k" + model_name: "k25_thinking_fp4" + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 8448 + max_seq_len: 8212 + attn_backend: "TRTLLM" + enable_attention_dp: true + trust_remote_code: true + print_iter_log: true + kv_cache_config: + dtype: 'fp8' + free_gpu_memory_fraction: 0.75 + cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 8448 + client_configs: + - name: "con128_iter10_2k1k" + concurrency: 128 + iterations: 10 + isl: 2048 + osl: 1024 + backend: "openai" + trust_remote_code: true + dataset_file: diff --git a/tests/scripts/perf-sanity/aggregated/dynamo_qwen3_235b_a22b_fp8_hopper.yaml b/tests/scripts/perf-sanity/aggregated/dynamo_qwen3_235b_a22b_fp8_hopper.yaml new file mode 100644 index 000000000000..e3c4e9ffb969 --- /dev/null +++ b/tests/scripts/perf-sanity/aggregated/dynamo_qwen3_235b_a22b_fp8_hopper.yaml @@ -0,0 +1,48 @@ +# Dynamo-replica: agg Qwen3-235B-A22B FP8 TRT-LLM deployment. +# +# Source (upstream Dynamo recipe): +# https://github.com/ai-dynamo/dynamo/tree/main/recipes/qwen3-235b-a22b-fp8/trtllm/agg + +metadata: + model_name: qwen3_235b_a22b_fp8 + supported_gpus: + - H200 +hardware: + gpus_per_node: 8 +server_configs: + - name: "qwen3_235b_a22b_fp8_tp4_ep4_deepgemm_8k1k" + model_name: "qwen3_235b_a22b_fp8" + trust_remote_code: true + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + moe_tensor_parallel_size: 1 + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 8192 + max_seq_len: 8192 + attn_backend: "TRTLLM" + enable_attention_dp: false + enable_chunked_prefill: true + disable_overlap_scheduler: false + print_iter_log: false + moe_config: + backend: 'DEEPGEMM' + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + kv_cache_config: + dtype: 'auto' + enable_block_reuse: true + free_gpu_memory_fraction: 0.8 + cache_transceiver_config: + backend: DEFAULT + client_configs: + # ISL+OSL must stay within recipe's max_seq_len=8192, so use 7k/1k. + - name: "con128_iter10_7k1k" + concurrency: 128 + iterations: 10 + isl: 7168 + osl: 1024 + backend: "openai" + trust_remote_code: true + dataset_file: diff --git a/tests/scripts/perf-sanity/aggregated/dynamo_qwen3_32b_fp8_hopper.yaml b/tests/scripts/perf-sanity/aggregated/dynamo_qwen3_32b_fp8_hopper.yaml new file mode 100644 index 000000000000..dfdc056776e4 --- /dev/null +++ b/tests/scripts/perf-sanity/aggregated/dynamo_qwen3_32b_fp8_hopper.yaml @@ -0,0 +1,48 @@ +# Dynamo-replica: agg Qwen3-32B FP8 TRT-LLM deployment. +# +# Source (upstream Dynamo recipe): +# https://github.com/ai-dynamo/dynamo/tree/main/recipes/qwen3-32b-fp8/trtllm/agg + +metadata: + model_name: qwen3_32b_fp8 + supported_gpus: + - H200 +hardware: + gpus_per_node: 8 +server_configs: + - name: "qwen3_32b_fp8_tp2_6k1k" + model_name: "qwen3_32b_fp8" + tensor_parallel_size: 2 + pipeline_parallel_size: 1 + max_batch_size: 96 + max_num_tokens: 7964 + max_seq_len: 7964 + attn_backend: "TRTLLM" + enable_attention_dp: false + enable_chunked_prefill: false + disable_overlap_scheduler: false + print_iter_log: false + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 96 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + client_configs: + # ISL+OSL must stay within recipe's max_seq_len=7964, so use ~6.7k/1k. + - name: "con96_iter10_6k1k" + concurrency: 96 + iterations: 10 + isl: 6912 + osl: 1024 + backend: "openai" + dataset_file: diff --git a/tests/scripts/perf-sanity/disaggregated/h200_nemotron-super-fp8_8k1k_con64_ctx1_tp2_gen1_tp2_eplb0_mtp0_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/h200_nemotron-super-fp8_8k1k_con64_ctx1_tp2_gen1_tp2_eplb0_mtp0_ccb-UCX.yaml new file mode 100644 index 000000000000..2499fb4cf7f7 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/h200_nemotron-super-fp8_8k1k_con64_ctx1_tp2_gen1_tp2_eplb0_mtp0_ccb-UCX.yaml @@ -0,0 +1,91 @@ +metadata: + model_name: super_fp8 + precision: fp8 + model_dir_name: NVIDIA-Nemotron-3-Super-120B-A12B-FP8 + supported_gpus: + - H200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k + # Native-target (Hopper) mirror of the Dynamo Nemotron-3-Super-FP8 TRT-LLM + # disagg deployment recipe: + # https://github.com/ai-dynamo/dynamo/tree/main/recipes/nemotron-3-super-fp8/trtllm/disagg +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:8" + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '64' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/nemotron_super-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 8 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + print_iter_log: true + tensor_parallel_size: 2 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: true + max_batch_size: 16 + max_num_tokens: 8192 + trust_remote_code: true + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRTLLM + cache_transceiver_config: + backend: UCX + disable_overlap_scheduler: false + ctx: + print_iter_log: true + tensor_parallel_size: 2 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: true + max_batch_size: 16 + max_num_tokens: 8192 + trust_remote_code: true + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRTLLM + cache_transceiver_config: + backend: UCX + disable_overlap_scheduler: true diff --git a/tests/scripts/perf-sanity/disaggregated/h200_qwen3-235b-a22b-fp8_8k1k_con512_ctx1_tp2_gen1_tep4_eplb0_mtp0_ccb-DEFAULT.yaml b/tests/scripts/perf-sanity/disaggregated/h200_qwen3-235b-a22b-fp8_8k1k_con512_ctx1_tp2_gen1_tep4_eplb0_mtp0_ccb-DEFAULT.yaml new file mode 100644 index 000000000000..0bf12a3fe300 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/h200_qwen3-235b-a22b-fp8_8k1k_con512_ctx1_tp2_gen1_tep4_eplb0_mtp0_ccb-DEFAULT.yaml @@ -0,0 +1,97 @@ +metadata: + model_name: qwen3_235b_a22b_fp8 + precision: fp8 + model_dir_name: Qwen3-235B-A22B-FP8 + supported_gpus: + - H200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k + # Native-target (Hopper) mirror of the Dynamo Qwen3-235B-A22B-FP8 TRT-LLM + # disagg deployment recipe: + # https://github.com/ai-dynamo/dynamo/tree/main/recipes/qwen3-235b-a22b-fp8/trtllm/disagg +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:8" + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '512' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/qwen3_235b-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 8 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + print_iter_log: true + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + moe_tensor_parallel_size: 1 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + trust_remote_code: true + max_batch_size: 512 + max_num_tokens: 1024 + max_seq_len: 8192 + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.95 + dtype: fp8 + moe_config: + backend: DEEPGEMM + cache_transceiver_config: + backend: DEFAULT + disable_overlap_scheduler: false + ctx: + print_iter_log: true + tensor_parallel_size: 2 + moe_expert_parallel_size: 1 + moe_tensor_parallel_size: 2 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + trust_remote_code: true + max_batch_size: 2 + max_num_tokens: 8192 + max_seq_len: 8192 + cuda_graph_config: + enable_padding: true + max_batch_size: 2 + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: DEEPGEMM + cache_transceiver_config: + backend: DEFAULT + disable_overlap_scheduler: true diff --git a/tests/scripts/perf-sanity/disaggregated/h200_qwen3-32b-fp8_4k1k_con128_ctx1_tp1_gen1_tp2_eplb0_mtp0_ccb-DEFAULT.yaml b/tests/scripts/perf-sanity/disaggregated/h200_qwen3-32b-fp8_4k1k_con128_ctx1_tp1_gen1_tp2_eplb0_mtp0_ccb-DEFAULT.yaml new file mode 100644 index 000000000000..e7eb2ce64804 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/h200_qwen3-32b-fp8_4k1k_con128_ctx1_tp1_gen1_tp2_eplb0_mtp0_ccb-DEFAULT.yaml @@ -0,0 +1,89 @@ +metadata: + model_name: qwen3_32b_fp8 + precision: fp8 + model_dir_name: Qwen3-32B-FP8 + supported_gpus: + - H200 + script_file: disaggr_torch.slurm + benchmark_type: 4k1k + # Native-target (Hopper) mirror of the Dynamo Qwen3-32B-FP8 TRT-LLM disagg + # deployment recipe: + # https://github.com/ai-dynamo/dynamo/tree/main/recipes/qwen3-32b-fp8/trtllm/disagg +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:8" + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '128' + input_length: 4096 + output_length: 1024 + dataset_file: datasets/perf-ci/qwen3_32b-4k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 8 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL=True" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + print_iter_log: true + tensor_parallel_size: 2 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + trust_remote_code: true + max_batch_size: 128 + max_num_tokens: 7808 + max_seq_len: 7808 + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + cache_transceiver_config: + backend: DEFAULT + disable_overlap_scheduler: false + ctx: + print_iter_log: true + tensor_parallel_size: 1 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + enable_chunked_prefill: false + trust_remote_code: true + max_batch_size: 1 + max_num_tokens: 7808 + max_seq_len: 7808 + cuda_graph_config: + enable_padding: true + max_batch_size: 256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + cache_transceiver_config: + backend: DEFAULT + disable_overlap_scheduler: true