NVIDIA · brb-nv · Apr 17, 2026 · Apr 19, 2026 · brb-nv · Apr 18, 2026
@@ -3273,6 +3273,7 @@ def launchTestJobs(pipeline, testFilter)
         "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["auto:dgx-b200-flex", "l0_b200_multi_gpus_perf_sanity", 2, 4, 8, 1, true],
         "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["auto:dgx-b200-flex", "l0_b200_multi_gpus_perf_sanity", 3, 4, 8, 1, true],
         "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-4": ["auto:dgx-b200-flex", "l0_b200_multi_gpus_perf_sanity", 4, 4, 8, 1, true],
+        "DGX_H200-8_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["dgx-h200-x8", "l0_dgx_h200_perf_sanity", 1, 1, 8, 1, true],
     ]
     fullSet += x86SlurmTestConfigs.keySet()
 

@@ -50,6 +50,7 @@
     "qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",  # Qwen3-235B-A22B-FP4
     "super_nvfp4": "NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4",  # Super (Nemotron-H SSM+MoE) NvFP4
     "qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf",  # Qwen3-235B-A22B-FP8
+    "qwen3_32b_fp8": "Qwen3/Qwen3-32B-FP8",
     "llama_v3.3_70b_instruct_fp4": "llama-3.3-models/Llama-3.3-70B-Instruct-FP4",
     "deepseek_v3_lite_fp8": "DeepSeek-V3-Lite/fp8",
     "llama_v3.1_8b_instruct": "llama-3.1-model/Llama-3.1-8B-Instruct",

diff --git a/tests/integration/test_lists/test-db/l0_b200_multi_gpus_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_b200_multi_gpus_perf_sanity.yml
@@ -41,6 +41,7 @@ l0_b200_multi_gpus_perf_sanity:
   - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_blackwell-k25_thinking_fp4_dep8_8k1k] TIMEOUT (90)
   - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_blackwell-k25_thinking_fp4_tep8_32k8k]
   - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_blackwell-k25_thinking_fp4_dep8_32k8k] TIMEOUT (90)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_k25_thinking_fp4_blackwell-k25_thinking_fp4_tep8_adp_2k1k]
   # ctx_only tests (disagg config)
   - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-b200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
   - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-b200_deepseek-r1-fp4_1k1k_con2048_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX] TIMEOUT (120)

diff --git a/tests/integration/test_lists/test-db/l0_dgx_h200_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_dgx_h200_perf_sanity.yml
@@ -0,0 +1,18 @@
+version: 0.0.1
+l0_dgx_h200_perf_sanity:
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 8
+        lte: 8
+    wildcards:
+      gpu:
+      - '*h200*'
+      linux_distribution_name: ubuntu*
+      cpu: x86_64
+    terms:
+      stage: post_merge
+      backend: pytorch
+  tests:
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_qwen3_235b_a22b_fp8_hopper-qwen3_235b_a22b_fp8_tp4_ep4_deepgemm_8k1k]
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_qwen3_32b_fp8_hopper-qwen3_32b_fp8_tp2_6k1k]
diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml
@@ -55,6 +55,8 @@ l0_gb200_multi_gpus_perf_sanity:
   - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp2_mtp0_1k1k]
   - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp1_mtp0_1k1k]
   - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp1_mtp0_8k1k]
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_gpt_oss_120b_fp4_blackwell-gpt_oss_fp4_tep4_adp_cutlass_1k1k]
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_gpt_oss_120b_fp4_blackwell-gpt_oss_fp4_tep4_adp_cutlass_8k1k]
   # k25-thinking-fp4 aggregated
   - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_grace_blackwell-k25_thinking_fp4_tep4_8k1k]
   - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_grace_blackwell-k25_thinking_fp4_dep4_8k1k] TIMEOUT (90)

diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_node2_gpu8.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_node2_gpu8.yml
@@ -21,6 +21,7 @@ l0_gb200_multi_nodes_perf_sanity_node2_gpu8:
   # k25-thinking-fp4 aggregated 2-nodes
   - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_2_nodes_grace_blackwell-k25_thinking_fp4_tep8_32k8k]
   - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_2_nodes_grace_blackwell-k25_thinking_fp4_dep8_32k8k] TIMEOUT (90)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_deepseek_v32_fp4_2_nodes_grace_blackwell-dsv32_fp4_dep8_trtllm_lpc_mnnvl_8k1k] TIMEOUT (120)
   # ctx only tests (disagg config)
   # deepseek-r1-fp4
   - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp2_ccb-UCX] TIMEOUT (120)

diff --git a/tests/scripts/perf-sanity/aggregated/dynamo_deepseek_v32_fp4_2_nodes_grace_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/dynamo_deepseek_v32_fp4_2_nodes_grace_blackwell.yaml
@@ -0,0 +1,50 @@
+# Dynamo-replica: agg-round-robin DeepSeek-V3.2 FP4 TRT-LLM deployment.
+#
+# Source (upstream Dynamo recipe):
+#   https://github.com/ai-dynamo/dynamo/tree/main/recipes/deepseek-v32-fp4/trtllm/agg-round-robin
+
+metadata:
+  model_name: deepseek_v32_fp4
+  supported_gpus:
+  - GB200
+hardware:
+  gpus_per_node: 4
+server_configs:
+  - name: "dsv32_fp4_dep8_trtllm_lpc_mnnvl_8k1k"
+    model_name: "deepseek_v32_fp4"
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    pipeline_parallel_size: 1
+    max_batch_size: 8
+    max_num_tokens: 8192
+    max_seq_len: 121000
+    attn_backend: "TRTLLM"
+    enable_attention_dp: true
+    enable_chunked_prefill: true
+    disable_overlap_scheduler: true
+    allreduce_strategy: MNNVL
+    num_postprocess_workers: 8
+    print_iter_log: true
+    stream_interval: 10
+    moe_config:
+      backend: 'TRTLLM'
+      use_low_precision_moe_combine: true
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 8
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.9
+      tokens_per_block: 64
+    cache_transceiver_config:
+      backend: UCX
+      max_tokens_in_buffer: 120000
+    client_configs:
+      - name: "con4_iter10_8k1k"
+        concurrency: 4
+        iterations: 10
+        isl: 8192
+        osl: 1024
+        backend: "openai"
+        dataset_file: <dataset_file>
diff --git a/tests/scripts/perf-sanity/aggregated/dynamo_gpt_oss_120b_fp4_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/dynamo_gpt_oss_120b_fp4_blackwell.yaml
@@ -0,0 +1,67 @@
+# Dynamo-replica: agg GPT-OSS-120B FP4 TRT-LLM deployment.
+#
+# Source (upstream Dynamo recipe):
+#   https://github.com/ai-dynamo/dynamo/tree/main/recipes/gpt-oss-120b/trtllm/agg
+
+metadata:
+  model_name: gpt_oss_120b_fp4
+  supported_gpus:
+  - GB200
+hardware:
+  gpus_per_node: 4
+server_configs:
+  - name: "gpt_oss_fp4_tep4_adp_cutlass_1k1k"
+    model_name: "gpt_oss_120b_fp4"
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    max_batch_size: 800
+    max_num_tokens: 20000
+    attn_backend: "TRTLLM"
+    enable_attention_dp: true
+    stream_interval: 20
+    moe_config:
+      backend: 'CUTLASS'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 800
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.9
+    client_configs:
+      - name: "con128_iter10_1k1k"
+        concurrency: 128
+        iterations: 10
+        isl: 1024
+        osl: 1024
+        backend: "openai"
+        dataset_file: <dataset_file>
+
+  - name: "gpt_oss_fp4_tep4_adp_cutlass_8k1k"
+    model_name: "gpt_oss_120b_fp4"
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    max_batch_size: 800
+    max_num_tokens: 20000
+    attn_backend: "TRTLLM"
+    enable_attention_dp: true
+    stream_interval: 20
+    moe_config:
+      backend: 'CUTLASS'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 800
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.9
+    client_configs:
+      - name: "con128_iter10_8k1k"
+        concurrency: 128
+        iterations: 10
+        isl: 8192
+        osl: 1024
+        backend: "openai"
+        dataset_file: <dataset_file>
diff --git a/tests/scripts/perf-sanity/aggregated/dynamo_k25_thinking_fp4_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/dynamo_k25_thinking_fp4_blackwell.yaml
@@ -0,0 +1,39 @@
+# Dynamo-replica: agg Kimi K2.5 FP4 TRT-LLM deployment (NVIDIA recipe variant).
+#
+# Source (upstream Dynamo recipe):
+#   https://github.com/ai-dynamo/dynamo/tree/main/recipes/kimi-k2.5/trtllm/agg/nvidia
+
+metadata:
+  model_name: k25_thinking_fp4
+  supported_gpus:
+  - B200
+hardware:
+  gpus_per_node: 8
+server_configs:
+  - name: "k25_thinking_fp4_tep8_adp_2k1k"
+    model_name: "k25_thinking_fp4"
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    pipeline_parallel_size: 1
+    max_batch_size: 128
+    max_num_tokens: 8448
+    max_seq_len: 8212
+    attn_backend: "TRTLLM"
+    enable_attention_dp: true
+    trust_remote_code: true
+    print_iter_log: true
+    kv_cache_config:
+      dtype: 'fp8'
+      free_gpu_memory_fraction: 0.75
+    cache_transceiver_config:
+      backend: UCX
+      max_tokens_in_buffer: 8448
+    client_configs:
+      - name: "con128_iter10_2k1k"
+        concurrency: 128
+        iterations: 10
+        isl: 2048
+        osl: 1024
+        backend: "openai"
+        trust_remote_code: true
+        dataset_file: <dataset_file>
diff --git a/tests/scripts/perf-sanity/aggregated/dynamo_qwen3_235b_a22b_fp8_hopper.yaml b/tests/scripts/perf-sanity/aggregated/dynamo_qwen3_235b_a22b_fp8_hopper.yaml
@@ -0,0 +1,48 @@
+# Dynamo-replica: agg Qwen3-235B-A22B FP8 TRT-LLM deployment.
+#
+# Source (upstream Dynamo recipe):
+#   https://github.com/ai-dynamo/dynamo/tree/main/recipes/qwen3-235b-a22b-fp8/trtllm/agg
+
+metadata:
+  model_name: qwen3_235b_a22b_fp8
+  supported_gpus:
+  - H200
+hardware:
+  gpus_per_node: 8
+server_configs:
+  - name: "qwen3_235b_a22b_fp8_tp4_ep4_deepgemm_8k1k"
+    model_name: "qwen3_235b_a22b_fp8"
+    trust_remote_code: true
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    moe_tensor_parallel_size: 1
+    pipeline_parallel_size: 1
+    max_batch_size: 128
+    max_num_tokens: 8192
+    max_seq_len: 8192
+    attn_backend: "TRTLLM"
+    enable_attention_dp: false
+    enable_chunked_prefill: true
+    disable_overlap_scheduler: false
+    print_iter_log: false
+    moe_config:
+      backend: 'DEEPGEMM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 128
+    kv_cache_config:
+      dtype: 'auto'
+      enable_block_reuse: true
+      free_gpu_memory_fraction: 0.8
+    cache_transceiver_config:
+      backend: DEFAULT
+    client_configs:
+      # ISL+OSL must stay within recipe's max_seq_len=8192, so use 7k/1k.
+      - name: "con128_iter10_7k1k"
+        concurrency: 128
+        iterations: 10
+        isl: 7168
+        osl: 1024
+        backend: "openai"
+        trust_remote_code: true
+        dataset_file: <dataset_file>
diff --git a/tests/scripts/perf-sanity/aggregated/dynamo_qwen3_32b_fp8_hopper.yaml b/tests/scripts/perf-sanity/aggregated/dynamo_qwen3_32b_fp8_hopper.yaml
@@ -0,0 +1,48 @@
+# Dynamo-replica: agg Qwen3-32B FP8 TRT-LLM deployment.
+#
+# Source (upstream Dynamo recipe):
+#   https://github.com/ai-dynamo/dynamo/tree/main/recipes/qwen3-32b-fp8/trtllm/agg
+
+metadata:
+  model_name: qwen3_32b_fp8
+  supported_gpus:
+  - H200
+hardware:
+  gpus_per_node: 8
+server_configs:
+  - name: "qwen3_32b_fp8_tp2_6k1k"
+    model_name: "qwen3_32b_fp8"
+    tensor_parallel_size: 2
+    pipeline_parallel_size: 1
+    max_batch_size: 96
+    max_num_tokens: 7964
+    max_seq_len: 7964
+    attn_backend: "TRTLLM"
+    enable_attention_dp: false
+    enable_chunked_prefill: false
+    disable_overlap_scheduler: false
+    print_iter_log: false
+    cuda_graph_config:
+      enable_padding: true
+      batch_sizes:
+      - 1
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 96
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.9
+    client_configs:
+      # ISL+OSL must stay within recipe's max_seq_len=7964, so use ~6.7k/1k.
+      - name: "con96_iter10_6k1k"
+        concurrency: 96
+        iterations: 10
+        isl: 6912
+        osl: 1024
+        backend: "openai"
+        dataset_file: <dataset_file>