NVIDIA · brb-nv · Apr 17, 2026 · Apr 20, 2026 · Apr 20, 2026
@@ -3273,6 +3273,7 @@ def launchTestJobs(pipeline, testFilter)
         "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["auto:dgx-b200-flex", "l0_b200_multi_gpus_perf_sanity", 2, 4, 8, 1, true],
         "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["auto:dgx-b200-flex", "l0_b200_multi_gpus_perf_sanity", 3, 4, 8, 1, true],
         "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-4": ["auto:dgx-b200-flex", "l0_b200_multi_gpus_perf_sanity", 4, 4, 8, 1, true],
+        "DGX_H200-8_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["auto:dgx-h200-x8", "l0_dgx_h200_perf_sanity", 1, 1, 8, 1, true],
     ]
     fullSet += x86SlurmTestConfigs.keySet()
 

@@ -49,7 +49,9 @@
     "k25_thinking_fp4": "Kimi-K2.5-NVFP4",
     "qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",  # Qwen3-235B-A22B-FP4
     "super_nvfp4": "NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4",  # Super (Nemotron-H SSM+MoE) NvFP4
+    "super_fp8": "NVIDIA-Nemotron-3-Super-120B-A12B-FP8",
     "qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf",  # Qwen3-235B-A22B-FP8
+    "qwen3_32b_fp8": "Qwen3/Qwen3-32B-FP8",
     "llama_v3.3_70b_instruct_fp4": "llama-3.3-models/Llama-3.3-70B-Instruct-FP4",
     "deepseek_v3_lite_fp8": "DeepSeek-V3-Lite/fp8",
     "llama_v3.1_8b_instruct": "llama-3.1-model/Llama-3.1-8B-Instruct",

diff --git a/tests/integration/test_lists/test-db/l0_dgx_h200_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_dgx_h200_perf_sanity.yml
@@ -0,0 +1,25 @@
+version: 0.0.1
+l0_dgx_h200_perf_sanity:
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 8
+        lte: 8
+    wildcards:
+      gpu:
+      - '*h200*'
+      linux_distribution_name: ubuntu*
+      cpu: x86_64
+    terms:
+      stage: post_merge
+      backend: pytorch
+  tests:
+
+
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-h200_nemotron-super-fp8_8k1k_con64_ctx1_tp2_gen1_tp2_eplb0_mtp0_ccb-UCX]
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-h200_qwen3-235b-a22b-fp8_8k1k_con512_ctx1_tp2_gen1_tep4_eplb0_mtp0_ccb-DEFAULT]
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-h200_qwen3-32b-fp8_4k1k_con128_ctx1_tp1_gen1_tp2_eplb0_mtp0_ccb-DEFAULT]
+
+  # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-h200_nemotron-super-fp8_8k1k_con64_ctx1_tp2_gen1_tp2_eplb0_mtp0_ccb-UCX]
+  # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-h200_qwen3-235b-a22b-fp8_8k1k_con512_ctx1_tp2_gen1_tep4_eplb0_mtp0_ccb-DEFAULT]
+  # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-h200_qwen3-32b-fp8_4k1k_con128_ctx1_tp1_gen1_tp2_eplb0_mtp0_ccb-DEFAULT]
diff --git a/...isaggregated/h200_nemotron-super-fp8_8k1k_con64_ctx1_tp2_gen1_tp2_eplb0_mtp0_ccb-UCX.yaml b/...isaggregated/h200_nemotron-super-fp8_8k1k_con64_ctx1_tp2_gen1_tp2_eplb0_mtp0_ccb-UCX.yaml
@@ -0,0 +1,91 @@
+metadata:
+  model_name: super_fp8
+  precision: fp8
+  model_dir_name: NVIDIA-Nemotron-3-Super-120B-A12B-FP8
+  supported_gpus:
+  - H200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 8k1k
+  # Native-target (Hopper) mirror of the Dynamo Nemotron-3-Super-FP8 TRT-LLM
+  # disagg deployment recipe:
+  # https://github.com/ai-dynamo/dynamo/tree/main/recipes/nemotron-3-super-fp8/trtllm/disagg
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: "--gres=gpu:8"
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 10
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '64'
+  input_length: 8192
+  output_length: 1024
+  dataset_file: datasets/perf-ci/nemotron_super-8k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 8
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+worker_config:
+  gen:
+    print_iter_log: true
+    tensor_parallel_size: 2
+    moe_expert_parallel_size: 1
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: false
+    enable_chunked_prefill: true
+    max_batch_size: 16
+    max_num_tokens: 8192
+    trust_remote_code: true
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 16
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.85
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      backend: UCX
+    disable_overlap_scheduler: false
+  ctx:
+    print_iter_log: true
+    tensor_parallel_size: 2
+    moe_expert_parallel_size: 1
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: false
+    enable_chunked_prefill: true
+    max_batch_size: 16
+    max_num_tokens: 8192
+    trust_remote_code: true
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 16
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.85
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      backend: UCX
+    disable_overlap_scheduler: true
diff --git a/...gated/h200_qwen3-235b-a22b-fp8_8k1k_con512_ctx1_tp2_gen1_tep4_eplb0_mtp0_ccb-DEFAULT.yaml b/...gated/h200_qwen3-235b-a22b-fp8_8k1k_con512_ctx1_tp2_gen1_tep4_eplb0_mtp0_ccb-DEFAULT.yaml
@@ -0,0 +1,97 @@
+metadata:
+  model_name: qwen3_235b_a22b_fp8
+  precision: fp8
+  model_dir_name: Qwen3-235B-A22B-FP8
+  supported_gpus:
+  - H200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 8k1k
+  # Native-target (Hopper) mirror of the Dynamo Qwen3-235B-A22B-FP8 TRT-LLM
+  # disagg deployment recipe:
+  # https://github.com/ai-dynamo/dynamo/tree/main/recipes/qwen3-235b-a22b-fp8/trtllm/disagg
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: "--gres=gpu:8"
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 10
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '512'
+  input_length: 8192
+  output_length: 1024
+  dataset_file: datasets/perf-ci/qwen3_235b-8k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 8
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+worker_config:
+  gen:
+    print_iter_log: true
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    moe_tensor_parallel_size: 1
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: false
+    enable_chunked_prefill: false
+    trust_remote_code: true
+    max_batch_size: 512
+    max_num_tokens: 1024
+    max_seq_len: 8192
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 512
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.95
+      dtype: fp8
+    moe_config:
+      backend: DEEPGEMM
+    cache_transceiver_config:
+      backend: DEFAULT
+    disable_overlap_scheduler: false
+  ctx:
+    print_iter_log: true
+    tensor_parallel_size: 2
+    moe_expert_parallel_size: 1
+    moe_tensor_parallel_size: 2
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: false
+    enable_chunked_prefill: false
+    trust_remote_code: true
+    max_batch_size: 2
+    max_num_tokens: 8192
+    max_seq_len: 8192
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 2
+    kv_cache_config:
+      enable_block_reuse: true
+      free_gpu_memory_fraction: 0.7
+      dtype: fp8
+    moe_config:
+      backend: DEEPGEMM
+    cache_transceiver_config:
+      backend: DEFAULT
+    disable_overlap_scheduler: true
diff --git a/...isaggregated/h200_qwen3-32b-fp8_4k1k_con128_ctx1_tp1_gen1_tp2_eplb0_mtp0_ccb-DEFAULT.yaml b/...isaggregated/h200_qwen3-32b-fp8_4k1k_con128_ctx1_tp1_gen1_tp2_eplb0_mtp0_ccb-DEFAULT.yaml
@@ -0,0 +1,89 @@
+metadata:
+  model_name: qwen3_32b_fp8
+  precision: fp8
+  model_dir_name: Qwen3-32B-FP8
+  supported_gpus:
+  - H200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 4k1k
+  # Native-target (Hopper) mirror of the Dynamo Qwen3-32B-FP8 TRT-LLM disagg
+  # deployment recipe:
+  # https://github.com/ai-dynamo/dynamo/tree/main/recipes/qwen3-32b-fp8/trtllm/disagg
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: "--gres=gpu:8"
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 10
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '128'
+  input_length: 4096
+  output_length: 1024
+  dataset_file: datasets/perf-ci/qwen3_32b-4k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 8
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL=True"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+worker_config:
+  gen:
+    print_iter_log: true
+    tensor_parallel_size: 2
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: false
+    enable_chunked_prefill: false
+    trust_remote_code: true
+    max_batch_size: 128
+    max_num_tokens: 7808
+    max_seq_len: 7808
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 128
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.9
+      dtype: fp8
+    cache_transceiver_config:
+      backend: DEFAULT
+    disable_overlap_scheduler: false
+  ctx:
+    print_iter_log: true
+    tensor_parallel_size: 1
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: false
+    enable_chunked_prefill: false
+    trust_remote_code: true
+    max_batch_size: 1
+    max_num_tokens: 7808
+    max_seq_len: 7808
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 256
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.9
+      dtype: fp8
+    cache_transceiver_config:
+      backend: DEFAULT
+    disable_overlap_scheduler: true