diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index 665fda0f6c6a..c78edcfdde39 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -3202,6 +3202,7 @@ def launchTestJobs(pipeline, testFilter)
         "DGX_H200-4_GPUs-Triton-Post-Merge-1": ["dgx-h200-x4", "l0_dgx_h200", 1, 1, 4],
         "DGX_H200-8_GPUs-PyTorch-Post-Merge-1": ["dgx-h200-x8", "l0_dgx_h200", 1, 1, 8],
         "DGX_H200-4_GPUs-PyTorch-Post-Merge-1": ["dgx-h200-x4", "l0_dgx_h200", 1, 1, 4],
+	"DGX_H200-8_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["dgx-h200-x8", "l0_dgx_h200_perf_sanity", 1, 1, 8],
         // "DGX_H200-4_GPUs-TensorRT-Post-Merge-1": ["dgx-h200-x4", "l0_dgx_h200", 1, 3, 4],
         // "DGX_H200-4_GPUs-TensorRT-Post-Merge-2": ["dgx-h200-x4", "l0_dgx_h200", 2, 3, 4],
         // "DGX_H200-4_GPUs-TensorRT-Post-Merge-3": ["dgx-h200-x4", "l0_dgx_h200", 3, 3, 4],
diff --git a/tests/integration/defs/perf/test_perf_sanity.py b/tests/integration/defs/perf/test_perf_sanity.py
index b6c7737cbe5a..04a2db1462c6 100644
--- a/tests/integration/defs/perf/test_perf_sanity.py
+++ b/tests/integration/defs/perf/test_perf_sanity.py
@@ -49,7 +49,9 @@
     "k25_thinking_fp4": "Kimi-K2.5-NVFP4",
     "qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",  # Qwen3-235B-A22B-FP4
     "super_nvfp4": "NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4",  # Super (Nemotron-H SSM+MoE) NvFP4
+    "super_fp8": "NVIDIA-Nemotron-3-Super-120B-A12B-FP8",
     "qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf",  # Qwen3-235B-A22B-FP8
+    "qwen3_32b_fp8": "Qwen3/Qwen3-32B-FP8",
     "llama_v3.3_70b_instruct_fp4": "llama-3.3-models/Llama-3.3-70B-Instruct-FP4",
     "deepseek_v3_lite_fp8": "DeepSeek-V3-Lite/fp8",
     "llama_v3.1_8b_instruct": "llama-3.1-model/Llama-3.1-8B-Instruct",
diff --git a/tests/integration/test_lists/test-db/l0_b200_multi_gpus_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_b200_multi_gpus_perf_sanity.yml
index 563191fa52bb..ff841be8868c 100644
--- a/tests/integration/test_lists/test-db/l0_b200_multi_gpus_perf_sanity.yml
+++ b/tests/integration/test_lists/test-db/l0_b200_multi_gpus_perf_sanity.yml
@@ -41,6 +41,7 @@ l0_b200_multi_gpus_perf_sanity:
   - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_blackwell-k25_thinking_fp4_dep8_8k1k] TIMEOUT (90)
   - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_blackwell-k25_thinking_fp4_tep8_32k8k]
   - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_blackwell-k25_thinking_fp4_dep8_32k8k] TIMEOUT (90)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_k25_thinking_fp4_blackwell-k25_thinking_fp4_tep8_adp_2k1k]
   # ctx_only tests (disagg config)
   - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-b200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
   - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-b200_deepseek-r1-fp4_1k1k_con2048_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX] TIMEOUT (120)
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h200_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_dgx_h200_perf_sanity.yml
new file mode 100644
index 000000000000..b7b871660a0a
--- /dev/null
+++ b/tests/integration/test_lists/test-db/l0_dgx_h200_perf_sanity.yml
@@ -0,0 +1,28 @@
+version: 0.0.1
+l0_dgx_h200_perf_sanity:
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 8
+        lte: 8
+    wildcards:
+      gpu:
+      - '*h200*'
+      linux_distribution_name: ubuntu*
+      cpu: x86_64
+    terms:
+      stage: post_merge
+      backend: pytorch
+  tests:
+<<<<<<< HEAD
+
+
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-h200_nemotron-super-fp8_8k1k_con64_ctx1_tp2_gen1_tp2_eplb0_mtp0_ccb-UCX]
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-h200_qwen3-235b-a22b-fp8_8k1k_con512_ctx1_tp2_gen1_tep4_eplb0_mtp0_ccb-DEFAULT]
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-h200_qwen3-32b-fp8_4k1k_con128_ctx1_tp1_gen1_tp2_eplb0_mtp0_ccb-DEFAULT]
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_qwen3_235b_a22b_fp8_hopper-qwen3_235b_a22b_fp8_tp4_ep4_deepgemm_8k1k]
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_qwen3_32b_fp8_hopper-qwen3_32b_fp8_tp2_6k1k]
+  # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-h200_nemotron-super-fp8_8k1k_con64_ctx1_tp2_gen1_tp2_eplb0_mtp0_ccb-UCX]
+  # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-h200_qwen3-235b-a22b-fp8_8k1k_con512_ctx1_tp2_gen1_tep4_eplb0_mtp0_ccb-DEFAULT]
+  # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-h200_qwen3-32b-fp8_4k1k_con128_ctx1_tp1_gen1_tp2_eplb0_mtp0_ccb-DEFAULT]
+
diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml
index 292d8633f2ab..72a19fca4eb4 100644
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml
@@ -55,6 +55,8 @@ l0_gb200_multi_gpus_perf_sanity:
   - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp2_mtp0_1k1k]
   - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp1_mtp0_1k1k]
   - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp1_mtp0_8k1k]
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_gpt_oss_120b_fp4_blackwell-gpt_oss_fp4_tep4_adp_cutlass_1k1k]
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_gpt_oss_120b_fp4_blackwell-gpt_oss_fp4_tep4_adp_cutlass_8k1k]
   # k25-thinking-fp4 aggregated
   - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_grace_blackwell-k25_thinking_fp4_tep4_8k1k]
   - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_grace_blackwell-k25_thinking_fp4_dep4_8k1k] TIMEOUT (90)
diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_node2_gpu8.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_node2_gpu8.yml
index 18431b483242..ae3d5665c4ca 100644
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_node2_gpu8.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_node2_gpu8.yml
@@ -21,6 +21,7 @@ l0_gb200_multi_nodes_perf_sanity_node2_gpu8:
   # k25-thinking-fp4 aggregated 2-nodes
   - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_2_nodes_grace_blackwell-k25_thinking_fp4_tep8_32k8k]
   - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_2_nodes_grace_blackwell-k25_thinking_fp4_dep8_32k8k] TIMEOUT (90)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_deepseek_v32_fp4_2_nodes_grace_blackwell-dsv32_fp4_dep8_trtllm_lpc_mnnvl_8k1k] TIMEOUT (120)
   # ctx only tests (disagg config)
   # deepseek-r1-fp4
   - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp2_ccb-UCX] TIMEOUT (120)
diff --git a/tests/scripts/perf-sanity/aggregated/dynamo_deepseek_v32_fp4_2_nodes_grace_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/dynamo_deepseek_v32_fp4_2_nodes_grace_blackwell.yaml
new file mode 100644
index 000000000000..debea1681f7e
--- /dev/null
+++ b/tests/scripts/perf-sanity/aggregated/dynamo_deepseek_v32_fp4_2_nodes_grace_blackwell.yaml
@@ -0,0 +1,50 @@
+# Dynamo-replica: agg-round-robin DeepSeek-V3.2 FP4 TRT-LLM deployment.
+#
+# Source (upstream Dynamo recipe):
+#   https://github.com/ai-dynamo/dynamo/tree/main/recipes/deepseek-v32-fp4/trtllm/agg-round-robin
+
+metadata:
+  model_name: deepseek_v32_fp4
+  supported_gpus:
+  - GB200
+hardware:
+  gpus_per_node: 4
+server_configs:
+  - name: "dsv32_fp4_dep8_trtllm_lpc_mnnvl_8k1k"
+    model_name: "deepseek_v32_fp4"
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    pipeline_parallel_size: 1
+    max_batch_size: 8
+    max_num_tokens: 8192
+    max_seq_len: 121000
+    attn_backend: "TRTLLM"
+    enable_attention_dp: true
+    enable_chunked_prefill: true
+    disable_overlap_scheduler: true
+    allreduce_strategy: MNNVL
+    num_postprocess_workers: 8
+    print_iter_log: true
+    stream_interval: 10
+    moe_config:
+      backend: 'TRTLLM'
+      use_low_precision_moe_combine: true
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 8
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.9
+      tokens_per_block: 64
+    cache_transceiver_config:
+      backend: UCX
+      max_tokens_in_buffer: 120000
+    client_configs:
+      - name: "con4_iter10_8k1k"
+        concurrency: 4
+        iterations: 10
+        isl: 8192
+        osl: 1024
+        backend: "openai"
+        dataset_file: <dataset_file>
diff --git a/tests/scripts/perf-sanity/aggregated/dynamo_gpt_oss_120b_fp4_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/dynamo_gpt_oss_120b_fp4_blackwell.yaml
new file mode 100644
index 000000000000..f1d2b6c272cf
--- /dev/null
+++ b/tests/scripts/perf-sanity/aggregated/dynamo_gpt_oss_120b_fp4_blackwell.yaml
@@ -0,0 +1,67 @@
+# Dynamo-replica: agg GPT-OSS-120B FP4 TRT-LLM deployment.
+#
+# Source (upstream Dynamo recipe):
+#   https://github.com/ai-dynamo/dynamo/tree/main/recipes/gpt-oss-120b/trtllm/agg
+
+metadata:
+  model_name: gpt_oss_120b_fp4
+  supported_gpus:
+  - GB200
+hardware:
+  gpus_per_node: 4
+server_configs:
+  - name: "gpt_oss_fp4_tep4_adp_cutlass_1k1k"
+    model_name: "gpt_oss_120b_fp4"
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    max_batch_size: 800
+    max_num_tokens: 20000
+    attn_backend: "TRTLLM"
+    enable_attention_dp: true
+    stream_interval: 20
+    moe_config:
+      backend: 'CUTLASS'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 800
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.9
+    client_configs:
+      - name: "con128_iter10_1k1k"
+        concurrency: 128
+        iterations: 10
+        isl: 1024
+        osl: 1024
+        backend: "openai"
+        dataset_file: <dataset_file>
+
+  - name: "gpt_oss_fp4_tep4_adp_cutlass_8k1k"
+    model_name: "gpt_oss_120b_fp4"
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    max_batch_size: 800
+    max_num_tokens: 20000
+    attn_backend: "TRTLLM"
+    enable_attention_dp: true
+    stream_interval: 20
+    moe_config:
+      backend: 'CUTLASS'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 800
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.9
+    client_configs:
+      - name: "con128_iter10_8k1k"
+        concurrency: 128
+        iterations: 10
+        isl: 8192
+        osl: 1024
+        backend: "openai"
+        dataset_file: <dataset_file>
diff --git a/tests/scripts/perf-sanity/aggregated/dynamo_k25_thinking_fp4_blackwell.yaml b/tests/scripts/perf-sanity/aggregated/dynamo_k25_thinking_fp4_blackwell.yaml
new file mode 100644
index 000000000000..24216305a8cb
--- /dev/null
+++ b/tests/scripts/perf-sanity/aggregated/dynamo_k25_thinking_fp4_blackwell.yaml
@@ -0,0 +1,39 @@
+# Dynamo-replica: agg Kimi K2.5 FP4 TRT-LLM deployment (NVIDIA recipe variant).
+#
+# Source (upstream Dynamo recipe):
+#   https://github.com/ai-dynamo/dynamo/tree/main/recipes/kimi-k2.5/trtllm/agg/nvidia
+
+metadata:
+  model_name: k25_thinking_fp4
+  supported_gpus:
+  - B200
+hardware:
+  gpus_per_node: 8
+server_configs:
+  - name: "k25_thinking_fp4_tep8_adp_2k1k"
+    model_name: "k25_thinking_fp4"
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    pipeline_parallel_size: 1
+    max_batch_size: 128
+    max_num_tokens: 8448
+    max_seq_len: 8212
+    attn_backend: "TRTLLM"
+    enable_attention_dp: true
+    trust_remote_code: true
+    print_iter_log: true
+    kv_cache_config:
+      dtype: 'fp8'
+      free_gpu_memory_fraction: 0.75
+    cache_transceiver_config:
+      backend: UCX
+      max_tokens_in_buffer: 8448
+    client_configs:
+      - name: "con128_iter10_2k1k"
+        concurrency: 128
+        iterations: 10
+        isl: 2048
+        osl: 1024
+        backend: "openai"
+        trust_remote_code: true
+        dataset_file: <dataset_file>
diff --git a/tests/scripts/perf-sanity/aggregated/dynamo_qwen3_235b_a22b_fp8_hopper.yaml b/tests/scripts/perf-sanity/aggregated/dynamo_qwen3_235b_a22b_fp8_hopper.yaml
new file mode 100644
index 000000000000..e3c4e9ffb969
--- /dev/null
+++ b/tests/scripts/perf-sanity/aggregated/dynamo_qwen3_235b_a22b_fp8_hopper.yaml
@@ -0,0 +1,48 @@
+# Dynamo-replica: agg Qwen3-235B-A22B FP8 TRT-LLM deployment.
+#
+# Source (upstream Dynamo recipe):
+#   https://github.com/ai-dynamo/dynamo/tree/main/recipes/qwen3-235b-a22b-fp8/trtllm/agg
+
+metadata:
+  model_name: qwen3_235b_a22b_fp8
+  supported_gpus:
+  - H200
+hardware:
+  gpus_per_node: 8
+server_configs:
+  - name: "qwen3_235b_a22b_fp8_tp4_ep4_deepgemm_8k1k"
+    model_name: "qwen3_235b_a22b_fp8"
+    trust_remote_code: true
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    moe_tensor_parallel_size: 1
+    pipeline_parallel_size: 1
+    max_batch_size: 128
+    max_num_tokens: 8192
+    max_seq_len: 8192
+    attn_backend: "TRTLLM"
+    enable_attention_dp: false
+    enable_chunked_prefill: true
+    disable_overlap_scheduler: false
+    print_iter_log: false
+    moe_config:
+      backend: 'DEEPGEMM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 128
+    kv_cache_config:
+      dtype: 'auto'
+      enable_block_reuse: true
+      free_gpu_memory_fraction: 0.8
+    cache_transceiver_config:
+      backend: DEFAULT
+    client_configs:
+      # ISL+OSL must stay within recipe's max_seq_len=8192, so use 7k/1k.
+      - name: "con128_iter10_7k1k"
+        concurrency: 128
+        iterations: 10
+        isl: 7168
+        osl: 1024
+        backend: "openai"
+        trust_remote_code: true
+        dataset_file: <dataset_file>
diff --git a/tests/scripts/perf-sanity/aggregated/dynamo_qwen3_32b_fp8_hopper.yaml b/tests/scripts/perf-sanity/aggregated/dynamo_qwen3_32b_fp8_hopper.yaml
new file mode 100644
index 000000000000..dfdc056776e4
--- /dev/null
+++ b/tests/scripts/perf-sanity/aggregated/dynamo_qwen3_32b_fp8_hopper.yaml
@@ -0,0 +1,48 @@
+# Dynamo-replica: agg Qwen3-32B FP8 TRT-LLM deployment.
+#
+# Source (upstream Dynamo recipe):
+#   https://github.com/ai-dynamo/dynamo/tree/main/recipes/qwen3-32b-fp8/trtllm/agg
+
+metadata:
+  model_name: qwen3_32b_fp8
+  supported_gpus:
+  - H200
+hardware:
+  gpus_per_node: 8
+server_configs:
+  - name: "qwen3_32b_fp8_tp2_6k1k"
+    model_name: "qwen3_32b_fp8"
+    tensor_parallel_size: 2
+    pipeline_parallel_size: 1
+    max_batch_size: 96
+    max_num_tokens: 7964
+    max_seq_len: 7964
+    attn_backend: "TRTLLM"
+    enable_attention_dp: false
+    enable_chunked_prefill: false
+    disable_overlap_scheduler: false
+    print_iter_log: false
+    cuda_graph_config:
+      enable_padding: true
+      batch_sizes:
+      - 1
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 96
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.9
+    client_configs:
+      # ISL+OSL must stay within recipe's max_seq_len=7964, so use ~6.7k/1k.
+      - name: "con96_iter10_6k1k"
+        concurrency: 96
+        iterations: 10
+        isl: 6912
+        osl: 1024
+        backend: "openai"
+        dataset_file: <dataset_file>
diff --git a/tests/scripts/perf-sanity/disaggregated/h200_nemotron-super-fp8_8k1k_con64_ctx1_tp2_gen1_tp2_eplb0_mtp0_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/h200_nemotron-super-fp8_8k1k_con64_ctx1_tp2_gen1_tp2_eplb0_mtp0_ccb-UCX.yaml
new file mode 100644
index 000000000000..2499fb4cf7f7
--- /dev/null
+++ b/tests/scripts/perf-sanity/disaggregated/h200_nemotron-super-fp8_8k1k_con64_ctx1_tp2_gen1_tp2_eplb0_mtp0_ccb-UCX.yaml
@@ -0,0 +1,91 @@
+metadata:
+  model_name: super_fp8
+  precision: fp8
+  model_dir_name: NVIDIA-Nemotron-3-Super-120B-A12B-FP8
+  supported_gpus:
+  - H200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 8k1k
+  # Native-target (Hopper) mirror of the Dynamo Nemotron-3-Super-FP8 TRT-LLM
+  # disagg deployment recipe:
+  # https://github.com/ai-dynamo/dynamo/tree/main/recipes/nemotron-3-super-fp8/trtllm/disagg
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: "--gres=gpu:8"
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 10
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '64'
+  input_length: 8192
+  output_length: 1024
+  dataset_file: datasets/perf-ci/nemotron_super-8k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 8
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+worker_config:
+  gen:
+    print_iter_log: true
+    tensor_parallel_size: 2
+    moe_expert_parallel_size: 1
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: false
+    enable_chunked_prefill: true
+    max_batch_size: 16
+    max_num_tokens: 8192
+    trust_remote_code: true
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 16
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.85
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      backend: UCX
+    disable_overlap_scheduler: false
+  ctx:
+    print_iter_log: true
+    tensor_parallel_size: 2
+    moe_expert_parallel_size: 1
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: false
+    enable_chunked_prefill: true
+    max_batch_size: 16
+    max_num_tokens: 8192
+    trust_remote_code: true
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 16
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.85
+    moe_config:
+      backend: TRTLLM
+    cache_transceiver_config:
+      backend: UCX
+    disable_overlap_scheduler: true
diff --git a/tests/scripts/perf-sanity/disaggregated/h200_qwen3-235b-a22b-fp8_8k1k_con512_ctx1_tp2_gen1_tep4_eplb0_mtp0_ccb-DEFAULT.yaml b/tests/scripts/perf-sanity/disaggregated/h200_qwen3-235b-a22b-fp8_8k1k_con512_ctx1_tp2_gen1_tep4_eplb0_mtp0_ccb-DEFAULT.yaml
new file mode 100644
index 000000000000..0bf12a3fe300
--- /dev/null
+++ b/tests/scripts/perf-sanity/disaggregated/h200_qwen3-235b-a22b-fp8_8k1k_con512_ctx1_tp2_gen1_tep4_eplb0_mtp0_ccb-DEFAULT.yaml
@@ -0,0 +1,97 @@
+metadata:
+  model_name: qwen3_235b_a22b_fp8
+  precision: fp8
+  model_dir_name: Qwen3-235B-A22B-FP8
+  supported_gpus:
+  - H200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 8k1k
+  # Native-target (Hopper) mirror of the Dynamo Qwen3-235B-A22B-FP8 TRT-LLM
+  # disagg deployment recipe:
+  # https://github.com/ai-dynamo/dynamo/tree/main/recipes/qwen3-235b-a22b-fp8/trtllm/disagg
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: "--gres=gpu:8"
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 10
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '512'
+  input_length: 8192
+  output_length: 1024
+  dataset_file: datasets/perf-ci/qwen3_235b-8k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 8
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+worker_config:
+  gen:
+    print_iter_log: true
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    moe_tensor_parallel_size: 1
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: false
+    enable_chunked_prefill: false
+    trust_remote_code: true
+    max_batch_size: 512
+    max_num_tokens: 1024
+    max_seq_len: 8192
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 512
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.95
+      dtype: fp8
+    moe_config:
+      backend: DEEPGEMM
+    cache_transceiver_config:
+      backend: DEFAULT
+    disable_overlap_scheduler: false
+  ctx:
+    print_iter_log: true
+    tensor_parallel_size: 2
+    moe_expert_parallel_size: 1
+    moe_tensor_parallel_size: 2
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: false
+    enable_chunked_prefill: false
+    trust_remote_code: true
+    max_batch_size: 2
+    max_num_tokens: 8192
+    max_seq_len: 8192
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 2
+    kv_cache_config:
+      enable_block_reuse: true
+      free_gpu_memory_fraction: 0.7
+      dtype: fp8
+    moe_config:
+      backend: DEEPGEMM
+    cache_transceiver_config:
+      backend: DEFAULT
+    disable_overlap_scheduler: true
diff --git a/tests/scripts/perf-sanity/disaggregated/h200_qwen3-32b-fp8_4k1k_con128_ctx1_tp1_gen1_tp2_eplb0_mtp0_ccb-DEFAULT.yaml b/tests/scripts/perf-sanity/disaggregated/h200_qwen3-32b-fp8_4k1k_con128_ctx1_tp1_gen1_tp2_eplb0_mtp0_ccb-DEFAULT.yaml
new file mode 100644
index 000000000000..e7eb2ce64804
--- /dev/null
+++ b/tests/scripts/perf-sanity/disaggregated/h200_qwen3-32b-fp8_4k1k_con128_ctx1_tp1_gen1_tp2_eplb0_mtp0_ccb-DEFAULT.yaml
@@ -0,0 +1,89 @@
+metadata:
+  model_name: qwen3_32b_fp8
+  precision: fp8
+  model_dir_name: Qwen3-32B-FP8
+  supported_gpus:
+  - H200
+  script_file: disaggr_torch.slurm
+  benchmark_type: 4k1k
+  # Native-target (Hopper) mirror of the Dynamo Qwen3-32B-FP8 TRT-LLM disagg
+  # deployment recipe:
+  # https://github.com/ai-dynamo/dynamo/tree/main/recipes/qwen3-32b-fp8/trtllm/disagg
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: "--gres=gpu:8"
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 10
+  benchmark_ratio: 0.0
+  streaming: true
+  concurrency_list: '128'
+  input_length: 4096
+  output_length: 1024
+  dataset_file: datasets/perf-ci/qwen3_32b-4k1k-20480-ratio-1_for_serve.json
+hardware:
+  gpus_per_node: 8
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL=True"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+worker_config:
+  gen:
+    print_iter_log: true
+    tensor_parallel_size: 2
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: false
+    enable_chunked_prefill: false
+    trust_remote_code: true
+    max_batch_size: 128
+    max_num_tokens: 7808
+    max_seq_len: 7808
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 128
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.9
+      dtype: fp8
+    cache_transceiver_config:
+      backend: DEFAULT
+    disable_overlap_scheduler: false
+  ctx:
+    print_iter_log: true
+    tensor_parallel_size: 1
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    enable_attention_dp: false
+    enable_chunked_prefill: false
+    trust_remote_code: true
+    max_batch_size: 1
+    max_num_tokens: 7808
+    max_seq_len: 7808
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 256
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.9
+      dtype: fp8
+    cache_transceiver_config:
+      backend: DEFAULT
+    disable_overlap_scheduler: true