NVIDIA · yihwang-nv · Apr 15, 2026 · Apr 15, 2026 · Apr 15, 2026 · Apr 15, 2026
@@ -5261,7 +5261,7 @@ For more information, please refer to <http://unlicense.org>
   - `Tracker`: https://github.com/tox-dev/py-filelock/issues
 
 
-## flashinfer-python (0.6.6)
+## flashinfer-python (0.6.8)
 
 ### Licenses
 License: `Apache-2.0`
@@ -33239,7 +33239,7 @@ License: `NVIDIA Proprietary Software`
   - `Homepage`: https://developer.nvidia.com/cusparselt
 
 
-## nvidia-cutlass-dsl (4.2.1)
+## nvidia-cutlass-dsl (4.4.2)
 
 ### Licenses
 License: `None`

diff --git a/constraints.txt b/constraints.txt
@@ -6,3 +6,5 @@ wheel>=0.46.2
 tornado>=6.5.5
 # WAR against https://github.com/advisories/GHSA-3936-cmfr-pm3m
 black>=26.3.1
+# Upgrade base image nvidia-cutlass-dsl 4.3.5 to 4.4.2
+nvidia-cutlass-dsl>=4.4.2
@@ -52,8 +52,10 @@ RUN --mount=type=bind,source=docker/common,target=/opt/docker/common \
 # Install constraints after install.sh so cleanup() doesn't delete the file mid-RUN
 COPY constraints.txt /tmp/constraints.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    # WAR: uninstall dependencies that has vulnerability
-    pip3 uninstall -y tornado black nbconvert || true && \
+    # WAR: uninstall dependencies that has vulnerability or need upgrading
+    pip3 uninstall -y tornado black nbconvert nvidia-cutlass-dsl nvidia-cutlass-dsl-libs-base || true && \
+    # Remove any leftover namespace dirs or dist-info that pip missed
+    rm -rf $(python3 -c "import site; print(site.getsitepackages()[0])")/nvidia_cutlass_dsl* && \
     pip3 install --ignore-installed --no-cache-dir -r /tmp/constraints.txt && \
     rm /tmp/constraints.txt
 

@@ -3620,6 +3620,10 @@ def launchTestJobs(pipeline, testFilter)
                             trtllm_utils.llmExecStepWithRetry(pipeline, script: "[ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true")
                             // Remove the python3-pygments pip package because the dlfw image already includes a Debian pygments package, which conflicts with the pip-installed version.
                             trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get remove -y python3-pygments")
+                            // Remove stale nvidia-cutlass-dsl from the base image to prevent namespace
+                            // directory corruption when pip upgrades to the version required by tensorrt_llm.
+                            trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 uninstall -y nvidia-cutlass-dsl nvidia-cutlass-dsl-libs-base || true")
+                            trtllm_utils.llmExecStepWithRetry(pipeline, script: 'rm -rf $(python3 -c "import site; print(site.getsitepackages()[0])")/nvidia_cutlass_dsl*')
                         }
                         trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y python3-pip git rsync curl wget")
                         trtllm_utils.checkoutSource(LLM_REPO, env.gitlabCommit, LLM_ROOT, false, true)

@@ -13,7 +13,7 @@
 #     images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead.
 IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm
 
-LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-26.02-py3-x86_64-ubuntu24.04-trt10.15.1.29-skip-tritondevel-202604011104-12600
-LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-26.02-py3-sbsa-ubuntu24.04-trt10.15.1.29-skip-tritondevel-202604011104-12600
-LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.1-devel-rocky8-x86_64-rocky8-py310-trt10.15.1.29-skip-tritondevel-202604011104-12600
-LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.1-devel-rocky8-x86_64-rocky8-py312-trt10.15.1.29-skip-tritondevel-202604011104-12600
+LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-26.02-py3-x86_64-ubuntu24.04-trt10.15.1.29-skip-tritondevel-202604200956-13064
+LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-26.02-py3-sbsa-ubuntu24.04-trt10.15.1.29-skip-tritondevel-202604200956-13064
+LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py310-trt10.15.1.29-skip-tritondevel-202604200956-13064
+LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py312-trt10.15.1.29-skip-tritondevel-202604200956-13064
@@ -54,7 +54,7 @@ ordered-set
 peft>=0.18.1,<0.19.0
 patchelf
 einops
-flashinfer-python==0.6.6
+flashinfer-python==0.6.8
 opencv-python-headless
 xgrammar==0.1.32
 llguidance==0.7.29
@@ -71,7 +71,7 @@ xdsl>=0.59.0 # Optional: required for MLIR-based elementwise fusion in AutoDeplo
 tiktoken
 blobfile
 openai-harmony==0.0.4
-nvidia-cutlass-dsl==4.3.4; python_version >= "3.10"
+nvidia-cutlass-dsl==4.4.2; python_version >= "3.10"
 plotly
 numexpr
 partial_json_parser

diff --git a/security_scanning/pyproject.toml b/security_scanning/pyproject.toml
@@ -55,7 +55,7 @@ dependencies = [
     "peft (>=0.18.1,<0.19.0)",
     "patchelf (>=0.17.2.4,<0.18.0.0)",
     "einops (>=0.8.2,<0.9.0)",
-    "flashinfer-python (==0.6.6)",
+    "flashinfer-python (==0.6.8)",
     "opencv-python-headless (>=4.13.0.92,<5.0.0.0)",
     "xgrammar (==0.1.32)",
     "llguidance (==0.7.29)",
@@ -72,7 +72,7 @@ dependencies = [
     "tiktoken (>=0.12.0,<0.13.0)",
     "blobfile (>=3.2.0,<4.0.0)",
     "openai-harmony (==0.0.4)",
-    "nvidia-cutlass-dsl (==4.3.4)",
+    "nvidia-cutlass-dsl (==4.4.2)",
     "plotly (>=6.7.0,<7.0.0)",
     "numexpr (>=2.14.1,<3.0.0)",
     "partial-json-parser (>=0.2.1.1.post7,<0.3.0.0)",

@@ -37,7 +37,6 @@
 import cutlass.utils.blockscaled_layout as blockscaled_utils
 from cutlass._mlir.dialects import math
 from cutlass.cute.nvgpu import cpasync, tcgen05
-from cutlass.cutlass_dsl import Int32
 
 from .custom_pipeline import PipelineCpAsyncUmma
 from .utils import (
@@ -155,144 +154,6 @@
 """
 
 
-# TODO: Remove this hook helper function after nvidia-cutlass-dsl 4.4 is released.
-def hooked_PersistentTileSchedulerParams_init(
-    self,
-    problem_shape_ntile_mnl: cute.Shape,
-    cluster_shape_mnk: cute.Shape,
-    swizzle_size: int = 1,
-    raster_along_m: bool = True,
-    *,
-    loc=None,
-    ip=None,
-):
-    if cluster_shape_mnk[2] != 1:
-        raise ValueError(f"unsupported cluster_shape_k {cluster_shape_mnk[2]}")
-    if swizzle_size < 1:
-        raise ValueError(f"expect swizzle_size >= 1, but get {swizzle_size}")
-
-    self.problem_shape_ntile_mnl = problem_shape_ntile_mnl
-    # cluster_shape_mnk is kept for reconstruction
-    self._cluster_shape_mnk = cluster_shape_mnk
-    self.cluster_shape_mn = cluster_shape_mnk[:2]
-    self.swizzle_size = swizzle_size
-    self._raster_along_m = raster_along_m
-    self._loc = loc
-
-    # Apply swizzle if swizzle_size > 1
-    if swizzle_size > 1:
-        problem_shape_ncluster_mnl = cute.round_up(
-            self.problem_layout_ncluster_mnl.shape,
-            (1, swizzle_size, 1) if raster_along_m else (swizzle_size, 1, 1),
-        )
-
-        if raster_along_m:
-            self.problem_layout_ncluster_mnl = cute.make_layout(
-                (
-                    problem_shape_ncluster_mnl[0],
-                    (swizzle_size, problem_shape_ncluster_mnl[1] // swizzle_size),
-                    problem_shape_ncluster_mnl[2],
-                ),
-                stride=(
-                    swizzle_size,
-                    (1, swizzle_size * problem_shape_ncluster_mnl[0]),
-                    problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1],
-                ),
-                loc=loc,
-                ip=ip,
-            )
-        else:
-            self.problem_layout_ncluster_mnl = cute.make_layout(
-                (
-                    (swizzle_size, problem_shape_ncluster_mnl[0] // swizzle_size),
-                    problem_shape_ncluster_mnl[1],
-                    problem_shape_ncluster_mnl[2],
-                ),
-                stride=(
-                    (1, swizzle_size * problem_shape_ncluster_mnl[1]),
-                    swizzle_size,
-                    problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1],
-                ),
-                loc=loc,
-                ip=ip,
-            )
-
-    # Create FastDivmod divisors (only when swizzle_size == 1 for correctness)
-    # FastDivmod assumes simple col-major/row-major layout, incompatible with swizzled layouts
-    if swizzle_size == 1:
-        problem_shape_ncluster_mnl = cute.ceil_div(
-            self.problem_shape_ntile_mnl, cluster_shape_mnk[:2], loc=loc, ip=ip
-        )
-        if raster_along_m:
-            self.problem_layout_ncluster_mnl = cute.make_layout(
-                problem_shape_ncluster_mnl,
-                stride=(
-                    1,
-                    problem_shape_ncluster_mnl[0],
-                    problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1],
-                ),
-                loc=loc,
-                ip=ip,
-            )
-        else:
-            self.problem_layout_ncluster_mnl = cute.make_layout(
-                problem_shape_ncluster_mnl,
-                stride=(
-                    problem_shape_ncluster_mnl[1],
-                    1,
-                    problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1],
-                ),
-                loc=loc,
-                ip=ip,
-            )
-        problem_layout_size = cute.size(self.problem_layout_ncluster_mnl, loc=loc, ip=ip)
-        cluster_count_m = self.problem_layout_ncluster_mnl.shape[0]
-        cluster_count_n = self.problem_layout_ncluster_mnl.shape[1]
-
-        # batch_fdd: Used to map linear_idx to work_unit_id (handles persistent scheduling)
-        self.batch_fdd = cute.fast_divmod_create_divisor(problem_layout_size, loc=loc, ip=ip)
-
-        # cluster_shape_m_fdd: Used to decode work_unit_id to cluster coordinates
-        self.cluster_shape_m_fdd = cute.fast_divmod_create_divisor(cluster_count_m, loc=loc, ip=ip)
-
-        # cluster_shape_n_fdd: Used for the second level decomposition
-        self.cluster_shape_n_fdd = cute.fast_divmod_create_divisor(cluster_count_n, loc=loc, ip=ip)
-    else:
-        # FastDivmod not applicable with swizzling, set to None
-        self.batch_fdd = None
-        self.cluster_shape_m_fdd = None
-        self.cluster_shape_n_fdd = None
-
-
-def hooked_get_cluster_work_idx_with_fastdivmod(
-    self, current_work_linear_idx: Int32, *, loc=None, ip=None
-) -> Tuple[Int32, Int32, Int32]:
-    work_iteration, work_unit_id = divmod(current_work_linear_idx, self.params.batch_fdd)
-
-    if self.params._raster_along_m:
-        # raster_along_m=True means column major (m is fastest)
-        # First, get cluster_m using cluster_shape_m_fdd
-        cluster_n_batch, cluster_m = divmod(work_unit_id, self.params.cluster_shape_m_fdd)
-
-        # Then decode cluster_n_batch to get cluster_n and batch_l using FastDivmod
-        batch_l, cluster_n = divmod(cluster_n_batch, self.params.cluster_shape_n_fdd)
-    else:
-        # raster_along_m=False means row major (n is fastest)
-        # First, get cluster_n using cluster_shape_n_fdd
-        cluster_m_batch, cluster_n = divmod(work_unit_id, self.params.cluster_shape_n_fdd)
-
-        # Then decode cluster_m_batch to get cluster_m and batch_l using FastDivmod
-        batch_l, cluster_m = divmod(cluster_m_batch, self.params.cluster_shape_m_fdd)
-
-    return (cluster_m, cluster_n, batch_l)
-
-
-cutlass.utils.PersistentTileSchedulerParams.__init__ = hooked_PersistentTileSchedulerParams_init
-cutlass.utils.StaticPersistentTileScheduler._get_cluster_work_idx_with_fastdivmod = (
-    hooked_get_cluster_work_idx_with_fastdivmod
-)
-
-
 class BlockScaledContiguousGatherGroupedGemmKernel:
     """This class implements contiguous grouped matrix multiplication with gather operation and SwiGLU fusion
     for FC1 layer computation (C = up * silu(gate), where up/gate come from interleaved GEMM result).

@@ -60,143 +60,6 @@
 )
 
 
-def hooked_PersistentTileSchedulerParams_init(
-    self,
-    problem_shape_ntile_mnl: cute.Shape,
-    cluster_shape_mnk: cute.Shape,
-    swizzle_size: int = 1,
-    raster_along_m: bool = True,
-    *,
-    loc=None,
-    ip=None,
-):
-    if cluster_shape_mnk[2] != 1:
-        raise ValueError(f"unsupported cluster_shape_k {cluster_shape_mnk[2]}")
-    if swizzle_size < 1:
-        raise ValueError(f"expect swizzle_size >= 1, but get {swizzle_size}")
-
-    self.problem_shape_ntile_mnl = problem_shape_ntile_mnl
-    # cluster_shape_mnk is kept for reconstruction
-    self._cluster_shape_mnk = cluster_shape_mnk
-    self.cluster_shape_mn = cluster_shape_mnk[:2]
-    self.swizzle_size = swizzle_size
-    self._raster_along_m = raster_along_m
-    self._loc = loc
-
-    # Apply swizzle if swizzle_size > 1
-    if swizzle_size > 1:
-        problem_shape_ncluster_mnl = cute.round_up(
-            self.problem_layout_ncluster_mnl.shape,
-            (1, swizzle_size, 1) if raster_along_m else (swizzle_size, 1, 1),
-        )
-
-        if raster_along_m:
-            self.problem_layout_ncluster_mnl = cute.make_layout(
-                (
-                    problem_shape_ncluster_mnl[0],
-                    (swizzle_size, problem_shape_ncluster_mnl[1] // swizzle_size),
-                    problem_shape_ncluster_mnl[2],
-                ),
-                stride=(
-                    swizzle_size,
-                    (1, swizzle_size * problem_shape_ncluster_mnl[0]),
-                    problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1],
-                ),
-                loc=loc,
-                ip=ip,
-            )
-        else:
-            self.problem_layout_ncluster_mnl = cute.make_layout(
-                (
-                    (swizzle_size, problem_shape_ncluster_mnl[0] // swizzle_size),
-                    problem_shape_ncluster_mnl[1],
-                    problem_shape_ncluster_mnl[2],
-                ),
-                stride=(
-                    (1, swizzle_size * problem_shape_ncluster_mnl[1]),
-                    swizzle_size,
-                    problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1],
-                ),
-                loc=loc,
-                ip=ip,
-            )
-
-    # Create FastDivmod divisors (only when swizzle_size == 1 for correctness)
-    # FastDivmod assumes simple col-major/row-major layout, incompatible with swizzled layouts
-    if swizzle_size == 1:
-        problem_shape_ncluster_mnl = cute.ceil_div(
-            self.problem_shape_ntile_mnl, cluster_shape_mnk[:2], loc=loc, ip=ip
-        )
-        if raster_along_m:
-            self.problem_layout_ncluster_mnl = cute.make_layout(
-                problem_shape_ncluster_mnl,
-                stride=(
-                    1,
-                    problem_shape_ncluster_mnl[0],
-                    problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1],
-                ),
-                loc=loc,
-                ip=ip,
-            )
-        else:
-            self.problem_layout_ncluster_mnl = cute.make_layout(
-                problem_shape_ncluster_mnl,
-                stride=(
-                    problem_shape_ncluster_mnl[1],
-                    1,
-                    problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1],
-                ),
-                loc=loc,
-                ip=ip,
-            )
-        problem_layout_size = cute.size(self.problem_layout_ncluster_mnl, loc=loc, ip=ip)
-        cluster_count_m = self.problem_layout_ncluster_mnl.shape[0]
-        cluster_count_n = self.problem_layout_ncluster_mnl.shape[1]
-
-        # batch_fdd: Used to map linear_idx to work_unit_id (handles persistent scheduling)
-        self.batch_fdd = cute.fast_divmod_create_divisor(problem_layout_size, loc=loc, ip=ip)
-
-        # cluster_shape_m_fdd: Used to decode work_unit_id to cluster coordinates
-        self.cluster_shape_m_fdd = cute.fast_divmod_create_divisor(cluster_count_m, loc=loc, ip=ip)
-
-        # cluster_shape_n_fdd: Used for the second level decomposition
-        self.cluster_shape_n_fdd = cute.fast_divmod_create_divisor(cluster_count_n, loc=loc, ip=ip)
-    else:
-        # FastDivmod not applicable with swizzling, set to None
-        self.batch_fdd = None
-        self.cluster_shape_m_fdd = None
-        self.cluster_shape_n_fdd = None
-
-
-def hooked_get_cluster_work_idx_with_fastdivmod(
-    self, current_work_linear_idx: cutlass.Int32, *, loc=None, ip=None
-) -> Tuple[cutlass.Int32, cutlass.Int32, cutlass.Int32]:
-    work_iteration, work_unit_id = divmod(current_work_linear_idx, self.params.batch_fdd)
-
-    if self.params._raster_along_m:
-        # raster_along_m=True means column major (m is fastest)
-        # First, get cluster_m using cluster_shape_m_fdd
-        cluster_n_batch, cluster_m = divmod(work_unit_id, self.params.cluster_shape_m_fdd)
-
-        # Then decode cluster_n_batch to get cluster_n and batch_l using FastDivmod
-        batch_l, cluster_n = divmod(cluster_n_batch, self.params.cluster_shape_n_fdd)
-    else:
-        # raster_along_m=False means row major (n is fastest)
-        # First, get cluster_n using cluster_shape_n_fdd
-        cluster_m_batch, cluster_n = divmod(work_unit_id, self.params.cluster_shape_n_fdd)
-
-        # Then decode cluster_m_batch to get cluster_m and batch_l using FastDivmod
-        batch_l, cluster_m = divmod(cluster_m_batch, self.params.cluster_shape_m_fdd)
-
-    return (cluster_m, cluster_n, batch_l)
-
-
-cutlass.utils.PersistentTileSchedulerParams.__init__ = hooked_PersistentTileSchedulerParams_init
-cutlass.utils.StaticPersistentTileScheduler._get_cluster_work_idx_with_fastdivmod = (
-    hooked_get_cluster_work_idx_with_fastdivmod
-)
-
-
 class Sm100BlockScaledContiguousGroupedGemmKernel:
     """This class implements batched matrix multiplication (C = A x SFA x B x SFB) with support for various data types
     and architectural features specific to Blackwell GPUs with persistent tile scheduling and warp specialization.