diff --git a/ATTRIBUTIONS-Python.md b/ATTRIBUTIONS-Python.md index a5e5ab387733..e14ff9f79ce4 100644 --- a/ATTRIBUTIONS-Python.md +++ b/ATTRIBUTIONS-Python.md @@ -5261,7 +5261,7 @@ For more information, please refer to - `Tracker`: https://github.com/tox-dev/py-filelock/issues -## flashinfer-python (0.6.6) +## flashinfer-python (0.6.8) ### Licenses License: `Apache-2.0` @@ -33239,7 +33239,7 @@ License: `NVIDIA Proprietary Software` - `Homepage`: https://developer.nvidia.com/cusparselt -## nvidia-cutlass-dsl (4.2.1) +## nvidia-cutlass-dsl (4.4.2) ### Licenses License: `None` diff --git a/constraints.txt b/constraints.txt index 7dd0d6747765..6f4ae99d5d95 100644 --- a/constraints.txt +++ b/constraints.txt @@ -6,3 +6,5 @@ wheel>=0.46.2 tornado>=6.5.5 # WAR against https://github.com/advisories/GHSA-3936-cmfr-pm3m black>=26.3.1 +# Upgrade base image nvidia-cutlass-dsl 4.3.5 to 4.4.2 +nvidia-cutlass-dsl>=4.4.2 diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi index 96b75cd96a43..3fbcc0b5659e 100644 --- a/docker/Dockerfile.multi +++ b/docker/Dockerfile.multi @@ -52,8 +52,10 @@ RUN --mount=type=bind,source=docker/common,target=/opt/docker/common \ # Install constraints after install.sh so cleanup() doesn't delete the file mid-RUN COPY constraints.txt /tmp/constraints.txt RUN --mount=type=cache,target=/root/.cache/pip \ - # WAR: uninstall dependencies that has vulnerability - pip3 uninstall -y tornado black nbconvert || true && \ + # WAR: uninstall dependencies that has vulnerability or need upgrading + pip3 uninstall -y tornado black nbconvert nvidia-cutlass-dsl nvidia-cutlass-dsl-libs-base || true && \ + # Remove any leftover namespace dirs or dist-info that pip missed + rm -rf $(python3 -c "import site; print(site.getsitepackages()[0])")/nvidia_cutlass_dsl* && \ pip3 install --ignore-installed --no-cache-dir -r /tmp/constraints.txt && \ rm /tmp/constraints.txt diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 43844da06d22..e5c353877b1f 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -3620,6 +3620,10 @@ def launchTestJobs(pipeline, testFilter) trtllm_utils.llmExecStepWithRetry(pipeline, script: "[ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true") // Remove the python3-pygments pip package because the dlfw image already includes a Debian pygments package, which conflicts with the pip-installed version. trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get remove -y python3-pygments") + // Remove stale nvidia-cutlass-dsl from the base image to prevent namespace + // directory corruption when pip upgrades to the version required by tensorrt_llm. + trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 uninstall -y nvidia-cutlass-dsl nvidia-cutlass-dsl-libs-base || true") + trtllm_utils.llmExecStepWithRetry(pipeline, script: 'rm -rf $(python3 -c "import site; print(site.getsitepackages()[0])")/nvidia_cutlass_dsl*') } trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y python3-pip git rsync curl wget") trtllm_utils.checkoutSource(LLM_REPO, env.gitlabCommit, LLM_ROOT, false, true) diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties index 8d751664640b..d8d16c56ee5d 100644 --- a/jenkins/current_image_tags.properties +++ b/jenkins/current_image_tags.properties @@ -13,7 +13,7 @@ # images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead. IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm -LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-26.02-py3-x86_64-ubuntu24.04-trt10.15.1.29-skip-tritondevel-202604011104-12600 -LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-26.02-py3-sbsa-ubuntu24.04-trt10.15.1.29-skip-tritondevel-202604011104-12600 -LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.1-devel-rocky8-x86_64-rocky8-py310-trt10.15.1.29-skip-tritondevel-202604011104-12600 -LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.1-devel-rocky8-x86_64-rocky8-py312-trt10.15.1.29-skip-tritondevel-202604011104-12600 +LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-26.02-py3-x86_64-ubuntu24.04-trt10.15.1.29-skip-tritondevel-202604200956-13064 +LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-26.02-py3-sbsa-ubuntu24.04-trt10.15.1.29-skip-tritondevel-202604200956-13064 +LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py310-trt10.15.1.29-skip-tritondevel-202604200956-13064 +LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py312-trt10.15.1.29-skip-tritondevel-202604200956-13064 diff --git a/requirements.txt b/requirements.txt index eaa5d0082f31..385f5269d08a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -54,7 +54,7 @@ ordered-set peft>=0.18.1,<0.19.0 patchelf einops -flashinfer-python==0.6.6 +flashinfer-python==0.6.8 opencv-python-headless xgrammar==0.1.32 llguidance==0.7.29 @@ -71,7 +71,7 @@ xdsl>=0.59.0 # Optional: required for MLIR-based elementwise fusion in AutoDeplo tiktoken blobfile openai-harmony==0.0.4 -nvidia-cutlass-dsl==4.3.4; python_version >= "3.10" +nvidia-cutlass-dsl==4.4.2; python_version >= "3.10" plotly numexpr partial_json_parser diff --git a/security_scanning/pyproject.toml b/security_scanning/pyproject.toml index 54e70cca5869..042c236dfb53 100644 --- a/security_scanning/pyproject.toml +++ b/security_scanning/pyproject.toml @@ -55,7 +55,7 @@ dependencies = [ "peft (>=0.18.1,<0.19.0)", "patchelf (>=0.17.2.4,<0.18.0.0)", "einops (>=0.8.2,<0.9.0)", - "flashinfer-python (==0.6.6)", + "flashinfer-python (==0.6.8)", "opencv-python-headless (>=4.13.0.92,<5.0.0.0)", "xgrammar (==0.1.32)", "llguidance (==0.7.29)", @@ -72,7 +72,7 @@ dependencies = [ "tiktoken (>=0.12.0,<0.13.0)", "blobfile (>=3.2.0,<4.0.0)", "openai-harmony (==0.0.4)", - "nvidia-cutlass-dsl (==4.3.4)", + "nvidia-cutlass-dsl (==4.4.2)", "plotly (>=6.7.0,<7.0.0)", "numexpr (>=2.14.1,<3.0.0)", "partial-json-parser (>=0.2.1.1.post7,<0.3.0.0)", diff --git a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py index c339787301e6..66b55bf56afd 100644 --- a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py +++ b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_gather_grouped_gemm_swiglu_fusion.py @@ -37,7 +37,6 @@ import cutlass.utils.blockscaled_layout as blockscaled_utils from cutlass._mlir.dialects import math from cutlass.cute.nvgpu import cpasync, tcgen05 -from cutlass.cutlass_dsl import Int32 from .custom_pipeline import PipelineCpAsyncUmma from .utils import ( @@ -155,144 +154,6 @@ """ -# TODO: Remove this hook helper function after nvidia-cutlass-dsl 4.4 is released. -def hooked_PersistentTileSchedulerParams_init( - self, - problem_shape_ntile_mnl: cute.Shape, - cluster_shape_mnk: cute.Shape, - swizzle_size: int = 1, - raster_along_m: bool = True, - *, - loc=None, - ip=None, -): - if cluster_shape_mnk[2] != 1: - raise ValueError(f"unsupported cluster_shape_k {cluster_shape_mnk[2]}") - if swizzle_size < 1: - raise ValueError(f"expect swizzle_size >= 1, but get {swizzle_size}") - - self.problem_shape_ntile_mnl = problem_shape_ntile_mnl - # cluster_shape_mnk is kept for reconstruction - self._cluster_shape_mnk = cluster_shape_mnk - self.cluster_shape_mn = cluster_shape_mnk[:2] - self.swizzle_size = swizzle_size - self._raster_along_m = raster_along_m - self._loc = loc - - # Apply swizzle if swizzle_size > 1 - if swizzle_size > 1: - problem_shape_ncluster_mnl = cute.round_up( - self.problem_layout_ncluster_mnl.shape, - (1, swizzle_size, 1) if raster_along_m else (swizzle_size, 1, 1), - ) - - if raster_along_m: - self.problem_layout_ncluster_mnl = cute.make_layout( - ( - problem_shape_ncluster_mnl[0], - (swizzle_size, problem_shape_ncluster_mnl[1] // swizzle_size), - problem_shape_ncluster_mnl[2], - ), - stride=( - swizzle_size, - (1, swizzle_size * problem_shape_ncluster_mnl[0]), - problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1], - ), - loc=loc, - ip=ip, - ) - else: - self.problem_layout_ncluster_mnl = cute.make_layout( - ( - (swizzle_size, problem_shape_ncluster_mnl[0] // swizzle_size), - problem_shape_ncluster_mnl[1], - problem_shape_ncluster_mnl[2], - ), - stride=( - (1, swizzle_size * problem_shape_ncluster_mnl[1]), - swizzle_size, - problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1], - ), - loc=loc, - ip=ip, - ) - - # Create FastDivmod divisors (only when swizzle_size == 1 for correctness) - # FastDivmod assumes simple col-major/row-major layout, incompatible with swizzled layouts - if swizzle_size == 1: - problem_shape_ncluster_mnl = cute.ceil_div( - self.problem_shape_ntile_mnl, cluster_shape_mnk[:2], loc=loc, ip=ip - ) - if raster_along_m: - self.problem_layout_ncluster_mnl = cute.make_layout( - problem_shape_ncluster_mnl, - stride=( - 1, - problem_shape_ncluster_mnl[0], - problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1], - ), - loc=loc, - ip=ip, - ) - else: - self.problem_layout_ncluster_mnl = cute.make_layout( - problem_shape_ncluster_mnl, - stride=( - problem_shape_ncluster_mnl[1], - 1, - problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1], - ), - loc=loc, - ip=ip, - ) - problem_layout_size = cute.size(self.problem_layout_ncluster_mnl, loc=loc, ip=ip) - cluster_count_m = self.problem_layout_ncluster_mnl.shape[0] - cluster_count_n = self.problem_layout_ncluster_mnl.shape[1] - - # batch_fdd: Used to map linear_idx to work_unit_id (handles persistent scheduling) - self.batch_fdd = cute.fast_divmod_create_divisor(problem_layout_size, loc=loc, ip=ip) - - # cluster_shape_m_fdd: Used to decode work_unit_id to cluster coordinates - self.cluster_shape_m_fdd = cute.fast_divmod_create_divisor(cluster_count_m, loc=loc, ip=ip) - - # cluster_shape_n_fdd: Used for the second level decomposition - self.cluster_shape_n_fdd = cute.fast_divmod_create_divisor(cluster_count_n, loc=loc, ip=ip) - else: - # FastDivmod not applicable with swizzling, set to None - self.batch_fdd = None - self.cluster_shape_m_fdd = None - self.cluster_shape_n_fdd = None - - -def hooked_get_cluster_work_idx_with_fastdivmod( - self, current_work_linear_idx: Int32, *, loc=None, ip=None -) -> Tuple[Int32, Int32, Int32]: - work_iteration, work_unit_id = divmod(current_work_linear_idx, self.params.batch_fdd) - - if self.params._raster_along_m: - # raster_along_m=True means column major (m is fastest) - # First, get cluster_m using cluster_shape_m_fdd - cluster_n_batch, cluster_m = divmod(work_unit_id, self.params.cluster_shape_m_fdd) - - # Then decode cluster_n_batch to get cluster_n and batch_l using FastDivmod - batch_l, cluster_n = divmod(cluster_n_batch, self.params.cluster_shape_n_fdd) - else: - # raster_along_m=False means row major (n is fastest) - # First, get cluster_n using cluster_shape_n_fdd - cluster_m_batch, cluster_n = divmod(work_unit_id, self.params.cluster_shape_n_fdd) - - # Then decode cluster_m_batch to get cluster_m and batch_l using FastDivmod - batch_l, cluster_m = divmod(cluster_m_batch, self.params.cluster_shape_m_fdd) - - return (cluster_m, cluster_n, batch_l) - - -cutlass.utils.PersistentTileSchedulerParams.__init__ = hooked_PersistentTileSchedulerParams_init -cutlass.utils.StaticPersistentTileScheduler._get_cluster_work_idx_with_fastdivmod = ( - hooked_get_cluster_work_idx_with_fastdivmod -) - - class BlockScaledContiguousGatherGroupedGemmKernel: """This class implements contiguous grouped matrix multiplication with gather operation and SwiGLU fusion for FC1 layer computation (C = up * silu(gate), where up/gate come from interleaved GEMM result). diff --git a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm.py b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm.py index a5571f616dda..80d87451de83 100644 --- a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm.py +++ b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm.py @@ -60,143 +60,6 @@ ) -def hooked_PersistentTileSchedulerParams_init( - self, - problem_shape_ntile_mnl: cute.Shape, - cluster_shape_mnk: cute.Shape, - swizzle_size: int = 1, - raster_along_m: bool = True, - *, - loc=None, - ip=None, -): - if cluster_shape_mnk[2] != 1: - raise ValueError(f"unsupported cluster_shape_k {cluster_shape_mnk[2]}") - if swizzle_size < 1: - raise ValueError(f"expect swizzle_size >= 1, but get {swizzle_size}") - - self.problem_shape_ntile_mnl = problem_shape_ntile_mnl - # cluster_shape_mnk is kept for reconstruction - self._cluster_shape_mnk = cluster_shape_mnk - self.cluster_shape_mn = cluster_shape_mnk[:2] - self.swizzle_size = swizzle_size - self._raster_along_m = raster_along_m - self._loc = loc - - # Apply swizzle if swizzle_size > 1 - if swizzle_size > 1: - problem_shape_ncluster_mnl = cute.round_up( - self.problem_layout_ncluster_mnl.shape, - (1, swizzle_size, 1) if raster_along_m else (swizzle_size, 1, 1), - ) - - if raster_along_m: - self.problem_layout_ncluster_mnl = cute.make_layout( - ( - problem_shape_ncluster_mnl[0], - (swizzle_size, problem_shape_ncluster_mnl[1] // swizzle_size), - problem_shape_ncluster_mnl[2], - ), - stride=( - swizzle_size, - (1, swizzle_size * problem_shape_ncluster_mnl[0]), - problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1], - ), - loc=loc, - ip=ip, - ) - else: - self.problem_layout_ncluster_mnl = cute.make_layout( - ( - (swizzle_size, problem_shape_ncluster_mnl[0] // swizzle_size), - problem_shape_ncluster_mnl[1], - problem_shape_ncluster_mnl[2], - ), - stride=( - (1, swizzle_size * problem_shape_ncluster_mnl[1]), - swizzle_size, - problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1], - ), - loc=loc, - ip=ip, - ) - - # Create FastDivmod divisors (only when swizzle_size == 1 for correctness) - # FastDivmod assumes simple col-major/row-major layout, incompatible with swizzled layouts - if swizzle_size == 1: - problem_shape_ncluster_mnl = cute.ceil_div( - self.problem_shape_ntile_mnl, cluster_shape_mnk[:2], loc=loc, ip=ip - ) - if raster_along_m: - self.problem_layout_ncluster_mnl = cute.make_layout( - problem_shape_ncluster_mnl, - stride=( - 1, - problem_shape_ncluster_mnl[0], - problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1], - ), - loc=loc, - ip=ip, - ) - else: - self.problem_layout_ncluster_mnl = cute.make_layout( - problem_shape_ncluster_mnl, - stride=( - problem_shape_ncluster_mnl[1], - 1, - problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1], - ), - loc=loc, - ip=ip, - ) - problem_layout_size = cute.size(self.problem_layout_ncluster_mnl, loc=loc, ip=ip) - cluster_count_m = self.problem_layout_ncluster_mnl.shape[0] - cluster_count_n = self.problem_layout_ncluster_mnl.shape[1] - - # batch_fdd: Used to map linear_idx to work_unit_id (handles persistent scheduling) - self.batch_fdd = cute.fast_divmod_create_divisor(problem_layout_size, loc=loc, ip=ip) - - # cluster_shape_m_fdd: Used to decode work_unit_id to cluster coordinates - self.cluster_shape_m_fdd = cute.fast_divmod_create_divisor(cluster_count_m, loc=loc, ip=ip) - - # cluster_shape_n_fdd: Used for the second level decomposition - self.cluster_shape_n_fdd = cute.fast_divmod_create_divisor(cluster_count_n, loc=loc, ip=ip) - else: - # FastDivmod not applicable with swizzling, set to None - self.batch_fdd = None - self.cluster_shape_m_fdd = None - self.cluster_shape_n_fdd = None - - -def hooked_get_cluster_work_idx_with_fastdivmod( - self, current_work_linear_idx: cutlass.Int32, *, loc=None, ip=None -) -> Tuple[cutlass.Int32, cutlass.Int32, cutlass.Int32]: - work_iteration, work_unit_id = divmod(current_work_linear_idx, self.params.batch_fdd) - - if self.params._raster_along_m: - # raster_along_m=True means column major (m is fastest) - # First, get cluster_m using cluster_shape_m_fdd - cluster_n_batch, cluster_m = divmod(work_unit_id, self.params.cluster_shape_m_fdd) - - # Then decode cluster_n_batch to get cluster_n and batch_l using FastDivmod - batch_l, cluster_n = divmod(cluster_n_batch, self.params.cluster_shape_n_fdd) - else: - # raster_along_m=False means row major (n is fastest) - # First, get cluster_n using cluster_shape_n_fdd - cluster_m_batch, cluster_n = divmod(work_unit_id, self.params.cluster_shape_n_fdd) - - # Then decode cluster_m_batch to get cluster_m and batch_l using FastDivmod - batch_l, cluster_m = divmod(cluster_m_batch, self.params.cluster_shape_m_fdd) - - return (cluster_m, cluster_n, batch_l) - - -cutlass.utils.PersistentTileSchedulerParams.__init__ = hooked_PersistentTileSchedulerParams_init -cutlass.utils.StaticPersistentTileScheduler._get_cluster_work_idx_with_fastdivmod = ( - hooked_get_cluster_work_idx_with_fastdivmod -) - - class Sm100BlockScaledContiguousGroupedGemmKernel: """This class implements batched matrix multiplication (C = A x SFA x B x SFB) with support for various data types and architectural features specific to Blackwell GPUs with persistent tile scheduling and warp specialization. diff --git a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py index babf3dbcb261..f82374ec3094 100644 --- a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py +++ b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py @@ -158,144 +158,6 @@ """ -# TODO(zhichenj): Remove this hook helper function after nvidia-cutlass-dsl 4.4 is released. -def hooked_PersistentTileSchedulerParams_init( - self, - problem_shape_ntile_mnl: cute.Shape, - cluster_shape_mnk: cute.Shape, - swizzle_size: int = 1, - raster_along_m: bool = True, - *, - loc=None, - ip=None, -): - if cluster_shape_mnk[2] != 1: - raise ValueError(f"unsupported cluster_shape_k {cluster_shape_mnk[2]}") - if swizzle_size < 1: - raise ValueError(f"expect swizzle_size >= 1, but get {swizzle_size}") - - self.problem_shape_ntile_mnl = problem_shape_ntile_mnl - # cluster_shape_mnk is kept for reconstruction - self._cluster_shape_mnk = cluster_shape_mnk - self.cluster_shape_mn = cluster_shape_mnk[:2] - self.swizzle_size = swizzle_size - self._raster_along_m = raster_along_m - self._loc = loc - - # Apply swizzle if swizzle_size > 1 - if swizzle_size > 1: - problem_shape_ncluster_mnl = cute.round_up( - self.problem_layout_ncluster_mnl.shape, - (1, swizzle_size, 1) if raster_along_m else (swizzle_size, 1, 1), - ) - - if raster_along_m: - self.problem_layout_ncluster_mnl = cute.make_layout( - ( - problem_shape_ncluster_mnl[0], - (swizzle_size, problem_shape_ncluster_mnl[1] // swizzle_size), - problem_shape_ncluster_mnl[2], - ), - stride=( - swizzle_size, - (1, swizzle_size * problem_shape_ncluster_mnl[0]), - problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1], - ), - loc=loc, - ip=ip, - ) - else: - self.problem_layout_ncluster_mnl = cute.make_layout( - ( - (swizzle_size, problem_shape_ncluster_mnl[0] // swizzle_size), - problem_shape_ncluster_mnl[1], - problem_shape_ncluster_mnl[2], - ), - stride=( - (1, swizzle_size * problem_shape_ncluster_mnl[1]), - swizzle_size, - problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1], - ), - loc=loc, - ip=ip, - ) - - # Create FastDivmod divisors (only when swizzle_size == 1 for correctness) - # FastDivmod assumes simple col-major/row-major layout, incompatible with swizzled layouts - if swizzle_size == 1: - problem_shape_ncluster_mnl = cute.ceil_div( - self.problem_shape_ntile_mnl, cluster_shape_mnk[:2], loc=loc, ip=ip - ) - if raster_along_m: - self.problem_layout_ncluster_mnl = cute.make_layout( - problem_shape_ncluster_mnl, - stride=( - 1, - problem_shape_ncluster_mnl[0], - problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1], - ), - loc=loc, - ip=ip, - ) - else: - self.problem_layout_ncluster_mnl = cute.make_layout( - problem_shape_ncluster_mnl, - stride=( - problem_shape_ncluster_mnl[1], - 1, - problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1], - ), - loc=loc, - ip=ip, - ) - problem_layout_size = cute.size(self.problem_layout_ncluster_mnl, loc=loc, ip=ip) - cluster_count_m = self.problem_layout_ncluster_mnl.shape[0] - cluster_count_n = self.problem_layout_ncluster_mnl.shape[1] - - # batch_fdd: Used to map linear_idx to work_unit_id (handles persistent scheduling) - self.batch_fdd = cute.fast_divmod_create_divisor(problem_layout_size, loc=loc, ip=ip) - - # cluster_shape_m_fdd: Used to decode work_unit_id to cluster coordinates - self.cluster_shape_m_fdd = cute.fast_divmod_create_divisor(cluster_count_m, loc=loc, ip=ip) - - # cluster_shape_n_fdd: Used for the second level decomposition - self.cluster_shape_n_fdd = cute.fast_divmod_create_divisor(cluster_count_n, loc=loc, ip=ip) - else: - # FastDivmod not applicable with swizzling, set to None - self.batch_fdd = None - self.cluster_shape_m_fdd = None - self.cluster_shape_n_fdd = None - - -def hooked_get_cluster_work_idx_with_fastdivmod( - self, current_work_linear_idx: cutlass.Int32, *, loc=None, ip=None -) -> Tuple[cutlass.Int32, cutlass.Int32, cutlass.Int32]: - work_iteration, work_unit_id = divmod(current_work_linear_idx, self.params.batch_fdd) - - if self.params._raster_along_m: - # raster_along_m=True means column major (m is fastest) - # First, get cluster_m using cluster_shape_m_fdd - cluster_n_batch, cluster_m = divmod(work_unit_id, self.params.cluster_shape_m_fdd) - - # Then decode cluster_n_batch to get cluster_n and batch_l using FastDivmod - batch_l, cluster_n = divmod(cluster_n_batch, self.params.cluster_shape_n_fdd) - else: - # raster_along_m=False means row major (n is fastest) - # First, get cluster_n using cluster_shape_n_fdd - cluster_m_batch, cluster_n = divmod(work_unit_id, self.params.cluster_shape_n_fdd) - - # Then decode cluster_m_batch to get cluster_m and batch_l using FastDivmod - batch_l, cluster_m = divmod(cluster_m_batch, self.params.cluster_shape_m_fdd) - - return (cluster_m, cluster_n, batch_l) - - -cutlass.utils.PersistentTileSchedulerParams.__init__ = hooked_PersistentTileSchedulerParams_init -cutlass.utils.StaticPersistentTileScheduler._get_cluster_work_idx_with_fastdivmod = ( - hooked_get_cluster_work_idx_with_fastdivmod -) - - class Sm100BlockScaledContiguousGroupedGemmFinalizeFusionKernel: """This class implements batched matrix multiplication (C = A x SFA x B x SFB) with support for various data types and architectural features specific to Blackwell GPUs with persistent tile scheduling and warp specialization. diff --git a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/custom_pipeline.py b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/custom_pipeline.py index 42791b3d994a..760a26dfe774 100644 --- a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/custom_pipeline.py +++ b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/custom_pipeline.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # Redistribution and use in source and binary forms, with or without @@ -48,8 +48,17 @@ import cutlass.cute as cute from cutlass.cutlass_dsl import Boolean, if_generate +# nvidia-cutlass-dsl 4.4.2 split the sync-object factory: sm90's +# PipelineAsync._make_sync_object no longer accepts Blackwell ops like +# TCGen05Mma/ClcLoad. The sm100 PipelineTmaUmma provides the expanded variant +# that handles every op used by the custom pipelines below. Alias to avoid +# colliding with the local PipelineTmaUmma defined in this module. from cutlass.pipeline import (Agent, CooperativeGroup, PipelineAsync, - PipelineOp, PipelineState, agent_sync) + PipelineOp, PipelineState) +from cutlass.pipeline import PipelineTmaUmma as _Sm100PipelineFactory +from cutlass.pipeline import agent_sync + +_make_sync_object = _Sm100PipelineFactory._make_sync_object def pipeline_init_wait(cta_layout_vmnk: Optional[cute.Layout] = None): @@ -179,9 +188,9 @@ def create( producer = (producer_type, producer_group) consumer = (consumer_type, consumer_group) - sync_object_full = PipelineAsync._make_sync_object( - barrier_storage.align(min_align=8), num_stages, producer, tx_count) - sync_object_empty = PipelineAsync._make_sync_object( + sync_object_full = _make_sync_object(barrier_storage.align(min_align=8), + num_stages, producer, tx_count) + sync_object_empty = _make_sync_object( barrier_storage.align(min_align=8) + num_stages, num_stages, consumer) @@ -214,7 +223,7 @@ def create( cta_group, ) - def consumer_release(self, state: PipelineState): + def consumer_release(self, state: PipelineState, *, loc=None, ip=None): """ UMMA consumer release buffer empty, cta_group needs to be provided. @@ -225,12 +234,18 @@ def consumer_release(self, state: PipelineState): Returns: None """ - self.sync_object_empty.arrive(state.index, self.consumer_mask, - self.cta_group) + self.sync_object_empty.arrive(state.index, + self.consumer_mask, + self.cta_group, + loc=loc, + ip=ip) def producer_acquire(self, state: PipelineState, - try_acquire_token: Optional[Boolean] = None): + try_acquire_token: Optional[Boolean] = None, + *, + loc=None, + ip=None): """ Conditionally waits on buffer empty and sets the transaction barrier for leader threadblocks. @@ -246,15 +261,20 @@ def producer_acquire(self, """ if_generate( try_acquire_token is None or try_acquire_token == 0, - lambda: self.sync_object_empty.wait(state.index, state.phase), + lambda: self.sync_object_empty.wait( + state.index, state.phase, loc=loc, ip=ip), + loc=loc, + ip=ip, ) if_generate( self.is_leader_cta, - lambda: self.sync_object_full.arrive(state.index, self.producer_mask - ), + lambda: self.sync_object_full.arrive( + state.index, self.producer_mask, loc=loc, ip=ip), + loc=loc, + ip=ip, ) - def producer_commit(self, state: PipelineState): + def producer_commit(self, state: PipelineState, *, loc=None, ip=None): """ TMA producer commit is a noop since TMA instruction itself updates the transaction count. @@ -323,9 +343,9 @@ def create( producer = (producer_type, producer_group) consumer = (consumer_type, consumer_group) - sync_object_full = PipelineAsync._make_sync_object( - barrier_storage.align(min_align=8), num_stages, producer) - sync_object_empty = PipelineAsync._make_sync_object( + sync_object_full = _make_sync_object(barrier_storage.align(min_align=8), + num_stages, producer) + sync_object_empty = _make_sync_object( barrier_storage.align(min_align=8) + num_stages, num_stages, consumer) @@ -357,23 +377,26 @@ def create( cta_group, ) - def producer_commit(self, state: PipelineState): - self.sync_object_full.arrive(state.index, self.producer_mask, - self.cta_group) + def producer_commit(self, state: PipelineState, *, loc=None, ip=None): + self.sync_object_full.arrive(state.index, + self.producer_mask, + self.cta_group, + loc=loc, + ip=ip) - def producer_tail(self, state: PipelineState): + def producer_tail(self, state: PipelineState, *, loc=None, ip=None): cta_rank_in_cluster = cute.arch.make_warp_uniform( - cute.arch.block_idx_in_cluster()) + cute.arch.block_idx_in_cluster(loc=loc, ip=ip), loc=loc, ip=ip) is_leader_cta = cta_rank_in_cluster % 2 == 0 def then_body(): # Assume state contains that next useful buffer # So we only need to advance to num_stages - 1 times to last used buffer for i in range(self.num_stages - 1): - state.advance() - self.producer_acquire(state) + state.advance(loc=loc, ip=ip) + self.producer_acquire(state, loc=loc, ip=ip) - if_generate(is_leader_cta, then_body) + if_generate(is_leader_cta, then_body, loc=loc, ip=ip) @dataclass(frozen=True) @@ -473,12 +496,12 @@ def create( producer = (producer_type, producer_group) consumer = (consumer_type, consumer_group) - sync_object_full = PipelineAsync._make_sync_object( + sync_object_full = _make_sync_object( barrier_storage.align(min_align=8), num_stages, producer, ) - sync_object_empty = PipelineAsync._make_sync_object( + sync_object_empty = _make_sync_object( barrier_storage.align(min_align=8) + num_stages, num_stages, consumer) @@ -515,9 +538,12 @@ def create( cta_group, ) - def consumer_release(self, state: PipelineState): + def consumer_release(self, state: PipelineState, *, loc=None, ip=None): """ UMMA consumer release buffer empty, cta_group needs to be provided. """ - self.sync_object_empty.arrive(state.index, self.consumer_mask, - self.cta_group) + self.sync_object_empty.arrive(state.index, + self.consumer_mask, + self.cta_group, + loc=loc, + ip=ip)