NVIDIA · Wanli-Jiang · Apr 15, 2026 · Apr 15, 2026 · Apr 15, 2026 · Apr 15, 2026
@@ -5260,8 +5260,7 @@ For more information, please refer to <http://unlicense.org>
   - `Source`: https://github.com/tox-dev/py-filelock
   - `Tracker`: https://github.com/tox-dev/py-filelock/issues
 
-
-## flashinfer-python (0.6.6)
+## flashinfer-python (0.6.8)
 
 ### Licenses
 License: `Apache-2.0`
@@ -33239,7 +33238,7 @@ License: `NVIDIA Proprietary Software`
   - `Homepage`: https://developer.nvidia.com/cusparselt
 
 
-## nvidia-cutlass-dsl (4.2.1)
+## nvidia-cutlass-dsl (4.4.2)
 
 ### Licenses
 License: `None`

diff --git a/constraints.txt b/constraints.txt
@@ -6,3 +6,5 @@ wheel>=0.46.2
 tornado>=6.5.5
 # WAR against https://github.com/advisories/GHSA-3936-cmfr-pm3m
 black>=26.3.1
+# Upgrade base image nvidia-cutlass-dsl 4.3.5 to 4.4.2
+nvidia-cutlass-dsl>=4.4.2
diff --git a/cpp/tensorrt_llm/kernels/causalConv1d/causalConv1d.cu b/cpp/tensorrt_llm/kernels/causalConv1d/causalConv1d.cu
diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu
@@ -60,12 +60,30 @@ using tensorrt_llm::common::launchWithPdlWhenEnabled;
         __VA_ARGS__;                                                                                                   \
         break;                                                                                                         \
     }                                                                                                                  \
+    case 18:                                                                                                           \
+    {                                                                                                                  \
+        constexpr int TOP_K = 18;                                                                                      \
+        __VA_ARGS__;                                                                                                   \
+        break;                                                                                                         \
+    }                                                                                                                  \
     case 16:                                                                                                           \
     {                                                                                                                  \
         constexpr int TOP_K = 16;                                                                                      \
         __VA_ARGS__;                                                                                                   \
         break;                                                                                                         \
     }                                                                                                                  \
+    case 14:                                                                                                           \
+    {                                                                                                                  \
+        constexpr int TOP_K = 14;                                                                                      \
+        __VA_ARGS__;                                                                                                   \
+        break;                                                                                                         \
+    }                                                                                                                  \
+    case 12:                                                                                                           \
+    {                                                                                                                  \
+        constexpr int TOP_K = 12;                                                                                      \
+        __VA_ARGS__;                                                                                                   \
+        break;                                                                                                         \
+    }                                                                                                                  \
     case 10:                                                                                                           \
     {                                                                                                                  \
         constexpr int TOP_K = 10;                                                                                      \

diff --git a/cpp/tensorrt_llm/kernels/moeTopKFuncs.cuh b/cpp/tensorrt_llm/kernels/moeTopKFuncs.cuh
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -123,8 +123,75 @@ struct TopKIdx<K_, true>
         topK[J].compValIdx = pairMin;                                                                                  \
     }
 
+template <int N>
+struct IsPowerOf2
+{
+    static constexpr bool value = (N > 0) && ((N & (N - 1)) == 0);
+};
+
 template <int N, typename RedType>
-struct Sort;
+struct Sort
+{
+    static_assert(N > 0 && N <= 32, "Sort supports N in [1, 32]");
+
+    static __device__ void run(RedType* topK)
+    {
+        if constexpr (IsPowerOf2<N>::value)
+        {
+#pragma unroll
+            for (int k = 2; k <= N; k *= 2)
+            {
+#pragma unroll
+                for (int j = k / 2; j > 0; j /= 2)
+                {
+#pragma unroll
+                    for (int i = 0; i < N; ++i)
+                    {
+                        int ixj = i ^ j;
+                        if (ixj > i)
+                        {
+                            if ((i & k) == 0)
+                            {
+                                if (topK[i].compValIdx < topK[ixj].compValIdx)
+                                {
+                                    auto tmp = topK[i].compValIdx;
+                                    topK[i].compValIdx = topK[ixj].compValIdx;
+                                    topK[ixj].compValIdx = tmp;
+                                }
+                            }
+                            else
+                            {
+                                if (topK[i].compValIdx > topK[ixj].compValIdx)
+                                {
+                                    auto tmp = topK[i].compValIdx;
+                                    topK[i].compValIdx = topK[ixj].compValIdx;
+                                    topK[ixj].compValIdx = tmp;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+#pragma unroll
+            for (int pass = 0; pass < N; ++pass)
+            {
+#pragma unroll
+                for (int i = 0; i < N - 1; i += 2)
+                {
+                    TOPK_SWAP(i, i + 1);
+                }
+#pragma unroll
+                for (int i = 1; i < N - 1; i += 2)
+                {
+                    TOPK_SWAP(i, i + 1);
+                }
+            }
+        }
+    }
+};
 
 template <typename RedType>
 struct Sort<1, RedType>
@@ -170,28 +237,27 @@ __forceinline__ __device__ void reduceTopK(cg::thread_block_tile<kWARP_SIZE> con
     int32_t (&outIdx)[K], Type value, int32_t idx, Type const minValue, int actualK = K)
 {
     static_assert(K > 0, "Top K must have K > 0");
-    static_assert(K < kWARP_SIZE, "Top K must have K < kWARP_SIZE");
+    static_assert(K <= kWARP_SIZE, "Top K must have K <= kWARP_SIZE");
     using RedType = TopKRedType<Type>;
     RedType topK{value, idx};
     typename RedType::TypeCmp packedMax{};
 #pragma unroll
-    for (int kk = 0; kk < actualK; ++kk) //@todo: check if actualK is correct
+    for (int kk = 0; kk < actualK; ++kk)
     {
         topK = kk > 0 && packedMax == topK.compValIdx ? RedType{minValue, idx} : topK;
-        // get the next largest value
         packedMax = topK.reduce(warp);
         RedType::unpack(out[kk], outIdx[kk], packedMax);
     }
 };
 
-template <int K, typename Type, int N, bool IsSorted = false>
-__device__ void reduceTopKFunc(cg::thread_block_tile<kWARP_SIZE> const& warp, Type (&out)[K], int32_t (&outIdx)[K],
-    Type (&value)[N], int32_t (&idx)[N], Type minValue, int actualK = K)
+template <int K, typename Type, int N>
+__forceinline__ __device__ void reduceTopK(cg::thread_block_tile<kWARP_SIZE> const& warp, Type (&out)[K],
+    int32_t (&outIdx)[K], Type (&value)[N], int32_t (&idx)[N], Type const minValue, int actualK = K)
 {
     static_assert(K > 0, "Top K must have K > 0");
-    static_assert(K < kWARP_SIZE, "Top K must have K < kWARP_SIZE");
+    static_assert(K <= kWARP_SIZE, "Top K must have K <= kWARP_SIZE");
     static_assert(N > 0, "Top K must have N > 0");
-    static_assert(N < 5, "Only support candidates number less than or equal to 128");
+    static_assert(N <= 32, "Only support candidates number less than or equal to 32*32=1024");
     using RedType = TopKRedType<Type>;
     RedType topK[N];
 #pragma unroll
@@ -200,12 +266,9 @@ __device__ void reduceTopKFunc(cg::thread_block_tile<kWARP_SIZE> const& warp, Ty
         topK[nn] = RedType{value[nn], idx[nn]};
     }
 
-    if constexpr (!IsSorted)
-    {
-        Sort<N, RedType>::run(topK);
-    }
+    Sort<N, RedType>::run(topK);
+
     typename RedType::TypeCmp packedMax{};
-#pragma unroll
     for (int kk = 0; kk < actualK; ++kk)
     {
         bool update = kk > 0 && packedMax == topK[0].compValIdx;
@@ -214,73 +277,11 @@ __device__ void reduceTopKFunc(cg::thread_block_tile<kWARP_SIZE> const& warp, Ty
         {
             topK[nn] = update && nn == N - 1 ? RedType{minValue, idx[nn]} : update ? topK[nn + 1] : topK[nn];
         }
-        // get the next largest value
         packedMax = topK[0].reduce(warp);
         RedType::unpack(out[kk], outIdx[kk], packedMax);
     }
 };
 
-template <int K, typename Type, int N>
-__forceinline__ __device__ void reduceTopK(cg::thread_block_tile<kWARP_SIZE> const& warp, Type (&out)[K],
-    int32_t (&outIdx)[K], Type (&value)[N], int32_t (&idx)[N], Type const minValue, int actualK = K)
-{
-    static_assert(K > 0, "Top K must have K > 0");
-    static_assert(K < kWARP_SIZE, "Top K must have K < kWARP_SIZE");
-    static_assert(N > 0, "Top K must have N > 0");
-    static_assert(N <= 16, "Only support candidates number less than or equal to 16*32=512");
-    static_assert(
-        N <= 4 || N % 4 == 0, "Only support candidates number is a multiple of 4*32=128 or less than or equal to 4");
-    using RedType = TopKRedType<Type>;
-
-    if constexpr (N <= 4)
-    {
-        reduceTopKFunc<K, Type, N>(warp, out, outIdx, value, idx, minValue, actualK);
-    }
-    else
-    {
-
-        constexpr int numLoops = N / 4;
-        constexpr int numResults = (numLoops * K - 1) / kWARP_SIZE + 1;
-
-        Type topKBufferValue[numResults];
-        int32_t topKBufferIdx[numResults];
-        int32_t laneIdx = threadIdx.x % kWARP_SIZE;
-
-        for (int ii = 0; ii < numResults; ++ii)
-        {
-            topKBufferValue[ii] = minValue;
-            topKBufferIdx[ii] = ii * kWARP_SIZE - 1; //@todo: check if this is correct
-        }
-        for (int loop = 0; loop < numLoops; ++loop)
-        {
-            int start = loop * 4;
-            Type topKValue[K];
-            int32_t topKIdx[K];
-            Type inValue[4];
-            int32_t inIdx[4];
-            for (int i = 0; i < 4; ++i)
-            {
-                inValue[i] = value[start + i];
-                inIdx[i] = idx[start + i];
-            }
-            reduceTopKFunc<K, Type, 4>(warp, topKValue, topKIdx, inValue, inIdx, minValue, actualK);
-            int inOffset = laneIdx % K;
-            if (laneIdx >= loop * K && laneIdx < (loop + 1) * K)
-            {
-                topKBufferValue[0] = topKValue[inOffset];
-                topKBufferIdx[0] = topKIdx[inOffset];
-            }
-            if (loop == numLoops - 1 && (laneIdx < (numLoops * K - kWARP_SIZE)))
-            {
-                topKBufferValue[1] = topKValue[inOffset];
-                topKBufferIdx[1] = topKIdx[inOffset];
-            }
-        }
-
-        reduceTopKFunc<K, Type, numResults>(warp, out, outIdx, topKBufferValue, topKBufferIdx, minValue, actualK);
-    }
-};
-
 #undef TOPK_SWAP
 
 } // namespace reduce_topk