Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
169af37
[None][chore] Update flashinfer-python from 0.6.6 to 0.6.8rc1
yihwang-nv Apr 15, 2026
0db1325
[None][chore] Update Docker to upgrade nvidia-cutlass-dsl to 4.4.2
yihwang-nv Apr 15, 2026
4e5f479
[None][chore] Update Docker to upgrade nvidia-cutlass-dsl to 4.4.2
yihwang-nv Apr 15, 2026
8e7f120
[None][chore] Update CI image tags to PR-13064 staging images
yihwang-nv Apr 15, 2026
33d79c4
[None][fix] Remove stale nvidia-cutlass-dsl before pip install in CI
yihwang-nv Apr 15, 2026
7d82d57
Revert "[None][chore] Update CI image tags to PR-13064 staging images"
yihwang-nv Apr 15, 2026
209a50e
[None][fix] Clean stale nvidia-cutlass-dsl before pip install in CI
yihwang-nv Apr 15, 2026
b4f0f82
[None][fix] Guard cutlass-dsl imports and clean site-packages in Chec…
yihwang-nv Apr 16, 2026
0036e8e
[None][chore] Regenerate security_scanning/poetry.lock content-hash
yihwang-nv Apr 16, 2026
11c6a31
[None][fix] Clean stale packages in SLURM install and update CI images
yihwang-nv Apr 16, 2026
0a31528
[None][chore] Remove clean_site_packages.py and all references
yihwang-nv Apr 16, 2026
4c4cc14
[None][chore] Update flashinfer-python from 0.6.8rc1 to 0.6.8
yihwang-nv Apr 16, 2026
a90284b
[None][fix] Remove stale PersistentTileSchedulerParams hooks for cutl…
yihwang-nv Apr 17, 2026
0817c24
[None][fix] Route custom pipelines through sm100 _make_sync_object fo…
yihwang-nv Apr 18, 2026
366eeb3
[None][fix] Accept loc/ip kwargs in custom pipeline overrides for cut…
yihwang-nv Apr 18, 2026
d69b0cc
[None][chore] Revert security_scanning/poetry.lock to origin/main
yihwang-nv Apr 18, 2026
c01a1eb
Revert docker image since the staging one is removed
Wanli-Jiang Apr 20, 2026
125c775
[None][feat] Add NemotronHPuzzleConfig modeling
Wanli-Jiang Apr 3, 2026
367a8b8
Upgrade flashinfer to nightly build
Wanli-Jiang Apr 7, 2026
39895b2
Fix reviewer's comment
Wanli-Jiang Apr 7, 2026
39845be
Update mamba SSD to use flashinfer kernel
Wanli-Jiang Apr 7, 2026
8d555a8
Fix python deps for cutlass-dsl
Wanli-Jiang Apr 7, 2026
f479286
Fix error when rebasing commit
Wanli-Jiang Apr 20, 2026
ab9441e
[None][feat] Optimize nemotron-h from python level
Wanli-Jiang Apr 14, 2026
f386993
[None][feat] Optimize causal_conv1d prefill and decode kernels
Wanli-Jiang Apr 15, 2026
8d78eb2
Fix unittests of nemotron-h
Wanli-Jiang Apr 20, 2026
a62c172
[TRTLLM-11585][feat] Add CUTEDSL moe backend for nemotron-h
Wanli-Jiang Apr 9, 2026
5a3ff0d
[TRTLLM-11585][refactor] Rename gather MoE SwiGLU kernel to act_fusion
Wanli-Jiang Apr 17, 2026
13041dd
Fix cutedsl upgrade issues
Wanli-Jiang Apr 20, 2026
e889569
Refactor the routing part in trtllmgen
ChristinaZ Feb 13, 2026
d8a2479
Revise based on review
ChristinaZ Apr 16, 2026
00ddc3d
Update the deepseek routing
ChristinaZ Apr 19, 2026
18af181
[None][fix] Fix Mamba cache correctness under MTP + CUDA-graph padding
Wanli-Jiang Apr 16, 2026
7b2fd39
Use updated docker image
Wanli-Jiang Apr 20, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions ATTRIBUTIONS-Python.md
Original file line number Diff line number Diff line change
Expand Up @@ -5260,8 +5260,7 @@ For more information, please refer to <http://unlicense.org>
- `Source`: https://github.com/tox-dev/py-filelock
- `Tracker`: https://github.com/tox-dev/py-filelock/issues


## flashinfer-python (0.6.6)
## flashinfer-python (0.6.8)

### Licenses
License: `Apache-2.0`
Expand Down Expand Up @@ -33239,7 +33238,7 @@ License: `NVIDIA Proprietary Software`
- `Homepage`: https://developer.nvidia.com/cusparselt


## nvidia-cutlass-dsl (4.2.1)
## nvidia-cutlass-dsl (4.4.2)

### Licenses
License: `None`
Expand Down
2 changes: 2 additions & 0 deletions constraints.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@ wheel>=0.46.2
tornado>=6.5.5
# WAR against https://github.com/advisories/GHSA-3936-cmfr-pm3m
black>=26.3.1
# Upgrade base image nvidia-cutlass-dsl 4.3.5 to 4.4.2
nvidia-cutlass-dsl>=4.4.2
295 changes: 173 additions & 122 deletions cpp/tensorrt_llm/kernels/causalConv1d/causalConv1d.cu

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,30 @@ using tensorrt_llm::common::launchWithPdlWhenEnabled;
__VA_ARGS__; \
break; \
} \
case 18: \
{ \
constexpr int TOP_K = 18; \
__VA_ARGS__; \
break; \
} \
case 16: \
{ \
constexpr int TOP_K = 16; \
__VA_ARGS__; \
break; \
} \
case 14: \
{ \
constexpr int TOP_K = 14; \
__VA_ARGS__; \
break; \
} \
case 12: \
{ \
constexpr int TOP_K = 12; \
__VA_ARGS__; \
break; \
} \
case 10: \
{ \
constexpr int TOP_K = 10; \
Expand Down
155 changes: 78 additions & 77 deletions cpp/tensorrt_llm/kernels/moeTopKFuncs.cuh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

/*
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2025-2026, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -123,8 +123,75 @@ struct TopKIdx<K_, true>
topK[J].compValIdx = pairMin; \
}

template <int N>
struct IsPowerOf2
{
static constexpr bool value = (N > 0) && ((N & (N - 1)) == 0);
};

template <int N, typename RedType>
struct Sort;
struct Sort
{
static_assert(N > 0 && N <= 32, "Sort supports N in [1, 32]");

static __device__ void run(RedType* topK)
{
if constexpr (IsPowerOf2<N>::value)
{
#pragma unroll
for (int k = 2; k <= N; k *= 2)
{
#pragma unroll
for (int j = k / 2; j > 0; j /= 2)
{
#pragma unroll
for (int i = 0; i < N; ++i)
{
int ixj = i ^ j;
if (ixj > i)
{
if ((i & k) == 0)
{
if (topK[i].compValIdx < topK[ixj].compValIdx)
{
auto tmp = topK[i].compValIdx;
topK[i].compValIdx = topK[ixj].compValIdx;
topK[ixj].compValIdx = tmp;
}
}
else
{
if (topK[i].compValIdx > topK[ixj].compValIdx)
{
auto tmp = topK[i].compValIdx;
topK[i].compValIdx = topK[ixj].compValIdx;
topK[ixj].compValIdx = tmp;
}
}
}
}
}
}
}
else
{
#pragma unroll
for (int pass = 0; pass < N; ++pass)
{
#pragma unroll
for (int i = 0; i < N - 1; i += 2)
{
TOPK_SWAP(i, i + 1);
}
#pragma unroll
for (int i = 1; i < N - 1; i += 2)
{
TOPK_SWAP(i, i + 1);
}
}
}
}
};

template <typename RedType>
struct Sort<1, RedType>
Expand Down Expand Up @@ -170,28 +237,27 @@ __forceinline__ __device__ void reduceTopK(cg::thread_block_tile<kWARP_SIZE> con
int32_t (&outIdx)[K], Type value, int32_t idx, Type const minValue, int actualK = K)
{
static_assert(K > 0, "Top K must have K > 0");
static_assert(K < kWARP_SIZE, "Top K must have K < kWARP_SIZE");
static_assert(K <= kWARP_SIZE, "Top K must have K <= kWARP_SIZE");
using RedType = TopKRedType<Type>;
RedType topK{value, idx};
typename RedType::TypeCmp packedMax{};
#pragma unroll
for (int kk = 0; kk < actualK; ++kk) //@todo: check if actualK is correct
for (int kk = 0; kk < actualK; ++kk)
{
topK = kk > 0 && packedMax == topK.compValIdx ? RedType{minValue, idx} : topK;
// get the next largest value
packedMax = topK.reduce(warp);
RedType::unpack(out[kk], outIdx[kk], packedMax);
}
};

template <int K, typename Type, int N, bool IsSorted = false>
__device__ void reduceTopKFunc(cg::thread_block_tile<kWARP_SIZE> const& warp, Type (&out)[K], int32_t (&outIdx)[K],
Type (&value)[N], int32_t (&idx)[N], Type minValue, int actualK = K)
template <int K, typename Type, int N>
__forceinline__ __device__ void reduceTopK(cg::thread_block_tile<kWARP_SIZE> const& warp, Type (&out)[K],
int32_t (&outIdx)[K], Type (&value)[N], int32_t (&idx)[N], Type const minValue, int actualK = K)
{
static_assert(K > 0, "Top K must have K > 0");
static_assert(K < kWARP_SIZE, "Top K must have K < kWARP_SIZE");
static_assert(K <= kWARP_SIZE, "Top K must have K <= kWARP_SIZE");
static_assert(N > 0, "Top K must have N > 0");
static_assert(N < 5, "Only support candidates number less than or equal to 128");
static_assert(N <= 32, "Only support candidates number less than or equal to 32*32=1024");
using RedType = TopKRedType<Type>;
RedType topK[N];
#pragma unroll
Expand All @@ -200,12 +266,9 @@ __device__ void reduceTopKFunc(cg::thread_block_tile<kWARP_SIZE> const& warp, Ty
topK[nn] = RedType{value[nn], idx[nn]};
}

if constexpr (!IsSorted)
{
Sort<N, RedType>::run(topK);
}
Sort<N, RedType>::run(topK);

typename RedType::TypeCmp packedMax{};
#pragma unroll
for (int kk = 0; kk < actualK; ++kk)
{
bool update = kk > 0 && packedMax == topK[0].compValIdx;
Expand All @@ -214,73 +277,11 @@ __device__ void reduceTopKFunc(cg::thread_block_tile<kWARP_SIZE> const& warp, Ty
{
topK[nn] = update && nn == N - 1 ? RedType{minValue, idx[nn]} : update ? topK[nn + 1] : topK[nn];
}
// get the next largest value
packedMax = topK[0].reduce(warp);
RedType::unpack(out[kk], outIdx[kk], packedMax);
}
};

template <int K, typename Type, int N>
__forceinline__ __device__ void reduceTopK(cg::thread_block_tile<kWARP_SIZE> const& warp, Type (&out)[K],
int32_t (&outIdx)[K], Type (&value)[N], int32_t (&idx)[N], Type const minValue, int actualK = K)
{
static_assert(K > 0, "Top K must have K > 0");
static_assert(K < kWARP_SIZE, "Top K must have K < kWARP_SIZE");
static_assert(N > 0, "Top K must have N > 0");
static_assert(N <= 16, "Only support candidates number less than or equal to 16*32=512");
static_assert(
N <= 4 || N % 4 == 0, "Only support candidates number is a multiple of 4*32=128 or less than or equal to 4");
using RedType = TopKRedType<Type>;

if constexpr (N <= 4)
{
reduceTopKFunc<K, Type, N>(warp, out, outIdx, value, idx, minValue, actualK);
}
else
{

constexpr int numLoops = N / 4;
constexpr int numResults = (numLoops * K - 1) / kWARP_SIZE + 1;

Type topKBufferValue[numResults];
int32_t topKBufferIdx[numResults];
int32_t laneIdx = threadIdx.x % kWARP_SIZE;

for (int ii = 0; ii < numResults; ++ii)
{
topKBufferValue[ii] = minValue;
topKBufferIdx[ii] = ii * kWARP_SIZE - 1; //@todo: check if this is correct
}
for (int loop = 0; loop < numLoops; ++loop)
{
int start = loop * 4;
Type topKValue[K];
int32_t topKIdx[K];
Type inValue[4];
int32_t inIdx[4];
for (int i = 0; i < 4; ++i)
{
inValue[i] = value[start + i];
inIdx[i] = idx[start + i];
}
reduceTopKFunc<K, Type, 4>(warp, topKValue, topKIdx, inValue, inIdx, minValue, actualK);
int inOffset = laneIdx % K;
if (laneIdx >= loop * K && laneIdx < (loop + 1) * K)
{
topKBufferValue[0] = topKValue[inOffset];
topKBufferIdx[0] = topKIdx[inOffset];
}
if (loop == numLoops - 1 && (laneIdx < (numLoops * K - kWARP_SIZE)))
{
topKBufferValue[1] = topKValue[inOffset];
topKBufferIdx[1] = topKIdx[inOffset];
}
}

reduceTopKFunc<K, Type, numResults>(warp, out, outIdx, topKBufferValue, topKBufferIdx, minValue, actualK);
}
};

#undef TOPK_SWAP

} // namespace reduce_topk
Expand Down
Loading