Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
66d7fd3
combine impls
tarang-jain Apr 10, 2026
0a09e6f
rm inertia_check
tarang-jain Apr 13, 2026
99a5730
change to warning
tarang-jain Apr 13, 2026
a077406
style
tarang-jain Apr 13, 2026
d659875
add init_size param
tarang-jain Apr 13, 2026
ec2e8b7
Merge branch 'main' into combine-batch
tarang-jain Apr 13, 2026
03a6473
docs
tarang-jain Apr 13, 2026
42a8d9d
Merge branch 'combine-batch' of https://github.com/tarang-jain/cuvs i…
tarang-jain Apr 13, 2026
86af2fa
rm direct cuda api calls
tarang-jain Apr 13, 2026
d4e4e2c
std::swap instead of raft::copy
tarang-jain Apr 14, 2026
0819af5
cache batch norms
tarang-jain Apr 14, 2026
e0f079c
centroid norms can also be cached per iteration
tarang-jain Apr 14, 2026
c2f7390
mg n_iter
tarang-jain Apr 14, 2026
b9c3102
pre-commit
tarang-jain Apr 14, 2026
e3956c1
do not break c abi
tarang-jain Apr 14, 2026
986d78a
Merge branch 'main' into combine-batch
tarang-jain Apr 14, 2026
a8e1d26
Merge branch 'main' into combine-batch
tarang-jain Apr 16, 2026
384d054
fix checkWeight
tarang-jain Apr 21, 2026
455b286
merge upstream:
tarang-jain Apr 21, 2026
5462809
Merge branch 'combine-batch' of https://github.com/tarang-jain/cuvs i…
tarang-jain Apr 21, 2026
6ba759c
fix compilation
tarang-jain Apr 21, 2026
e76eaac
rel_tol
tarang-jain Apr 22, 2026
afbefdf
pass workspace
tarang-jain Apr 22, 2026
e62a63c
Merge branch 'combine-batch' of https://github.com/tarang-jain/cuvs i…
tarang-jain Apr 22, 2026
e4f08bf
style
tarang-jain Apr 22, 2026
6e4a8f0
Merge branch 'main' of https://github.com/rapidsai/cuvs into combine-…
tarang-jain Apr 22, 2026
4a8a85c
do not use batch scratch space; rm update_centroids
tarang-jain Apr 22, 2026
bbf2a9f
move the debug log
tarang-jain Apr 22, 2026
410092c
add new suffixed param struct
tarang-jain Apr 22, 2026
c515c1e
address pr reviews
tarang-jain Apr 22, 2026
e8e63ab
fix docstring
tarang-jain Apr 22, 2026
30c457c
fix wt_sum warning
tarang-jain Apr 22, 2026
ab96623
rm deprecationwarning and instead add FutureWarning:=
tarang-jain Apr 22, 2026
269f23c
unweighted to never materialize batch weights
tarang-jain Apr 22, 2026
80a22ca
add cpp tests
tarang-jain Apr 23, 2026
ac06b05
update cpp tests
tarang-jain Apr 23, 2026
0569340
revert batch norms cache
tarang-jain Apr 23, 2026
8cac63a
increase zero cost threshold
tarang-jain Apr 24, 2026
f6df4ae
apply cuda event plus re-add h_norm_cache
tarang-jain Apr 24, 2026
9fc74b1
rm cosine expanded stuff
tarang-jain Apr 24, 2026
dec3dc4
resolve merge conflicts
tarang-jain Apr 28, 2026
0d030a2
change suffix of the params struct
tarang-jain Apr 28, 2026
b1c034e
replace 06 by 08, add todo and note
tarang-jain Apr 28, 2026
a482495
update to v2
tarang-jain Apr 28, 2026
8ecfdc1
avoid stream sync inside weight sum
tarang-jain Apr 29, 2026
1e1525e
Merge branch 'combine-batch' of https://github.com/tarang-jain/cuvs i…
tarang-jain Apr 29, 2026
ec22e07
empty
tarang-jain Apr 29, 2026
d2e410d
empty
tarang-jain Apr 29, 2026
b791c38
Merge branch 'main' into combine-batch
tarang-jain Apr 29, 2026
a05a006
new signatures with new struct
tarang-jain Apr 29, 2026
73293cf
Merge branch 'combine-batch' of https://github.com/tarang-jain/cuvs i…
tarang-jain Apr 29, 2026
880c7b9
Merge branch 'main' of https://github.com/rapidsai/cuvs into combine-…
tarang-jain Apr 30, 2026
e2035ec
revert change to calls in py and rust; add c tests
tarang-jain Apr 30, 2026
e28c200
Merge branch 'main' into combine-batch
tarang-jain May 1, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
188 changes: 186 additions & 2 deletions c/include/cuvs/cluster/kmeans.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ typedef enum {

/**
* @brief Hyper-parameters for the kmeans algorithm
* NB: The inertia_check field is kept for ABI compatibility. Removed in cuvsKMeansParams_v2.
* TODO: CalVer for the replacement: 26.08
*/
struct cuvsKMeansParams {
cuvsDistanceType metric;
Expand Down Expand Up @@ -91,7 +93,7 @@ struct cuvsKMeansParams {
*/
int batch_centroids;

/** Check inertia during iterations for early convergence. */
/** Deprecated, ignored. Kept for ABI compatibility. */
Comment thread
tarang-jain marked this conversation as resolved.
bool inertia_check;

/**
Expand All @@ -108,14 +110,104 @@ struct cuvsKMeansParams {
* Number of samples to process per GPU batch for the batched (host-data) API.
* When set to 0, defaults to n_samples (process all at once).
*/
int64_t streaming_batch_size;
int64_t streaming_batch_size;

/**
* Number of samples to draw for KMeansPlusPlus initialization.
* When set to 0, uses heuristic min(3 * n_clusters, n_samples) for host data,
* or n_samples for device data.
*/
int64_t init_size;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question: adding init_size at the tail changes sizeof(cuvsKMeansParams and therefore still breaks ABI for any caller that stack-allocates or sizeofs the struct, no?

In practice cVS consumers go through cuvsKMeansParamsCreate, so this is a soft break and might be not much of an issue, but wanted to raise it to discuss.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes I was just following the instructions in the doc that we have as a guide for ABI stability. And we also have a CI check for the same, which will fail if there is a breaking change. I also added streaming_batch_size as a new param in the last release, which would have also changed the sizeof(cuvsKMeansParams)

};

/**
* @brief Hyper-parameters for the kmeans algorithm
* TODO: Remove this after cuvsKMeansParams is replaced in ABI 2.0
*/
struct cuvsKMeansParams_v2 {
cuvsDistanceType metric;

/**
* The number of clusters to form as well as the number of centroids to generate (default:8).
*/
int n_clusters;

/**
* Method for initialization, defaults to k-means++:
* - cuvsKMeansInitMethod::KMeansPlusPlus (k-means++): Use scalable k-means++ algorithm
* to select the initial cluster centers.
* - cuvsKMeansInitMethod::Random (random): Choose 'n_clusters' observations (rows) at
* random from the input data for the initial centroids.
* - cuvsKMeansInitMethod::Array (ndarray): Use 'centroids' as initial cluster centers.
*/
cuvsKMeansInitMethod init;

/**
* Maximum number of iterations of the k-means algorithm for a single run.
*/
int max_iter;

/**
* Relative tolerance with regards to inertia to declare convergence.
*/
double tol;

/**
* Number of instance k-means algorithm will be run with different seeds.
*/
int n_init;

/**
* Oversampling factor for use in the k-means|| algorithm
*/
double oversampling_factor;

/**
* batch_samples and batch_centroids are used to tile 1NN computation which is
* useful to optimize/control the memory footprint
* Default tile is [batch_samples x n_clusters] i.e. when batch_centroids is 0
* then don't tile the centroids
*/
int batch_samples;

/**
* if 0 then batch_centroids = n_clusters
*/
int batch_centroids;

/**
* Whether to use hierarchical (balanced) kmeans or not
*/
bool hierarchical;

/**
* For hierarchical k-means , defines the number of training iterations
*/
int hierarchical_n_iters;

/**
* Number of samples to process per GPU batch for the batched (host-data) API.
* When set to 0, defaults to n_samples (process all at once).
*/
int64_t streaming_batch_size;

/**
* Number of samples to draw for KMeansPlusPlus initialization.
* When set to 0, uses heuristic min(3 * n_clusters, n_samples) for host data,
* or n_samples for device data.
*/
int64_t init_size;
};
Comment thread
coderabbitai[bot] marked this conversation as resolved.

typedef struct cuvsKMeansParams* cuvsKMeansParams_t;
typedef struct cuvsKMeansParams_v2* cuvsKMeansParams_v2_t;

/**
* @brief Allocate KMeans params, and populate with default values
*
* @note In cuVS 26.08 (next ABI major version) this signature will be
* replaced by cuvsKMeansParamsCreate_v2.
*
* @param[in] params cuvsKMeansParams_t to allocate
* @return cuvsError_t
*/
Expand All @@ -124,11 +216,33 @@ cuvsError_t cuvsKMeansParamsCreate(cuvsKMeansParams_t* params);
/**
* @brief De-allocate KMeans params
*
* @note In cuVS 26.08 (next ABI major version) this signature will be
* replaced by cuvsKMeansParamsDestroy_v2.
*
* @param[in] params
* @return cuvsError_t
*/
cuvsError_t cuvsKMeansParamsDestroy(cuvsKMeansParams_t params);

/**
* @brief Allocate KMeans params
*
* Mirrors cuvsKMeansParamsCreate but operates on cuvsKMeansParams_v2.
* Will become the unsuffixed cuvsKMeansParamsCreate in cuVS 26.08.
*
* @param[in] params cuvsKMeansParams_v2_t to allocate
* @return cuvsError_t
*/
cuvsError_t cuvsKMeansParamsCreate_v2(cuvsKMeansParams_v2_t* params);

/**
* @brief De-allocate KMeans params allocated by cuvsKMeansParamsCreate_v2.
*
* @param[in] params
* @return cuvsError_t
*/
cuvsError_t cuvsKMeansParamsDestroy_v2(cuvsKMeansParams_v2_t params);

/**
* @brief Type of k-means algorithm.
*/
Expand All @@ -154,6 +268,9 @@ typedef enum { CUVS_KMEANS_TYPE_KMEANS = 0, CUVS_KMEANS_TYPE_KMEANS_BALANCED = 1
* When X is on the host the data is streamed to the GPU in
* batches controlled by params->streaming_batch_size.
*
* @note In cuVS 26.08 (next ABI major version) this signature will be
* replaced by cuvsKMeansFit_v2.
*
* @param[in] res opaque C handle
* @param[in] params Parameters for KMeans model.
* @param[in] X Training instances to cluster. The data must
Expand Down Expand Up @@ -181,9 +298,45 @@ cuvsError_t cuvsKMeansFit(cuvsResources_t res,
double* inertia,
int* n_iter);

/**
* @brief Find clusters with k-means algorithm (v2 params layout).
*
* Mirrors cuvsKMeansFit but takes cuvsKMeansParams_v2_t. Will become the
* unsuffixed cuvsKMeansFit in cuVS 26.08.
*
* @param[in] res opaque C handle
* @param[in] params Parameters for KMeans model (v2 layout).
* @param[in] X Training instances to cluster. The data must
* be in row-major format. May be on host or
* device memory.
* [dim = n_samples x n_features]
* @param[in] sample_weight Optional weights for each observation in X.
* Must be on the same memory space as X.
* [len = n_samples]
* @param[inout] centroids [in] When init is InitMethod::Array, use
* centroids as the initial cluster centers.
* [out] The generated centroids from the
* kmeans algorithm are stored at the address
* pointed by 'centroids'. Must be on device.
* [dim = n_clusters x n_features]
* @param[out] inertia Sum of squared distances of samples to their
* closest cluster center.
* @param[out] n_iter Number of iterations run.
*/
cuvsError_t cuvsKMeansFit_v2(cuvsResources_t res,
cuvsKMeansParams_v2_t params,
DLManagedTensor* X,
DLManagedTensor* sample_weight,
DLManagedTensor* centroids,
double* inertia,
int* n_iter);

/**
* @brief Predict the closest cluster each sample in X belongs to.
*
* @note In cuVS 26.08 (next ABI major version) this signature will be
* replaced by cuvsKMeansPredict_v2.
*
* @param[in] res opaque C handle
* @param[in] params Parameters for KMeans model.
* @param[in] X New data to predict.
Expand All @@ -209,6 +362,37 @@ cuvsError_t cuvsKMeansPredict(cuvsResources_t res,
bool normalize_weight,
double* inertia);

/**
* @brief Predict the closest cluster each sample in X belongs to (v2 params layout).
*
* Mirrors cuvsKMeansPredict but takes cuvsKMeansParams_v2_t. Will become the
* unsuffixed cuvsKMeansPredict in cuVS 26.08.
*
* @param[in] res opaque C handle
* @param[in] params Parameters for KMeans model (v2 layout).
* @param[in] X New data to predict.
* [dim = n_samples x n_features]
* @param[in] sample_weight Optional weights for each observation in X.
* [len = n_samples]
* @param[in] centroids Cluster centroids. The data must be in
* row-major format.
* [dim = n_clusters x n_features]
* @param[in] normalize_weight True if the weights should be normalized
* @param[out] labels Index of the cluster each sample in X
* belongs to.
* [len = n_samples]
* @param[out] inertia Sum of squared distances of samples to
* their closest cluster center.
*/
cuvsError_t cuvsKMeansPredict_v2(cuvsResources_t res,
cuvsKMeansParams_v2_t params,
DLManagedTensor* X,
DLManagedTensor* sample_weight,
DLManagedTensor* centroids,
DLManagedTensor* labels,
bool normalize_weight,
double* inertia);

/**
* @brief Compute cluster cost
*
Expand Down
95 changes: 86 additions & 9 deletions c/src/cluster/kmeans.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@

namespace {

cuvs::cluster::kmeans::params convert_params(const cuvsKMeansParams& params)
// The conversions are templated on the C struct type and reused by both API surfaces.
template <typename ParamsT>
cuvs::cluster::kmeans::params convert_params(const ParamsT& params)
{
auto kmeans_params = cuvs::cluster::kmeans::params();
kmeans_params.metric = static_cast<cuvs::distance::DistanceType>(params.metric);
Expand All @@ -28,22 +30,23 @@ cuvs::cluster::kmeans::params convert_params(const cuvsKMeansParams& params)
kmeans_params.oversampling_factor = params.oversampling_factor;
kmeans_params.batch_samples = params.batch_samples;
kmeans_params.batch_centroids = params.batch_centroids;
kmeans_params.inertia_check = params.inertia_check;
kmeans_params.init_size = params.init_size;
kmeans_params.streaming_batch_size = params.streaming_batch_size;
return kmeans_params;
}

cuvs::cluster::kmeans::balanced_params convert_balanced_params(const cuvsKMeansParams& params)
template <typename ParamsT>
cuvs::cluster::kmeans::balanced_params convert_balanced_params(const ParamsT& params)
{
auto kmeans_params = cuvs::cluster::kmeans::balanced_params();
kmeans_params.metric = static_cast<cuvs::distance::DistanceType>(params.metric);
kmeans_params.n_iters = params.hierarchical_n_iters;
return kmeans_params;
}

template <typename T, typename IdxT = int64_t>
template <typename T, typename ParamsT, typename IdxT = int64_t>
void _fit(cuvsResources_t res,
const cuvsKMeansParams& params,
const ParamsT& params,
DLManagedTensor* X_tensor,
DLManagedTensor* sample_weight_tensor,
DLManagedTensor* centroids_tensor,
Expand Down Expand Up @@ -140,9 +143,9 @@ void _fit(cuvsResources_t res,
}
}

template <typename T, typename IdxT = int32_t, typename LabelsT = int32_t>
template <typename T, typename ParamsT, typename IdxT = int32_t, typename LabelsT = int32_t>
void _predict(cuvsResources_t res,
const cuvsKMeansParams& params,
const ParamsT& params,
DLManagedTensor* X_tensor,
DLManagedTensor* sample_weight_tensor,
DLManagedTensor* centroids_tensor,
Expand Down Expand Up @@ -237,10 +240,11 @@ extern "C" cuvsError_t cuvsKMeansParamsCreate(cuvsKMeansParams_t* params)
.oversampling_factor = cpp_params.oversampling_factor,
.batch_samples = cpp_params.batch_samples,
.batch_centroids = cpp_params.batch_centroids,
.inertia_check = cpp_params.inertia_check,
.inertia_check = false,
.hierarchical = false,
.hierarchical_n_iters = static_cast<int>(cpp_balanced_params.n_iters),
.streaming_batch_size = cpp_params.streaming_batch_size};
.streaming_batch_size = cpp_params.streaming_batch_size,
.init_size = cpp_params.init_size};
});
}

Expand Down Expand Up @@ -294,6 +298,79 @@ extern "C" cuvsError_t cuvsKMeansPredict(cuvsResources_t res,
});
}

extern "C" cuvsError_t cuvsKMeansParamsCreate_v2(cuvsKMeansParams_v2_t* params)
{
return cuvs::core::translate_exceptions([=] {
cuvs::cluster::kmeans::params cpp_params;
cuvs::cluster::kmeans::balanced_params cpp_balanced_params;
*params = new cuvsKMeansParams_v2{
.metric = static_cast<cuvsDistanceType>(cpp_params.metric),
.n_clusters = cpp_params.n_clusters,
.init = static_cast<cuvsKMeansInitMethod>(cpp_params.init),
.max_iter = cpp_params.max_iter,
.tol = cpp_params.tol,
.n_init = cpp_params.n_init,
.oversampling_factor = cpp_params.oversampling_factor,
.batch_samples = cpp_params.batch_samples,
.batch_centroids = cpp_params.batch_centroids,
.hierarchical = false,
.hierarchical_n_iters = static_cast<int>(cpp_balanced_params.n_iters),
.streaming_batch_size = cpp_params.streaming_batch_size,
.init_size = cpp_params.init_size};
});
}

extern "C" cuvsError_t cuvsKMeansParamsDestroy_v2(cuvsKMeansParams_v2_t params)
{
return cuvs::core::translate_exceptions([=] { delete params; });
}

extern "C" cuvsError_t cuvsKMeansFit_v2(cuvsResources_t res,
cuvsKMeansParams_v2_t params,
DLManagedTensor* X,
DLManagedTensor* sample_weight,
DLManagedTensor* centroids,
double* inertia,
int* n_iter)
{
return cuvs::core::translate_exceptions([=] {
auto dataset = X->dl_tensor;
if (dataset.dtype.code == kDLFloat && dataset.dtype.bits == 32) {
_fit<float>(res, *params, X, sample_weight, centroids, inertia, n_iter);
} else if (dataset.dtype.code == kDLFloat && dataset.dtype.bits == 64) {
_fit<double>(res, *params, X, sample_weight, centroids, inertia, n_iter);
} else {
RAFT_FAIL("Unsupported dataset DLtensor dtype: %d and bits: %d",
dataset.dtype.code,
dataset.dtype.bits);
}
});
}

extern "C" cuvsError_t cuvsKMeansPredict_v2(cuvsResources_t res,
cuvsKMeansParams_v2_t params,
DLManagedTensor* X,
DLManagedTensor* sample_weight,
DLManagedTensor* centroids,
DLManagedTensor* labels,
bool normalize_weight,
double* inertia)
{
return cuvs::core::translate_exceptions([=] {
auto dataset = X->dl_tensor;
if (dataset.dtype.code == kDLFloat && dataset.dtype.bits == 32) {
_predict<float>(res, *params, X, sample_weight, centroids, labels, normalize_weight, inertia);
} else if (dataset.dtype.code == kDLFloat && dataset.dtype.bits == 64) {
_predict<double>(
res, *params, X, sample_weight, centroids, labels, normalize_weight, inertia);
} else {
RAFT_FAIL("Unsupported dataset DLtensor dtype: %d and bits: %d",
dataset.dtype.code,
dataset.dtype.bits);
}
});
}

extern "C" cuvsError_t cuvsKMeansClusterCost(cuvsResources_t res,
DLManagedTensor* X,
DLManagedTensor* centroids,
Expand Down
Loading
Loading