-
Notifications
You must be signed in to change notification settings - Fork 184
[Cleanup] Combine Batched and Regular KMeans Impl #2015
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
66d7fd3
0a09e6f
99a5730
a077406
d659875
ec2e8b7
03a6473
42a8d9d
86af2fa
d4e4e2c
0819af5
e0f079c
c2f7390
b9c3102
e3956c1
986d78a
a8e1d26
384d054
455b286
5462809
6ba759c
e76eaac
afbefdf
e62a63c
e4f08bf
6e4a8f0
4a8a85c
bbf2a9f
410092c
c515c1e
e8e63ab
30c457c
ab96623
269f23c
80a22ca
ac06b05
0569340
8cac63a
f6df4ae
9fc74b1
dec3dc4
0d030a2
b1c034e
a482495
8ecfdc1
1e1525e
ec22e07
d2e410d
b791c38
a05a006
73293cf
880c7b9
e2035ec
e28c200
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,6 +39,8 @@ typedef enum { | |
|
|
||
| /** | ||
| * @brief Hyper-parameters for the kmeans algorithm | ||
| * NB: The inertia_check field is kept for ABI compatibility. Removed in cuvsKMeansParams_v2. | ||
| * TODO: CalVer for the replacement: 26.08 | ||
| */ | ||
| struct cuvsKMeansParams { | ||
| cuvsDistanceType metric; | ||
|
|
@@ -91,7 +93,7 @@ struct cuvsKMeansParams { | |
| */ | ||
| int batch_centroids; | ||
|
|
||
| /** Check inertia during iterations for early convergence. */ | ||
| /** Deprecated, ignored. Kept for ABI compatibility. */ | ||
| bool inertia_check; | ||
|
|
||
| /** | ||
|
|
@@ -108,14 +110,104 @@ struct cuvsKMeansParams { | |
| * Number of samples to process per GPU batch for the batched (host-data) API. | ||
| * When set to 0, defaults to n_samples (process all at once). | ||
| */ | ||
| int64_t streaming_batch_size; | ||
| int64_t streaming_batch_size; | ||
|
|
||
| /** | ||
| * Number of samples to draw for KMeansPlusPlus initialization. | ||
| * When set to 0, uses heuristic min(3 * n_clusters, n_samples) for host data, | ||
| * or n_samples for device data. | ||
| */ | ||
| int64_t init_size; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Question: adding In practice cVS consumers go through
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes I was just following the instructions in the doc that we have as a guide for ABI stability. And we also have a CI check for the same, which will fail if there is a breaking change. I also added |
||
| }; | ||
|
|
||
| /** | ||
| * @brief Hyper-parameters for the kmeans algorithm | ||
| * TODO: Remove this after cuvsKMeansParams is replaced in ABI 2.0 | ||
| */ | ||
| struct cuvsKMeansParams_v2 { | ||
| cuvsDistanceType metric; | ||
|
|
||
| /** | ||
| * The number of clusters to form as well as the number of centroids to generate (default:8). | ||
| */ | ||
| int n_clusters; | ||
|
|
||
| /** | ||
| * Method for initialization, defaults to k-means++: | ||
| * - cuvsKMeansInitMethod::KMeansPlusPlus (k-means++): Use scalable k-means++ algorithm | ||
| * to select the initial cluster centers. | ||
| * - cuvsKMeansInitMethod::Random (random): Choose 'n_clusters' observations (rows) at | ||
| * random from the input data for the initial centroids. | ||
| * - cuvsKMeansInitMethod::Array (ndarray): Use 'centroids' as initial cluster centers. | ||
| */ | ||
| cuvsKMeansInitMethod init; | ||
|
|
||
| /** | ||
| * Maximum number of iterations of the k-means algorithm for a single run. | ||
| */ | ||
| int max_iter; | ||
|
|
||
| /** | ||
| * Relative tolerance with regards to inertia to declare convergence. | ||
| */ | ||
| double tol; | ||
|
|
||
| /** | ||
| * Number of instance k-means algorithm will be run with different seeds. | ||
| */ | ||
| int n_init; | ||
|
|
||
| /** | ||
| * Oversampling factor for use in the k-means|| algorithm | ||
| */ | ||
| double oversampling_factor; | ||
|
|
||
| /** | ||
| * batch_samples and batch_centroids are used to tile 1NN computation which is | ||
| * useful to optimize/control the memory footprint | ||
| * Default tile is [batch_samples x n_clusters] i.e. when batch_centroids is 0 | ||
| * then don't tile the centroids | ||
| */ | ||
| int batch_samples; | ||
|
|
||
| /** | ||
| * if 0 then batch_centroids = n_clusters | ||
| */ | ||
| int batch_centroids; | ||
|
|
||
| /** | ||
| * Whether to use hierarchical (balanced) kmeans or not | ||
| */ | ||
| bool hierarchical; | ||
|
|
||
| /** | ||
| * For hierarchical k-means , defines the number of training iterations | ||
| */ | ||
| int hierarchical_n_iters; | ||
|
|
||
| /** | ||
| * Number of samples to process per GPU batch for the batched (host-data) API. | ||
| * When set to 0, defaults to n_samples (process all at once). | ||
| */ | ||
| int64_t streaming_batch_size; | ||
|
|
||
| /** | ||
| * Number of samples to draw for KMeansPlusPlus initialization. | ||
| * When set to 0, uses heuristic min(3 * n_clusters, n_samples) for host data, | ||
| * or n_samples for device data. | ||
| */ | ||
| int64_t init_size; | ||
| }; | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
|
|
||
| typedef struct cuvsKMeansParams* cuvsKMeansParams_t; | ||
| typedef struct cuvsKMeansParams_v2* cuvsKMeansParams_v2_t; | ||
|
|
||
| /** | ||
| * @brief Allocate KMeans params, and populate with default values | ||
| * | ||
| * @note In cuVS 26.08 (next ABI major version) this signature will be | ||
| * replaced by cuvsKMeansParamsCreate_v2. | ||
| * | ||
| * @param[in] params cuvsKMeansParams_t to allocate | ||
| * @return cuvsError_t | ||
| */ | ||
|
|
@@ -124,11 +216,33 @@ cuvsError_t cuvsKMeansParamsCreate(cuvsKMeansParams_t* params); | |
| /** | ||
| * @brief De-allocate KMeans params | ||
| * | ||
| * @note In cuVS 26.08 (next ABI major version) this signature will be | ||
| * replaced by cuvsKMeansParamsDestroy_v2. | ||
| * | ||
| * @param[in] params | ||
| * @return cuvsError_t | ||
| */ | ||
| cuvsError_t cuvsKMeansParamsDestroy(cuvsKMeansParams_t params); | ||
|
|
||
| /** | ||
| * @brief Allocate KMeans params | ||
| * | ||
| * Mirrors cuvsKMeansParamsCreate but operates on cuvsKMeansParams_v2. | ||
| * Will become the unsuffixed cuvsKMeansParamsCreate in cuVS 26.08. | ||
| * | ||
| * @param[in] params cuvsKMeansParams_v2_t to allocate | ||
| * @return cuvsError_t | ||
| */ | ||
| cuvsError_t cuvsKMeansParamsCreate_v2(cuvsKMeansParams_v2_t* params); | ||
|
|
||
| /** | ||
| * @brief De-allocate KMeans params allocated by cuvsKMeansParamsCreate_v2. | ||
| * | ||
| * @param[in] params | ||
| * @return cuvsError_t | ||
| */ | ||
| cuvsError_t cuvsKMeansParamsDestroy_v2(cuvsKMeansParams_v2_t params); | ||
|
|
||
| /** | ||
| * @brief Type of k-means algorithm. | ||
| */ | ||
|
|
@@ -154,6 +268,9 @@ typedef enum { CUVS_KMEANS_TYPE_KMEANS = 0, CUVS_KMEANS_TYPE_KMEANS_BALANCED = 1 | |
| * When X is on the host the data is streamed to the GPU in | ||
| * batches controlled by params->streaming_batch_size. | ||
| * | ||
| * @note In cuVS 26.08 (next ABI major version) this signature will be | ||
| * replaced by cuvsKMeansFit_v2. | ||
| * | ||
| * @param[in] res opaque C handle | ||
| * @param[in] params Parameters for KMeans model. | ||
| * @param[in] X Training instances to cluster. The data must | ||
|
|
@@ -181,9 +298,45 @@ cuvsError_t cuvsKMeansFit(cuvsResources_t res, | |
| double* inertia, | ||
| int* n_iter); | ||
|
|
||
| /** | ||
| * @brief Find clusters with k-means algorithm (v2 params layout). | ||
| * | ||
| * Mirrors cuvsKMeansFit but takes cuvsKMeansParams_v2_t. Will become the | ||
| * unsuffixed cuvsKMeansFit in cuVS 26.08. | ||
| * | ||
| * @param[in] res opaque C handle | ||
| * @param[in] params Parameters for KMeans model (v2 layout). | ||
| * @param[in] X Training instances to cluster. The data must | ||
| * be in row-major format. May be on host or | ||
| * device memory. | ||
| * [dim = n_samples x n_features] | ||
| * @param[in] sample_weight Optional weights for each observation in X. | ||
| * Must be on the same memory space as X. | ||
| * [len = n_samples] | ||
| * @param[inout] centroids [in] When init is InitMethod::Array, use | ||
| * centroids as the initial cluster centers. | ||
| * [out] The generated centroids from the | ||
| * kmeans algorithm are stored at the address | ||
| * pointed by 'centroids'. Must be on device. | ||
| * [dim = n_clusters x n_features] | ||
| * @param[out] inertia Sum of squared distances of samples to their | ||
| * closest cluster center. | ||
| * @param[out] n_iter Number of iterations run. | ||
| */ | ||
| cuvsError_t cuvsKMeansFit_v2(cuvsResources_t res, | ||
| cuvsKMeansParams_v2_t params, | ||
| DLManagedTensor* X, | ||
| DLManagedTensor* sample_weight, | ||
| DLManagedTensor* centroids, | ||
| double* inertia, | ||
| int* n_iter); | ||
|
|
||
| /** | ||
| * @brief Predict the closest cluster each sample in X belongs to. | ||
| * | ||
| * @note In cuVS 26.08 (next ABI major version) this signature will be | ||
| * replaced by cuvsKMeansPredict_v2. | ||
| * | ||
| * @param[in] res opaque C handle | ||
| * @param[in] params Parameters for KMeans model. | ||
| * @param[in] X New data to predict. | ||
|
|
@@ -209,6 +362,37 @@ cuvsError_t cuvsKMeansPredict(cuvsResources_t res, | |
| bool normalize_weight, | ||
| double* inertia); | ||
|
|
||
| /** | ||
| * @brief Predict the closest cluster each sample in X belongs to (v2 params layout). | ||
| * | ||
| * Mirrors cuvsKMeansPredict but takes cuvsKMeansParams_v2_t. Will become the | ||
| * unsuffixed cuvsKMeansPredict in cuVS 26.08. | ||
| * | ||
| * @param[in] res opaque C handle | ||
| * @param[in] params Parameters for KMeans model (v2 layout). | ||
| * @param[in] X New data to predict. | ||
| * [dim = n_samples x n_features] | ||
| * @param[in] sample_weight Optional weights for each observation in X. | ||
| * [len = n_samples] | ||
| * @param[in] centroids Cluster centroids. The data must be in | ||
| * row-major format. | ||
| * [dim = n_clusters x n_features] | ||
| * @param[in] normalize_weight True if the weights should be normalized | ||
| * @param[out] labels Index of the cluster each sample in X | ||
| * belongs to. | ||
| * [len = n_samples] | ||
| * @param[out] inertia Sum of squared distances of samples to | ||
| * their closest cluster center. | ||
| */ | ||
| cuvsError_t cuvsKMeansPredict_v2(cuvsResources_t res, | ||
| cuvsKMeansParams_v2_t params, | ||
| DLManagedTensor* X, | ||
| DLManagedTensor* sample_weight, | ||
| DLManagedTensor* centroids, | ||
| DLManagedTensor* labels, | ||
| bool normalize_weight, | ||
| double* inertia); | ||
|
|
||
| /** | ||
| * @brief Compute cluster cost | ||
| * | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.