rapidsai · tarang-jain · Apr 10, 2026 · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026
@@ -91,7 +91,7 @@ struct cuvsKMeansParams {
    */
   int batch_centroids;
 
-  /** Check inertia during iterations for early convergence. */
+  /** Deprecated, ignored. Kept for ABI compatibility. */
   bool inertia_check;
 
   /**
@@ -108,7 +108,14 @@ struct cuvsKMeansParams {
    * Number of samples to process per GPU batch for the batched (host-data) API.
    * When set to 0, defaults to n_samples (process all at once).
    */
-   int64_t streaming_batch_size;
+  int64_t streaming_batch_size;
+
+  /**
+   * Number of samples to draw for KMeansPlusPlus initialization.
+   * When set to 0, uses heuristic min(3 * n_clusters, n_samples) for host data,
+   * or n_samples for device data.
+   */
+  int64_t init_size;
 };
 
 typedef struct cuvsKMeansParams* cuvsKMeansParams_t;

@@ -28,7 +28,7 @@ cuvs::cluster::kmeans::params convert_params(const cuvsKMeansParams& params)
   kmeans_params.oversampling_factor = params.oversampling_factor;
   kmeans_params.batch_samples       = params.batch_samples;
   kmeans_params.batch_centroids     = params.batch_centroids;
-  kmeans_params.inertia_check       = params.inertia_check;
+  kmeans_params.init_size             = params.init_size;
   kmeans_params.streaming_batch_size  = params.streaming_batch_size;
   return kmeans_params;
 }
@@ -237,10 +237,11 @@ extern "C" cuvsError_t cuvsKMeansParamsCreate(cuvsKMeansParams_t* params)
       .oversampling_factor  = cpp_params.oversampling_factor,
       .batch_samples        = cpp_params.batch_samples,
       .batch_centroids      = cpp_params.batch_centroids,
-      .inertia_check        = cpp_params.inertia_check,
+      .inertia_check        = false,
       .hierarchical         = false,
       .hierarchical_n_iters = static_cast<int>(cpp_balanced_params.n_iters),
-      .streaming_batch_size           = cpp_params.streaming_batch_size};
+      .streaming_batch_size = cpp_params.streaming_batch_size,
+      .init_size            = cpp_params.init_size};
   });
 }
 

@@ -113,9 +113,14 @@ struct params : base_params {
   int batch_centroids = 0;
 
   /**
-   * If true, check inertia during iterations for early convergence.
+   * Number of samples to randomly draw for the KMeansPlusPlus initialization
+   * step. A random subset of this size is used for centroid seeding.
+   * When set to 0 the default depends on the data location:
+   *   - Device data: n_samples (use the full dataset).
+   *   - Host data:   min(3 * n_clusters, n_samples).
+   * Default: 0.
    */
-  bool inertia_check = false;
+  int64_t init_size = 0;
 
   /**
    * Number of samples to process per GPU batch when fitting with host data.