rapidsai · Stardust-SJF · Nov 3, 2025 · Nov 3, 2025 · Feb 12, 2025 · Feb 12, 2025
@@ -225,6 +225,32 @@ if(NOT BUILD_CPU_ONLY)
   )
   target_link_libraries(cuvs_cpp_headers INTERFACE raft::raft rmm::rmm)
 
+  add_library(
+    ivf_rabitq STATIC
+    src/neighbors/ivf_rabitq/gpu_index/ivf_gpu.cu
+    src/neighbors/ivf_rabitq/gpu_index/initializer_gpu.cu
+    src/neighbors/ivf_rabitq/gpu_index/quantizer_gpu.cu
+    src/neighbors/ivf_rabitq/gpu_index/rotator_gpu.cu
+    src/neighbors/ivf_rabitq/gpu_index/searcher_gpu.cu
+    src/neighbors/ivf_rabitq/gpu_index/searcher_gpu_shared_mem_opt.cu
+    src/neighbors/ivf_rabitq/gpu_index/searcher_gpu_quantize_query.cu
+    src/neighbors/ivf_rabitq/utils/searcher_gpu_utils.cu
+  )
+
+  target_link_libraries(ivf_rabitq PRIVATE OpenMP::OpenMP_CXX CUDA::cudart raft::raft rmm)
+
+  target_include_directories(
+    ivf_rabitq PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
+  )
+
+  target_compile_options(
+    ivf_rabitq
+    PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: $<$<CONFIG:Debug>:-G;-g> --extended-lambda
+            --expt-relaxed-constexpr -Xcompiler=-fopenmp > $<$<COMPILE_LANGUAGE:CXX>:-fopenmp>
+  )
+
+  target_compile_definitions(ivf_rabitq PRIVATE HIGH_ACC_FAST_SCAN)
+
   generate_inst_matrix(
     cagra_search_inst_files
     MATRIX_JSON_FILE "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/cagra_search_matrix.json"
@@ -897,6 +923,7 @@ if(NOT BUILD_CPU_ONLY)
     src/neighbors/ivf_pq/detail/ivf_pq_process_and_fill_codes.cu
     ${ivf_pq_search_inst_files}
     ${ivf_pq_transform_inst_files}
+    src/neighbors/ivf_rabitq.cu
     src/neighbors/knn_merge_parts.cu
     src/neighbors/nn_descent.cu
     ${nn_descent_inst_files}
@@ -1040,6 +1067,7 @@ if(NOT BUILD_CPU_ONLY)
               $<COMPILE_ONLY:cuco::cuco>
               CUDA::nvJitLink
               CUDA::nvrtc
+              ivf_rabitq
     )
     set_property(TARGET cuvs PROPERTY NO_CUDART_DEP ON)
 

@@ -22,6 +22,9 @@ option(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT "Include faiss' cpu ivf flat algori
        ON
 )
 option(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_PQ "Include faiss' cpu ivf pq algorithm in benchmark" ON)
+option(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_RABITQ
+       "Include faiss' cpu ivf rabitq algorithm in benchmark" OFF
+)
 option(CUVS_ANN_BENCH_USE_FAISS_CPU_HNSW_FLAT "Include faiss' hnsw algorithm in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT "Include cuVS ivf flat algorithm in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ "Include cuVS ivf pq algorithm in benchmark" ON)
@@ -45,6 +48,7 @@ option(CUVS_ANN_BENCH_SINGLE_EXE
        "Make a single executable with benchmark as shared library modules" OFF
 )
 option(CUVS_KNN_BENCH_USE_CUVS_BRUTE_FORCE "Include cuVS brute force knn in benchmark" ON)
+option(CUVS_ANN_BENCH_USE_CUVS_IVF_RABITQ "Include cuVS ivf RaBitQ algorithm in benchmark" ON)
 
 # ##################################################################################################
 # * Process options ----------------------------------------------------------
@@ -244,6 +248,13 @@ if(CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT)
   )
 endif()
 
+if(CUVS_ANN_BENCH_USE_CUVS_IVF_RABITQ)
+  ConfigureAnnBench(
+    NAME CUVS_IVF_RABITQ PATH src/cuvs/cuvs_benchmark.cu src/cuvs/cuvs_ivf_rabitq.cu LINKS cuvs
+    ivf_rabitq
+  )
+endif()
+
 if(CUVS_ANN_BENCH_USE_CUVS_BRUTE_FORCE)
   ConfigureAnnBench(NAME CUVS_BRUTE_FORCE PATH src/cuvs/cuvs_benchmark.cu LINKS cuvs)
 endif()
@@ -309,6 +320,13 @@ if(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_PQ)
   )
 endif()
 
+if(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_RABITQ)
+  ConfigureAnnBench(
+    NAME FAISS_CPU_IVF_RABITQ PATH src/faiss/faiss_cpu_benchmark.cpp LINKS ${CUVS_FAISS_TARGETS}
+    cuvs ivf_rabitq
+  )
+endif()
+
 if(CUVS_ANN_BENCH_USE_FAISS_CPU_HNSW_FLAT)
   ConfigureAnnBench(
     NAME FAISS_CPU_HNSW_FLAT PATH src/faiss/faiss_cpu_benchmark.cpp LINKS ${CUVS_FAISS_TARGETS}

@@ -24,6 +24,12 @@ extern template class cuvs::bench::cuvs_ivf_pq<float, int64_t>;
 extern template class cuvs::bench::cuvs_ivf_pq<uint8_t, int64_t>;
 extern template class cuvs::bench::cuvs_ivf_pq<int8_t, int64_t>;
 #endif
+#if defined(CUVS_ANN_BENCH_USE_CUVS_IVF_RABITQ)
+#include "cuvs_ivf_rabitq_wrapper.h"
+#endif
+#ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_RABITQ
+extern template class cuvs::bench::cuvs_ivf_rabitq<float, int64_t>;
+#endif
 #if defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA) || defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB) || \
   defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA_DISKANN)
 #include "cuvs_cagra_wrapper.h"
@@ -178,6 +184,47 @@ void parse_search_param(const nlohmann::json& conf,
 }
 #endif
 
+#if defined(CUVS_ANN_BENCH_USE_CUVS_IVF_RABITQ)
+template <typename T, typename IdxT>
+void parse_build_param(const nlohmann::json& conf,
+                       typename cuvs::bench::cuvs_ivf_rabitq<T, IdxT>::build_param& param)
+{
+  if (conf.contains("nlist")) { param.n_lists = conf.at("nlist"); }
+  if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); }
+  if (conf.contains("max_points_per_cluster")) {
+    param.max_train_points_per_cluster = conf.at("max_points_per_cluster");
+  }
+  if (conf.contains("bits_per_dim")) { param.bits_per_dim = conf.at("bits_per_dim"); }
+  if (conf.contains("fast_quantize_flag")) {
+    param.fast_quantize_flag = conf.at("fast_quantize_flag");
+  }
+  if (conf.contains("force_streaming")) { param.force_streaming = conf.at("force_streaming"); }
+}
+
+template <typename T, typename IdxT>
+void parse_search_param(const nlohmann::json& conf,
+                        typename cuvs::bench::cuvs_ivf_rabitq<T, IdxT>::search_param& param)
+{
+  if (conf.contains("nprobe")) { param.rabitq_param.n_probes = conf.at("nprobe"); }
+
+  if (conf.contains("mode")) {
+    std::string mode = conf.at("mode");
+    if (mode == "lut16") {
+      param.rabitq_param.mode = cuvs::neighbors::ivf_rabitq::search_mode::LUT16;
+    } else if (mode == "lut32") {
+      param.rabitq_param.mode = cuvs::neighbors::ivf_rabitq::search_mode::LUT32;
+    } else if (mode == "quant4") {
+      param.rabitq_param.mode = cuvs::neighbors::ivf_rabitq::search_mode::QUANT4;
+    } else if (mode == "quant8") {
+      param.rabitq_param.mode = cuvs::neighbors::ivf_rabitq::search_mode::QUANT8;
+    } else {
+      throw std::runtime_error("mode: '" + mode +
+                               "', should be either 'lut16', 'lut32', 'quant4' or 'quant8'");
+    }
+  }
+}
+#endif
+
 #if defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA) || defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB) || \
   defined(CUVS_ANN_BENCH_USE_CUVS_MG) || defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA_DISKANN)
 template <typename T, typename IdxT>

@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -91,6 +91,15 @@ auto create_algo(const std::string& algo_name,
     a = std::make_unique<cuvs::bench::cuvs_ivf_pq<T, int64_t>>(metric, dim, param);
   }
 #endif
+#ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_RABITQ
+  if constexpr (std::is_same_v<T, float>) {
+    if (algo_name == "cuvs_ivf_rabitq") {
+      typename cuvs::bench::cuvs_ivf_rabitq<T, int64_t>::build_param param;
+      parse_build_param<T, int64_t>(conf, param);
+      a = std::make_unique<cuvs::bench::cuvs_ivf_rabitq<T, int64_t>>(metric, dim, param);
+    }
+  }
+#endif
 #ifdef CUVS_ANN_BENCH_USE_CUVS_CAGRA
   if (algo_name == "raft_cagra" || algo_name == "cuvs_cagra") {
     typename cuvs::bench::cuvs_cagra<T, uint32_t>::build_param param;
@@ -158,6 +167,16 @@ auto create_search_param(const std::string& algo_name, const nlohmann::json& con
     return param;
   }
 #endif
+#ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_RABITQ
+  if constexpr (std::is_same_v<T, float>) {
+    if (algo_name == "cuvs_ivf_rabitq") {
+      auto param =
+        std::make_unique<typename cuvs::bench::cuvs_ivf_rabitq<T, int64_t>::search_param>();
+      parse_search_param<T, int64_t>(conf, *param);
+      return param;
+    }
+  }
+#endif
 #ifdef CUVS_ANN_BENCH_USE_CUVS_CAGRA
   if (algo_name == "raft_cagra" || algo_name == "cuvs_cagra") {
     auto param = std::make_unique<typename cuvs::bench::cuvs_cagra<T, uint32_t>::search_param>();

@@ -0,0 +1,9 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "cuvs_ivf_rabitq_wrapper.h"
+
+namespace cuvs::bench {
+template class cuvs_ivf_rabitq<float, int64_t>;
+}  // namespace cuvs::bench
@@ -0,0 +1,166 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include "../common/ann_types.hpp"
+#include "cuvs_ann_bench_utils.h"
+
+#include <cuvs/neighbors/ivf_rabitq.hpp>
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/cuda_stream_pool.hpp>
+
+#include <type_traits>
+
+namespace cuvs::bench {
+
+template <typename T, typename IdxT>
+class cuvs_ivf_rabitq : public algo<T>, public algo_gpu {
+ public:
+  using search_param_base = typename algo<T>::search_param;
+  using algo<T>::dim_;
+
+  struct search_param : public search_param_base {
+    cuvs::neighbors::ivf_rabitq::search_params rabitq_param;
+    float refine_ratio = 1.0f;
+    [[nodiscard]] auto needs_dataset() const -> bool override { return refine_ratio > 1.0f; }
+  };
+
+  using build_param = cuvs::neighbors::ivf_rabitq::index_params;
+
+  cuvs_ivf_rabitq(Metric metric, int dim, const build_param& param)
+    : algo<T>(metric, dim), index_params_(param), dimension_(dim)
+  {
+  }
+
+  void build(const T* dataset, size_t nrow) final;
+
+  void set_search_param(const search_param_base& param, const void* filter_bitset) override;
+  void set_search_dataset(const T* dataset, size_t nrow) override;
+
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              algo_base::index_type* neighbors,
+              float* distances) const override;
+
+  [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
+  {
+    return handle_.get_sync_stream();
+  }
+
+  // to enable dataset access from GPU memory
+  [[nodiscard]] auto get_preference() const -> algo_property override
+  {
+    algo_property property;
+    property.dataset_memory_type = MemoryType::kHost;
+    property.query_memory_type   = MemoryType::kDevice;
+    return property;
+  }
+  void save(const std::string& file) const override;
+  void load(const std::string&) override;
+  std::unique_ptr<algo<T>> copy() override;
+
+ private:
+  // handle_ must go first to make sure it dies last and all memory allocated in pool
+  configured_raft_resources handle_{};
+  build_param index_params_;
+  cuvs::neighbors::ivf_rabitq::search_params search_params_;
+  std::shared_ptr<cuvs::neighbors::ivf_rabitq::index<IdxT>> index_;
+  int dimension_;
+  float refine_ratio_ = 1.0;
+  raft::device_matrix_view<const T, IdxT> dataset_;
+};
+
+template <typename T, typename IdxT>
+void cuvs_ivf_rabitq<T, IdxT>::save(const std::string& file) const
+{
+  cuvs::neighbors::ivf_rabitq::serialize(handle_, file, *index_);
+}
+
+template <typename T, typename IdxT>
+void cuvs_ivf_rabitq<T, IdxT>::load(const std::string& file)
+{
+  index_ = std::make_shared<cuvs::neighbors::ivf_rabitq::index<IdxT>>(handle_);
+  cuvs::neighbors::ivf_rabitq::deserialize(handle_, file, index_.get());
+}
+
+template <typename T, typename IdxT>
+void cuvs_ivf_rabitq<T, IdxT>::build(const T* dataset, size_t nrow)
+{
+  // Create a CUDA stream pool with 1 stream (besides main stream) for kernel/copy overlapping.
+  size_t n_streams = 1;
+  raft::resource::set_cuda_stream_pool(handle_, std::make_shared<rmm::cuda_stream_pool>(n_streams));
+  auto dataset_v = raft::make_device_matrix_view<const T, IdxT>(dataset, IdxT(nrow), dim_);
+  std::make_shared<cuvs::neighbors::ivf_rabitq::index<IdxT>>(
+    std::move(cuvs::neighbors::ivf_rabitq::build(handle_, index_params_, dataset_v)))
+    .swap(index_);
+  // Note: internally the IVF-RaBitQ build works with simple pointers, and accepts both host and
+  // device pointer. Therefore, although we provide here a device_mdspan, this works with host
+  // pointer too.
+}
+
+template <typename T, typename IdxT>
+std::unique_ptr<algo<T>> cuvs_ivf_rabitq<T, IdxT>::copy()
+{
+  return std::make_unique<cuvs_ivf_rabitq<T, IdxT>>(*this);  // use copy constructor
+}
+
+template <typename T, typename IdxT>
+void cuvs_ivf_rabitq<T, IdxT>::set_search_param(const search_param_base& param, const void*)
+{
+  auto sp        = dynamic_cast<const search_param&>(param);
+  search_params_ = sp.rabitq_param;
+  refine_ratio_  = sp.refine_ratio;
+  assert(search_params_.n_probes <= index_params_.n_lists);
+}
+
+template <typename T, typename IdxT>
+void cuvs_ivf_rabitq<T, IdxT>::set_search_dataset(const T* dataset, size_t nrow)
+{
+  dataset_ = raft::make_device_matrix_view<const T, IdxT>(dataset, nrow, index_->dim());
+}
+
+template <typename T, typename IdxT>
+void cuvs_ivf_rabitq<T, IdxT>::search(
+  const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const
+{
+  static_assert(std::is_integral_v<algo_base::index_type>);
+  static_assert(std::is_integral_v<IdxT>);
+
+  IdxT* neighbors_idx;
+  std::optional<rmm::device_uvector<IdxT>> neighbors_storage{std::nullopt};
+  if constexpr (sizeof(IdxT) == sizeof(algo_base::index_type)) {
+    neighbors_idx = reinterpret_cast<IdxT*>(neighbors);
+  } else {
+    neighbors_storage.emplace(batch_size * k, raft::resource::get_cuda_stream(handle_));
+    neighbors_idx = neighbors_storage->data();
+  }
+
+  auto queries_view =
+    raft::make_device_matrix_view<const T, int64_t>(queries, batch_size, dimension_);
+  auto neighbors_view = raft::make_device_matrix_view<IdxT, int64_t>(neighbors_idx, batch_size, k);
+  auto distances_view = raft::make_device_matrix_view<float, int64_t>(distances, batch_size, k);
+
+  cuvs::neighbors::ivf_rabitq::search(
+    handle_, search_params_, *index_, queries_view, neighbors_view, distances_view);
+
+  if constexpr (sizeof(IdxT) != sizeof(algo_base::index_type)) {
+    raft::linalg::unaryOp(neighbors,
+                          neighbors_idx,
+                          batch_size * k,
+                          raft::cast_op<algo_base::index_type>(),
+                          raft::resource::get_cuda_stream(handle_));
+  }
+}
+}  // namespace cuvs::bench