apache · rich7420 · Apr 19, 2026 · Apr 19, 2026 · Apr 21, 2026 · Apr 21, 2026
@@ -22,17 +22,43 @@ on:
     paths:
       - "**.py"
       - "pyproject.toml"
+      - "qdp/qdp-core/**/*.rs"
+      - "qdp/qdp-kernels/**/*.rs"
+      - "qdp/qdp-python/**/*.rs"
+      - "qdp/**/Cargo.toml"
       - ".github/workflows/python-testing.yml"
   pull_request:
     branches: [main]
     paths:
       - "**.py"
       - "pyproject.toml"
+      - "qdp/qdp-core/**/*.rs"
+      - "qdp/qdp-kernels/**/*.rs"
+      - "qdp/qdp-python/**/*.rs"
+      - "qdp/**/Cargo.toml"
       - ".github/workflows/python-testing.yml"
   workflow_dispatch:
 
 jobs:
+  rust-check:
+    # Fast gate: type-check both with and without CUDA stubs so duplicate
+    # stub definitions or cfg mismatches fail in ~30s instead of surfacing
+    # during the slower maturin build below.
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Install Rust toolchain
+        uses: dtolnay/rust-toolchain@stable
+
+      - name: Cargo check (no-CUDA stubs)
+        working-directory: qdp
+        env:
+          QDP_NO_CUDA: "1"
+        run: cargo check --workspace --tests
+
   test:
+    needs: rust-check
     runs-on: ubuntu-latest
     strategy:
       matrix:

diff --git a/qdp/DEVELOPMENT.md b/qdp/DEVELOPMENT.md
@@ -91,6 +91,23 @@ uv run pytest testing/qdp -v
 uv run pytest testing/qdp_python -v
 ```
 
+### Pre-push sanity: no-CUDA build
+
+CI builds on a runner without `nvcc`, which activates the `qdp_no_cuda`
+cfg and swaps every `extern "C"` CUDA launcher for its stub. If you only
+ever build locally with CUDA, duplicate stubs or cfg mismatches can slip
+through unnoticed. Before pushing Rust / FFI changes, run:
+
+```bash
+cd qdp
+QDP_NO_CUDA=1 cargo build --workspace --lib --release
+cargo check --workspace --tests
+cd ..
+```
+
+The first command is what `maturin develop --release` runs on CI; the
+second verifies tests type-check in the CUDA build.
+
 ## 4. Benchmarks
 
 From the repo root, set up and prepare benchmarks:

@@ -28,14 +28,14 @@ use crate::gpu::pool_metrics::PoolMetrics;
 
 /// Handle that automatically returns a buffer to the pool on drop.
 #[cfg(target_os = "linux")]
-pub struct PinnedBufferHandle {
-    buffer: Option<PinnedHostBuffer>,
-    pool: Arc<PinnedBufferPool>,
+pub struct PinnedBufferHandle<T: Copy = f64> {
+    buffer: Option<PinnedHostBuffer<T>>,
+    pool: Arc<PinnedBufferPool<T>>,
 }
 
 #[cfg(target_os = "linux")]
-impl std::ops::Deref for PinnedBufferHandle {
-    type Target = PinnedHostBuffer;
+impl<T: Copy> std::ops::Deref for PinnedBufferHandle<T> {
+    type Target = PinnedHostBuffer<T>;
 
     fn deref(&self) -> &Self::Target {
         self.buffer
@@ -45,7 +45,7 @@ impl std::ops::Deref for PinnedBufferHandle {
 }
 
 #[cfg(target_os = "linux")]
-impl std::ops::DerefMut for PinnedBufferHandle {
+impl<T: Copy> std::ops::DerefMut for PinnedBufferHandle<T> {
     fn deref_mut(&mut self) -> &mut Self::Target {
         self.buffer
             .as_mut()
@@ -54,7 +54,7 @@ impl std::ops::DerefMut for PinnedBufferHandle {
 }
 
 #[cfg(target_os = "linux")]
-impl Drop for PinnedBufferHandle {
+impl<T: Copy> Drop for PinnedBufferHandle<T> {
     fn drop(&mut self) {
         if let Some(buf) = self.buffer.take() {
             let mut free = self.pool.lock_free();
@@ -66,16 +66,16 @@ impl Drop for PinnedBufferHandle {
 
 /// Pool of pinned host buffers sized for a fixed batch shape.
 #[cfg(target_os = "linux")]
-pub struct PinnedBufferPool {
-    free: Mutex<Vec<PinnedHostBuffer>>,
+pub struct PinnedBufferPool<T: Copy = f64> {
+    free: Mutex<Vec<PinnedHostBuffer<T>>>,
     available_cv: Condvar,
     capacity: usize,
     elements_per_buffer: usize,
 }
 
 #[cfg(target_os = "linux")]
-impl PinnedBufferPool {
-    /// Create a pool with `pool_size` pinned buffers, each sized for `elements_per_buffer` f64 values.
+impl<T: Copy> PinnedBufferPool<T> {
+    /// Create a pool with `pool_size` pinned buffers, each sized for `elements_per_buffer` values of `T`.
     pub fn new(pool_size: usize, elements_per_buffer: usize) -> Result<Arc<Self>> {
         if pool_size == 0 {
             return Err(MahoutError::InvalidInput(
@@ -90,7 +90,7 @@ impl PinnedBufferPool {
 
         let mut buffers = Vec::with_capacity(pool_size);
         for _ in 0..pool_size {
-            buffers.push(PinnedHostBuffer::new(elements_per_buffer)?);
+            buffers.push(PinnedHostBuffer::<T>::new(elements_per_buffer)?);
         }
 
         Ok(Arc::new(Self {
@@ -101,15 +101,15 @@ impl PinnedBufferPool {
         }))
     }
 
-    fn lock_free(&self) -> MutexGuard<'_, Vec<PinnedHostBuffer>> {
+    fn lock_free(&self) -> MutexGuard<'_, Vec<PinnedHostBuffer<T>>> {
         // Ignore poisoning to keep the pool usable after a panic elsewhere.
         self.free
             .lock()
             .unwrap_or_else(|poisoned| poisoned.into_inner())
     }
 
     /// Acquire a pinned buffer, blocking until one is available.
-    pub fn acquire(self: &Arc<Self>) -> PinnedBufferHandle {
+    pub fn acquire(self: &Arc<Self>) -> PinnedBufferHandle<T> {
         self.acquire_with_metrics(None)
     }
 
@@ -123,7 +123,7 @@ impl PinnedBufferPool {
     pub fn acquire_with_metrics(
         self: &Arc<Self>,
         metrics: Option<&PoolMetrics>,
-    ) -> PinnedBufferHandle {
+    ) -> PinnedBufferHandle<T> {
         let mut free = self.lock_free();
 
         // Record available count while holding the lock to avoid TOCTOU race condition
@@ -161,7 +161,7 @@ impl PinnedBufferPool {
     ///
     /// Returns `None` if the pool is currently empty; callers can choose to spin/wait
     /// or fall back to synchronous paths.
-    pub fn try_acquire(self: &Arc<Self>) -> Option<PinnedBufferHandle> {
+    pub fn try_acquire(self: &Arc<Self>) -> Option<PinnedBufferHandle<T>> {
         let mut free = self.lock_free();
         free.pop().map(|buffer| PinnedBufferHandle {
             buffer: Some(buffer),