Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions .github/workflows/python-testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,43 @@ on:
paths:
- "**.py"
- "pyproject.toml"
- "qdp/qdp-core/**/*.rs"
- "qdp/qdp-kernels/**/*.rs"
- "qdp/qdp-python/**/*.rs"
- "qdp/**/Cargo.toml"
- ".github/workflows/python-testing.yml"
pull_request:
branches: [main]
paths:
- "**.py"
- "pyproject.toml"
- "qdp/qdp-core/**/*.rs"
- "qdp/qdp-kernels/**/*.rs"
- "qdp/qdp-python/**/*.rs"
- "qdp/**/Cargo.toml"
- ".github/workflows/python-testing.yml"
workflow_dispatch:

jobs:
rust-check:
# Fast gate: type-check both with and without CUDA stubs so duplicate
# stub definitions or cfg mismatches fail in ~30s instead of surfacing
# during the slower maturin build below.
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6

- name: Install Rust toolchain
uses: dtolnay/rust-toolchain@stable

- name: Cargo check (no-CUDA stubs)
working-directory: qdp
env:
QDP_NO_CUDA: "1"
run: cargo check --workspace --tests

test:
needs: rust-check
runs-on: ubuntu-latest
strategy:
matrix:
Expand Down
17 changes: 17 additions & 0 deletions qdp/DEVELOPMENT.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,23 @@ uv run pytest testing/qdp -v
uv run pytest testing/qdp_python -v
```

### Pre-push sanity: no-CUDA build

CI builds on a runner without `nvcc`, which activates the `qdp_no_cuda`
cfg and swaps every `extern "C"` CUDA launcher for its stub. If you only
ever build locally with CUDA, duplicate stubs or cfg mismatches can slip
through unnoticed. Before pushing Rust / FFI changes, run:

```bash
cd qdp
QDP_NO_CUDA=1 cargo build --workspace --lib --release
cargo check --workspace --tests
cd ..
```

The first command is what `maturin develop --release` runs on CI; the
second verifies tests type-check in the CUDA build.

## 4. Benchmarks

From the repo root, set up and prepare benchmarks:
Expand Down
32 changes: 16 additions & 16 deletions qdp/qdp-core/src/gpu/buffer_pool.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,14 @@ use crate::gpu::pool_metrics::PoolMetrics;

/// Handle that automatically returns a buffer to the pool on drop.
#[cfg(target_os = "linux")]
pub struct PinnedBufferHandle {
buffer: Option<PinnedHostBuffer>,
pool: Arc<PinnedBufferPool>,
pub struct PinnedBufferHandle<T: Copy = f64> {
buffer: Option<PinnedHostBuffer<T>>,
pool: Arc<PinnedBufferPool<T>>,
}

#[cfg(target_os = "linux")]
impl std::ops::Deref for PinnedBufferHandle {
type Target = PinnedHostBuffer;
impl<T: Copy> std::ops::Deref for PinnedBufferHandle<T> {
type Target = PinnedHostBuffer<T>;

fn deref(&self) -> &Self::Target {
self.buffer
Expand All @@ -45,7 +45,7 @@ impl std::ops::Deref for PinnedBufferHandle {
}

#[cfg(target_os = "linux")]
impl std::ops::DerefMut for PinnedBufferHandle {
impl<T: Copy> std::ops::DerefMut for PinnedBufferHandle<T> {
fn deref_mut(&mut self) -> &mut Self::Target {
self.buffer
.as_mut()
Expand All @@ -54,7 +54,7 @@ impl std::ops::DerefMut for PinnedBufferHandle {
}

#[cfg(target_os = "linux")]
impl Drop for PinnedBufferHandle {
impl<T: Copy> Drop for PinnedBufferHandle<T> {
fn drop(&mut self) {
if let Some(buf) = self.buffer.take() {
let mut free = self.pool.lock_free();
Expand All @@ -66,16 +66,16 @@ impl Drop for PinnedBufferHandle {

/// Pool of pinned host buffers sized for a fixed batch shape.
#[cfg(target_os = "linux")]
pub struct PinnedBufferPool {
free: Mutex<Vec<PinnedHostBuffer>>,
pub struct PinnedBufferPool<T: Copy = f64> {
free: Mutex<Vec<PinnedHostBuffer<T>>>,
available_cv: Condvar,
capacity: usize,
elements_per_buffer: usize,
}

#[cfg(target_os = "linux")]
impl PinnedBufferPool {
/// Create a pool with `pool_size` pinned buffers, each sized for `elements_per_buffer` f64 values.
impl<T: Copy> PinnedBufferPool<T> {
/// Create a pool with `pool_size` pinned buffers, each sized for `elements_per_buffer` values of `T`.
pub fn new(pool_size: usize, elements_per_buffer: usize) -> Result<Arc<Self>> {
if pool_size == 0 {
return Err(MahoutError::InvalidInput(
Expand All @@ -90,7 +90,7 @@ impl PinnedBufferPool {

let mut buffers = Vec::with_capacity(pool_size);
for _ in 0..pool_size {
buffers.push(PinnedHostBuffer::new(elements_per_buffer)?);
buffers.push(PinnedHostBuffer::<T>::new(elements_per_buffer)?);
}

Ok(Arc::new(Self {
Expand All @@ -101,15 +101,15 @@ impl PinnedBufferPool {
}))
}

fn lock_free(&self) -> MutexGuard<'_, Vec<PinnedHostBuffer>> {
fn lock_free(&self) -> MutexGuard<'_, Vec<PinnedHostBuffer<T>>> {
// Ignore poisoning to keep the pool usable after a panic elsewhere.
self.free
.lock()
.unwrap_or_else(|poisoned| poisoned.into_inner())
}

/// Acquire a pinned buffer, blocking until one is available.
pub fn acquire(self: &Arc<Self>) -> PinnedBufferHandle {
pub fn acquire(self: &Arc<Self>) -> PinnedBufferHandle<T> {
self.acquire_with_metrics(None)
}

Expand All @@ -123,7 +123,7 @@ impl PinnedBufferPool {
pub fn acquire_with_metrics(
self: &Arc<Self>,
metrics: Option<&PoolMetrics>,
) -> PinnedBufferHandle {
) -> PinnedBufferHandle<T> {
let mut free = self.lock_free();

// Record available count while holding the lock to avoid TOCTOU race condition
Expand Down Expand Up @@ -161,7 +161,7 @@ impl PinnedBufferPool {
///
/// Returns `None` if the pool is currently empty; callers can choose to spin/wait
/// or fall back to synchronous paths.
pub fn try_acquire(self: &Arc<Self>) -> Option<PinnedBufferHandle> {
pub fn try_acquire(self: &Arc<Self>) -> Option<PinnedBufferHandle<T>> {
let mut free = self.lock_free();
free.pop().map(|buffer| PinnedBufferHandle {
buffer: Some(buffer),
Expand Down
Loading
Loading