From 622f4b858b19175b8e49737d6c8165775020227c Mon Sep 17 00:00:00 2001
From: viiccwen <viiccwen@gmail.com>
Date: Mon, 4 May 2026 17:18:17 +0000
Subject: [PATCH 1/5] refactor(gpu): improve device-aware allocation
 diagnostics

---
 qdp/qdp-core/src/gpu/cuda_ffi.rs            |   8 +-
 qdp/qdp-core/src/gpu/encodings/amplitude.rs |  88 +++++++++++++-
 qdp/qdp-core/src/gpu/encodings/angle.rs     |  24 +++-
 qdp/qdp-core/src/gpu/encodings/basis.rs     |   1 +
 qdp/qdp-core/src/gpu/encodings/iqp.rs       |  16 ++-
 qdp/qdp-core/src/gpu/encodings/phase.rs     |  16 ++-
 qdp/qdp-core/src/gpu/memory.rs              | 124 ++++++++++++++++++--
 qdp/qdp-core/src/gpu/pipeline.rs            |  15 ++-
 8 files changed, 269 insertions(+), 23 deletions(-)
diff --git a/qdp/qdp-core/src/gpu/cuda_ffi.rs b/qdp/qdp-core/src/gpu/cuda_ffi.rs
index 2ed60c311e..efafe7c2b0 100644
--- a/qdp/qdp-core/src/gpu/cuda_ffi.rs
+++ b/qdp/qdp-core/src/gpu/cuda_ffi.rs
@@ -54,7 +54,14 @@ unsafe extern "C" {
         ptr: *const c_void,
     ) -> i32;
 
+    pub(crate) fn cudaGetDevice(device: *mut i32) -> i32;
+    pub(crate) fn cudaSetDevice(device: i32) -> i32;
     pub(crate) fn cudaMemGetInfo(free: *mut usize, total: *mut usize) -> i32;
+    pub(crate) fn cudaDeviceCanAccessPeer(
+        can_access_peer: *mut i32,
+        device: i32,
+        peer_device: i32,
+    ) -> i32;
 
     pub(crate) fn cudaMemcpyAsync(
         dst: *mut c_void,
@@ -63,7 +70,6 @@ unsafe extern "C" {
         kind: u32,
         stream: *mut c_void,
     ) -> i32;
-
     pub(crate) fn cudaEventCreateWithFlags(event: *mut *mut c_void, flags: u32) -> i32;
     pub(crate) fn cudaEventRecord(event: *mut c_void, stream: *mut c_void) -> i32;
     pub(crate) fn cudaEventDestroy(event: *mut c_void) -> i32;
diff --git a/qdp/qdp-core/src/gpu/encodings/amplitude.rs b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
index e3081b65d6..aa187649a1 100644
--- a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
+++ b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
@@ -86,6 +86,7 @@ impl QuantumEncoder for AmplitudeEncoder {
                     input_bytes,
                     "input staging buffer",
                     Some(num_qubits),
+                    Some(_device.ordinal()),
                 )?;
 
                 let input_slice = {
@@ -95,6 +96,7 @@ impl QuantumEncoder for AmplitudeEncoder {
                             input_bytes,
                             "input staging buffer",
                             Some(num_qubits),
+                            Some(_device.ordinal()),
                             e,
                         )
                     })?
@@ -693,6 +695,29 @@ impl QuantumEncoder for AmplitudeEncoder {
 }
 
 impl AmplitudeEncoder {
+    #[cfg(target_os = "linux")]
+    fn async_chunk_state_len(
+        state_len: usize,
+        chunk_offset: usize,
+        chunk_len: usize,
+    ) -> Result<usize> {
+        let remaining = state_len.checked_sub(chunk_offset).ok_or_else(|| {
+            MahoutError::InvalidInput(format!(
+                "Async amplitude chunk offset {} exceeds state length {}",
+                chunk_offset, state_len
+            ))
+        })?;
+
+        if chunk_len > remaining {
+            return Err(MahoutError::InvalidInput(format!(
+                "Async amplitude chunk length {} at offset {} exceeds remaining state length {}",
+                chunk_len, chunk_offset, remaining
+            )));
+        }
+
+        Ok(chunk_len)
+    }
+
     /// Async pipeline encoding for large data
     ///
     /// Uses the generic dual-stream pipeline infrastructure to overlap
@@ -720,6 +745,8 @@ impl AmplitudeEncoder {
             device,
             host_data,
             |stream, input_ptr, chunk_offset, chunk_len| {
+                let chunk_state_len =
+                    Self::async_chunk_state_len(state_len, chunk_offset, chunk_len)?;
                 // Calculate offset pointer for state vector (type-safe pointer arithmetic)
                 // Offset is in complex numbers (CuDoubleComplex), not f64 elements
                 let state_ptr_offset = unsafe {
@@ -735,7 +762,7 @@ impl AmplitudeEncoder {
                         input_ptr,
                         state_ptr_offset,
                         chunk_len,
-                        state_len,
+                        chunk_state_len,
                         inv_norm,
                         stream.stream as *mut c_void,
                     )
@@ -1093,3 +1120,62 @@ impl AmplitudeEncoder {
         Ok(())
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::AmplitudeEncoder;
+    #[cfg(target_os = "linux")]
+    use crate::gpu::QuantumEncoder;
+    #[cfg(target_os = "linux")]
+    use cudarc::driver::CudaDevice;
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn async_chunk_state_len_uses_chunk_local_extent() {
+        assert_eq!(
+            AmplitudeEncoder::async_chunk_state_len(1024, 256, 128).unwrap(),
+            128
+        );
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn async_chunk_state_len_rejects_overrun() {
+        let err = AmplitudeEncoder::async_chunk_state_len(1024, 1000, 64).unwrap_err();
+        assert!(matches!(err, crate::error::MahoutError::InvalidInput(_)));
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn async_pipeline_large_encode_q22_matches_reference_slices() {
+        let device = match CudaDevice::new(0) {
+            Ok(device) => device,
+            Err(_) => return,
+        };
+
+        let num_qubits = 22usize;
+        let state_len = 1usize << num_qubits;
+        let mut input = vec![0.0f64; state_len];
+        for (idx, value) in input.iter_mut().enumerate() {
+            *value = ((idx % 1024) as f64) + 1.0;
+        }
+
+        let encoder = AmplitudeEncoder;
+        let state = encoder.encode(&device, &input, num_qubits).unwrap();
+        let host = state.copy_to_host_f64(&device).unwrap();
+        let norm = input.iter().map(|value| value * value).sum::<f64>().sqrt();
+
+        for idx in [0usize, 1, 1024, state_len / 2, state_len - 1] {
+            let expected = input[idx] / norm;
+            let actual = host[idx].x;
+            assert!(
+                (actual - expected).abs() < 1e-12,
+                "Mismatch at amplitude {}: actual={} expected={}",
+                idx,
+                actual,
+                expected
+            );
+            assert_eq!(host[idx].y, 0.0);
+        }
+    }
+}
diff --git a/qdp/qdp-core/src/gpu/encodings/angle.rs b/qdp/qdp-core/src/gpu/encodings/angle.rs
index b0c6773ff5..ef39f14611 100644
--- a/qdp/qdp-core/src/gpu/encodings/angle.rs
+++ b/qdp/qdp-core/src/gpu/encodings/angle.rs
@@ -57,7 +57,13 @@ impl QuantumEncoder for AngleEncoder {
             let angles_gpu = {
                 crate::profile_scope!("GPU::H2D_Angles");
                 device.htod_sync_copy(data).map_err(|e| {
-                    map_allocation_error(input_bytes, "angle input upload", Some(num_qubits), e)
+                    map_allocation_error(
+                        input_bytes,
+                        "angle input upload",
+                        Some(num_qubits),
+                        Some(device.ordinal()),
+                        e,
+                    )
                 })?
             };
 
@@ -175,7 +181,13 @@ impl QuantumEncoder for AngleEncoder {
         let angles_gpu = {
             crate::profile_scope!("GPU::H2D_BatchAngles");
             device.htod_sync_copy(batch_data).map_err(|e| {
-                map_allocation_error(input_bytes, "angle batch upload", Some(num_qubits), e)
+                map_allocation_error(
+                    input_bytes,
+                    "angle batch upload",
+                    Some(num_qubits),
+                    Some(device.ordinal()),
+                    e,
+                )
             })?
         };
 
@@ -435,7 +447,13 @@ impl QuantumEncoder for AngleEncoder {
         let angles_gpu = {
             crate::profile_scope!("GPU::H2D_BatchAnglesF32");
             device.htod_sync_copy(batch_data).map_err(|e| {
-                map_allocation_error(input_bytes, "angle batch upload", Some(num_qubits), e)
+                map_allocation_error(
+                    input_bytes,
+                    "angle batch upload",
+                    Some(num_qubits),
+                    Some(device.ordinal()),
+                    e,
+                )
             })?
         };
 
diff --git a/qdp/qdp-core/src/gpu/encodings/basis.rs b/qdp/qdp-core/src/gpu/encodings/basis.rs
index 494b385af2..db07c74b4f 100644
--- a/qdp/qdp-core/src/gpu/encodings/basis.rs
+++ b/qdp/qdp-core/src/gpu/encodings/basis.rs
@@ -180,6 +180,7 @@ impl QuantumEncoder for BasisEncoder {
                     num_samples * std::mem::size_of::<usize>(),
                     "basis indices upload",
                     Some(num_qubits),
+                    Some(device.ordinal()),
                     e,
                 )
             })?
diff --git a/qdp/qdp-core/src/gpu/encodings/iqp.rs b/qdp/qdp-core/src/gpu/encodings/iqp.rs
index c6ecf17624..fc925c2393 100644
--- a/qdp/qdp-core/src/gpu/encodings/iqp.rs
+++ b/qdp/qdp-core/src/gpu/encodings/iqp.rs
@@ -82,7 +82,13 @@ impl QuantumEncoder for IqpEncoder {
             let data_gpu = {
                 crate::profile_scope!("GPU::H2D_IqpData");
                 device.htod_sync_copy(data).map_err(|e| {
-                    map_allocation_error(input_bytes, "IQP input upload", Some(num_qubits), e)
+                    map_allocation_error(
+                        input_bytes,
+                        "IQP input upload",
+                        Some(num_qubits),
+                        Some(device.ordinal()),
+                        e,
+                    )
                 })?
             };
 
@@ -193,7 +199,13 @@ impl QuantumEncoder for IqpEncoder {
         let data_gpu = {
             crate::profile_scope!("GPU::H2D_BatchIqpData");
             device.htod_sync_copy(batch_data).map_err(|e| {
-                map_allocation_error(input_bytes, "IQP batch upload", Some(num_qubits), e)
+                map_allocation_error(
+                    input_bytes,
+                    "IQP batch upload",
+                    Some(num_qubits),
+                    Some(device.ordinal()),
+                    e,
+                )
             })?
         };
 
diff --git a/qdp/qdp-core/src/gpu/encodings/phase.rs b/qdp/qdp-core/src/gpu/encodings/phase.rs
index b329a2429e..335bcfd85d 100644
--- a/qdp/qdp-core/src/gpu/encodings/phase.rs
+++ b/qdp/qdp-core/src/gpu/encodings/phase.rs
@@ -80,7 +80,13 @@ impl QuantumEncoder for PhaseEncoder {
             let phases_gpu = {
                 crate::profile_scope!("GPU::H2D_Phases");
                 device.htod_sync_copy(data).map_err(|e| {
-                    map_allocation_error(input_bytes, "phase input upload", Some(num_qubits), e)
+                    map_allocation_error(
+                        input_bytes,
+                        "phase input upload",
+                        Some(num_qubits),
+                        Some(device.ordinal()),
+                        e,
+                    )
                 })?
             };
 
@@ -199,7 +205,13 @@ impl QuantumEncoder for PhaseEncoder {
         let phases_gpu = {
             crate::profile_scope!("GPU::H2D_BatchPhases");
             device.htod_sync_copy(batch_data).map_err(|e| {
-                map_allocation_error(input_bytes, "phase batch upload", Some(num_qubits), e)
+                map_allocation_error(
+                    input_bytes,
+                    "phase batch upload",
+                    Some(num_qubits),
+                    Some(device.ordinal()),
+                    e,
+                )
             })?
         };
 
diff --git a/qdp/qdp-core/src/gpu/memory.rs b/qdp/qdp-core/src/gpu/memory.rs
index 661060b550..c39f6c5616 100644
--- a/qdp/qdp-core/src/gpu/memory.rs
+++ b/qdp/qdp-core/src/gpu/memory.rs
@@ -43,7 +43,9 @@ pub enum GpuDeviceType {
 }
 
 #[cfg(target_os = "linux")]
-use crate::gpu::cuda_ffi::{cudaFreeHost, cudaHostAlloc, cudaMemGetInfo};
+use crate::gpu::cuda_ffi::{
+    CUDA_SUCCESS, cudaFreeHost, cudaGetDevice, cudaHostAlloc, cudaMemGetInfo, cudaSetDevice,
+};
 
 #[cfg(target_os = "linux")]
 fn bytes_to_mib(bytes: usize) -> f64 {
@@ -51,7 +53,49 @@ fn bytes_to_mib(bytes: usize) -> f64 {
 }
 
 #[cfg(target_os = "linux")]
-fn query_cuda_mem_info() -> Result<(usize, usize)> {
+struct DeviceContextGuard {
+    original_device: i32,
+}
+
+#[cfg(target_os = "linux")]
+impl DeviceContextGuard {
+    fn switch_to(target_device: Option<usize>) -> Result<Self> {
+        let mut original_device = 0i32;
+        let get_ret = unsafe { cudaGetDevice(&mut original_device as *mut i32) };
+        if get_ret != CUDA_SUCCESS {
+            return Err(MahoutError::Cuda(format!(
+                "cudaGetDevice failed: {} ({})",
+                get_ret,
+                cuda_error_to_string(get_ret)
+            )));
+        }
+
+        if let Some(device_id) = target_device {
+            let set_ret = unsafe { cudaSetDevice(device_id as i32) };
+            if set_ret != CUDA_SUCCESS {
+                return Err(MahoutError::Cuda(format!(
+                    "cudaSetDevice(cuda:{}) failed: {} ({})",
+                    device_id,
+                    set_ret,
+                    cuda_error_to_string(set_ret)
+                )));
+            }
+        }
+
+        Ok(Self { original_device })
+    }
+}
+
+#[cfg(target_os = "linux")]
+impl Drop for DeviceContextGuard {
+    fn drop(&mut self) {
+        let _ = unsafe { cudaSetDevice(self.original_device) };
+    }
+}
+
+#[cfg(target_os = "linux")]
+fn query_cuda_mem_info(device_id: Option<usize>) -> Result<(usize, usize)> {
+    let _guard = DeviceContextGuard::switch_to(device_id)?;
     unsafe {
         let mut free_bytes: usize = 0;
         let mut total_bytes: usize = 0;
@@ -101,8 +145,9 @@ pub(crate) fn ensure_device_memory_available(
     requested_bytes: usize,
     context: &str,
     qubits: Option<usize>,
+    device_id: Option<usize>,
 ) -> Result<()> {
-    let (free, total) = query_cuda_mem_info()?;
+    let (free, total) = query_cuda_mem_info(device_id)?;
 
     if (requested_bytes as u64) > (free as u64) {
         return Err(MahoutError::MemoryAllocation(build_oom_message(
@@ -123,9 +168,10 @@ pub(crate) fn map_allocation_error(
     requested_bytes: usize,
     context: &str,
     qubits: Option<usize>,
+    device_id: Option<usize>,
     source: impl std::fmt::Debug,
 ) -> MahoutError {
-    match query_cuda_mem_info() {
+    match query_cuda_mem_info(device_id) {
         Ok((free, total)) => {
             if (requested_bytes as u64) > (free as u64) {
                 MahoutError::MemoryAllocation(build_oom_message(
@@ -175,28 +221,28 @@ pub enum BufferStorage {
 }
 
 impl BufferStorage {
-    fn precision(&self) -> Precision {
+    pub(crate) fn precision(&self) -> Precision {
         match self {
             BufferStorage::F32(_) => Precision::Float32,
             BufferStorage::F64(_) => Precision::Float64,
         }
     }
 
-    fn ptr_void(&self) -> *mut c_void {
+    pub(crate) fn ptr_void(&self) -> *mut c_void {
         match self {
             BufferStorage::F32(buf) => buf.ptr() as *mut c_void,
             BufferStorage::F64(buf) => buf.ptr() as *mut c_void,
         }
     }
 
-    fn ptr_f64(&self) -> Option<*mut CuDoubleComplex> {
+    pub(crate) fn ptr_f64(&self) -> Option<*mut CuDoubleComplex> {
         match self {
             BufferStorage::F64(buf) => Some(buf.ptr()),
             _ => None,
         }
     }
 
-    fn ptr_f32(&self) -> Option<*mut CuComplex> {
+    pub(crate) fn ptr_f32(&self) -> Option<*mut CuComplex> {
         match self {
             BufferStorage::F32(buf) => Some(buf.ptr()),
             _ => None,
@@ -250,6 +296,7 @@ impl GpuStateVector {
                     requested_bytes,
                     "state vector allocation (f32)",
                     Some(qubits),
+                    Some(_device.ordinal()),
                 )?;
 
                 let slice = unsafe { _device.alloc::<CuComplex>(_size_elements) }.map_err(|e| {
@@ -257,6 +304,7 @@ impl GpuStateVector {
                         requested_bytes,
                         "state vector allocation (f32)",
                         Some(qubits),
+                        Some(_device.ordinal()),
                         e,
                     )
                 })?;
@@ -277,6 +325,7 @@ impl GpuStateVector {
                     requested_bytes,
                     "state vector allocation",
                     Some(qubits),
+                    Some(_device.ordinal()),
                 )?;
 
                 let slice =
@@ -285,6 +334,7 @@ impl GpuStateVector {
                             requested_bytes,
                             "state vector allocation",
                             Some(qubits),
+                            Some(_device.ordinal()),
                             e,
                         )
                     })?;
@@ -339,6 +389,30 @@ impl GpuStateVector {
         self.num_qubits
     }
 
+    #[cfg(target_os = "linux")]
+    pub fn copy_to_host_f64(&self, device: &Arc<CudaDevice>) -> Result<Vec<CuDoubleComplex>> {
+        match self.buffer.as_ref() {
+            BufferStorage::F64(buf) => device.dtoh_sync_copy(&buf.slice).map_err(|e| {
+                MahoutError::Cuda(format!("Failed to copy float64 state to host: {:?}", e))
+            }),
+            BufferStorage::F32(_) => Err(MahoutError::InvalidInput(
+                "State vector stores float32 data, not float64".to_string(),
+            )),
+        }
+    }
+
+    #[cfg(target_os = "linux")]
+    pub fn copy_to_host_f32(&self, device: &Arc<CudaDevice>) -> Result<Vec<CuComplex>> {
+        match self.buffer.as_ref() {
+            BufferStorage::F32(buf) => device.dtoh_sync_copy(&buf.slice).map_err(|e| {
+                MahoutError::Cuda(format!("Failed to copy float32 state to host: {:?}", e))
+            }),
+            BufferStorage::F64(_) => Err(MahoutError::InvalidInput(
+                "State vector stores float64 data, not float32".to_string(),
+            )),
+        }
+    }
+
     /// Get the size in elements (2^n where n is number of qubits)
     pub fn size_elements(&self) -> usize {
         self.size_elements
@@ -374,11 +448,22 @@ impl GpuStateVector {
                         })?;
 
                     let context = "batch state vector allocation (f32)";
-                    ensure_device_memory_available(requested_bytes, context, Some(qubits))?;
+                    ensure_device_memory_available(
+                        requested_bytes,
+                        context,
+                        Some(qubits),
+                        Some(_device.ordinal()),
+                    )?;
 
                     let slice =
                         unsafe { _device.alloc::<CuComplex>(total_elements) }.map_err(|e| {
-                            map_allocation_error(requested_bytes, context, Some(qubits), e)
+                            map_allocation_error(
+                                requested_bytes,
+                                context,
+                                Some(qubits),
+                                Some(_device.ordinal()),
+                                e,
+                            )
                         })?;
 
                     BufferStorage::F32(GpuBufferRaw { slice })
@@ -394,11 +479,22 @@ impl GpuStateVector {
                         })?;
 
                     let context = "batch state vector allocation";
-                    ensure_device_memory_available(requested_bytes, context, Some(qubits))?;
+                    ensure_device_memory_available(
+                        requested_bytes,
+                        context,
+                        Some(qubits),
+                        Some(_device.ordinal()),
+                    )?;
 
                     let slice = unsafe { _device.alloc::<CuDoubleComplex>(total_elements) }
                         .map_err(|e| {
-                            map_allocation_error(requested_bytes, context, Some(qubits), e)
+                            map_allocation_error(
+                                requested_bytes,
+                                context,
+                                Some(qubits),
+                                Some(_device.ordinal()),
+                                e,
+                            )
                         })?;
 
                     BufferStorage::F64(GpuBufferRaw { slice })
@@ -450,6 +546,7 @@ impl GpuStateVector {
                         requested_bytes,
                         "state vector precision conversion",
                         Some(self.num_qubits),
+                        Some(device.ordinal()),
                     )?;
 
                     let slice = unsafe { device.alloc::<CuDoubleComplex>(self.size_elements) }
@@ -458,6 +555,7 @@ impl GpuStateVector {
                                 requested_bytes,
                                 "state vector precision conversion",
                                 Some(self.num_qubits),
+                                Some(device.ordinal()),
                                 e,
                             )
                         })?;
@@ -526,6 +624,7 @@ impl GpuStateVector {
                         requested_bytes,
                         "state vector precision conversion",
                         Some(self.num_qubits),
+                        Some(device.ordinal()),
                     )?;
 
                     let slice =
@@ -534,6 +633,7 @@ impl GpuStateVector {
                                 requested_bytes,
                                 "state vector precision conversion",
                                 Some(self.num_qubits),
+                                Some(device.ordinal()),
                                 e,
                             )
                         })?;
diff --git a/qdp/qdp-core/src/gpu/pipeline.rs b/qdp/qdp-core/src/gpu/pipeline.rs
index 9a715df6f8..d960d0ce95 100644
--- a/qdp/qdp-core/src/gpu/pipeline.rs
+++ b/qdp/qdp-core/src/gpu/pipeline.rs
@@ -379,12 +379,23 @@ where
         crate::profile_scope!("GPU::ChunkProcess");
 
         let chunk_bytes = std::mem::size_of_val(chunk);
-        ensure_device_memory_available(chunk_bytes, "pipeline chunk buffer allocation", None)?;
+        ensure_device_memory_available(
+            chunk_bytes,
+            "pipeline chunk buffer allocation",
+            None,
+            Some(device.ordinal()),
+        )?;
 
         // Allocate temporary device buffer for this chunk
         #[allow(clippy::collapsible_if, clippy::manual_is_multiple_of)]
         let input_chunk_dev = unsafe { device.alloc::<f64>(chunk.len()) }.map_err(|e| {
-            map_allocation_error(chunk_bytes, "pipeline chunk buffer allocation", None, e)
+            map_allocation_error(
+                chunk_bytes,
+                "pipeline chunk buffer allocation",
+                None,
+                Some(device.ordinal()),
+                e,
+            )
         })?;
 
         // Acquire pinned staging buffer and populate it with the current chunk

From be01bc6f7244ba1f0b1d0cac5664fe9569683df2 Mon Sep 17 00:00:00 2001
From: viiccwen <viiccwen@gmail.com>
Date: Mon, 4 May 2026 17:18:29 +0000
Subject: [PATCH 2/5] feat(gpu): add distributed amplitude scaffolding

---
 qdp/qdp-core/src/gpu/communicator.rs          |  39 +++
 qdp/qdp-core/src/gpu/distributed/amplitude.rs | 132 ++++++++
 qdp/qdp-core/src/gpu/distributed/layout.rs    | 122 +++++++
 qdp/qdp-core/src/gpu/distributed/mod.rs       |  30 ++
 qdp/qdp-core/src/gpu/distributed/planner.rs   | 247 +++++++++++++++
 qdp/qdp-core/src/gpu/distributed/runtime.rs   | 298 ++++++++++++++++++
 qdp/qdp-core/src/gpu/distributed/shared.rs    |  64 ++++
 qdp/qdp-core/src/gpu/distributed/state.rs     | 162 ++++++++++
 qdp/qdp-core/src/gpu/mod.rs                   |  10 +
 qdp/qdp-core/src/gpu/topology.rs              | 293 +++++++++++++++++
 qdp/qdp-core/src/lib.rs                       |  86 +++++
 11 files changed, 1483 insertions(+)
 create mode 100644 qdp/qdp-core/src/gpu/communicator.rs
 create mode 100644 qdp/qdp-core/src/gpu/distributed/amplitude.rs
 create mode 100644 qdp/qdp-core/src/gpu/distributed/layout.rs
 create mode 100644 qdp/qdp-core/src/gpu/distributed/mod.rs
 create mode 100644 qdp/qdp-core/src/gpu/distributed/planner.rs
 create mode 100644 qdp/qdp-core/src/gpu/distributed/runtime.rs
 create mode 100644 qdp/qdp-core/src/gpu/distributed/shared.rs
 create mode 100644 qdp/qdp-core/src/gpu/distributed/state.rs
 create mode 100644 qdp/qdp-core/src/gpu/topology.rs

diff --git a/qdp/qdp-core/src/gpu/communicator.rs b/qdp/qdp-core/src/gpu/communicator.rs
new file mode 100644
index 0000000000..c7b9776ef8
--- /dev/null
+++ b/qdp/qdp-core/src/gpu/communicator.rs
@@ -0,0 +1,39 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::error::{MahoutError, Result};
+
+/// Abstracts cross-device coordination. PR1 provides a host-coordinated fallback;
+/// a later PR can add NCCL-backed implementations behind the same trait.
+pub trait Communicator: Send + Sync {
+    fn reduce_sum_f64(&self, values: &[f64]) -> Result<f64>;
+}
+
+/// Host-coordinated reduction placeholder used for early distributed amplitude prototypes.
+#[derive(Default, Debug, Clone, Copy)]
+pub struct HostCommunicator;
+
+impl Communicator for HostCommunicator {
+    fn reduce_sum_f64(&self, values: &[f64]) -> Result<f64> {
+        if values.is_empty() {
+            return Err(MahoutError::InvalidInput(
+                "Host communicator requires at least one value for reduction".to_string(),
+            ));
+        }
+
+        Ok(values.iter().copied().sum())
+    }
+}
diff --git a/qdp/qdp-core/src/gpu/distributed/amplitude.rs b/qdp/qdp-core/src/gpu/distributed/amplitude.rs
new file mode 100644
index 0000000000..f5f4f94bd9
--- /dev/null
+++ b/qdp/qdp-core/src/gpu/distributed/amplitude.rs
@@ -0,0 +1,132 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::error::{MahoutError, Result};
+use crate::gpu::distributed::{
+    DistributionMode, PlacementPlan, PlacementPlanner, PlacementRequest, ShardPlacement,
+    ShardPolicy,
+};
+use crate::gpu::encodings::MAX_QUBITS;
+use crate::gpu::topology::DeviceMesh;
+
+/// Shared planning math for amplitude-sharded state construction.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct DistributedAmplitudePlan {
+    pub request: PlacementRequest,
+    pub placement: PlacementPlan,
+    pub num_qubits: usize,
+    pub global_len: usize,
+    pub num_devices: usize,
+    pub shard_bits: Option<usize>,
+    pub uniform_shard_len: Option<usize>,
+}
+
+/// Result of preparing a distributed amplitude encode without yet allocating
+/// concrete shard buffers. This fixes the public API surface for later PRs that
+/// will populate `state` with real device allocations.
+#[derive(Clone)]
+pub struct PreparedDistributedAmplitudeEncode {
+    pub mesh: DeviceMesh,
+    pub plan: DistributedAmplitudePlan,
+    pub inv_norm: f64,
+    pub layout: super::layout::DistributedStateLayout,
+}
+
+impl DistributedAmplitudePlan {
+    pub fn for_request(mesh: &DeviceMesh, request: PlacementRequest) -> Result<Self> {
+        if request.num_qubits == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Number of qubits must be at least 1 for distributed amplitude planning"
+                    .to_string(),
+            ));
+        }
+        if mesh.num_devices() == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Distributed amplitude planning requires at least one device".to_string(),
+            ));
+        }
+        if request.mode != DistributionMode::ShardedCapacity {
+            return Err(MahoutError::InvalidInput(format!(
+                "Distributed amplitude planning currently supports only {:?}, got {:?}",
+                DistributionMode::ShardedCapacity,
+                request.mode
+            )));
+        }
+
+        let num_devices = mesh.num_devices();
+        let placement = PlacementPlanner::plan(mesh, &request)?;
+        Self::validate_local_shard_capacity(request.num_qubits, &placement)?;
+        let global_len = placement.global_len;
+        let num_qubits = request.num_qubits;
+        let (shard_bits, uniform_shard_len) = match request.shard_policy {
+            ShardPolicy::Equal => {
+                debug_assert!(num_devices.is_power_of_two());
+                let shard_bits = num_devices.trailing_zeros() as usize;
+                if shard_bits > request.num_qubits {
+                    return Err(MahoutError::InvalidInput(format!(
+                        "Cannot shard {} qubits across {} devices: shard bits {} exceed qubit count",
+                        request.num_qubits, num_devices, shard_bits
+                    )));
+                }
+                (Some(shard_bits), Some(placement.shard_len()?))
+            }
+            ShardPolicy::BalancedUneven => (None, None),
+        };
+
+        Ok(Self {
+            request,
+            placement,
+            num_qubits,
+            global_len,
+            num_devices,
+            shard_bits,
+            uniform_shard_len,
+        })
+    }
+
+    pub fn shard_range(&self, shard_id: usize) -> Result<(usize, usize)> {
+        let placement = self.placement.placements.get(shard_id).ok_or_else(|| {
+            MahoutError::InvalidInput(format!(
+                "Shard ID {} out of range for {} devices",
+                shard_id, self.num_devices
+            ))
+        })?;
+        Ok((placement.start_idx, placement.end_idx))
+    }
+
+    fn validate_local_shard_capacity(num_qubits: usize, placement: &PlacementPlan) -> Result<()> {
+        let max_local_len = 1usize << MAX_QUBITS;
+        let required_local_len = placement
+            .placements
+            .iter()
+            .map(ShardPlacement::local_len)
+            .max()
+            .ok_or_else(|| {
+                MahoutError::InvalidInput(
+                    "Placement plan must contain at least one shard".to_string(),
+                )
+            })?;
+
+        if required_local_len > max_local_len {
+            return Err(MahoutError::InvalidInput(format!(
+                "Distributed amplitude request for {} qubits still exceeds per-device capacity: largest shard has {} amplitudes, limit is 2^{} = {}. Add more GPUs or reduce qubits.",
+                num_qubits, required_local_len, MAX_QUBITS, max_local_len
+            )));
+        }
+
+        Ok(())
+    }
+}
diff --git a/qdp/qdp-core/src/gpu/distributed/layout.rs b/qdp/qdp-core/src/gpu/distributed/layout.rs
new file mode 100644
index 0000000000..143b1637c2
--- /dev/null
+++ b/qdp/qdp-core/src/gpu/distributed/layout.rs
@@ -0,0 +1,122 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use cudarc::driver::CudaDevice;
+
+use crate::error::{MahoutError, Result};
+use crate::gpu::memory::Precision;
+use crate::gpu::topology::{DeviceMesh, GpuTopology};
+
+use super::DistributedAmplitudePlan;
+use super::shared;
+
+/// One shard of a logically distributed state vector.
+#[derive(Clone)]
+pub struct StateShardLayout {
+    pub device: Arc<CudaDevice>,
+    pub device_id: usize,
+    pub shard_id: usize,
+    pub start_idx: usize,
+    pub end_idx: usize,
+    pub local_len: usize,
+}
+
+/// Metadata and placement plan for a future multi-GPU state vector.
+#[derive(Clone)]
+pub struct DistributedStateLayout {
+    pub num_qubits: usize,
+    pub precision: Precision,
+    pub global_len: usize,
+    pub shard_bits: Option<usize>,
+    pub topology: GpuTopology,
+    pub shards: Vec<StateShardLayout>,
+}
+
+impl DistributedStateLayout {
+    #[cfg(target_os = "linux")]
+    pub fn recommended_gather_device_id(&self) -> Option<usize> {
+        shared::policy_device_ids(
+            &self.topology,
+            self.shards.iter().map(|shard| shard.device_id),
+        )
+        .0
+    }
+
+    #[cfg(target_os = "linux")]
+    pub fn recommended_placement_device_ids(&self) -> Vec<usize> {
+        shared::policy_device_ids(
+            &self.topology,
+            self.shards.iter().map(|shard| shard.device_id),
+        )
+        .1
+    }
+
+    pub fn num_shards(&self) -> usize {
+        self.shards.len()
+    }
+
+    pub fn new(
+        mesh: &DeviceMesh,
+        plan: &DistributedAmplitudePlan,
+        precision: Precision,
+    ) -> Result<Self> {
+        if mesh.num_devices() != plan.num_devices {
+            return Err(MahoutError::InvalidInput(format!(
+                "Device mesh / amplitude plan mismatch: {} devices vs {} planned shards",
+                mesh.num_devices(),
+                plan.num_devices
+            )));
+        }
+        if mesh.devices.len() != plan.num_devices {
+            return Err(MahoutError::InvalidInput(format!(
+                "Device mesh / device handles mismatch: {} handles for {} planned shards",
+                mesh.devices.len(),
+                plan.num_devices
+            )));
+        }
+        if plan.placement.placements.len() != plan.num_devices {
+            return Err(MahoutError::InvalidInput(format!(
+                "Placement plan mismatch: {} placements for {} planned shards",
+                plan.placement.placements.len(),
+                plan.num_devices
+            )));
+        }
+
+        let mut shards = Vec::with_capacity(mesh.num_devices());
+        for (placement, device) in plan.placement.placements.iter().zip(mesh.devices.iter()) {
+            let (start_idx, end_idx) = (placement.start_idx, placement.end_idx);
+            shards.push(StateShardLayout {
+                device: Arc::clone(device),
+                device_id: placement.device_id,
+                shard_id: placement.shard_id,
+                start_idx,
+                end_idx,
+                local_len: placement.local_len(),
+            });
+        }
+
+        Ok(Self {
+            num_qubits: plan.num_qubits,
+            precision,
+            global_len: plan.global_len,
+            shard_bits: plan.shard_bits,
+            topology: mesh.topology.clone(),
+            shards,
+        })
+    }
+}
diff --git a/qdp/qdp-core/src/gpu/distributed/mod.rs b/qdp/qdp-core/src/gpu/distributed/mod.rs
new file mode 100644
index 0000000000..acb02918f8
--- /dev/null
+++ b/qdp/qdp-core/src/gpu/distributed/mod.rs
@@ -0,0 +1,30 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+mod amplitude;
+mod layout;
+mod planner;
+pub(crate) mod runtime;
+mod shared;
+mod state;
+
+pub use amplitude::{DistributedAmplitudePlan, PreparedDistributedAmplitudeEncode};
+pub use layout::{DistributedStateLayout, StateShardLayout};
+pub use planner::{
+    DistributionMode, PlacementPlan, PlacementPlanner, PlacementRequest, ShardPlacement,
+    ShardPolicy,
+};
+pub use state::{DistributedStateVector, StateShard};
diff --git a/qdp/qdp-core/src/gpu/distributed/planner.rs b/qdp/qdp-core/src/gpu/distributed/planner.rs
new file mode 100644
index 0000000000..747edace98
--- /dev/null
+++ b/qdp/qdp-core/src/gpu/distributed/planner.rs
@@ -0,0 +1,247 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::error::{MahoutError, Result};
+use crate::gpu::topology::DeviceMesh;
+
+/// Runtime distribution modes for future multi-GPU state construction.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum DistributionMode {
+    Single,
+    ShardedCapacity,
+    Replicated,
+}
+
+/// Placement policy for slicing the logical output state across devices.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum ShardPolicy {
+    /// Evenly partition the logical state into contiguous shards.
+    ///
+    /// Because the global state length is always `2^n`, equal-width integer
+    /// shards are only possible for power-of-two device counts.
+    Equal,
+    BalancedUneven,
+}
+
+/// Planner input describing the logical distributed state request.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct PlacementRequest {
+    pub num_qubits: usize,
+    pub mode: DistributionMode,
+    pub shard_policy: ShardPolicy,
+}
+
+impl PlacementRequest {
+    pub fn new(num_qubits: usize, mode: DistributionMode, shard_policy: ShardPolicy) -> Self {
+        Self {
+            num_qubits,
+            mode,
+            shard_policy,
+        }
+    }
+
+    pub fn global_len(&self) -> Result<usize> {
+        1usize.checked_shl(self.num_qubits as u32).ok_or_else(|| {
+            MahoutError::InvalidInput(format!(
+                "Global amplitude length overflow for {} qubits",
+                self.num_qubits
+            ))
+        })
+    }
+}
+
+/// One logical placement decision produced by the planner.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct ShardPlacement {
+    pub device_id: usize,
+    pub shard_id: usize,
+    pub start_idx: usize,
+    pub end_idx: usize,
+}
+
+impl ShardPlacement {
+    pub fn local_len(&self) -> usize {
+        self.end_idx - self.start_idx
+    }
+}
+
+/// Planner output consumed by distributed encoders and future gather/export APIs.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct PlacementPlan {
+    pub mode: DistributionMode,
+    pub global_len: usize,
+    pub placements: Vec<ShardPlacement>,
+    pub gather_device_id: Option<usize>,
+}
+
+impl PlacementPlan {
+    pub fn num_devices(&self) -> usize {
+        self.placements.len()
+    }
+
+    pub fn shard_len(&self) -> Result<usize> {
+        let Some(first) = self.placements.first() else {
+            return Err(MahoutError::InvalidInput(
+                "Placement plan must contain at least one shard".to_string(),
+            ));
+        };
+        let shard_len = first.local_len();
+        if self
+            .placements
+            .iter()
+            .any(|placement| placement.local_len() != shard_len)
+        {
+            return Err(MahoutError::InvalidInput(
+                "Placement plan contains uneven shard lengths".to_string(),
+            ));
+        }
+        Ok(shard_len)
+    }
+}
+
+/// Stateless planner for device placement decisions.
+#[derive(Clone, Debug, Default)]
+pub struct PlacementPlanner;
+
+impl PlacementPlanner {
+    pub fn plan(mesh: &DeviceMesh, request: &PlacementRequest) -> Result<PlacementPlan> {
+        if mesh.num_devices() == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Placement planner requires at least one device".to_string(),
+            ));
+        }
+
+        let global_len = request.global_len()?;
+        match request.mode {
+            DistributionMode::Single => {
+                let device_ids = Self::select_device_ids(mesh, request)?;
+                let device_id = *device_ids.first().ok_or_else(|| {
+                    MahoutError::InvalidInput(
+                        "Single-device placement requires one device ID".to_string(),
+                    )
+                })?;
+                Ok(PlacementPlan {
+                    mode: request.mode,
+                    global_len,
+                    placements: vec![ShardPlacement {
+                        device_id,
+                        shard_id: 0,
+                        start_idx: 0,
+                        end_idx: global_len,
+                    }],
+                    gather_device_id: Some(device_id),
+                })
+            }
+            DistributionMode::ShardedCapacity => {
+                let device_ids = Self::select_device_ids(mesh, request)?;
+                let shard_lengths =
+                    Self::plan_shard_lengths(global_len, device_ids.len(), request.shard_policy)?;
+                let placements = Self::build_shard_placements(&device_ids, &shard_lengths);
+
+                Ok(PlacementPlan {
+                    mode: request.mode,
+                    global_len,
+                    placements,
+                    gather_device_id: mesh.recommended_gather_device_id(),
+                })
+            }
+            DistributionMode::Replicated => Err(MahoutError::NotImplemented(
+                "Replicated placement is not implemented yet".to_string(),
+            )),
+        }
+    }
+
+    fn select_device_ids(mesh: &DeviceMesh, request: &PlacementRequest) -> Result<Vec<usize>> {
+        match request.mode {
+            DistributionMode::Single => mesh
+                .device_ids
+                .first()
+                .copied()
+                .map(|device_id| vec![device_id])
+                .ok_or_else(|| {
+                    MahoutError::InvalidInput(
+                        "Single-device placement requires one device ID".to_string(),
+                    )
+                }),
+            DistributionMode::ShardedCapacity => {
+                let recommended = mesh.recommended_placement_device_ids();
+                if recommended.is_empty() {
+                    Ok(mesh.device_ids.clone())
+                } else {
+                    Ok(recommended)
+                }
+            }
+            DistributionMode::Replicated => Err(MahoutError::NotImplemented(
+                "Replicated placement is not implemented yet".to_string(),
+            )),
+        }
+    }
+
+    fn plan_shard_lengths(
+        global_len: usize,
+        num_devices: usize,
+        shard_policy: ShardPolicy,
+    ) -> Result<Vec<usize>> {
+        if num_devices == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Shard planning requires at least one device".to_string(),
+            ));
+        }
+
+        match shard_policy {
+            ShardPolicy::Equal => {
+                if !num_devices.is_power_of_two() {
+                    return Err(MahoutError::InvalidInput(format!(
+                        "Equal shard policy requires a power-of-two device count, got {}",
+                        num_devices
+                    )));
+                }
+                Ok(vec![global_len / num_devices; num_devices])
+            }
+            ShardPolicy::BalancedUneven => {
+                let base_len = global_len / num_devices;
+                let remainder = global_len % num_devices;
+                Ok((0..num_devices)
+                    .map(|shard_id| base_len + usize::from(shard_id < remainder))
+                    .collect())
+            }
+        }
+    }
+
+    fn build_shard_placements(
+        device_ids: &[usize],
+        shard_lengths: &[usize],
+    ) -> Vec<ShardPlacement> {
+        let mut start_idx = 0usize;
+        device_ids
+            .iter()
+            .copied()
+            .zip(shard_lengths.iter().copied())
+            .enumerate()
+            .map(|(shard_id, (device_id, local_len))| {
+                let end_idx = start_idx + local_len;
+                let placement = ShardPlacement {
+                    device_id,
+                    shard_id,
+                    start_idx,
+                    end_idx,
+                };
+                start_idx = end_idx;
+                placement
+            })
+            .collect()
+    }
+}
diff --git a/qdp/qdp-core/src/gpu/distributed/runtime.rs b/qdp/qdp-core/src/gpu/distributed/runtime.rs
new file mode 100644
index 0000000000..2d7f42370c
--- /dev/null
+++ b/qdp/qdp-core/src/gpu/distributed/runtime.rs
@@ -0,0 +1,298 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+#[cfg(target_os = "linux")]
+use crate::error::cuda_error_to_string;
+use crate::error::{MahoutError, Result};
+use crate::gpu::communicator::Communicator;
+#[cfg(target_os = "linux")]
+use crate::gpu::memory::{BufferStorage, GpuBufferRaw};
+use crate::gpu::memory::{Precision, ensure_device_memory_available, map_allocation_error};
+use crate::gpu::topology::DeviceMesh;
+use crate::gpu::{
+    DistributedAmplitudePlan, DistributedStateLayout, DistributedStateVector, PlacementRequest,
+};
+#[cfg(target_os = "linux")]
+use cudarc::driver::{DevicePtr, DevicePtrMut};
+#[cfg(target_os = "linux")]
+use qdp_kernels::{
+    CuComplex, CuDoubleComplex, launch_amplitude_encode, launch_amplitude_encode_f32,
+};
+#[cfg(target_os = "linux")]
+use std::ffi::c_void;
+
+pub(crate) fn validate_distributed_input(
+    host_data: &[f64],
+    request: &PlacementRequest,
+) -> Result<()> {
+    if request.num_qubits == 0 {
+        return Err(MahoutError::InvalidInput(
+            "Number of qubits must be at least 1 for distributed amplitude planning".to_string(),
+        ));
+    }
+
+    if host_data.is_empty() {
+        return Err(MahoutError::InvalidInput(
+            "Input data cannot be empty".to_string(),
+        ));
+    }
+
+    let state_len = request.global_len()?;
+    if host_data.len() > state_len {
+        return Err(MahoutError::InvalidInput(format!(
+            "Input data length {} exceeds state vector size {}",
+            host_data.len(),
+            state_len
+        )));
+    }
+
+    Ok(())
+}
+
+pub(crate) fn plan_distributed_encode(
+    mesh: &DeviceMesh,
+    host_data: &[f64],
+    request: PlacementRequest,
+) -> Result<DistributedAmplitudePlan> {
+    validate_distributed_input(host_data, &request)?;
+    DistributedAmplitudePlan::for_request(mesh, request)
+}
+
+pub(crate) fn calculate_local_norm_sq(
+    host_data: &[f64],
+    start_idx: usize,
+    end_idx: usize,
+) -> Result<f64> {
+    if start_idx > end_idx {
+        return Err(MahoutError::InvalidInput(format!(
+            "Invalid shard range: start {} exceeds end {}",
+            start_idx, end_idx
+        )));
+    }
+
+    let slice_end = end_idx.min(host_data.len());
+    if start_idx >= slice_end {
+        return Ok(0.0);
+    }
+
+    let mut local_sum = 0.0f64;
+    for &value in &host_data[start_idx..slice_end] {
+        if !value.is_finite() {
+            return Err(MahoutError::InvalidInput(
+                "Input data contains NaN or Inf".to_string(),
+            ));
+        }
+        local_sum += value * value;
+    }
+    Ok(local_sum)
+}
+
+pub(crate) fn calculate_inv_norm_distributed(
+    plan: &DistributedAmplitudePlan,
+    host_data: &[f64],
+    communicator: &dyn Communicator,
+) -> Result<f64> {
+    let mut partials = Vec::with_capacity(plan.num_devices);
+    for shard_id in 0..plan.num_devices {
+        let (start_idx, end_idx) = plan.shard_range(shard_id)?;
+        partials.push(calculate_local_norm_sq(host_data, start_idx, end_idx)?);
+    }
+
+    let global_norm_sq = communicator.reduce_sum_f64(&partials)?;
+    if global_norm_sq <= 0.0 || !global_norm_sq.is_finite() {
+        return Err(MahoutError::InvalidInput(
+            "Input data has zero or non-finite norm (contains NaN, Inf, or all zeros)".to_string(),
+        ));
+    }
+
+    Ok(1.0 / global_norm_sq.sqrt())
+}
+
+pub(crate) fn prepare_distributed_encode(
+    mesh: &DeviceMesh,
+    host_data: &[f64],
+    precision: Precision,
+    request: PlacementRequest,
+    communicator: &dyn Communicator,
+) -> Result<(DistributedAmplitudePlan, f64, DistributedStateLayout)> {
+    let plan = plan_distributed_encode(mesh, host_data, request)?;
+    let inv_norm = calculate_inv_norm_distributed(&plan, host_data, communicator)?;
+    let layout = DistributedStateLayout::new(mesh, &plan, precision)?;
+    Ok((plan, inv_norm, layout))
+}
+
+#[cfg(target_os = "linux")]
+pub(crate) fn encode_distributed_to_shards(
+    mesh: &DeviceMesh,
+    host_data: &[f64],
+    precision: Precision,
+    request: PlacementRequest,
+    communicator: &dyn Communicator,
+) -> Result<DistributedStateVector> {
+    let (plan, inv_norm, layout) =
+        prepare_distributed_encode(mesh, host_data, precision, request, communicator)?;
+    let num_qubits = plan.request.num_qubits;
+
+    let mut buffers = Vec::with_capacity(plan.num_devices);
+    for (shard_id, device) in mesh.devices.iter().enumerate() {
+        let (start_idx, end_idx) = plan.shard_range(shard_id)?;
+        let local_len = end_idx - start_idx;
+        let slice_end = end_idx.min(host_data.len());
+        let present_len = slice_end.saturating_sub(start_idx);
+
+        let buffer = match precision {
+            Precision::Float32 => {
+                let requested_bytes = distributed_shard_bytes::<CuComplex>(local_len)?;
+                ensure_device_memory_available(
+                    requested_bytes,
+                    "distributed amplitude shard allocation (f32)",
+                    Some(num_qubits),
+                    Some(device.ordinal()),
+                )?;
+                let mut state_slice = device.alloc_zeros::<CuComplex>(local_len).map_err(|e| {
+                    map_allocation_error(
+                        requested_bytes,
+                        "distributed amplitude shard allocation (f32)",
+                        Some(num_qubits),
+                        Some(device.ordinal()),
+                        e,
+                    )
+                })?;
+
+                if present_len > 0 {
+                    let host_input = host_data[start_idx..slice_end]
+                        .iter()
+                        .map(|&value| value as f32)
+                        .collect::<Vec<_>>();
+                    let input_slice = device.htod_sync_copy(&host_input).map_err(|e| {
+                        MahoutError::MemoryAllocation(format!(
+                            "Failed to upload distributed amplitude shard input (f32): {:?}",
+                            e
+                        ))
+                    })?;
+
+                    let ret = unsafe {
+                        launch_amplitude_encode_f32(
+                            *input_slice.device_ptr() as *const f32,
+                            *state_slice.device_ptr_mut() as *mut c_void,
+                            present_len,
+                            local_len,
+                            inv_norm as f32,
+                            std::ptr::null_mut(),
+                        )
+                    };
+
+                    if ret != 0 {
+                        return Err(MahoutError::KernelLaunch(format!(
+                            "Distributed amplitude shard kernel failed on cuda:{} with CUDA error code: {} ({})",
+                            device.ordinal(),
+                            ret,
+                            cuda_error_to_string(ret)
+                        )));
+                    }
+
+                    device.synchronize().map_err(|e| {
+                        MahoutError::Cuda(format!(
+                            "Distributed amplitude shard synchronize failed on cuda:{}: {:?}",
+                            device.ordinal(),
+                            e
+                        ))
+                    })?;
+                }
+
+                Arc::new(BufferStorage::F32(GpuBufferRaw { slice: state_slice }))
+            }
+            Precision::Float64 => {
+                let requested_bytes = distributed_shard_bytes::<CuDoubleComplex>(local_len)?;
+                ensure_device_memory_available(
+                    requested_bytes,
+                    "distributed amplitude shard allocation",
+                    Some(num_qubits),
+                    Some(device.ordinal()),
+                )?;
+                let mut state_slice =
+                    device
+                        .alloc_zeros::<CuDoubleComplex>(local_len)
+                        .map_err(|e| {
+                            map_allocation_error(
+                                requested_bytes,
+                                "distributed amplitude shard allocation",
+                                Some(num_qubits),
+                                Some(device.ordinal()),
+                                e,
+                            )
+                        })?;
+
+                if present_len > 0 {
+                    let input_slice = device
+                        .htod_sync_copy(&host_data[start_idx..slice_end])
+                        .map_err(|e| {
+                            MahoutError::MemoryAllocation(format!(
+                                "Failed to upload distributed amplitude shard input: {:?}",
+                                e
+                            ))
+                        })?;
+
+                    let ret = unsafe {
+                        launch_amplitude_encode(
+                            *input_slice.device_ptr() as *const f64,
+                            *state_slice.device_ptr_mut() as *mut c_void,
+                            present_len,
+                            local_len,
+                            inv_norm,
+                            std::ptr::null_mut(),
+                        )
+                    };
+
+                    if ret != 0 {
+                        return Err(MahoutError::KernelLaunch(format!(
+                            "Distributed amplitude shard kernel failed on cuda:{} with CUDA error code: {} ({})",
+                            device.ordinal(),
+                            ret,
+                            cuda_error_to_string(ret)
+                        )));
+                    }
+
+                    device.synchronize().map_err(|e| {
+                        MahoutError::Cuda(format!(
+                            "Distributed amplitude shard synchronize failed on cuda:{}: {:?}",
+                            device.ordinal(),
+                            e
+                        ))
+                    })?;
+                }
+
+                Arc::new(BufferStorage::F64(GpuBufferRaw { slice: state_slice }))
+            }
+        };
+
+        buffers.push(buffer);
+    }
+
+    DistributedStateVector::new_with_buffers(layout, buffers)
+}
+
+#[cfg(target_os = "linux")]
+fn distributed_shard_bytes<T>(len: usize) -> Result<usize> {
+    len.checked_mul(std::mem::size_of::<T>()).ok_or_else(|| {
+        MahoutError::MemoryAllocation(format!(
+            "Distributed shard allocation size overflow (elements={})",
+            len
+        ))
+    })
+}
diff --git a/qdp/qdp-core/src/gpu/distributed/shared.rs b/qdp/qdp-core/src/gpu/distributed/shared.rs
new file mode 100644
index 0000000000..28f5708de6
--- /dev/null
+++ b/qdp/qdp-core/src/gpu/distributed/shared.rs
@@ -0,0 +1,64 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::gpu::topology::GpuTopology;
+
+pub(crate) fn policy_device_ids(
+    topology: &GpuTopology,
+    shard_device_ids: impl IntoIterator<Item = usize>,
+) -> (Option<usize>, Vec<usize>) {
+    let shard_device_ids = shard_device_ids.into_iter().collect::<Vec<_>>();
+    let recommended_gather_device_id = topology
+        .preferred_gather_index()
+        .and_then(|idx| shard_device_ids.get(idx).copied());
+    let recommended_placement_device_ids = topology
+        .recommended_placement_order()
+        .into_iter()
+        .filter_map(|idx| shard_device_ids.get(idx).copied())
+        .collect();
+
+    (
+        recommended_gather_device_id,
+        recommended_placement_device_ids,
+    )
+}
+
+#[cfg(test)]
+mod tests {
+    use super::policy_device_ids;
+    use crate::gpu::topology::{GpuTopology, LinkKind};
+
+    #[test]
+    fn policy_device_ids_skips_out_of_range_topology_indices() {
+        let topology = GpuTopology {
+            peer_access: vec![
+                vec![true, true, false],
+                vec![true, true, true],
+                vec![false, true, true],
+            ],
+            links: vec![
+                vec![LinkKind::SameDevice, LinkKind::Pix, LinkKind::Unknown],
+                vec![LinkKind::Pix, LinkKind::SameDevice, LinkKind::Node],
+                vec![LinkKind::Unknown, LinkKind::Node, LinkKind::SameDevice],
+            ],
+        };
+
+        let (gather, placement) = policy_device_ids(&topology, [17, 23]);
+
+        assert_eq!(gather, Some(23));
+        assert_eq!(placement, vec![23, 17]);
+    }
+}
diff --git a/qdp/qdp-core/src/gpu/distributed/state.rs b/qdp/qdp-core/src/gpu/distributed/state.rs
new file mode 100644
index 0000000000..9f1cf4a38b
--- /dev/null
+++ b/qdp/qdp-core/src/gpu/distributed/state.rs
@@ -0,0 +1,162 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use cudarc::driver::CudaDevice;
+#[cfg(target_os = "linux")]
+use qdp_kernels::{CuComplex, CuDoubleComplex};
+
+use crate::error::{MahoutError, Result};
+use crate::gpu::memory::{BufferStorage, Precision};
+
+use super::DistributedStateLayout;
+use super::shared;
+
+/// One materialized shard of a distributed state vector.
+#[derive(Clone)]
+pub struct StateShard {
+    pub device: Arc<CudaDevice>,
+    pub device_id: usize,
+    pub shard_id: usize,
+    pub start_idx: usize,
+    pub end_idx: usize,
+    pub local_len: usize,
+    pub buffer: Arc<BufferStorage>,
+}
+
+/// Materialized multi-GPU state vector with one live buffer per shard.
+#[derive(Clone)]
+pub struct DistributedStateVector {
+    pub num_qubits: usize,
+    pub precision: Precision,
+    pub global_len: usize,
+    pub shard_bits: Option<usize>,
+    pub topology: crate::gpu::topology::GpuTopology,
+    pub shards: Vec<StateShard>,
+}
+
+impl DistributedStateVector {
+    pub fn recommended_gather_device_id(&self) -> Option<usize> {
+        shared::policy_device_ids(
+            &self.topology,
+            self.shards.iter().map(|shard| shard.device_id),
+        )
+        .0
+    }
+
+    pub fn recommended_placement_device_ids(&self) -> Vec<usize> {
+        shared::policy_device_ids(
+            &self.topology,
+            self.shards.iter().map(|shard| shard.device_id),
+        )
+        .1
+    }
+
+    pub fn num_shards(&self) -> usize {
+        self.shards.len()
+    }
+
+    #[cfg(target_os = "linux")]
+    pub fn copy_shard_to_host_f64(&self, shard_id: usize) -> Result<Vec<CuDoubleComplex>> {
+        let shard = self.shards.get(shard_id).ok_or_else(|| {
+            MahoutError::InvalidInput(format!(
+                "Shard ID {} out of range for {} shards",
+                shard_id,
+                self.shards.len()
+            ))
+        })?;
+        match shard.buffer.as_ref() {
+            BufferStorage::F64(buf) => shard.device.dtoh_sync_copy(&buf.slice).map_err(|e| {
+                MahoutError::Cuda(format!(
+                    "Failed to copy distributed float64 shard {} to host: {:?}",
+                    shard_id, e
+                ))
+            }),
+            BufferStorage::F32(_) => Err(MahoutError::InvalidInput(format!(
+                "Shard {} stores float32 data, not float64",
+                shard_id
+            ))),
+        }
+    }
+
+    #[cfg(target_os = "linux")]
+    pub fn copy_shard_to_host_f32(&self, shard_id: usize) -> Result<Vec<CuComplex>> {
+        let shard = self.shards.get(shard_id).ok_or_else(|| {
+            MahoutError::InvalidInput(format!(
+                "Shard ID {} out of range for {} shards",
+                shard_id,
+                self.shards.len()
+            ))
+        })?;
+        match shard.buffer.as_ref() {
+            BufferStorage::F32(buf) => shard.device.dtoh_sync_copy(&buf.slice).map_err(|e| {
+                MahoutError::Cuda(format!(
+                    "Failed to copy distributed float32 shard {} to host: {:?}",
+                    shard_id, e
+                ))
+            }),
+            BufferStorage::F64(_) => Err(MahoutError::InvalidInput(format!(
+                "Shard {} stores float64 data, not float32",
+                shard_id
+            ))),
+        }
+    }
+
+    /// Construct a distributed state vector with concrete shard buffers already allocated.
+    pub fn new_with_buffers(
+        layout: DistributedStateLayout,
+        buffers: Vec<Arc<BufferStorage>>,
+    ) -> Result<Self> {
+        if buffers.len() != layout.shards.len() {
+            return Err(MahoutError::InvalidInput(format!(
+                "Distributed state buffer mismatch: {} buffers for {} shards",
+                buffers.len(),
+                layout.shards.len()
+            )));
+        }
+
+        let mut shards = Vec::with_capacity(layout.shards.len());
+        for (shard_layout, buffer) in layout.shards.into_iter().zip(buffers.into_iter()) {
+            if buffer.precision() != layout.precision {
+                return Err(MahoutError::InvalidInput(format!(
+                    "Distributed shard precision mismatch on shard {}: expected {:?}, got {:?}",
+                    shard_layout.shard_id,
+                    layout.precision,
+                    buffer.precision()
+                )));
+            }
+            shards.push(StateShard {
+                device: shard_layout.device,
+                device_id: shard_layout.device_id,
+                shard_id: shard_layout.shard_id,
+                start_idx: shard_layout.start_idx,
+                end_idx: shard_layout.end_idx,
+                local_len: shard_layout.local_len,
+                buffer,
+            });
+        }
+
+        Ok(Self {
+            num_qubits: layout.num_qubits,
+            precision: layout.precision,
+            global_len: layout.global_len,
+            shard_bits: layout.shard_bits,
+            topology: layout.topology,
+            shards,
+        })
+    }
+}
diff --git a/qdp/qdp-core/src/gpu/mod.rs b/qdp/qdp-core/src/gpu/mod.rs
index 7e16be7be3..eee6d2014a 100644
--- a/qdp/qdp-core/src/gpu/mod.rs
+++ b/qdp/qdp-core/src/gpu/mod.rs
@@ -16,8 +16,10 @@
 
 #[cfg(target_os = "linux")]
 pub mod buffer_pool;
+pub mod communicator;
 #[cfg(target_os = "linux")]
 pub(crate) mod cuda_sync;
+pub mod distributed;
 pub mod encodings;
 pub mod memory;
 #[cfg(target_os = "linux")]
@@ -25,15 +27,23 @@ pub mod overlap_tracker;
 pub mod pipeline;
 #[cfg(target_os = "linux")]
 pub mod pool_metrics;
+pub mod topology;
 
 #[cfg(target_os = "linux")]
 pub(crate) mod cuda_ffi;
 
 #[cfg(target_os = "linux")]
 pub use buffer_pool::{PinnedBufferHandle, PinnedBufferPool};
+pub use communicator::{Communicator, HostCommunicator};
+pub use distributed::{
+    DistributedAmplitudePlan, DistributedStateLayout, DistributedStateVector, DistributionMode,
+    PlacementPlan, PlacementPlanner, PlacementRequest, PreparedDistributedAmplitudeEncode,
+    ShardPlacement, ShardPolicy, StateShard, StateShardLayout,
+};
 pub use encodings::{AmplitudeEncoder, AngleEncoder, BasisEncoder, QuantumEncoder, get_encoder};
 pub use memory::GpuStateVector;
 pub use pipeline::run_dual_stream_pipeline;
+pub use topology::{DeviceMesh, GpuTopology, LinkKind};
 
 #[cfg(target_os = "linux")]
 pub use overlap_tracker::OverlapTracker;
diff --git a/qdp/qdp-core/src/gpu/topology.rs b/qdp/qdp-core/src/gpu/topology.rs
new file mode 100644
index 0000000000..170b1200ab
--- /dev/null
+++ b/qdp/qdp-core/src/gpu/topology.rs
@@ -0,0 +1,293 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use cudarc::driver::CudaDevice;
+
+use crate::error::{MahoutError, Result};
+#[cfg(target_os = "linux")]
+use crate::{
+    cuda_error_to_string,
+    gpu::cuda_ffi::{CUDA_SUCCESS, cudaDeviceCanAccessPeer},
+};
+
+/// Coarse-grained GPU interconnect classification used for future placement policies.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum LinkKind {
+    SameDevice,
+    Pix,
+    Node,
+    Sys,
+    Unknown,
+}
+
+impl LinkKind {
+    fn score(self) -> usize {
+        match self {
+            LinkKind::SameDevice => 4,
+            LinkKind::Pix => 3,
+            LinkKind::Node => 2,
+            LinkKind::Sys => 1,
+            LinkKind::Unknown => 0,
+        }
+    }
+}
+
+/// Runtime topology metadata for a device mesh.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct GpuTopology {
+    pub peer_access: Vec<Vec<bool>>,
+    pub links: Vec<Vec<LinkKind>>,
+}
+
+impl GpuTopology {
+    /// Create a placeholder topology where only self-access is guaranteed.
+    pub fn placeholder(num_devices: usize) -> Self {
+        let mut peer_access = vec![vec![false; num_devices]; num_devices];
+        let mut links = vec![vec![LinkKind::Unknown; num_devices]; num_devices];
+
+        for idx in 0..num_devices {
+            peer_access[idx][idx] = true;
+            links[idx][idx] = LinkKind::SameDevice;
+        }
+
+        Self { peer_access, links }
+    }
+
+    #[cfg(target_os = "linux")]
+    pub fn probe(device_ids: &[usize]) -> Result<Self> {
+        let num_devices = device_ids.len();
+        let mut topology = Self::placeholder(num_devices);
+
+        for (src_idx, &src_device_id) in device_ids.iter().enumerate() {
+            for (dst_idx, &dst_device_id) in device_ids.iter().enumerate() {
+                if src_idx == dst_idx {
+                    continue;
+                }
+
+                let mut can_access_peer = 0i32;
+                let ret = unsafe {
+                    cudaDeviceCanAccessPeer(
+                        &mut can_access_peer as *mut i32,
+                        src_device_id as i32,
+                        dst_device_id as i32,
+                    )
+                };
+
+                if ret != CUDA_SUCCESS {
+                    return Err(MahoutError::Cuda(format!(
+                        "cudaDeviceCanAccessPeer(cuda:{}, cuda:{}) failed: {} ({})",
+                        src_device_id,
+                        dst_device_id,
+                        ret,
+                        cuda_error_to_string(ret)
+                    )));
+                }
+
+                topology.peer_access[src_idx][dst_idx] = can_access_peer != 0;
+            }
+        }
+
+        Ok(topology)
+    }
+
+    pub fn preferred_gather_index(&self) -> Option<usize> {
+        if self.peer_access.is_empty() {
+            return None;
+        }
+
+        let mut best_idx = 0usize;
+        let mut best_score = 0usize;
+        for idx in 0..self.peer_access.len() {
+            let mut score = 0usize;
+            for peer_idx in 0..self.peer_access.len() {
+                if self.peer_access[idx][peer_idx] {
+                    score += 10 + self.links[idx][peer_idx].score();
+                }
+                if self.peer_access[peer_idx][idx] {
+                    score += 10 + self.links[peer_idx][idx].score();
+                }
+            }
+
+            if score > best_score {
+                best_score = score;
+                best_idx = idx;
+            }
+        }
+
+        Some(best_idx)
+    }
+
+    pub fn recommended_placement_order(&self) -> Vec<usize> {
+        let Some(root_idx) = self.preferred_gather_index() else {
+            return Vec::new();
+        };
+
+        let mut indices: Vec<usize> = (0..self.peer_access.len()).collect();
+        indices.sort_by_key(|&idx| {
+            let mut score = 0usize;
+            if self.peer_access[root_idx][idx] {
+                score += 10 + self.links[root_idx][idx].score();
+            }
+            if self.peer_access[idx][root_idx] {
+                score += 10 + self.links[idx][root_idx].score();
+            }
+            (usize::MAX - score, idx)
+        });
+        indices
+    }
+}
+
+/// A validated collection of CUDA devices that will eventually back distributed state vectors.
+#[derive(Clone)]
+pub struct DeviceMesh {
+    pub device_ids: Vec<usize>,
+    pub devices: Vec<Arc<CudaDevice>>,
+    pub topology: GpuTopology,
+}
+
+impl DeviceMesh {
+    /// Build a mesh from CUDA device IDs with a placeholder topology.
+    ///
+    /// This intentionally keeps PR1 lightweight: it validates shape and initializes
+    /// CUDA devices, while a later PR can replace placeholder topology discovery with
+    /// explicit peer-access probing and richer link metadata.
+    pub fn new(device_ids: Vec<usize>) -> Result<Self> {
+        Self::validate_device_ids(&device_ids)?;
+
+        let mut devices = Vec::with_capacity(device_ids.len());
+        for &device_id in &device_ids {
+            let device = CudaDevice::new(device_id).map_err(|e| {
+                MahoutError::Cuda(format!(
+                    "Failed to initialize CUDA device {} for device mesh: {:?}",
+                    device_id, e
+                ))
+            })?;
+            devices.push(device);
+        }
+
+        #[cfg(target_os = "linux")]
+        let topology = GpuTopology::probe(&device_ids)?;
+        #[cfg(not(target_os = "linux"))]
+        let topology = GpuTopology::placeholder(device_ids.len());
+
+        Ok(Self {
+            device_ids,
+            devices,
+            topology,
+        })
+    }
+
+    /// Build a mesh from explicit parts. Intended for tests and future topology injection.
+    pub fn from_parts(
+        device_ids: Vec<usize>,
+        devices: Vec<Arc<CudaDevice>>,
+        topology: GpuTopology,
+    ) -> Result<Self> {
+        Self::validate_device_ids(&device_ids)?;
+
+        if device_ids.len() != devices.len() {
+            return Err(MahoutError::InvalidInput(format!(
+                "Device mesh mismatch: {} device IDs but {} device handles",
+                device_ids.len(),
+                devices.len()
+            )));
+        }
+        for (&device_id, device) in device_ids.iter().zip(devices.iter()) {
+            if device.ordinal() != device_id {
+                return Err(MahoutError::InvalidInput(format!(
+                    "Device mesh ordinal mismatch: declared cuda:{} but handle targets cuda:{}",
+                    device_id,
+                    device.ordinal()
+                )));
+            }
+        }
+
+        let num_devices = device_ids.len();
+        if topology.peer_access.len() != num_devices || topology.links.len() != num_devices {
+            return Err(MahoutError::InvalidInput(format!(
+                "Topology dimension mismatch for {} devices",
+                num_devices
+            )));
+        }
+        if topology
+            .peer_access
+            .iter()
+            .any(|row| row.len() != num_devices)
+            || topology.links.iter().any(|row| row.len() != num_devices)
+        {
+            return Err(MahoutError::InvalidInput(format!(
+                "Topology row width mismatch for {} devices",
+                num_devices
+            )));
+        }
+
+        Ok(Self {
+            device_ids,
+            devices,
+            topology,
+        })
+    }
+
+    pub fn num_devices(&self) -> usize {
+        self.device_ids.len()
+    }
+
+    pub fn validate_power_of_two(&self) -> Result<()> {
+        let num_devices = self.num_devices();
+        if !num_devices.is_power_of_two() {
+            return Err(MahoutError::InvalidInput(format!(
+                "Distributed QDP currently requires a power-of-two device count, got {}",
+                num_devices
+            )));
+        }
+        Ok(())
+    }
+
+    pub fn recommended_gather_device_id(&self) -> Option<usize> {
+        self.topology
+            .preferred_gather_index()
+            .map(|idx| self.device_ids[idx])
+    }
+
+    pub fn recommended_placement_device_ids(&self) -> Vec<usize> {
+        self.topology
+            .recommended_placement_order()
+            .into_iter()
+            .map(|idx| self.device_ids[idx])
+            .collect()
+    }
+
+    fn validate_device_ids(device_ids: &[usize]) -> Result<()> {
+        if device_ids.is_empty() {
+            return Err(MahoutError::InvalidInput(
+                "Device mesh requires at least one device ID".to_string(),
+            ));
+        }
+
+        let mut sorted = device_ids.to_vec();
+        sorted.sort_unstable();
+        sorted.dedup();
+        if sorted.len() != device_ids.len() {
+            return Err(MahoutError::InvalidInput(
+                "Device mesh contains duplicate device IDs".to_string(),
+            ));
+        }
+
+        Ok(())
+    }
+}
diff --git a/qdp/qdp-core/src/lib.rs b/qdp/qdp-core/src/lib.rs
index 799eb7b180..7bbb963cfe 100644
--- a/qdp/qdp-core/src/lib.rs
+++ b/qdp/qdp-core/src/lib.rs
@@ -36,6 +36,11 @@ mod profiling;
 
 pub use error::{MahoutError, Result, cuda_error_to_string};
 pub use gpu::memory::Precision;
+pub use gpu::{
+    DeviceMesh, DistributedAmplitudePlan, DistributedStateLayout, DistributedStateVector,
+    DistributionMode, GpuTopology, LinkKind, PlacementPlan, PlacementPlanner, PlacementRequest,
+    PreparedDistributedAmplitudeEncode, ShardPlacement, ShardPolicy,
+};
 pub use reader::{NullHandling, handle_float64_nulls};
 
 // Throughput/latency pipeline runner: single path using QdpEngine and encode_batch in Rust.
@@ -52,6 +57,8 @@ use std::ffi::c_void;
 use std::sync::Arc;
 
 use crate::dlpack::DLManagedTensor;
+use crate::gpu::HostCommunicator;
+use crate::gpu::distributed::runtime;
 use crate::gpu::get_encoder;
 use cudarc::driver::CudaDevice;
 
@@ -138,6 +145,85 @@ impl QdpEngine {
         })
     }
 
+    /// Build a validated multi-GPU mesh for future distributed encoding paths.
+    ///
+    /// PR1 exposes mesh construction without changing the single-device engine shape.
+    pub fn new_distributed_mesh(device_ids: Vec<usize>) -> Result<DeviceMesh> {
+        DeviceMesh::new(device_ids)
+    }
+
+    /// Prepare a distributed amplitude encode on a validated multi-GPU mesh.
+    ///
+    /// This stage performs input validation, shard planning, and host-coordinated
+    /// global norm reduction, then returns metadata for the future distributed
+    /// state layout. It does not yet allocate per-shard buffers or launch
+    /// multi-GPU kernels.
+    #[doc(hidden)]
+    pub fn prepare_distributed_amplitude(
+        device_ids: Vec<usize>,
+        data: &[f64],
+        num_qubits: usize,
+        precision: Precision,
+        request: Option<PlacementRequest>,
+    ) -> Result<PreparedDistributedAmplitudeEncode> {
+        let request = Self::resolve_distributed_request(num_qubits, request)?;
+        runtime::validate_distributed_input(data, &request)?;
+        let mesh = Self::new_distributed_mesh(device_ids)?;
+        let communicator = HostCommunicator;
+        let (plan, inv_norm, layout) =
+            runtime::prepare_distributed_encode(&mesh, data, precision, request, &communicator)?;
+
+        Ok(PreparedDistributedAmplitudeEncode {
+            mesh,
+            plan,
+            inv_norm,
+            layout,
+        })
+    }
+
+    /// Encode amplitude data into real per-device shard buffers for a distributed
+    /// state vector on a single host.
+    ///
+    /// This is the first executable distributed path: it returns a sharded state
+    /// object with one GPU buffer per shard. It does not expose DLPack or gather
+    /// semantics yet.
+    #[doc(hidden)]
+    pub fn encode_distributed_amplitude_to_shards(
+        device_ids: Vec<usize>,
+        data: &[f64],
+        num_qubits: usize,
+        precision: Precision,
+        request: Option<PlacementRequest>,
+    ) -> Result<DistributedStateVector> {
+        let request = Self::resolve_distributed_request(num_qubits, request)?;
+        runtime::validate_distributed_input(data, &request)?;
+        let mesh = Self::new_distributed_mesh(device_ids)?;
+        let communicator = HostCommunicator;
+        runtime::encode_distributed_to_shards(&mesh, data, precision, request, &communicator)
+    }
+
+    fn resolve_distributed_request(
+        num_qubits: usize,
+        request: Option<PlacementRequest>,
+    ) -> Result<PlacementRequest> {
+        match request {
+            Some(request) => {
+                if request.num_qubits != num_qubits {
+                    return Err(MahoutError::InvalidInput(format!(
+                        "Distributed request qubit mismatch: argument specifies {} qubits but request specifies {}",
+                        num_qubits, request.num_qubits
+                    )));
+                }
+                Ok(request)
+            }
+            None => Ok(PlacementRequest::new(
+                num_qubits,
+                DistributionMode::ShardedCapacity,
+                ShardPolicy::Equal,
+            )),
+        }
+    }
+
     /// Encode classical data into quantum state
     ///
     /// Selects encoding strategy, executes on GPU, returns DLPack pointer.

From 9086a50fd8354e572f2c26b6181410486bafc335 Mon Sep 17 00:00:00 2001
From: viiccwen <viiccwen@gmail.com>
Date: Mon, 4 May 2026 17:18:35 +0000
Subject: [PATCH 3/5] test(gpu): cover distributed runtime and planning

---
 qdp/qdp-core/tests/gpu_api_workflow.rs        |  58 ++++-
 .../tests/gpu_distributed_amplitude_plan.rs   | 140 +++++++++++
 .../tests/gpu_distributed_communicator.rs     |  19 ++
 qdp/qdp-core/tests/gpu_distributed_engine.rs  | 219 ++++++++++++++++++
 qdp/qdp-core/tests/gpu_distributed_planner.rs | 139 +++++++++++
 qdp/qdp-core/tests/gpu_distributed_runtime.rs | 144 ++++++++++++
 qdp/qdp-core/tests/gpu_topology.rs            |  79 +++++++
 7 files changed, 790 insertions(+), 8 deletions(-)
 create mode 100644 qdp/qdp-core/tests/gpu_distributed_amplitude_plan.rs
 create mode 100644 qdp/qdp-core/tests/gpu_distributed_communicator.rs
 create mode 100644 qdp/qdp-core/tests/gpu_distributed_engine.rs
 create mode 100644 qdp/qdp-core/tests/gpu_distributed_planner.rs
 create mode 100644 qdp/qdp-core/tests/gpu_distributed_runtime.rs
 create mode 100644 qdp/qdp-core/tests/gpu_topology.rs

diff --git a/qdp/qdp-core/tests/gpu_api_workflow.rs b/qdp/qdp-core/tests/gpu_api_workflow.rs
index 6c8e651669..cd79ac32a6 100644
--- a/qdp/qdp-core/tests/gpu_api_workflow.rs
+++ b/qdp/qdp-core/tests/gpu_api_workflow.rs
@@ -18,9 +18,9 @@
 
 #[cfg(target_os = "linux")]
 use qdp_core::MahoutError;
-use qdp_core::QdpEngine;
 #[cfg(target_os = "linux")]
 use qdp_core::gpu::pipeline::run_dual_stream_pipeline_aligned;
+use qdp_core::{Precision, QdpEngine};
 
 mod common;
 
@@ -63,7 +63,6 @@ fn test_amplitude_encoding_workflow() {
     assert!(!dlpack_ptr.is_null(), "DLPack pointer should not be null");
     println!("PASS: Encoding succeeded, DLPack pointer valid");
 
-    // Simulate PyTorch behavior: manually call deleter to free GPU memory
     unsafe {
         println!("Calling deleter to free GPU memory");
         common::take_deleter_and_delete(dlpack_ptr);
@@ -81,7 +80,6 @@ fn test_amplitude_encoding_async_pipeline() {
         return;
     };
 
-    // Use 200000 elements to trigger async pipeline path (ASYNC_THRESHOLD = 131072)
     let data = common::create_test_data(200000);
     println!("Created test data: {} elements", data.len());
 
@@ -109,7 +107,7 @@ fn test_angle_encoding_async_pipeline() {
 
     let num_qubits = 4;
     let sample_size = num_qubits;
-    let num_samples = 32768; // 32768 * 4 = 131072 elements (>= 1MB threshold)
+    let num_samples = 32768;
     let batch_data = common::create_test_data(num_samples * sample_size);
 
     let result = engine.encode_batch(&batch_data, num_samples, sample_size, num_qubits, "angle");
@@ -162,7 +160,6 @@ fn test_batch_dlpack_2d_shape() {
         return;
     };
 
-    // Create batch data: 3 samples, each with 4 elements (2 qubits)
     let num_samples = 3;
     let num_qubits = 2;
     let sample_size = 4;
@@ -248,6 +245,54 @@ fn test_single_encode_dlpack_2d_shape() {
     }
 }
 
+#[test]
+#[cfg(target_os = "linux")]
+fn test_distributed_amplitude_two_gpu_smoke() {
+    println!("Testing distributed amplitude shard encode on two GPUs...");
+
+    if common::cuda_device().is_none() {
+        println!("SKIP: No GPU available");
+        return;
+    }
+
+    if cudarc::driver::CudaDevice::new(1).is_err() {
+        println!("SKIP: Second GPU unavailable");
+        return;
+    }
+
+    let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
+    let distributed = QdpEngine::encode_distributed_amplitude_to_shards(
+        vec![0, 1],
+        &data,
+        3,
+        Precision::Float64,
+        None,
+    )
+    .expect("Distributed amplitude shard encode should succeed");
+
+    assert_eq!(distributed.num_shards(), 2, "Expected two shards");
+    assert_eq!(distributed.shards[0].device_id, 0);
+    assert_eq!(distributed.shards[1].device_id, 1);
+
+    let shard0 = distributed.copy_shard_to_host_f64(0).unwrap();
+    let shard1 = distributed.copy_shard_to_host_f64(1).unwrap();
+
+    assert_eq!(shard0.len(), 4);
+    assert_eq!(shard1.len(), 4);
+
+    let total_prob: f64 = shard0
+        .iter()
+        .chain(shard1.iter())
+        .map(|value| value.x * value.x + value.y * value.y)
+        .sum();
+    assert!(
+        (total_prob - 1.0).abs() < 1e-10,
+        "Distributed state should remain normalized"
+    );
+
+    println!("PASS: Distributed amplitude shard encode succeeded on two GPUs");
+}
+
 #[test]
 #[cfg(target_os = "linux")]
 fn test_dlpack_device_id() {
@@ -269,13 +314,11 @@ fn test_dlpack_device_id() {
         let managed = &*dlpack_ptr;
         let tensor = &managed.dl_tensor;
 
-        // Verify device_id is correctly set (0 for device 0)
         assert_eq!(
             tensor.device.device_id, 0,
             "device_id should be 0 for device 0"
         );
 
-        // Verify device_type is CUDA (kDLCUDA = 2)
         use qdp_core::dlpack::DLDeviceType;
         match tensor.device.device_type {
             DLDeviceType::kDLCUDA => println!("PASS: Device type is CUDA"),
@@ -287,7 +330,6 @@ fn test_dlpack_device_id() {
             tensor.device.device_id
         );
 
-        // Free memory
         common::take_deleter_and_delete(dlpack_ptr);
     }
 }
diff --git a/qdp/qdp-core/tests/gpu_distributed_amplitude_plan.rs b/qdp/qdp-core/tests/gpu_distributed_amplitude_plan.rs
new file mode 100644
index 0000000000..702f551a77
--- /dev/null
+++ b/qdp/qdp-core/tests/gpu_distributed_amplitude_plan.rs
@@ -0,0 +1,140 @@
+use qdp_core::{
+    DeviceMesh, DistributedAmplitudePlan, DistributedStateLayout, DistributionMode, GpuTopology,
+    PlacementRequest, Precision, ShardPolicy,
+};
+
+#[test]
+fn distributed_amplitude_plan_has_expected_shard_math() {
+    let topology = GpuTopology::placeholder(4);
+    let mesh = DeviceMesh {
+        device_ids: vec![0, 1, 2, 3],
+        devices: Vec::new(),
+        topology,
+    };
+    let request = PlacementRequest::new(4, DistributionMode::ShardedCapacity, ShardPolicy::Equal);
+    let plan = DistributedAmplitudePlan::for_request(&mesh, request).unwrap();
+    assert_eq!(plan.global_len, 16);
+    assert_eq!(plan.shard_bits, Some(2));
+    assert_eq!(plan.uniform_shard_len, Some(4));
+    assert_eq!(plan.shard_range(0).unwrap(), (0, 4));
+    assert_eq!(plan.shard_range(3).unwrap(), (12, 16));
+}
+
+#[test]
+fn distributed_amplitude_plan_rejects_too_many_devices_for_qubits() {
+    let topology = GpuTopology::placeholder(4);
+    let mesh = DeviceMesh {
+        device_ids: vec![0, 1, 2, 3],
+        devices: Vec::new(),
+        topology,
+    };
+    let request = PlacementRequest::new(1, DistributionMode::ShardedCapacity, ShardPolicy::Equal);
+    let err = DistributedAmplitudePlan::for_request(&mesh, request).unwrap_err();
+    assert!(matches!(err, qdp_core::MahoutError::InvalidInput(_)));
+}
+
+#[test]
+fn distributed_amplitude_plan_allows_extra_global_qubit_when_shards_fit() {
+    let topology = GpuTopology::placeholder(2);
+    let mesh = DeviceMesh {
+        device_ids: vec![0, 1],
+        devices: Vec::new(),
+        topology,
+    };
+    let request = PlacementRequest::new(31, DistributionMode::ShardedCapacity, ShardPolicy::Equal);
+    let plan = DistributedAmplitudePlan::for_request(&mesh, request).unwrap();
+    assert_eq!(plan.global_len, 1usize << 31);
+    assert_eq!(plan.uniform_shard_len, Some(1usize << 30));
+}
+
+#[test]
+fn distributed_amplitude_plan_rejects_global_qubits_when_local_shard_exceeds_single_gpu_limit() {
+    let topology = GpuTopology::placeholder(1);
+    let mesh = DeviceMesh {
+        device_ids: vec![0],
+        devices: Vec::new(),
+        topology,
+    };
+    let request = PlacementRequest::new(31, DistributionMode::ShardedCapacity, ShardPolicy::Equal);
+    let err = DistributedAmplitudePlan::for_request(&mesh, request).unwrap_err();
+    assert!(matches!(
+        err,
+        qdp_core::MahoutError::InvalidInput(msg) if msg.contains("per-device capacity")
+    ));
+}
+
+#[test]
+fn distributed_amplitude_plan_rejects_zero_qubits() {
+    let topology = GpuTopology::placeholder(1);
+    let mesh = DeviceMesh {
+        device_ids: vec![0],
+        devices: Vec::new(),
+        topology,
+    };
+    let request = PlacementRequest::new(0, DistributionMode::ShardedCapacity, ShardPolicy::Equal);
+    let err = DistributedAmplitudePlan::for_request(&mesh, request).unwrap_err();
+    assert!(matches!(
+        err,
+        qdp_core::MahoutError::InvalidInput(msg)
+        if msg.contains("at least 1")
+    ));
+}
+
+#[test]
+fn distributed_amplitude_plan_reports_out_of_range_shard_ids() {
+    let topology = GpuTopology::placeholder(2);
+    let mesh = DeviceMesh {
+        device_ids: vec![0, 1],
+        devices: Vec::new(),
+        topology,
+    };
+    let request = PlacementRequest::new(2, DistributionMode::ShardedCapacity, ShardPolicy::Equal);
+    let plan = DistributedAmplitudePlan::for_request(&mesh, request).unwrap();
+    let err = plan.shard_range(2).unwrap_err();
+    assert!(matches!(
+        err,
+        qdp_core::MahoutError::InvalidInput(msg)
+        if msg.contains("out of range")
+    ));
+}
+
+#[test]
+fn balanced_uneven_distributed_amplitude_plan_omits_uniform_shard_metadata() {
+    let topology = GpuTopology::placeholder(3);
+    let mesh = DeviceMesh {
+        device_ids: vec![0, 1, 2],
+        devices: Vec::new(),
+        topology,
+    };
+    let request = PlacementRequest::new(
+        3,
+        DistributionMode::ShardedCapacity,
+        ShardPolicy::BalancedUneven,
+    );
+    let plan = DistributedAmplitudePlan::for_request(&mesh, request).unwrap();
+    assert_eq!(plan.shard_bits, None);
+    assert_eq!(plan.uniform_shard_len, None);
+    assert_eq!(plan.shard_range(2).unwrap(), (6, 8));
+}
+
+#[test]
+fn distributed_state_layout_rejects_mesh_with_missing_device_handles() {
+    let topology = GpuTopology::placeholder(2);
+    let mesh = DeviceMesh {
+        device_ids: vec![0, 1],
+        devices: Vec::new(),
+        topology,
+    };
+    let request = PlacementRequest::new(2, DistributionMode::ShardedCapacity, ShardPolicy::Equal);
+    let plan = DistributedAmplitudePlan::for_request(&mesh, request).unwrap();
+
+    let err = match DistributedStateLayout::new(&mesh, &plan, Precision::Float64) {
+        Ok(_) => panic!("expected malformed mesh to be rejected"),
+        Err(err) => err,
+    };
+    assert!(matches!(
+        err,
+        qdp_core::MahoutError::InvalidInput(msg)
+        if msg.contains("device handles")
+    ));
+}
diff --git a/qdp/qdp-core/tests/gpu_distributed_communicator.rs b/qdp/qdp-core/tests/gpu_distributed_communicator.rs
new file mode 100644
index 0000000000..bfcc4d4cf1
--- /dev/null
+++ b/qdp/qdp-core/tests/gpu_distributed_communicator.rs
@@ -0,0 +1,19 @@
+use qdp_core::gpu::{Communicator, HostCommunicator};
+
+#[test]
+fn host_communicator_reduce_sum_returns_total() {
+    let comm = HostCommunicator;
+    let values = vec![1.0, 2.0, 3.0];
+    assert_eq!(comm.reduce_sum_f64(&values).unwrap(), 6.0);
+}
+
+#[test]
+fn host_communicator_reduce_sum_rejects_empty_inputs() {
+    let comm = HostCommunicator;
+    let err = comm.reduce_sum_f64(&[]).unwrap_err();
+    assert!(matches!(
+        err,
+        qdp_core::MahoutError::InvalidInput(msg)
+        if msg.contains("at least one value")
+    ));
+}
diff --git a/qdp/qdp-core/tests/gpu_distributed_engine.rs b/qdp/qdp-core/tests/gpu_distributed_engine.rs
new file mode 100644
index 0000000000..5d26b96003
--- /dev/null
+++ b/qdp/qdp-core/tests/gpu_distributed_engine.rs
@@ -0,0 +1,219 @@
+use qdp_core::gpu::QuantumEncoder;
+use qdp_core::{Precision, QdpEngine};
+
+mod common;
+
+#[test]
+fn prepare_distributed_amplitude_returns_expected_metadata() {
+    #[cfg(target_os = "linux")]
+    if cudarc::driver::CudaDevice::new(1).is_err() {
+        return;
+    }
+
+    let prepared = QdpEngine::prepare_distributed_amplitude(
+        vec![0, 1],
+        &[1.0, 2.0, 3.0],
+        2,
+        Precision::Float32,
+        None,
+    )
+    .unwrap();
+
+    assert_eq!(prepared.mesh.num_devices(), 2);
+    assert_eq!(prepared.plan.uniform_shard_len, Some(2));
+    assert_eq!(prepared.layout.num_shards(), 2);
+    assert_eq!(prepared.layout.global_len, 4);
+
+    let expected = 1.0 / 14.0f64.sqrt();
+    assert!((prepared.inv_norm - expected).abs() < 1e-12);
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn encode_distributed_amplitude_to_shards_returns_real_buffers() {
+    let Some(device0) = common::cuda_device() else {
+        return;
+    };
+    let device1 = match cudarc::driver::CudaDevice::new(1) {
+        Ok(device) => device,
+        Err(_) => return,
+    };
+
+    let _ = device1;
+
+    let state = QdpEngine::encode_distributed_amplitude_to_shards(
+        vec![0, 1],
+        &[1.0, 2.0, 3.0],
+        2,
+        Precision::Float64,
+        None,
+    )
+    .unwrap();
+
+    assert_eq!(state.num_shards(), 2);
+
+    let shard0 = state.copy_shard_to_host_f64(0).unwrap();
+    let shard1 = state.copy_shard_to_host_f64(1).unwrap();
+
+    let expected = 1.0 / 14.0f64.sqrt();
+    let shard0 = shard0.iter().map(|value| value.x).collect::<Vec<_>>();
+    let shard1 = shard1.iter().map(|value| value.x).collect::<Vec<_>>();
+
+    assert!((shard0[0] - expected).abs() < 1e-12);
+    assert!((shard0[1] - (2.0 * expected)).abs() < 1e-12);
+    assert!((shard1[0] - (3.0 * expected)).abs() < 1e-12);
+    assert_eq!(shard1[1], 0.0);
+
+    let _ = device0;
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn distributed_amplitude_matches_single_gpu_reference_on_two_gpus() {
+    let device0 = match cudarc::driver::CudaDevice::new(0) {
+        Ok(device) => device,
+        Err(_) => return,
+    };
+    let device1 = match cudarc::driver::CudaDevice::new(1) {
+        Ok(device) => device,
+        Err(_) => return,
+    };
+
+    let input = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
+    let encoder = qdp_core::gpu::AmplitudeEncoder;
+    let single_state = encoder.encode(&device0, &input, 3).unwrap();
+    let single_host = single_state
+        .copy_to_host_f64(&device0)
+        .unwrap()
+        .into_iter()
+        .map(|value| value.x)
+        .collect::<Vec<_>>();
+
+    let distributed = QdpEngine::encode_distributed_amplitude_to_shards(
+        vec![0, 1],
+        &input,
+        3,
+        Precision::Float64,
+        None,
+    )
+    .unwrap();
+
+    let mut distributed_host = Vec::new();
+    distributed_host.extend(
+        distributed
+            .copy_shard_to_host_f64(0)
+            .unwrap()
+            .into_iter()
+            .map(|value| value.x),
+    );
+    distributed_host.extend(
+        distributed
+            .copy_shard_to_host_f64(1)
+            .unwrap()
+            .into_iter()
+            .map(|value| value.x),
+    );
+
+    assert_eq!(distributed_host.len(), single_host.len());
+    for (idx, (distributed_value, single_value)) in
+        distributed_host.iter().zip(single_host.iter()).enumerate()
+    {
+        assert!(
+            (*distributed_value - *single_value).abs() < 1e-12,
+            "Mismatch at amplitude {}: distributed={} single={}",
+            idx,
+            distributed_value,
+            single_value
+        );
+    }
+
+    let _ = device1;
+}
+
+#[test]
+fn prepare_distributed_amplitude_rejects_oversized_inputs() {
+    let err = match QdpEngine::prepare_distributed_amplitude(
+        vec![0],
+        &[1.0, 2.0, 3.0],
+        1,
+        Precision::Float64,
+        None,
+    ) {
+        Ok(_) => panic!("expected oversized input to be rejected"),
+        Err(err) => err,
+    };
+
+    assert!(matches!(
+        err,
+        qdp_core::MahoutError::InvalidInput(msg)
+        if msg.contains("exceeds state vector size")
+    ));
+}
+
+#[test]
+fn prepare_distributed_amplitude_validates_input_before_building_mesh() {
+    let err = match QdpEngine::prepare_distributed_amplitude(
+        vec![9999],
+        &[],
+        1,
+        Precision::Float64,
+        None,
+    ) {
+        Ok(_) => panic!("expected empty input to be rejected before mesh creation"),
+        Err(err) => err,
+    };
+
+    assert!(matches!(
+        err,
+        qdp_core::MahoutError::InvalidInput(msg)
+        if msg.contains("cannot be empty")
+    ));
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn distributed_state_rejects_out_of_range_shard_reads() {
+    let Some(_device0) = common::cuda_device() else {
+        return;
+    };
+
+    let state = QdpEngine::encode_distributed_amplitude_to_shards(
+        vec![0],
+        &[1.0, 2.0],
+        1,
+        Precision::Float64,
+        None,
+    )
+    .unwrap();
+
+    let err = state.copy_shard_to_host_f64(1).unwrap_err();
+    assert!(matches!(
+        err,
+        qdp_core::MahoutError::InvalidInput(msg)
+        if msg.contains("out of range")
+    ));
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn distributed_state_rejects_precision_mismatched_host_reads() {
+    let Some(_device0) = common::cuda_device() else {
+        return;
+    };
+
+    let state = QdpEngine::encode_distributed_amplitude_to_shards(
+        vec![0],
+        &[1.0, 2.0],
+        1,
+        Precision::Float32,
+        None,
+    )
+    .unwrap();
+
+    let err = state.copy_shard_to_host_f64(0).unwrap_err();
+    assert!(matches!(
+        err,
+        qdp_core::MahoutError::InvalidInput(msg)
+        if msg.contains("float32 data, not float64")
+    ));
+}
diff --git a/qdp/qdp-core/tests/gpu_distributed_planner.rs b/qdp/qdp-core/tests/gpu_distributed_planner.rs
new file mode 100644
index 0000000000..3449629ca6
--- /dev/null
+++ b/qdp/qdp-core/tests/gpu_distributed_planner.rs
@@ -0,0 +1,139 @@
+use qdp_core::{
+    DeviceMesh, DistributionMode, GpuTopology, LinkKind, PlacementPlanner, PlacementRequest,
+    ShardPolicy,
+};
+
+#[test]
+fn placement_planner_emits_contiguous_ranges() {
+    let topology = GpuTopology::placeholder(2);
+    let mesh = DeviceMesh {
+        device_ids: vec![3, 7],
+        devices: Vec::new(),
+        topology,
+    };
+    let request = PlacementRequest::new(3, DistributionMode::ShardedCapacity, ShardPolicy::Equal);
+    let plan = PlacementPlanner::plan(&mesh, &request).unwrap();
+    assert_eq!(plan.gather_device_id, Some(3));
+    assert_eq!(plan.placements.len(), 2);
+    assert_eq!(plan.placements[0].device_id, 3);
+    assert_eq!(
+        (plan.placements[0].start_idx, plan.placements[0].end_idx),
+        (0, 4)
+    );
+    assert_eq!(plan.placements[1].device_id, 7);
+    assert_eq!(
+        (plan.placements[1].start_idx, plan.placements[1].end_idx),
+        (4, 8)
+    );
+}
+
+#[test]
+fn balanced_uneven_policy_supports_non_power_of_two_device_counts() {
+    let topology = GpuTopology::placeholder(3);
+    let mesh = DeviceMesh {
+        device_ids: vec![0, 1, 2],
+        devices: Vec::new(),
+        topology,
+    };
+    let request = PlacementRequest::new(
+        3,
+        DistributionMode::ShardedCapacity,
+        ShardPolicy::BalancedUneven,
+    );
+    let plan = PlacementPlanner::plan(&mesh, &request).unwrap();
+    assert_eq!(plan.placements.len(), 3);
+    assert_eq!(
+        (plan.placements[0].start_idx, plan.placements[0].end_idx),
+        (0, 3)
+    );
+    assert_eq!(
+        (plan.placements[1].start_idx, plan.placements[1].end_idx),
+        (3, 6)
+    );
+    assert_eq!(
+        (plan.placements[2].start_idx, plan.placements[2].end_idx),
+        (6, 8)
+    );
+}
+
+#[test]
+fn equal_policy_rejects_non_power_of_two_device_counts() {
+    let topology = GpuTopology::placeholder(3);
+    let mesh = DeviceMesh {
+        device_ids: vec![0, 1, 2],
+        devices: Vec::new(),
+        topology,
+    };
+    let request = PlacementRequest::new(3, DistributionMode::ShardedCapacity, ShardPolicy::Equal);
+    let err = PlacementPlanner::plan(&mesh, &request).unwrap_err();
+    assert!(matches!(
+        err,
+        qdp_core::MahoutError::InvalidInput(msg)
+        if msg.contains("power-of-two")
+    ));
+}
+
+#[test]
+fn single_mode_uses_only_first_device() {
+    let topology = GpuTopology::placeholder(3);
+    let mesh = DeviceMesh {
+        device_ids: vec![4, 8, 15],
+        devices: Vec::new(),
+        topology,
+    };
+    let request = PlacementRequest::new(3, DistributionMode::Single, ShardPolicy::Equal);
+    let plan = PlacementPlanner::plan(&mesh, &request).unwrap();
+    assert_eq!(plan.placements.len(), 1);
+    assert_eq!(plan.gather_device_id, Some(4));
+    assert_eq!(plan.placements[0].device_id, 4);
+    assert_eq!((plan.placements[0].start_idx, plan.placements[0].end_idx), (0, 8));
+}
+
+#[test]
+fn replicated_mode_is_not_implemented() {
+    let topology = GpuTopology::placeholder(2);
+    let mesh = DeviceMesh {
+        device_ids: vec![0, 1],
+        devices: Vec::new(),
+        topology,
+    };
+    let request = PlacementRequest::new(2, DistributionMode::Replicated, ShardPolicy::Equal);
+    let err = PlacementPlanner::plan(&mesh, &request).unwrap_err();
+    assert!(matches!(err, qdp_core::MahoutError::NotImplemented(_)));
+}
+
+#[test]
+fn sharded_capacity_prefers_topology_recommended_device_order() {
+    let topology = GpuTopology {
+        peer_access: vec![
+            vec![true, true, false],
+            vec![true, true, true],
+            vec![false, true, true],
+        ],
+        links: vec![
+            vec![LinkKind::SameDevice, LinkKind::Pix, LinkKind::Unknown],
+            vec![LinkKind::Pix, LinkKind::SameDevice, LinkKind::Node],
+            vec![LinkKind::Unknown, LinkKind::Node, LinkKind::SameDevice],
+        ],
+    };
+    let mesh = DeviceMesh {
+        device_ids: vec![10, 11, 12],
+        devices: Vec::new(),
+        topology,
+    };
+    let request = PlacementRequest::new(
+        3,
+        DistributionMode::ShardedCapacity,
+        ShardPolicy::BalancedUneven,
+    );
+
+    let plan = PlacementPlanner::plan(&mesh, &request).unwrap();
+    let ordered_ids = plan
+        .placements
+        .iter()
+        .map(|placement| placement.device_id)
+        .collect::<Vec<_>>();
+
+    assert_eq!(plan.gather_device_id, Some(11));
+    assert_eq!(ordered_ids, vec![11, 10, 12]);
+}
diff --git a/qdp/qdp-core/tests/gpu_distributed_runtime.rs b/qdp/qdp-core/tests/gpu_distributed_runtime.rs
new file mode 100644
index 0000000000..ca2950c7ce
--- /dev/null
+++ b/qdp/qdp-core/tests/gpu_distributed_runtime.rs
@@ -0,0 +1,144 @@
+use qdp_core::{DistributionMode, PlacementRequest, Precision, QdpEngine, ShardPolicy};
+
+#[test]
+fn prepare_distributed_amplitude_handles_padding_tail_in_norm() {
+    let prepared = QdpEngine::prepare_distributed_amplitude(
+        vec![0],
+        &[1.0, 2.0, 3.0],
+        2,
+        Precision::Float64,
+        None,
+    )
+    .unwrap();
+
+    let expected = 1.0 / 14.0f64.sqrt();
+    assert!((prepared.inv_norm - expected).abs() < 1e-12);
+}
+
+#[test]
+fn prepare_distributed_amplitude_accepts_custom_request() {
+    #[cfg(target_os = "linux")]
+    if cudarc::driver::CudaDevice::new(1).is_err() {
+        return;
+    }
+
+    let prepared = QdpEngine::prepare_distributed_amplitude(
+        vec![0, 1],
+        &[1.0, 2.0, 3.0],
+        2,
+        Precision::Float32,
+        Some(PlacementRequest::new(
+            2,
+            DistributionMode::ShardedCapacity,
+            ShardPolicy::Equal,
+        )),
+    )
+    .unwrap();
+
+    assert_eq!(prepared.plan.num_qubits, 2);
+    assert_eq!(prepared.plan.placement.num_devices(), 2);
+}
+
+#[test]
+fn prepare_distributed_amplitude_rejects_request_qubit_mismatch() {
+    let err = match QdpEngine::prepare_distributed_amplitude(
+        vec![0],
+        &[1.0, 2.0],
+        2,
+        Precision::Float64,
+        Some(PlacementRequest::new(
+            3,
+            DistributionMode::ShardedCapacity,
+            ShardPolicy::Equal,
+        )),
+    ) {
+        Ok(_) => panic!("expected qubit mismatch to be rejected"),
+        Err(err) => err,
+    };
+
+    assert!(matches!(
+        err,
+        qdp_core::MahoutError::InvalidInput(msg)
+        if msg.contains("qubit mismatch")
+    ));
+}
+
+#[test]
+fn prepare_distributed_amplitude_rejects_empty_input() {
+    let err = match QdpEngine::prepare_distributed_amplitude(
+        vec![0],
+        &[],
+        1,
+        Precision::Float64,
+        None,
+    ) {
+        Ok(_) => panic!("expected empty input to be rejected"),
+        Err(err) => err,
+    };
+
+    assert!(matches!(
+        err,
+        qdp_core::MahoutError::InvalidInput(msg)
+        if msg.contains("cannot be empty")
+    ));
+}
+
+#[test]
+fn prepare_distributed_amplitude_rejects_zero_norm_input() {
+    let err = match QdpEngine::prepare_distributed_amplitude(
+        vec![0],
+        &[0.0, 0.0],
+        1,
+        Precision::Float64,
+        None,
+    ) {
+        Ok(_) => panic!("expected zero-norm input to be rejected"),
+        Err(err) => err,
+    };
+
+    assert!(matches!(
+        err,
+        qdp_core::MahoutError::InvalidInput(msg)
+        if msg.contains("zero or non-finite norm")
+    ));
+}
+
+#[test]
+fn prepare_distributed_amplitude_rejects_non_finite_input() {
+    let err = match QdpEngine::prepare_distributed_amplitude(
+        vec![0],
+        &[1.0, f64::NAN],
+        1,
+        Precision::Float64,
+        None,
+    ) {
+        Ok(_) => panic!("expected non-finite input to be rejected"),
+        Err(err) => err,
+    };
+
+    assert!(matches!(
+        err,
+        qdp_core::MahoutError::InvalidInput(msg)
+        if msg.contains("NaN or Inf")
+    ));
+}
+
+#[test]
+fn prepare_distributed_amplitude_validates_input_before_building_mesh() {
+    let err = match QdpEngine::prepare_distributed_amplitude(
+        vec![9999],
+        &[],
+        1,
+        Precision::Float64,
+        None,
+    ) {
+        Ok(_) => panic!("expected invalid input to be rejected before mesh creation"),
+        Err(err) => err,
+    };
+
+    assert!(matches!(
+        err,
+        qdp_core::MahoutError::InvalidInput(msg)
+        if msg.contains("cannot be empty")
+    ));
+}
diff --git a/qdp/qdp-core/tests/gpu_topology.rs b/qdp/qdp-core/tests/gpu_topology.rs
new file mode 100644
index 0000000000..8b0b2427f0
--- /dev/null
+++ b/qdp/qdp-core/tests/gpu_topology.rs
@@ -0,0 +1,79 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use qdp_core::{DeviceMesh, GpuTopology, LinkKind};
+
+#[test]
+fn placeholder_topology_marks_self_edges() {
+    let topology = GpuTopology::placeholder(2);
+    assert!(topology.peer_access[0][0]);
+    assert!(topology.peer_access[1][1]);
+    assert_eq!(topology.links[0][0], LinkKind::SameDevice);
+    assert_eq!(topology.links[1][1], LinkKind::SameDevice);
+    assert!(!topology.peer_access[0][1]);
+    assert_eq!(topology.links[0][1], LinkKind::Unknown);
+}
+
+#[test]
+fn duplicate_device_ids_are_rejected() {
+    let err = DeviceMesh::new(vec![0, 0]).err().unwrap();
+    assert!(matches!(err, qdp_core::MahoutError::InvalidInput(_)));
+}
+
+#[test]
+fn power_of_two_validation_rejects_three_devices() {
+    let topology = GpuTopology::placeholder(3);
+    let mesh = DeviceMesh {
+        device_ids: vec![0, 1, 2],
+        devices: Vec::new(),
+        topology,
+    };
+    let err = mesh.validate_power_of_two().unwrap_err();
+    assert!(matches!(err, qdp_core::MahoutError::InvalidInput(_)));
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn from_parts_rejects_device_handle_mismatch() {
+    let device = match cudarc::driver::CudaDevice::new(0) {
+        Ok(device) => device,
+        Err(_) => return,
+    };
+    let topology = GpuTopology::placeholder(1);
+    let err = DeviceMesh::from_parts(vec![1], vec![device], topology)
+        .err()
+        .unwrap();
+    assert!(matches!(err, qdp_core::MahoutError::InvalidInput(_)));
+}
+
+#[test]
+fn preferred_gather_index_prefers_best_connected_device() {
+    let topology = GpuTopology {
+        peer_access: vec![
+            vec![true, true, false],
+            vec![true, true, true],
+            vec![false, true, true],
+        ],
+        links: vec![
+            vec![LinkKind::SameDevice, LinkKind::Pix, LinkKind::Unknown],
+            vec![LinkKind::Pix, LinkKind::SameDevice, LinkKind::Node],
+            vec![LinkKind::Unknown, LinkKind::Node, LinkKind::SameDevice],
+        ],
+    };
+
+    assert_eq!(topology.preferred_gather_index(), Some(1));
+    assert_eq!(topology.recommended_placement_order(), vec![1, 0, 2]);
+}

From aca3a0f5300362c9575f4ab255d99bc59fb4d761 Mon Sep 17 00:00:00 2001
From: viiccwen <viiccwen@gmail.com>
Date: Tue, 5 May 2026 05:22:17 +0000
Subject: [PATCH 4/5] feat(gpu): enable q34 mpi-shaped distributed amplitude
 probe

---
 .../examples/mpi_multigpu_q34_probe.rs        | 132 ++++++++++++++++++
 qdp/qdp-core/src/gpu/distributed/amplitude.rs |  45 +++++-
 qdp/qdp-core/src/lib.rs                       |  61 ++++++--
 .../tests/gpu_distributed_amplitude_plan.rs   |  44 ++++--
 qdp/qdp-core/tests/gpu_distributed_engine.rs  |  42 +++++-
 5 files changed, 299 insertions(+), 25 deletions(-)
 create mode 100644 qdp/qdp-core/examples/mpi_multigpu_q34_probe.rs

diff --git a/qdp/qdp-core/examples/mpi_multigpu_q34_probe.rs b/qdp/qdp-core/examples/mpi_multigpu_q34_probe.rs
new file mode 100644
index 0000000000..4d3d8cdf36
--- /dev/null
+++ b/qdp/qdp-core/examples/mpi_multigpu_q34_probe.rs
@@ -0,0 +1,132 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::time::Instant;
+
+use qdp_core::{DistributionMode, MahoutError};
+use qdp_core::{HostCommunicator, PlacementRequest, Precision, QdpEngine, ShardPolicy};
+
+fn gib(bytes: usize) -> f64 {
+    bytes as f64 / (1024.0 * 1024.0 * 1024.0)
+}
+
+fn parse_device_ids() -> Result<Vec<usize>, MahoutError> {
+    let raw = std::env::var("GPU_IDS").unwrap_or_else(|_| "0,1,2,3,4,5".to_string());
+    let mut ids = Vec::new();
+    for piece in raw.split(',') {
+        let trimmed = piece.trim();
+        if trimmed.is_empty() {
+            continue;
+        }
+        ids.push(trimmed.parse::<usize>().map_err(|err| {
+            MahoutError::InvalidInput(format!("Invalid GPU ID '{trimmed}': {err}"))
+        })?);
+    }
+
+    if ids.is_empty() {
+        return Err(MahoutError::InvalidInput(
+            "GPU_IDS must contain at least one CUDA device ID".to_string(),
+        ));
+    }
+
+    Ok(ids)
+}
+
+fn main() -> Result<(), MahoutError> {
+    let num_qubits = std::env::var("QUBITS")
+        .ok()
+        .and_then(|value| value.parse::<usize>().ok())
+        .unwrap_or(34);
+    let host_len = std::env::var("HOST_LEN")
+        .ok()
+        .and_then(|value| value.parse::<usize>().ok())
+        .unwrap_or(1);
+    let precision = match std::env::var("PRECISION").ok().as_deref() {
+        Some("f64") | Some("float64") => Precision::Float64,
+        _ => Precision::Float32,
+    };
+    let shard_policy = match std::env::var("SHARD_POLICY").ok().as_deref() {
+        Some("equal") => ShardPolicy::Equal,
+        _ => ShardPolicy::BalancedUneven,
+    };
+    let device_ids = parse_device_ids()?;
+    let request =
+        PlacementRequest::new(num_qubits, DistributionMode::ShardedCapacity, shard_policy);
+    let communicator = HostCommunicator;
+    let host_data = vec![1.0f64; host_len];
+
+    println!(
+        "Starting MPI-shaped distributed amplitude probe: qubits={}, host_len={}, gpus={:?}, precision={:?}, shard_policy={:?}, communicator=host-loopback",
+        num_qubits, host_len, device_ids, precision, shard_policy
+    );
+
+    let prepare_start = Instant::now();
+    let prepared = QdpEngine::prepare_distributed_amplitude_with_communicator(
+        device_ids.clone(),
+        &host_data,
+        num_qubits,
+        precision,
+        Some(request.clone()),
+        &communicator,
+    )?;
+    let prepare_elapsed = prepare_start.elapsed();
+
+    println!(
+        "Prepared in {:.3}s; global_len={}; shards={}; max_local_len={}; estimated_max_shard_gib={:.2}; gather_device={:?}",
+        prepare_elapsed.as_secs_f64(),
+        prepared.plan.global_len,
+        prepared.layout.num_shards(),
+        prepared.plan.max_local_len(),
+        gib(prepared.plan.estimated_max_shard_bytes(precision)?),
+        prepared.layout.recommended_gather_device_id()
+    );
+
+    for shard in &prepared.layout.shards {
+        let shard_bytes = match precision {
+            Precision::Float32 => shard.local_len * 8,
+            Precision::Float64 => shard.local_len * 16,
+        };
+        println!(
+            "  shard {} -> cuda:{} range=[{}, {}) local_len={} (~{:.2} GiB)",
+            shard.shard_id,
+            shard.device_id,
+            shard.start_idx,
+            shard.end_idx,
+            shard.local_len,
+            gib(shard_bytes)
+        );
+    }
+
+    let encode_start = Instant::now();
+    let state = QdpEngine::encode_distributed_amplitude_to_shards_with_communicator(
+        device_ids,
+        &host_data,
+        num_qubits,
+        precision,
+        Some(request),
+        &communicator,
+    )?;
+    let encode_elapsed = encode_start.elapsed();
+
+    println!(
+        "Encoded in {:.3}s; state_shards={}; placement={:?}",
+        encode_elapsed.as_secs_f64(),
+        state.num_shards(),
+        state.recommended_placement_device_ids()
+    );
+
+    Ok(())
+}
diff --git a/qdp/qdp-core/src/gpu/distributed/amplitude.rs b/qdp/qdp-core/src/gpu/distributed/amplitude.rs
index f5f4f94bd9..fdc41635a4 100644
--- a/qdp/qdp-core/src/gpu/distributed/amplitude.rs
+++ b/qdp/qdp-core/src/gpu/distributed/amplitude.rs
@@ -19,7 +19,7 @@ use crate::gpu::distributed::{
     DistributionMode, PlacementPlan, PlacementPlanner, PlacementRequest, ShardPlacement,
     ShardPolicy,
 };
-use crate::gpu::encodings::MAX_QUBITS;
+use crate::gpu::memory::Precision;
 use crate::gpu::topology::DeviceMesh;
 
 /// Shared planning math for amplitude-sharded state construction.
@@ -68,7 +68,7 @@ impl DistributedAmplitudePlan {
 
         let num_devices = mesh.num_devices();
         let placement = PlacementPlanner::plan(mesh, &request)?;
-        Self::validate_local_shard_capacity(request.num_qubits, &placement)?;
+        Self::validate_local_shard_shape(request.num_qubits, &placement)?;
         let global_len = placement.global_len;
         let num_qubits = request.num_qubits;
         let (shard_bits, uniform_shard_len) = match request.shard_policy {
@@ -107,8 +107,20 @@ impl DistributedAmplitudePlan {
         Ok((placement.start_idx, placement.end_idx))
     }
 
-    fn validate_local_shard_capacity(num_qubits: usize, placement: &PlacementPlan) -> Result<()> {
-        let max_local_len = 1usize << MAX_QUBITS;
+    pub fn max_local_len(&self) -> usize {
+        self.placement
+            .placements
+            .iter()
+            .map(ShardPlacement::local_len)
+            .max()
+            .unwrap_or(0)
+    }
+
+    pub fn estimated_max_shard_bytes(&self, precision: Precision) -> Result<usize> {
+        estimated_amplitude_bytes(self.max_local_len(), precision)
+    }
+
+    fn validate_local_shard_shape(num_qubits: usize, placement: &PlacementPlan) -> Result<()> {
         let required_local_len = placement
             .placements
             .iter()
@@ -120,13 +132,32 @@ impl DistributedAmplitudePlan {
                 )
             })?;
 
-        if required_local_len > max_local_len {
+        if required_local_len == 0 {
             return Err(MahoutError::InvalidInput(format!(
-                "Distributed amplitude request for {} qubits still exceeds per-device capacity: largest shard has {} amplitudes, limit is 2^{} = {}. Add more GPUs or reduce qubits.",
-                num_qubits, required_local_len, MAX_QUBITS, max_local_len
+                "Distributed amplitude request for {} qubits produced an empty local shard",
+                num_qubits
             )));
         }
 
+        let _ = estimated_amplitude_bytes(required_local_len, Precision::Float32)?;
+        let _ = estimated_amplitude_bytes(required_local_len, Precision::Float64)?;
+
         Ok(())
     }
 }
+
+fn estimated_amplitude_bytes(local_len: usize, precision: Precision) -> Result<usize> {
+    let bytes_per_amplitude = match precision {
+        Precision::Float32 => 8usize,
+        Precision::Float64 => 16usize,
+    };
+
+    local_len
+        .checked_mul(bytes_per_amplitude)
+        .ok_or_else(|| {
+            MahoutError::InvalidInput(format!(
+                "Distributed amplitude shard byte estimate overflowed for local_len={} and precision={:?}",
+                local_len, precision
+            ))
+        })
+}
diff --git a/qdp/qdp-core/src/lib.rs b/qdp/qdp-core/src/lib.rs
index 7bbb963cfe..a52d9f4fb1 100644
--- a/qdp/qdp-core/src/lib.rs
+++ b/qdp/qdp-core/src/lib.rs
@@ -37,9 +37,10 @@ mod profiling;
 pub use error::{MahoutError, Result, cuda_error_to_string};
 pub use gpu::memory::Precision;
 pub use gpu::{
-    DeviceMesh, DistributedAmplitudePlan, DistributedStateLayout, DistributedStateVector,
-    DistributionMode, GpuTopology, LinkKind, PlacementPlan, PlacementPlanner, PlacementRequest,
-    PreparedDistributedAmplitudeEncode, ShardPlacement, ShardPolicy,
+    Communicator, DeviceMesh, DistributedAmplitudePlan, DistributedStateLayout,
+    DistributedStateVector, DistributionMode, GpuTopology, HostCommunicator, LinkKind,
+    PlacementPlan, PlacementPlanner, PlacementRequest, PreparedDistributedAmplitudeEncode,
+    ShardPlacement, ShardPolicy,
 };
 pub use reader::{NullHandling, handle_float64_nulls};
 
@@ -57,7 +58,6 @@ use std::ffi::c_void;
 use std::sync::Arc;
 
 use crate::dlpack::DLManagedTensor;
-use crate::gpu::HostCommunicator;
 use crate::gpu::distributed::runtime;
 use crate::gpu::get_encoder;
 use cudarc::driver::CudaDevice;
@@ -165,13 +165,36 @@ impl QdpEngine {
         num_qubits: usize,
         precision: Precision,
         request: Option<PlacementRequest>,
+    ) -> Result<PreparedDistributedAmplitudeEncode> {
+        let communicator = HostCommunicator;
+        Self::prepare_distributed_amplitude_with_communicator(
+            device_ids,
+            data,
+            num_qubits,
+            precision,
+            request,
+            &communicator,
+        )
+    }
+
+    /// Prepare a distributed amplitude encode with an injected collective
+    /// implementation. This keeps the execution path compatible with future
+    /// MPI-backed or NCCL-backed coordination without hard-coding a specific
+    /// transport into the engine surface.
+    #[doc(hidden)]
+    pub fn prepare_distributed_amplitude_with_communicator(
+        device_ids: Vec<usize>,
+        data: &[f64],
+        num_qubits: usize,
+        precision: Precision,
+        request: Option<PlacementRequest>,
+        communicator: &dyn Communicator,
     ) -> Result<PreparedDistributedAmplitudeEncode> {
         let request = Self::resolve_distributed_request(num_qubits, request)?;
         runtime::validate_distributed_input(data, &request)?;
         let mesh = Self::new_distributed_mesh(device_ids)?;
-        let communicator = HostCommunicator;
         let (plan, inv_norm, layout) =
-            runtime::prepare_distributed_encode(&mesh, data, precision, request, &communicator)?;
+            runtime::prepare_distributed_encode(&mesh, data, precision, request, communicator)?;
 
         Ok(PreparedDistributedAmplitudeEncode {
             mesh,
@@ -194,12 +217,34 @@ impl QdpEngine {
         num_qubits: usize,
         precision: Precision,
         request: Option<PlacementRequest>,
+    ) -> Result<DistributedStateVector> {
+        let communicator = HostCommunicator;
+        Self::encode_distributed_amplitude_to_shards_with_communicator(
+            device_ids,
+            data,
+            num_qubits,
+            precision,
+            request,
+            &communicator,
+        )
+    }
+
+    /// Materialize a distributed amplitude state with an injected collective
+    /// implementation. This is the execution-oriented companion to
+    /// `prepare_distributed_amplitude_with_communicator`.
+    #[doc(hidden)]
+    pub fn encode_distributed_amplitude_to_shards_with_communicator(
+        device_ids: Vec<usize>,
+        data: &[f64],
+        num_qubits: usize,
+        precision: Precision,
+        request: Option<PlacementRequest>,
+        communicator: &dyn Communicator,
     ) -> Result<DistributedStateVector> {
         let request = Self::resolve_distributed_request(num_qubits, request)?;
         runtime::validate_distributed_input(data, &request)?;
         let mesh = Self::new_distributed_mesh(device_ids)?;
-        let communicator = HostCommunicator;
-        runtime::encode_distributed_to_shards(&mesh, data, precision, request, &communicator)
+        runtime::encode_distributed_to_shards(&mesh, data, precision, request, communicator)
     }
 
     fn resolve_distributed_request(
diff --git a/qdp/qdp-core/tests/gpu_distributed_amplitude_plan.rs b/qdp/qdp-core/tests/gpu_distributed_amplitude_plan.rs
index 702f551a77..b59101c2d7 100644
--- a/qdp/qdp-core/tests/gpu_distributed_amplitude_plan.rs
+++ b/qdp/qdp-core/tests/gpu_distributed_amplitude_plan.rs
@@ -1,3 +1,19 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 use qdp_core::{
     DeviceMesh, DistributedAmplitudePlan, DistributedStateLayout, DistributionMode, GpuTopology,
     PlacementRequest, Precision, ShardPolicy,
@@ -48,19 +64,29 @@ fn distributed_amplitude_plan_allows_extra_global_qubit_when_shards_fit() {
 }
 
 #[test]
-fn distributed_amplitude_plan_rejects_global_qubits_when_local_shard_exceeds_single_gpu_limit() {
-    let topology = GpuTopology::placeholder(1);
+fn distributed_amplitude_plan_supports_q34_with_balanced_six_gpu_shards() {
+    let topology = GpuTopology::placeholder(6);
     let mesh = DeviceMesh {
-        device_ids: vec![0],
+        device_ids: vec![0, 1, 2, 3, 4, 5],
         devices: Vec::new(),
         topology,
     };
-    let request = PlacementRequest::new(31, DistributionMode::ShardedCapacity, ShardPolicy::Equal);
-    let err = DistributedAmplitudePlan::for_request(&mesh, request).unwrap_err();
-    assert!(matches!(
-        err,
-        qdp_core::MahoutError::InvalidInput(msg) if msg.contains("per-device capacity")
-    ));
+    let request = PlacementRequest::new(
+        34,
+        DistributionMode::ShardedCapacity,
+        ShardPolicy::BalancedUneven,
+    );
+    let plan = DistributedAmplitudePlan::for_request(&mesh, request).unwrap();
+
+    assert_eq!(plan.global_len, 1usize << 34);
+    assert_eq!(plan.num_devices, 6);
+    assert_eq!(plan.shard_bits, None);
+    assert_eq!(plan.uniform_shard_len, None);
+    assert_eq!(plan.shard_range(0).unwrap(), (0, 2_863_311_531));
+    assert_eq!(
+        plan.shard_range(5).unwrap(),
+        (14_316_557_654, 17_179_869_184)
+    );
 }
 
 #[test]
diff --git a/qdp/qdp-core/tests/gpu_distributed_engine.rs b/qdp/qdp-core/tests/gpu_distributed_engine.rs
index 5d26b96003..b584bf0055 100644
--- a/qdp/qdp-core/tests/gpu_distributed_engine.rs
+++ b/qdp/qdp-core/tests/gpu_distributed_engine.rs
@@ -1,5 +1,21 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 use qdp_core::gpu::QuantumEncoder;
-use qdp_core::{Precision, QdpEngine};
+use qdp_core::{HostCommunicator, Precision, QdpEngine};
 
 mod common;
 
@@ -28,6 +44,30 @@ fn prepare_distributed_amplitude_returns_expected_metadata() {
     assert!((prepared.inv_norm - expected).abs() < 1e-12);
 }
 
+#[test]
+fn prepare_distributed_amplitude_with_explicit_communicator_returns_expected_metadata() {
+    #[cfg(target_os = "linux")]
+    if cudarc::driver::CudaDevice::new(1).is_err() {
+        return;
+    }
+
+    let communicator = HostCommunicator;
+    let prepared = QdpEngine::prepare_distributed_amplitude_with_communicator(
+        vec![0, 1],
+        &[1.0, 2.0, 3.0],
+        2,
+        Precision::Float32,
+        None,
+        &communicator,
+    )
+    .unwrap();
+
+    assert_eq!(prepared.mesh.num_devices(), 2);
+    assert_eq!(prepared.plan.uniform_shard_len, Some(2));
+    assert_eq!(prepared.layout.num_shards(), 2);
+    assert_eq!(prepared.layout.global_len, 4);
+}
+
 #[test]
 #[cfg(target_os = "linux")]
 fn encode_distributed_amplitude_to_shards_returns_real_buffers() {

From baa6a907b8d137acd97cebe11106f942097dc1f1 Mon Sep 17 00:00:00 2001
From: viiccwen <viiccwen@gmail.com>
Date: Tue, 5 May 2026 05:59:48 +0000
Subject: [PATCH 5/5] refactor(gpu): harden distributed execution context

---
 ...e.rs => distributed_multigpu_q34_probe.rs} |  23 ++--
 qdp/qdp-core/src/gpu/communicator.rs          |  22 ++--
 qdp/qdp-core/src/gpu/distributed/context.rs   |  51 ++++++++
 qdp/qdp-core/src/gpu/distributed/layout.rs    |   7 +-
 qdp/qdp-core/src/gpu/distributed/mod.rs       |   2 +
 qdp/qdp-core/src/gpu/distributed/planner.rs   |   5 +-
 qdp/qdp-core/src/gpu/distributed/runtime.rs   |  90 +++++++++----
 qdp/qdp-core/src/gpu/mod.rs                   |   8 +-
 qdp/qdp-core/src/gpu/topology.rs              |  29 +++--
 qdp/qdp-core/src/lib.rs                       |  83 ++++++------
 .../tests/gpu_distributed_amplitude_plan.rs   |   5 +-
 .../tests/gpu_distributed_communicator.rs     |  32 +++--
 qdp/qdp-core/tests/gpu_distributed_engine.rs  | 118 ++++++++++++++++--
 qdp/qdp-core/tests/gpu_distributed_planner.rs |   7 +-
 qdp/qdp-core/tests/gpu_distributed_runtime.rs |  15 +--
 qdp/qdp-kernels/build.rs                      |  12 +-
 16 files changed, 369 insertions(+), 140 deletions(-)
 rename qdp/qdp-core/examples/{mpi_multigpu_q34_probe.rs => distributed_multigpu_q34_probe.rs} (87%)
 create mode 100644 qdp/qdp-core/src/gpu/distributed/context.rs

diff --git a/qdp/qdp-core/examples/mpi_multigpu_q34_probe.rs b/qdp/qdp-core/examples/distributed_multigpu_q34_probe.rs
similarity index 87%
rename from qdp/qdp-core/examples/mpi_multigpu_q34_probe.rs
rename to qdp/qdp-core/examples/distributed_multigpu_q34_probe.rs
index 4d3d8cdf36..9eaa226a1a 100644
--- a/qdp/qdp-core/examples/mpi_multigpu_q34_probe.rs
+++ b/qdp/qdp-core/examples/distributed_multigpu_q34_probe.rs
@@ -16,8 +16,11 @@
 
 use std::time::Instant;
 
-use qdp_core::{DistributionMode, MahoutError};
-use qdp_core::{HostCommunicator, PlacementRequest, Precision, QdpEngine, ShardPolicy};
+use qdp_core::gpu::LocalCollectiveCommunicator;
+use qdp_core::{
+    DistributedExecutionContext, DistributionMode, MahoutError, PlacementRequest, Precision,
+    QdpEngine, ShardPolicy,
+};
 
 fn gib(bytes: usize) -> f64 {
     bytes as f64 / (1024.0 * 1024.0 * 1024.0)
@@ -65,22 +68,23 @@ fn main() -> Result<(), MahoutError> {
     let device_ids = parse_device_ids()?;
     let request =
         PlacementRequest::new(num_qubits, DistributionMode::ShardedCapacity, shard_policy);
-    let communicator = HostCommunicator;
     let host_data = vec![1.0f64; host_len];
 
     println!(
-        "Starting MPI-shaped distributed amplitude probe: qubits={}, host_len={}, gpus={:?}, precision={:?}, shard_policy={:?}, communicator=host-loopback",
+        "Starting distributed amplitude probe: qubits={}, host_len={}, gpus={:?}, precision={:?}, shard_policy={:?}, collectives=in-process",
         num_qubits, host_len, device_ids, precision, shard_policy
     );
 
+    let collectives = LocalCollectiveCommunicator;
+    let execution = DistributedExecutionContext::single_process(device_ids.clone(), &collectives)?;
+
     let prepare_start = Instant::now();
-    let prepared = QdpEngine::prepare_distributed_amplitude_with_communicator(
-        device_ids.clone(),
+    let prepared = QdpEngine::prepare_distributed_amplitude_on(
+        &execution,
         &host_data,
         num_qubits,
         precision,
         Some(request.clone()),
-        &communicator,
     )?;
     let prepare_elapsed = prepare_start.elapsed();
 
@@ -111,13 +115,12 @@ fn main() -> Result<(), MahoutError> {
     }
 
     let encode_start = Instant::now();
-    let state = QdpEngine::encode_distributed_amplitude_to_shards_with_communicator(
-        device_ids,
+    let state = QdpEngine::encode_distributed_amplitude_to_shards_on(
+        &execution,
         &host_data,
         num_qubits,
         precision,
         Some(request),
-        &communicator,
     )?;
     let encode_elapsed = encode_start.elapsed();
 
diff --git a/qdp/qdp-core/src/gpu/communicator.rs b/qdp/qdp-core/src/gpu/communicator.rs
index c7b9776ef8..591cc81d32 100644
--- a/qdp/qdp-core/src/gpu/communicator.rs
+++ b/qdp/qdp-core/src/gpu/communicator.rs
@@ -16,21 +16,25 @@
 
 use crate::error::{MahoutError, Result};
 
-/// Abstracts cross-device coordination. PR1 provides a host-coordinated fallback;
-/// a later PR can add NCCL-backed implementations behind the same trait.
-pub trait Communicator: Send + Sync {
-    fn reduce_sum_f64(&self, values: &[f64]) -> Result<f64>;
+/// Abstracts cross-shard collective operations.
+///
+/// The current implementation executes collectives inside one process. A future
+/// MPI-backed implementation can provide the same interface while mapping the
+/// partial contributions to rank-local shards and performing a real all-reduce.
+pub trait CollectiveCommunicator: Send + Sync {
+    fn all_reduce_sum_f64(&self, values: &[f64]) -> Result<f64>;
 }
 
-/// Host-coordinated reduction placeholder used for early distributed amplitude prototypes.
+/// In-process collective implementation for the current single-process
+/// distributed path.
 #[derive(Default, Debug, Clone, Copy)]
-pub struct HostCommunicator;
+pub struct LocalCollectiveCommunicator;
 
-impl Communicator for HostCommunicator {
-    fn reduce_sum_f64(&self, values: &[f64]) -> Result<f64> {
+impl CollectiveCommunicator for LocalCollectiveCommunicator {
+    fn all_reduce_sum_f64(&self, values: &[f64]) -> Result<f64> {
         if values.is_empty() {
             return Err(MahoutError::InvalidInput(
-                "Host communicator requires at least one value for reduction".to_string(),
+                "Collective reduction requires at least one partial contribution".to_string(),
             ));
         }
 
diff --git a/qdp/qdp-core/src/gpu/distributed/context.rs b/qdp/qdp-core/src/gpu/distributed/context.rs
new file mode 100644
index 0000000000..f103db5bc2
--- /dev/null
+++ b/qdp/qdp-core/src/gpu/distributed/context.rs
@@ -0,0 +1,51 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::error::Result;
+use crate::gpu::communicator::CollectiveCommunicator;
+use crate::gpu::topology::DeviceMesh;
+
+/// Bundles the device mesh with the collective implementation that coordinates
+/// those devices.
+///
+/// The current branch uses one process with one mesh covering all participating
+/// devices. A future MPI implementation can construct one context per rank with
+/// a rank-local mesh and an MPI-backed collective implementation.
+pub struct DistributedExecutionContext<'a> {
+    mesh: DeviceMesh,
+    collectives: &'a dyn CollectiveCommunicator,
+}
+
+impl<'a> DistributedExecutionContext<'a> {
+    pub fn new(mesh: DeviceMesh, collectives: &'a dyn CollectiveCommunicator) -> Self {
+        Self { mesh, collectives }
+    }
+
+    pub fn single_process(
+        device_ids: Vec<usize>,
+        collectives: &'a dyn CollectiveCommunicator,
+    ) -> Result<Self> {
+        Ok(Self::new(DeviceMesh::new(device_ids)?, collectives))
+    }
+
+    pub fn mesh(&self) -> &DeviceMesh {
+        &self.mesh
+    }
+
+    pub(crate) fn collectives(&self) -> &dyn CollectiveCommunicator {
+        self.collectives
+    }
+}
diff --git a/qdp/qdp-core/src/gpu/distributed/layout.rs b/qdp/qdp-core/src/gpu/distributed/layout.rs
index 143b1637c2..feb86258a0 100644
--- a/qdp/qdp-core/src/gpu/distributed/layout.rs
+++ b/qdp/qdp-core/src/gpu/distributed/layout.rs
@@ -36,7 +36,8 @@ pub struct StateShardLayout {
     pub local_len: usize,
 }
 
-/// Metadata and placement plan for a future multi-GPU state vector.
+/// Metadata describing how one distributed state is mapped onto one execution
+/// context.
 #[derive(Clone)]
 pub struct DistributedStateLayout {
     pub num_qubits: usize,
@@ -98,10 +99,10 @@ impl DistributedStateLayout {
         }
 
         let mut shards = Vec::with_capacity(mesh.num_devices());
-        for (placement, device) in plan.placement.placements.iter().zip(mesh.devices.iter()) {
+        for placement in &plan.placement.placements {
             let (start_idx, end_idx) = (placement.start_idx, placement.end_idx);
             shards.push(StateShardLayout {
-                device: Arc::clone(device),
+                device: mesh.device_for_id(placement.device_id)?,
                 device_id: placement.device_id,
                 shard_id: placement.shard_id,
                 start_idx,
diff --git a/qdp/qdp-core/src/gpu/distributed/mod.rs b/qdp/qdp-core/src/gpu/distributed/mod.rs
index acb02918f8..47249b5b64 100644
--- a/qdp/qdp-core/src/gpu/distributed/mod.rs
+++ b/qdp/qdp-core/src/gpu/distributed/mod.rs
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 mod amplitude;
+mod context;
 mod layout;
 mod planner;
 pub(crate) mod runtime;
@@ -22,6 +23,7 @@ mod shared;
 mod state;
 
 pub use amplitude::{DistributedAmplitudePlan, PreparedDistributedAmplitudeEncode};
+pub use context::DistributedExecutionContext;
 pub use layout::{DistributedStateLayout, StateShardLayout};
 pub use planner::{
     DistributionMode, PlacementPlan, PlacementPlanner, PlacementRequest, ShardPlacement,
diff --git a/qdp/qdp-core/src/gpu/distributed/planner.rs b/qdp/qdp-core/src/gpu/distributed/planner.rs
index 747edace98..060d001a80 100644
--- a/qdp/qdp-core/src/gpu/distributed/planner.rs
+++ b/qdp/qdp-core/src/gpu/distributed/planner.rs
@@ -17,7 +17,7 @@
 use crate::error::{MahoutError, Result};
 use crate::gpu::topology::DeviceMesh;
 
-/// Runtime distribution modes for future multi-GPU state construction.
+/// Runtime distribution modes for distributed state construction.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum DistributionMode {
     Single,
@@ -78,7 +78,8 @@ impl ShardPlacement {
     }
 }
 
-/// Planner output consumed by distributed encoders and future gather/export APIs.
+/// Planner output consumed by distributed encoders and by any later
+/// gather/export layer that needs stable shard ranges.
 #[derive(Clone, Debug, PartialEq, Eq)]
 pub struct PlacementPlan {
     pub mode: DistributionMode,
diff --git a/qdp/qdp-core/src/gpu/distributed/runtime.rs b/qdp/qdp-core/src/gpu/distributed/runtime.rs
index 2d7f42370c..f7b0e7df9b 100644
--- a/qdp/qdp-core/src/gpu/distributed/runtime.rs
+++ b/qdp/qdp-core/src/gpu/distributed/runtime.rs
@@ -19,11 +19,12 @@ use std::sync::Arc;
 #[cfg(target_os = "linux")]
 use crate::error::cuda_error_to_string;
 use crate::error::{MahoutError, Result};
-use crate::gpu::communicator::Communicator;
+#[cfg(target_os = "linux")]
+use crate::gpu::cuda_ffi::{CUDA_SUCCESS, cudaGetDevice, cudaSetDevice};
+use crate::gpu::distributed::DistributedExecutionContext;
 #[cfg(target_os = "linux")]
 use crate::gpu::memory::{BufferStorage, GpuBufferRaw};
 use crate::gpu::memory::{Precision, ensure_device_memory_available, map_allocation_error};
-use crate::gpu::topology::DeviceMesh;
 use crate::gpu::{
     DistributedAmplitudePlan, DistributedStateLayout, DistributedStateVector, PlacementRequest,
 };
@@ -36,6 +37,45 @@ use qdp_kernels::{
 #[cfg(target_os = "linux")]
 use std::ffi::c_void;
 
+#[cfg(target_os = "linux")]
+struct DistributedDeviceContextGuard {
+    original_device: i32,
+}
+
+#[cfg(target_os = "linux")]
+impl DistributedDeviceContextGuard {
+    fn switch_to(device_id: usize) -> Result<Self> {
+        let mut original_device = 0i32;
+        let get_ret = unsafe { cudaGetDevice(&mut original_device as *mut i32) };
+        if get_ret != CUDA_SUCCESS {
+            return Err(MahoutError::Cuda(format!(
+                "cudaGetDevice failed before distributed shard launch: {} ({})",
+                get_ret,
+                cuda_error_to_string(get_ret)
+            )));
+        }
+
+        let set_ret = unsafe { cudaSetDevice(device_id as i32) };
+        if set_ret != CUDA_SUCCESS {
+            return Err(MahoutError::Cuda(format!(
+                "cudaSetDevice(cuda:{}) failed before distributed shard launch: {} ({})",
+                device_id,
+                set_ret,
+                cuda_error_to_string(set_ret)
+            )));
+        }
+
+        Ok(Self { original_device })
+    }
+}
+
+#[cfg(target_os = "linux")]
+impl Drop for DistributedDeviceContextGuard {
+    fn drop(&mut self) {
+        let _ = unsafe { cudaSetDevice(self.original_device) };
+    }
+}
+
 pub(crate) fn validate_distributed_input(
     host_data: &[f64],
     request: &PlacementRequest,
@@ -65,12 +105,12 @@ pub(crate) fn validate_distributed_input(
 }
 
 pub(crate) fn plan_distributed_encode(
-    mesh: &DeviceMesh,
+    execution: &DistributedExecutionContext<'_>,
     host_data: &[f64],
     request: PlacementRequest,
 ) -> Result<DistributedAmplitudePlan> {
     validate_distributed_input(host_data, &request)?;
-    DistributedAmplitudePlan::for_request(mesh, request)
+    DistributedAmplitudePlan::for_request(execution.mesh(), request)
 }
 
 pub(crate) fn calculate_local_norm_sq(
@@ -105,7 +145,7 @@ pub(crate) fn calculate_local_norm_sq(
 pub(crate) fn calculate_inv_norm_distributed(
     plan: &DistributedAmplitudePlan,
     host_data: &[f64],
-    communicator: &dyn Communicator,
+    execution: &DistributedExecutionContext<'_>,
 ) -> Result<f64> {
     let mut partials = Vec::with_capacity(plan.num_devices);
     for shard_id in 0..plan.num_devices {
@@ -113,7 +153,7 @@ pub(crate) fn calculate_inv_norm_distributed(
         partials.push(calculate_local_norm_sq(host_data, start_idx, end_idx)?);
     }
 
-    let global_norm_sq = communicator.reduce_sum_f64(&partials)?;
+    let global_norm_sq = execution.collectives().all_reduce_sum_f64(&partials)?;
     if global_norm_sq <= 0.0 || !global_norm_sq.is_finite() {
         return Err(MahoutError::InvalidInput(
             "Input data has zero or non-finite norm (contains NaN, Inf, or all zeros)".to_string(),
@@ -124,33 +164,33 @@ pub(crate) fn calculate_inv_norm_distributed(
 }
 
 pub(crate) fn prepare_distributed_encode(
-    mesh: &DeviceMesh,
+    execution: &DistributedExecutionContext<'_>,
     host_data: &[f64],
     precision: Precision,
     request: PlacementRequest,
-    communicator: &dyn Communicator,
 ) -> Result<(DistributedAmplitudePlan, f64, DistributedStateLayout)> {
-    let plan = plan_distributed_encode(mesh, host_data, request)?;
-    let inv_norm = calculate_inv_norm_distributed(&plan, host_data, communicator)?;
-    let layout = DistributedStateLayout::new(mesh, &plan, precision)?;
+    let plan = plan_distributed_encode(execution, host_data, request)?;
+    let inv_norm = calculate_inv_norm_distributed(&plan, host_data, execution)?;
+    let layout = DistributedStateLayout::new(execution.mesh(), &plan, precision)?;
     Ok((plan, inv_norm, layout))
 }
 
 #[cfg(target_os = "linux")]
 pub(crate) fn encode_distributed_to_shards(
-    mesh: &DeviceMesh,
+    execution: &DistributedExecutionContext<'_>,
     host_data: &[f64],
     precision: Precision,
     request: PlacementRequest,
-    communicator: &dyn Communicator,
 ) -> Result<DistributedStateVector> {
     let (plan, inv_norm, layout) =
-        prepare_distributed_encode(mesh, host_data, precision, request, communicator)?;
+        prepare_distributed_encode(execution, host_data, precision, request)?;
     let num_qubits = plan.request.num_qubits;
 
     let mut buffers = Vec::with_capacity(plan.num_devices);
-    for (shard_id, device) in mesh.devices.iter().enumerate() {
-        let (start_idx, end_idx) = plan.shard_range(shard_id)?;
+    for placement in &plan.placement.placements {
+        let device = execution.mesh().device_for_id(placement.device_id)?;
+        let _device_guard = DistributedDeviceContextGuard::switch_to(placement.device_id)?;
+        let (start_idx, end_idx) = (placement.start_idx, placement.end_idx);
         let local_len = end_idx - start_idx;
         let slice_end = end_idx.min(host_data.len());
         let present_len = slice_end.saturating_sub(start_idx);
@@ -162,14 +202,14 @@ pub(crate) fn encode_distributed_to_shards(
                     requested_bytes,
                     "distributed amplitude shard allocation (f32)",
                     Some(num_qubits),
-                    Some(device.ordinal()),
+                    Some(placement.device_id),
                 )?;
                 let mut state_slice = device.alloc_zeros::<CuComplex>(local_len).map_err(|e| {
                     map_allocation_error(
                         requested_bytes,
                         "distributed amplitude shard allocation (f32)",
                         Some(num_qubits),
-                        Some(device.ordinal()),
+                        Some(placement.device_id),
                         e,
                     )
                 })?;
@@ -200,7 +240,7 @@ pub(crate) fn encode_distributed_to_shards(
                     if ret != 0 {
                         return Err(MahoutError::KernelLaunch(format!(
                             "Distributed amplitude shard kernel failed on cuda:{} with CUDA error code: {} ({})",
-                            device.ordinal(),
+                            placement.device_id,
                             ret,
                             cuda_error_to_string(ret)
                         )));
@@ -209,8 +249,7 @@ pub(crate) fn encode_distributed_to_shards(
                     device.synchronize().map_err(|e| {
                         MahoutError::Cuda(format!(
                             "Distributed amplitude shard synchronize failed on cuda:{}: {:?}",
-                            device.ordinal(),
-                            e
+                            placement.device_id, e
                         ))
                     })?;
                 }
@@ -223,7 +262,7 @@ pub(crate) fn encode_distributed_to_shards(
                     requested_bytes,
                     "distributed amplitude shard allocation",
                     Some(num_qubits),
-                    Some(device.ordinal()),
+                    Some(placement.device_id),
                 )?;
                 let mut state_slice =
                     device
@@ -233,7 +272,7 @@ pub(crate) fn encode_distributed_to_shards(
                                 requested_bytes,
                                 "distributed amplitude shard allocation",
                                 Some(num_qubits),
-                                Some(device.ordinal()),
+                                Some(placement.device_id),
                                 e,
                             )
                         })?;
@@ -262,7 +301,7 @@ pub(crate) fn encode_distributed_to_shards(
                     if ret != 0 {
                         return Err(MahoutError::KernelLaunch(format!(
                             "Distributed amplitude shard kernel failed on cuda:{} with CUDA error code: {} ({})",
-                            device.ordinal(),
+                            placement.device_id,
                             ret,
                             cuda_error_to_string(ret)
                         )));
@@ -271,8 +310,7 @@ pub(crate) fn encode_distributed_to_shards(
                     device.synchronize().map_err(|e| {
                         MahoutError::Cuda(format!(
                             "Distributed amplitude shard synchronize failed on cuda:{}: {:?}",
-                            device.ordinal(),
-                            e
+                            placement.device_id, e
                         ))
                     })?;
                 }
diff --git a/qdp/qdp-core/src/gpu/mod.rs b/qdp/qdp-core/src/gpu/mod.rs
index eee6d2014a..ec2dff3123 100644
--- a/qdp/qdp-core/src/gpu/mod.rs
+++ b/qdp/qdp-core/src/gpu/mod.rs
@@ -34,11 +34,11 @@ pub(crate) mod cuda_ffi;
 
 #[cfg(target_os = "linux")]
 pub use buffer_pool::{PinnedBufferHandle, PinnedBufferPool};
-pub use communicator::{Communicator, HostCommunicator};
+pub use communicator::{CollectiveCommunicator, LocalCollectiveCommunicator};
 pub use distributed::{
-    DistributedAmplitudePlan, DistributedStateLayout, DistributedStateVector, DistributionMode,
-    PlacementPlan, PlacementPlanner, PlacementRequest, PreparedDistributedAmplitudeEncode,
-    ShardPlacement, ShardPolicy, StateShard, StateShardLayout,
+    DistributedAmplitudePlan, DistributedExecutionContext, DistributedStateLayout,
+    DistributedStateVector, DistributionMode, PlacementPlan, PlacementPlanner, PlacementRequest,
+    PreparedDistributedAmplitudeEncode, ShardPlacement, ShardPolicy, StateShard, StateShardLayout,
 };
 pub use encodings::{AmplitudeEncoder, AngleEncoder, BasisEncoder, QuantumEncoder, get_encoder};
 pub use memory::GpuStateVector;
diff --git a/qdp/qdp-core/src/gpu/topology.rs b/qdp/qdp-core/src/gpu/topology.rs
index 170b1200ab..f956c7bd9b 100644
--- a/qdp/qdp-core/src/gpu/topology.rs
+++ b/qdp/qdp-core/src/gpu/topology.rs
@@ -25,7 +25,7 @@ use crate::{
     gpu::cuda_ffi::{CUDA_SUCCESS, cudaDeviceCanAccessPeer},
 };
 
-/// Coarse-grained GPU interconnect classification used for future placement policies.
+/// Coarse-grained GPU interconnect classification used by placement policies.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum LinkKind {
     SameDevice,
@@ -152,7 +152,7 @@ impl GpuTopology {
     }
 }
 
-/// A validated collection of CUDA devices that will eventually back distributed state vectors.
+/// A validated collection of CUDA devices that back one distributed execution context.
 #[derive(Clone)]
 pub struct DeviceMesh {
     pub device_ids: Vec<usize>,
@@ -161,11 +161,11 @@ pub struct DeviceMesh {
 }
 
 impl DeviceMesh {
-    /// Build a mesh from CUDA device IDs with a placeholder topology.
+    /// Build a mesh from CUDA device IDs.
     ///
-    /// This intentionally keeps PR1 lightweight: it validates shape and initializes
-    /// CUDA devices, while a later PR can replace placeholder topology discovery with
-    /// explicit peer-access probing and richer link metadata.
+    /// On Linux this probes peer-access reachability so placement policies can
+    /// prefer the most connected devices without changing the caller-provided
+    /// device set.
     pub fn new(device_ids: Vec<usize>) -> Result<Self> {
         Self::validate_device_ids(&device_ids)?;
 
@@ -192,7 +192,8 @@ impl DeviceMesh {
         })
     }
 
-    /// Build a mesh from explicit parts. Intended for tests and future topology injection.
+    /// Build a mesh from explicit parts. Intended for tests and injected
+    /// topology metadata.
     pub fn from_parts(
         device_ids: Vec<usize>,
         devices: Vec<Arc<CudaDevice>>,
@@ -272,6 +273,20 @@ impl DeviceMesh {
             .collect()
     }
 
+    pub fn device_for_id(&self, device_id: usize) -> Result<Arc<CudaDevice>> {
+        let index = self
+            .device_ids
+            .iter()
+            .position(|&candidate| candidate == device_id)
+            .ok_or_else(|| {
+                MahoutError::InvalidInput(format!(
+                    "Device mesh does not contain cuda:{}",
+                    device_id
+                ))
+            })?;
+        Ok(Arc::clone(&self.devices[index]))
+    }
+
     fn validate_device_ids(device_ids: &[usize]) -> Result<()> {
         if device_ids.is_empty() {
             return Err(MahoutError::InvalidInput(
diff --git a/qdp/qdp-core/src/lib.rs b/qdp/qdp-core/src/lib.rs
index a52d9f4fb1..fb0ff4af96 100644
--- a/qdp/qdp-core/src/lib.rs
+++ b/qdp/qdp-core/src/lib.rs
@@ -37,10 +37,9 @@ mod profiling;
 pub use error::{MahoutError, Result, cuda_error_to_string};
 pub use gpu::memory::Precision;
 pub use gpu::{
-    Communicator, DeviceMesh, DistributedAmplitudePlan, DistributedStateLayout,
-    DistributedStateVector, DistributionMode, GpuTopology, HostCommunicator, LinkKind,
-    PlacementPlan, PlacementPlanner, PlacementRequest, PreparedDistributedAmplitudeEncode,
-    ShardPlacement, ShardPolicy,
+    DeviceMesh, DistributedAmplitudePlan, DistributedExecutionContext, DistributedStateVector,
+    DistributionMode, GpuTopology, LinkKind, PlacementRequest, PreparedDistributedAmplitudeEncode,
+    ShardPolicy,
 };
 pub use reader::{NullHandling, handle_float64_nulls};
 
@@ -145,19 +144,15 @@ impl QdpEngine {
         })
     }
 
-    /// Build a validated multi-GPU mesh for future distributed encoding paths.
-    ///
-    /// PR1 exposes mesh construction without changing the single-device engine shape.
+    /// Build a validated device mesh for one distributed execution context.
     pub fn new_distributed_mesh(device_ids: Vec<usize>) -> Result<DeviceMesh> {
         DeviceMesh::new(device_ids)
     }
 
-    /// Prepare a distributed amplitude encode on a validated multi-GPU mesh.
+    /// Prepare a distributed amplitude encode on a single-process device mesh.
     ///
-    /// This stage performs input validation, shard planning, and host-coordinated
-    /// global norm reduction, then returns metadata for the future distributed
-    /// state layout. It does not yet allocate per-shard buffers or launch
-    /// multi-GPU kernels.
+    /// This is a convenience wrapper over `prepare_distributed_amplitude_on`
+    /// using an in-process collective implementation.
     #[doc(hidden)]
     pub fn prepare_distributed_amplitude(
         device_ids: Vec<usize>,
@@ -166,50 +161,49 @@ impl QdpEngine {
         precision: Precision,
         request: Option<PlacementRequest>,
     ) -> Result<PreparedDistributedAmplitudeEncode> {
-        let communicator = HostCommunicator;
-        Self::prepare_distributed_amplitude_with_communicator(
-            device_ids,
+        let request = Self::resolve_distributed_request(num_qubits, request)?;
+        runtime::validate_distributed_input(data, &request)?;
+        let collectives = crate::gpu::LocalCollectiveCommunicator;
+        let execution = DistributedExecutionContext::single_process(device_ids, &collectives)?;
+        Self::prepare_distributed_amplitude_on(
+            &execution,
             data,
             num_qubits,
             precision,
-            request,
-            &communicator,
+            Some(request),
         )
     }
 
-    /// Prepare a distributed amplitude encode with an injected collective
-    /// implementation. This keeps the execution path compatible with future
-    /// MPI-backed or NCCL-backed coordination without hard-coding a specific
-    /// transport into the engine surface.
+    /// Prepare a distributed amplitude encode on an explicit execution context.
+    ///
+    /// The execution context owns the participating device mesh and the
+    /// collective implementation that coordinates those shards.
     #[doc(hidden)]
-    pub fn prepare_distributed_amplitude_with_communicator(
-        device_ids: Vec<usize>,
+    pub fn prepare_distributed_amplitude_on(
+        execution: &DistributedExecutionContext<'_>,
         data: &[f64],
         num_qubits: usize,
         precision: Precision,
         request: Option<PlacementRequest>,
-        communicator: &dyn Communicator,
     ) -> Result<PreparedDistributedAmplitudeEncode> {
         let request = Self::resolve_distributed_request(num_qubits, request)?;
         runtime::validate_distributed_input(data, &request)?;
-        let mesh = Self::new_distributed_mesh(device_ids)?;
         let (plan, inv_norm, layout) =
-            runtime::prepare_distributed_encode(&mesh, data, precision, request, communicator)?;
+            runtime::prepare_distributed_encode(execution, data, precision, request)?;
 
         Ok(PreparedDistributedAmplitudeEncode {
-            mesh,
+            mesh: execution.mesh().clone(),
             plan,
             inv_norm,
             layout,
         })
     }
 
-    /// Encode amplitude data into real per-device shard buffers for a distributed
-    /// state vector on a single host.
+    /// Encode amplitude data into real per-device shard buffers on a
+    /// single-process device mesh.
     ///
-    /// This is the first executable distributed path: it returns a sharded state
-    /// object with one GPU buffer per shard. It does not expose DLPack or gather
-    /// semantics yet.
+    /// This is a convenience wrapper over `encode_distributed_amplitude_to_shards_on`
+    /// using an in-process collective implementation.
     #[doc(hidden)]
     pub fn encode_distributed_amplitude_to_shards(
         device_ids: Vec<usize>,
@@ -218,33 +212,32 @@ impl QdpEngine {
         precision: Precision,
         request: Option<PlacementRequest>,
     ) -> Result<DistributedStateVector> {
-        let communicator = HostCommunicator;
-        Self::encode_distributed_amplitude_to_shards_with_communicator(
-            device_ids,
+        let request = Self::resolve_distributed_request(num_qubits, request)?;
+        runtime::validate_distributed_input(data, &request)?;
+        let collectives = crate::gpu::LocalCollectiveCommunicator;
+        let execution = DistributedExecutionContext::single_process(device_ids, &collectives)?;
+        Self::encode_distributed_amplitude_to_shards_on(
+            &execution,
             data,
             num_qubits,
             precision,
-            request,
-            &communicator,
+            Some(request),
         )
     }
 
-    /// Materialize a distributed amplitude state with an injected collective
-    /// implementation. This is the execution-oriented companion to
-    /// `prepare_distributed_amplitude_with_communicator`.
+    /// Materialize a distributed amplitude state on an explicit execution
+    /// context.
     #[doc(hidden)]
-    pub fn encode_distributed_amplitude_to_shards_with_communicator(
-        device_ids: Vec<usize>,
+    pub fn encode_distributed_amplitude_to_shards_on(
+        execution: &DistributedExecutionContext<'_>,
         data: &[f64],
         num_qubits: usize,
         precision: Precision,
         request: Option<PlacementRequest>,
-        communicator: &dyn Communicator,
     ) -> Result<DistributedStateVector> {
         let request = Self::resolve_distributed_request(num_qubits, request)?;
         runtime::validate_distributed_input(data, &request)?;
-        let mesh = Self::new_distributed_mesh(device_ids)?;
-        runtime::encode_distributed_to_shards(&mesh, data, precision, request, communicator)
+        runtime::encode_distributed_to_shards(execution, data, precision, request)
     }
 
     fn resolve_distributed_request(
diff --git a/qdp/qdp-core/tests/gpu_distributed_amplitude_plan.rs b/qdp/qdp-core/tests/gpu_distributed_amplitude_plan.rs
index b59101c2d7..ba14064330 100644
--- a/qdp/qdp-core/tests/gpu_distributed_amplitude_plan.rs
+++ b/qdp/qdp-core/tests/gpu_distributed_amplitude_plan.rs
@@ -14,9 +14,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use qdp_core::{
+use qdp_core::Precision;
+use qdp_core::gpu::{
     DeviceMesh, DistributedAmplitudePlan, DistributedStateLayout, DistributionMode, GpuTopology,
-    PlacementRequest, Precision, ShardPolicy,
+    PlacementRequest, ShardPolicy,
 };
 
 #[test]
diff --git a/qdp/qdp-core/tests/gpu_distributed_communicator.rs b/qdp/qdp-core/tests/gpu_distributed_communicator.rs
index bfcc4d4cf1..ffd1904671 100644
--- a/qdp/qdp-core/tests/gpu_distributed_communicator.rs
+++ b/qdp/qdp-core/tests/gpu_distributed_communicator.rs
@@ -1,19 +1,35 @@
-use qdp_core::gpu::{Communicator, HostCommunicator};
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use qdp_core::gpu::{CollectiveCommunicator, LocalCollectiveCommunicator};
 
 #[test]
-fn host_communicator_reduce_sum_returns_total() {
-    let comm = HostCommunicator;
+fn local_collective_reduce_sum_returns_total() {
+    let comm = LocalCollectiveCommunicator;
     let values = vec![1.0, 2.0, 3.0];
-    assert_eq!(comm.reduce_sum_f64(&values).unwrap(), 6.0);
+    assert_eq!(comm.all_reduce_sum_f64(&values).unwrap(), 6.0);
 }
 
 #[test]
-fn host_communicator_reduce_sum_rejects_empty_inputs() {
-    let comm = HostCommunicator;
-    let err = comm.reduce_sum_f64(&[]).unwrap_err();
+fn local_collective_reduce_sum_rejects_empty_inputs() {
+    let comm = LocalCollectiveCommunicator;
+    let err = comm.all_reduce_sum_f64(&[]).unwrap_err();
     assert!(matches!(
         err,
         qdp_core::MahoutError::InvalidInput(msg)
-        if msg.contains("at least one value")
+        if msg.contains("at least one partial contribution")
     ));
 }
diff --git a/qdp/qdp-core/tests/gpu_distributed_engine.rs b/qdp/qdp-core/tests/gpu_distributed_engine.rs
index b584bf0055..3caebdd3c7 100644
--- a/qdp/qdp-core/tests/gpu_distributed_engine.rs
+++ b/qdp/qdp-core/tests/gpu_distributed_engine.rs
@@ -14,8 +14,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use qdp_core::gpu::QuantumEncoder;
-use qdp_core::{HostCommunicator, Precision, QdpEngine};
+use qdp_core::gpu::{
+    DeviceMesh, DistributedAmplitudePlan, DistributedExecutionContext, DistributedStateLayout,
+    DistributionMode, GpuTopology, LinkKind, LocalCollectiveCommunicator, PlacementRequest,
+    QuantumEncoder, ShardPolicy,
+};
+use qdp_core::{Precision, QdpEngine};
 
 mod common;
 
@@ -45,20 +49,20 @@ fn prepare_distributed_amplitude_returns_expected_metadata() {
 }
 
 #[test]
-fn prepare_distributed_amplitude_with_explicit_communicator_returns_expected_metadata() {
+fn prepare_distributed_amplitude_on_execution_context_returns_expected_metadata() {
     #[cfg(target_os = "linux")]
     if cudarc::driver::CudaDevice::new(1).is_err() {
         return;
     }
 
-    let communicator = HostCommunicator;
-    let prepared = QdpEngine::prepare_distributed_amplitude_with_communicator(
-        vec![0, 1],
+    let collectives = LocalCollectiveCommunicator;
+    let execution = DistributedExecutionContext::single_process(vec![0, 1], &collectives).unwrap();
+    let prepared = QdpEngine::prepare_distributed_amplitude_on(
+        &execution,
         &[1.0, 2.0, 3.0],
         2,
         Precision::Float32,
         None,
-        &communicator,
     )
     .unwrap();
 
@@ -68,6 +72,106 @@ fn prepare_distributed_amplitude_with_explicit_communicator_returns_expected_met
     assert_eq!(prepared.layout.global_len, 4);
 }
 
+#[cfg(target_os = "linux")]
+fn reordered_three_device_mesh() -> Option<DeviceMesh> {
+    let device0 = cudarc::driver::CudaDevice::new(0).ok()?;
+    let device1 = cudarc::driver::CudaDevice::new(1).ok()?;
+    let device2 = cudarc::driver::CudaDevice::new(2).ok()?;
+    let topology = GpuTopology {
+        peer_access: vec![
+            vec![true, true, false],
+            vec![true, true, true],
+            vec![false, true, true],
+        ],
+        links: vec![
+            vec![LinkKind::SameDevice, LinkKind::Pix, LinkKind::Unknown],
+            vec![LinkKind::Pix, LinkKind::SameDevice, LinkKind::Node],
+            vec![LinkKind::Unknown, LinkKind::Node, LinkKind::SameDevice],
+        ],
+    };
+
+    DeviceMesh::from_parts(vec![0, 1, 2], vec![device0, device1, device2], topology).ok()
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn distributed_layout_uses_device_handles_for_reordered_placements() {
+    let Some(mesh) = reordered_three_device_mesh() else {
+        return;
+    };
+
+    let request = PlacementRequest::new(
+        2,
+        DistributionMode::ShardedCapacity,
+        ShardPolicy::BalancedUneven,
+    );
+    let plan = DistributedAmplitudePlan::for_request(&mesh, request).unwrap();
+    assert_eq!(plan.placement.placements[0].device_id, 1);
+
+    let layout = DistributedStateLayout::new(&mesh, &plan, Precision::Float32).unwrap();
+    assert_eq!(layout.shards[0].device_id, 1);
+    assert_eq!(layout.shards[0].device.ordinal(), 1);
+    assert_eq!(layout.shards[1].device_id, 0);
+    assert_eq!(layout.shards[1].device.ordinal(), 0);
+    assert_eq!(layout.shards[2].device_id, 2);
+    assert_eq!(layout.shards[2].device.ordinal(), 2);
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn distributed_encoding_uses_device_handles_for_reordered_placements() {
+    let Some(mesh) = reordered_three_device_mesh() else {
+        return;
+    };
+    let device0 = mesh.device_for_id(0).unwrap();
+
+    let collectives = LocalCollectiveCommunicator;
+    let execution = DistributedExecutionContext::new(mesh, &collectives);
+    let input = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
+    let single_state = qdp_core::gpu::AmplitudeEncoder
+        .encode(&device0, &input, 3)
+        .unwrap();
+    let single_host = single_state
+        .copy_to_host_f64(&device0)
+        .unwrap()
+        .into_iter()
+        .map(|value| value.x)
+        .collect::<Vec<_>>();
+
+    let state = QdpEngine::encode_distributed_amplitude_to_shards_on(
+        &execution,
+        &input,
+        3,
+        Precision::Float64,
+        Some(PlacementRequest::new(
+            3,
+            DistributionMode::ShardedCapacity,
+            ShardPolicy::BalancedUneven,
+        )),
+    )
+    .unwrap();
+
+    assert_eq!(state.shards[0].device_id, 1);
+    assert_eq!(state.shards[0].device.ordinal(), 1);
+    assert_eq!(state.shards[1].device_id, 0);
+    assert_eq!(state.shards[1].device.ordinal(), 0);
+    assert_eq!(state.shards[2].device_id, 2);
+    assert_eq!(state.shards[2].device.ordinal(), 2);
+
+    let mut distributed_host = Vec::new();
+    for shard_id in 0..state.num_shards() {
+        distributed_host.extend(
+            state
+                .copy_shard_to_host_f64(shard_id)
+                .unwrap()
+                .into_iter()
+                .map(|value| value.x),
+        );
+    }
+
+    assert_eq!(distributed_host, single_host);
+}
+
 #[test]
 #[cfg(target_os = "linux")]
 fn encode_distributed_amplitude_to_shards_returns_real_buffers() {
diff --git a/qdp/qdp-core/tests/gpu_distributed_planner.rs b/qdp/qdp-core/tests/gpu_distributed_planner.rs
index 3449629ca6..1ac04ef632 100644
--- a/qdp/qdp-core/tests/gpu_distributed_planner.rs
+++ b/qdp/qdp-core/tests/gpu_distributed_planner.rs
@@ -1,4 +1,4 @@
-use qdp_core::{
+use qdp_core::gpu::{
     DeviceMesh, DistributionMode, GpuTopology, LinkKind, PlacementPlanner, PlacementRequest,
     ShardPolicy,
 };
@@ -86,7 +86,10 @@ fn single_mode_uses_only_first_device() {
     assert_eq!(plan.placements.len(), 1);
     assert_eq!(plan.gather_device_id, Some(4));
     assert_eq!(plan.placements[0].device_id, 4);
-    assert_eq!((plan.placements[0].start_idx, plan.placements[0].end_idx), (0, 8));
+    assert_eq!(
+        (plan.placements[0].start_idx, plan.placements[0].end_idx),
+        (0, 8)
+    );
 }
 
 #[test]
diff --git a/qdp/qdp-core/tests/gpu_distributed_runtime.rs b/qdp/qdp-core/tests/gpu_distributed_runtime.rs
index ca2950c7ce..fcb9609cd4 100644
--- a/qdp/qdp-core/tests/gpu_distributed_runtime.rs
+++ b/qdp/qdp-core/tests/gpu_distributed_runtime.rs
@@ -65,16 +65,11 @@ fn prepare_distributed_amplitude_rejects_request_qubit_mismatch() {
 
 #[test]
 fn prepare_distributed_amplitude_rejects_empty_input() {
-    let err = match QdpEngine::prepare_distributed_amplitude(
-        vec![0],
-        &[],
-        1,
-        Precision::Float64,
-        None,
-    ) {
-        Ok(_) => panic!("expected empty input to be rejected"),
-        Err(err) => err,
-    };
+    let err =
+        match QdpEngine::prepare_distributed_amplitude(vec![0], &[], 1, Precision::Float64, None) {
+            Ok(_) => panic!("expected empty input to be rejected"),
+            Err(err) => err,
+        };
 
     assert!(matches!(
         err,
diff --git a/qdp/qdp-kernels/build.rs b/qdp/qdp-kernels/build.rs
index 097b57e15f..f12fdf9876 100644
--- a/qdp/qdp-kernels/build.rs
+++ b/qdp/qdp-kernels/build.rs
@@ -86,18 +86,20 @@ fn main() {
         // SM 75 = Turing (T4, RTX 2000 series)
         // SM 80 = Ampere (A100, RTX 3000 series)
         // SM 86 = Ampere (RTX 3090, A40)
-        // SM 89 = Ada Lovelace (RTX 4000 series)
+        // SM 89 = Ada Lovelace (RTX 4090 / RTX 4000 series)
         // SM 90 = Hopper (H100)
-        // Support both Turing (sm_75) and Ampere+ architectures
+        // SM 120 = Blackwell workstation parts
+        // Support the GPU generations currently present in the QDP test host.
         .flag("-gencode")
         .flag("arch=compute_75,code=sm_75")
         .flag("-gencode")
         .flag("arch=compute_80,code=sm_80")
         .flag("-gencode")
         .flag("arch=compute_86,code=sm_86")
-        // Optional: Add more architectures for production
-        // .flag("-gencode")
-        // .flag("arch=compute_89,code=sm_89")
+        .flag("-gencode")
+        .flag("arch=compute_89,code=sm_89")
+        .flag("-gencode")
+        .flag("arch=compute_120,code=sm_120")
         .file("src/amplitude.cu")
         .file("src/basis.cu")
         .file("src/angle.cu")