diff --git a/CHANGELOG b/CHANGELOG
index 115c96603..da2e55311 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -3,6 +3,10 @@ If not stated, FINUFFT is assumed (old cuFINUFFT <=1.3 is listed separately).
 
 v2.6.0-dev
 
+* Added C-level simple API to cuFINUFFT
+  (`cufinufft{,f}{1,2,3}d{1,2,3}[many]`) mirroring the CPU `finufft1d1`-style
+  one-shot wrappers. 36 new entry points that allocate a plan, set points,
+  execute, and destroy in a single call; inputs are device pointers. (Barbone)
 * Added `threadsafe_execute` regression test verifying concurrent `execute()`
   calls on the same plan produce correct results. Added sanitizer mode selection
   via `FINUFFT_USE_SANITIZERS=OFF|ON|MEMSAN|TSAN`, and extended the sanitizer
diff --git a/docs/c_gpu.rst b/docs/c_gpu.rst
index fb776cbc0..215de2867 100644
--- a/docs/c_gpu.rst
+++ b/docs/c_gpu.rst
@@ -19,6 +19,67 @@ You will also want to read the examples in ``examples/cuda`` and ``test/cuda/cuf
 *Note*: The interface to cuFINUFFT has changed between versions 1.3 and 2.2.
 Please see :ref:`Migration to cuFINUFFT v2.2<cufinufft_migration>` for details.
 
+Simple interface
+----------------
+
+For users who only need to perform a single transform per plan, cuFINUFFT
+provides one-shot wrappers that combine all four plan steps (make, setpts,
+execute, destroy) into a single call. These mirror the CPU ``finufft1d1``
+family (see :ref:`c`) and follow the same naming and argument order; the
+only difference is that all input/output array pointers refer to memory on
+the GPU. They have double (``cufinufft``) and single (``cufinufftf``)
+precision versions, which we document together.
+
+For each dimension ``d`` (1, 2, or 3) and transform type ``t`` (1, 2, or 3),
+there are four entry points::
+
+    cufinufft{d}d{t}(...);            // double precision, single transform
+    cufinufftf{d}d{t}(...);           // single precision, single transform
+    cufinufft{d}d{t}many(n_transf, ...);   // double precision, many transforms
+    cufinufftf{d}d{t}many(n_transf, ...);  // single precision, many transforms
+
+This gives 36 entry points in total. The full prototypes are declared in
+``include/cufinufft.h``. As an example, the 1D type-1 simple call in single
+precision is:
+
+.. code-block:: c
+
+    int cufinufftf1d1(int64_t nj, const float *xj, const cuFloatComplex *cj,
+                      int iflag, float eps, int64_t ms, cuFloatComplex *fk,
+                      const cufinufft_opts *opts);
+
+Inputs:
+
+* ``nj`` — number of nonuniform points
+* ``xj`` — length-``nj`` device array of NU point coordinates (in :math:`[-\pi,\pi)`,
+  values outside are folded)
+* ``cj`` — length-``nj`` device array of NU strengths
+* ``iflag`` — sign in the complex exponential (>=0 for +, <0 for -)
+* ``eps`` — requested relative tolerance
+* ``ms`` — number of Fourier modes
+* ``fk`` — length-``ms`` device array, output
+* ``opts`` — optional options pointer (NULL for defaults)
+
+Returns ``0`` on success, otherwise an error code (see ``finufft_errors.h``).
+
+A complete example using the simple interface in place of the 4-step calls
+from the :ref:`Getting started <c_gpu>` walkthrough below is:
+
+.. code-block:: c
+
+    cufinufftf1d1(M, d_x, d_c, 1, 1e-6f, N, d_f, NULL);
+
+This single call replaces the ``makeplan``/``setpts``/``execute``/``destroy``
+sequence. Type-3 calls also take frequency-target arrays (``s`` for 1D,
+``s,t`` for 2D, ``s,t,u`` for 3D), exactly as in the CPU API. The ``many``
+variants take an ``n_transf`` parameter as the first argument and use the
+same set of NU points for ``n_transf`` consecutive transforms (with input
+and output arrays sized accordingly).
+
+The simple interface is intended for cases where the plan would otherwise be
+used exactly once. When the same NU points are reused across several
+transforms, use the 4-step plan interface below for better performance.
+
 Getting started
 ---------------
 
diff --git a/include/cufinufft.h b/include/cufinufft.h
index 954ffae00..da9f419f6 100644
--- a/include/cufinufft.h
+++ b/include/cufinufft.h
@@ -36,6 +36,154 @@ FINUFFT_EXPORT int cufinufftf_execute(cufinufftf_plan d_plan, cuFloatComplex *d_
 
 FINUFFT_EXPORT int cufinufft_destroy(cufinufft_plan d_plan);
 FINUFFT_EXPORT int cufinufftf_destroy(cufinufftf_plan d_plan);
+
+// Simple (one-shot) interfaces. Pointers are device pointers. Behavior matches
+// the 4-step plan API above. 36 entry points (3 dims x 3 types x {single,many}
+// x {double,float}).
+
+// Dimension 1111111111111111111111111111111111111111111111111111111111111111
+FINUFFT_EXPORT int cufinufft1d1many(
+    int n_transf, int64_t nj, const double *xj, const cuDoubleComplex *cj, int iflag,
+    double eps, int64_t ms, cuDoubleComplex *fk, const cufinufft_opts *opts);
+FINUFFT_EXPORT int cufinufftf1d1many(
+    int n_transf, int64_t nj, const float *xj, const cuFloatComplex *cj, int iflag,
+    float eps, int64_t ms, cuFloatComplex *fk, const cufinufft_opts *opts);
+FINUFFT_EXPORT int cufinufft1d1(int64_t nj, const double *xj, const cuDoubleComplex *cj,
+                                int iflag, double eps, int64_t ms, cuDoubleComplex *fk,
+                                const cufinufft_opts *opts);
+FINUFFT_EXPORT int cufinufftf1d1(int64_t nj, const float *xj, const cuFloatComplex *cj,
+                                 int iflag, float eps, int64_t ms, cuFloatComplex *fk,
+                                 const cufinufft_opts *opts);
+
+FINUFFT_EXPORT int cufinufft1d2many(
+    int n_transf, int64_t nj, const double *xj, cuDoubleComplex *cj, int iflag,
+    double eps, int64_t ms, const cuDoubleComplex *fk, const cufinufft_opts *opts);
+FINUFFT_EXPORT int cufinufftf1d2many(
+    int n_transf, int64_t nj, const float *xj, cuFloatComplex *cj, int iflag, float eps,
+    int64_t ms, const cuFloatComplex *fk, const cufinufft_opts *opts);
+FINUFFT_EXPORT int cufinufft1d2(int64_t nj, const double *xj, cuDoubleComplex *cj,
+                                int iflag, double eps, int64_t ms,
+                                const cuDoubleComplex *fk, const cufinufft_opts *opts);
+FINUFFT_EXPORT int cufinufftf1d2(int64_t nj, const float *xj, cuFloatComplex *cj,
+                                 int iflag, float eps, int64_t ms,
+                                 const cuFloatComplex *fk, const cufinufft_opts *opts);
+
+FINUFFT_EXPORT int cufinufft1d3many(int n_transf, int64_t nj, const double *xj,
+                                    const cuDoubleComplex *cj, int iflag, double eps,
+                                    int64_t nk, const double *s, cuDoubleComplex *fk,
+                                    const cufinufft_opts *opts);
+FINUFFT_EXPORT int cufinufftf1d3many(int n_transf, int64_t nj, const float *xj,
+                                     const cuFloatComplex *cj, int iflag, float eps,
+                                     int64_t nk, const float *s, cuFloatComplex *fk,
+                                     const cufinufft_opts *opts);
+FINUFFT_EXPORT int cufinufft1d3(int64_t nj, const double *xj, const cuDoubleComplex *cj,
+                                int iflag, double eps, int64_t nk, const double *s,
+                                cuDoubleComplex *fk, const cufinufft_opts *opts);
+FINUFFT_EXPORT int cufinufftf1d3(int64_t nj, const float *xj, const cuFloatComplex *cj,
+                                 int iflag, float eps, int64_t nk, const float *s,
+                                 cuFloatComplex *fk, const cufinufft_opts *opts);
+
+// Dimension 22222222222222222222222222222222222222222222222222222222222222222
+FINUFFT_EXPORT int cufinufft2d1many(int n_transf, int64_t nj, const double *xj,
+                                    const double *yj, const cuDoubleComplex *cj,
+                                    int iflag, double eps, int64_t ms, int64_t mt,
+                                    cuDoubleComplex *fk, const cufinufft_opts *opts);
+FINUFFT_EXPORT int cufinufftf2d1many(int n_transf, int64_t nj, const float *xj,
+                                     const float *yj, const cuFloatComplex *cj, int iflag,
+                                     float eps, int64_t ms, int64_t mt,
+                                     cuFloatComplex *fk, const cufinufft_opts *opts);
+FINUFFT_EXPORT int cufinufft2d1(
+    int64_t nj, const double *xj, const double *yj, const cuDoubleComplex *cj, int iflag,
+    double eps, int64_t ms, int64_t mt, cuDoubleComplex *fk, const cufinufft_opts *opts);
+FINUFFT_EXPORT int cufinufftf2d1(
+    int64_t nj, const float *xj, const float *yj, const cuFloatComplex *cj, int iflag,
+    float eps, int64_t ms, int64_t mt, cuFloatComplex *fk, const cufinufft_opts *opts);
+
+FINUFFT_EXPORT int cufinufft2d2many(
+    int n_transf, int64_t nj, const double *xj, const double *yj, cuDoubleComplex *cj,
+    int iflag, double eps, int64_t ms, int64_t mt, const cuDoubleComplex *fk,
+    const cufinufft_opts *opts);
+FINUFFT_EXPORT int cufinufftf2d2many(
+    int n_transf, int64_t nj, const float *xj, const float *yj, cuFloatComplex *cj,
+    int iflag, float eps, int64_t ms, int64_t mt, const cuFloatComplex *fk,
+    const cufinufft_opts *opts);
+FINUFFT_EXPORT int cufinufft2d2(int64_t nj, const double *xj, const double *yj,
+                                cuDoubleComplex *cj, int iflag, double eps, int64_t ms,
+                                int64_t mt, const cuDoubleComplex *fk,
+                                const cufinufft_opts *opts);
+FINUFFT_EXPORT int cufinufftf2d2(int64_t nj, const float *xj, const float *yj,
+                                 cuFloatComplex *cj, int iflag, float eps, int64_t ms,
+                                 int64_t mt, const cuFloatComplex *fk,
+                                 const cufinufft_opts *opts);
+
+FINUFFT_EXPORT int cufinufft2d3many(
+    int n_transf, int64_t nj, const double *xj, const double *yj,
+    const cuDoubleComplex *cj, int iflag, double eps, int64_t nk, const double *s,
+    const double *t, cuDoubleComplex *fk, const cufinufft_opts *opts);
+FINUFFT_EXPORT int cufinufftf2d3many(
+    int n_transf, int64_t nj, const float *xj, const float *yj, const cuFloatComplex *cj,
+    int iflag, float eps, int64_t nk, const float *s, const float *t, cuFloatComplex *fk,
+    const cufinufft_opts *opts);
+FINUFFT_EXPORT int cufinufft2d3(int64_t nj, const double *xj, const double *yj,
+                                const cuDoubleComplex *cj, int iflag, double eps,
+                                int64_t nk, const double *s, const double *t,
+                                cuDoubleComplex *fk, const cufinufft_opts *opts);
+FINUFFT_EXPORT int cufinufftf2d3(int64_t nj, const float *xj, const float *yj,
+                                 const cuFloatComplex *cj, int iflag, float eps,
+                                 int64_t nk, const float *s, const float *t,
+                                 cuFloatComplex *fk, const cufinufft_opts *opts);
+
+// Dimension 3333333333333333333333333333333333333333333333333333333333333333
+FINUFFT_EXPORT int cufinufft3d1many(
+    int n_transf, int64_t nj, const double *xj, const double *yj, const double *zj,
+    const cuDoubleComplex *cj, int iflag, double eps, int64_t ms, int64_t mt, int64_t mu,
+    cuDoubleComplex *fk, const cufinufft_opts *opts);
+FINUFFT_EXPORT int cufinufftf3d1many(
+    int n_transf, int64_t nj, const float *xj, const float *yj, const float *zj,
+    const cuFloatComplex *cj, int iflag, float eps, int64_t ms, int64_t mt, int64_t mu,
+    cuFloatComplex *fk, const cufinufft_opts *opts);
+FINUFFT_EXPORT int cufinufft3d1(int64_t nj, const double *xj, const double *yj,
+                                const double *zj, const cuDoubleComplex *cj, int iflag,
+                                double eps, int64_t ms, int64_t mt, int64_t mu,
+                                cuDoubleComplex *fk, const cufinufft_opts *opts);
+FINUFFT_EXPORT int cufinufftf3d1(int64_t nj, const float *xj, const float *yj,
+                                 const float *zj, const cuFloatComplex *cj, int iflag,
+                                 float eps, int64_t ms, int64_t mt, int64_t mu,
+                                 cuFloatComplex *fk, const cufinufft_opts *opts);
+
+FINUFFT_EXPORT int cufinufft3d2many(
+    int n_transf, int64_t nj, const double *xj, const double *yj, const double *zj,
+    cuDoubleComplex *cj, int iflag, double eps, int64_t ms, int64_t mt, int64_t mu,
+    const cuDoubleComplex *fk, const cufinufft_opts *opts);
+FINUFFT_EXPORT int cufinufftf3d2many(
+    int n_transf, int64_t nj, const float *xj, const float *yj, const float *zj,
+    cuFloatComplex *cj, int iflag, float eps, int64_t ms, int64_t mt, int64_t mu,
+    const cuFloatComplex *fk, const cufinufft_opts *opts);
+FINUFFT_EXPORT int cufinufft3d2(int64_t nj, const double *xj, const double *yj,
+                                const double *zj, cuDoubleComplex *cj, int iflag,
+                                double eps, int64_t ms, int64_t mt, int64_t mu,
+                                const cuDoubleComplex *fk, const cufinufft_opts *opts);
+FINUFFT_EXPORT int cufinufftf3d2(int64_t nj, const float *xj, const float *yj,
+                                 const float *zj, cuFloatComplex *cj, int iflag,
+                                 float eps, int64_t ms, int64_t mt, int64_t mu,
+                                 const cuFloatComplex *fk, const cufinufft_opts *opts);
+
+FINUFFT_EXPORT int cufinufft3d3many(
+    int n_transf, int64_t nj, const double *xj, const double *yj, const double *zj,
+    const cuDoubleComplex *cj, int iflag, double eps, int64_t nk, const double *s,
+    const double *t, const double *u, cuDoubleComplex *fk, const cufinufft_opts *opts);
+FINUFFT_EXPORT int cufinufftf3d3many(
+    int n_transf, int64_t nj, const float *xj, const float *yj, const float *zj,
+    const cuFloatComplex *cj, int iflag, float eps, int64_t nk, const float *s,
+    const float *t, const float *u, cuFloatComplex *fk, const cufinufft_opts *opts);
+FINUFFT_EXPORT int cufinufft3d3(
+    int64_t nj, const double *xj, const double *yj, const double *zj,
+    const cuDoubleComplex *cj, int iflag, double eps, int64_t nk, const double *s,
+    const double *t, const double *u, cuDoubleComplex *fk, const cufinufft_opts *opts);
+FINUFFT_EXPORT int cufinufftf3d3(
+    int64_t nj, const float *xj, const float *yj, const float *zj,
+    const cuFloatComplex *cj, int iflag, float eps, int64_t nk, const float *s,
+    const float *t, const float *u, cuFloatComplex *fk, const cufinufft_opts *opts);
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/cuda/c_interface.cpp b/src/cuda/c_interface.cpp
index 8af543002..9f7e6e0df 100644
--- a/src/cuda/c_interface.cpp
+++ b/src/cuda/c_interface.cpp
@@ -1,7 +1,9 @@
+#include <array>
 #include <cmath>
 #include <cstdint>
 #include <iostream>
 #include <limits>
+#include <memory>
 
 #include <cufinufft.h>
 #include <cufinufft/cufinufft_plan_t.hpp>
@@ -170,4 +172,296 @@ void cufinufft_default_opts(cufinufft_opts *opts)
   opts->gpu_stream         = cudaStreamDefault;
   // sphinx tag (don't remove): @gpu_defopts_end
 }
+} // extern "C"
+
+/* ---------------------------------------------------------------------------
+   The 18 simple interfaces (= 3 dims * 3 types * {singlecall,many}) to
+   cuFINUFFT, mirroring the CPU c_interface.cpp layout. Pointers are device
+   pointers; behavior mirrors the 4-step plan API above.
+   --------------------------------------------------------------------------- */
+
+namespace { // helpers local to this TU
+
+using i64 = int64_t;
+using ci64 = const int64_t;
+
+template<typename T>
+int simple_guru(
+    int n_dims, int type, int n_transf, i64 nj, const std::array<const T *, 3> &xyz,
+    cuda_complex<T> *cj, int iflag, T eps, const std::array<ci64, 3> &n_modes, i64 nk,
+    const std::array<const T *, 3> &stu, cuda_complex<T> *fk, const cufinufft_opts *popts)
+// Helper layer between simple interfaces and the underlying plan methods.
+// Plan is heap-allocated (it owns CUDA streams / cuFFT plans / device buffers
+// whose destruction order matters) and managed via unique_ptr so error paths
+// still free GPU resources.
+{
+  if (n_dims < 1 || n_dims > 3) return FINUFFT_ERR_DIM_NOTVALID;
+  if (nj > std::numeric_limits<int32_t>::max()) return FINUFFT_ERR_NDATA_NOTVALID;
+  if (nk > std::numeric_limits<int32_t>::max()) return FINUFFT_ERR_NDATA_NOTVALID;
+
+  int nmodes32[3];
+  if (is_invalid_mode_array(type, n_dims, n_modes.data(), nmodes32))
+    return FINUFFT_ERR_NDATA_NOTVALID;
+
+  cufinufft_opts planopts;
+  if (popts)
+    planopts = *popts;
+  else
+    cufinufft_default_opts(&planopts);
+
+  auto plan = std::make_unique<cufinufft_plan_t<T>>(type, n_dims, nmodes32, iflag,
+                                                    n_transf, eps, planopts);
+  const int warn = plan->eps_too_small ? FINUFFT_WARN_EPS_TOO_SMALL : 0;
+  plan->setpts((int)nj, xyz[0], xyz[1], xyz[2], (int)nk, stu[0], stu[1], stu[2]);
+  plan->execute(cj, fk);
+  return warn;
+}
+
+template<typename T>
+int guru13(int n_dims, int type, int n_transf, i64 nj,
+           const std::array<const T *, 3> &xyz, const cuda_complex<T> *cj, int iflag,
+           T eps, const std::array<ci64, 3> &n_modes, i64 nk,
+           const std::array<const T *, 3> &stu, cuda_complex<T> *fk,
+           const cufinufft_opts *popts) {
+  return safe_finufft_call([&]() -> int {
+    return simple_guru<T>(n_dims, type, n_transf, nj, xyz,
+                          const_cast<cuda_complex<T> *>(cj), iflag, eps, n_modes, nk, stu,
+                          fk, popts);
+  });
+}
+
+template<typename T>
+int guru2(int n_dims, int type, int n_transf, i64 nj, const std::array<const T *, 3> &xyz,
+          cuda_complex<T> *cj, int iflag, T eps, const std::array<ci64, 3> &n_modes,
+          i64 nk, const std::array<const T *, 3> &stu, const cuda_complex<T> *fk,
+          const cufinufft_opts *popts) {
+  return safe_finufft_call([&]() -> int {
+    return simple_guru<T>(n_dims, type, n_transf, nj, xyz, cj, iflag, eps, n_modes, nk,
+                          stu, const_cast<cuda_complex<T> *>(fk), popts);
+  });
+}
+
+} // anonymous namespace
+
+extern "C" {
+
+// Dimension 1111111111111111111111111111111111111111111111111111111111111111
+
+int cufinufft1d1many(int n_transf, i64 nj, const double *xj, const cuDoubleComplex *cj,
+                     int iflag, double eps, i64 ms, cuDoubleComplex *fk,
+                     const cufinufft_opts *opts) {
+  return guru13<double>(1, 1, n_transf, nj, {xj}, cj, iflag, eps, {ms, 1, 1}, 0, {}, fk,
+                        opts);
+}
+int cufinufftf1d1many(int n_transf, i64 nj, const float *xj, const cuFloatComplex *cj,
+                      int iflag, float eps, i64 ms, cuFloatComplex *fk,
+                      const cufinufft_opts *opts) {
+  return guru13<float>(1, 1, n_transf, nj, {xj}, cj, iflag, eps, {ms, 1, 1}, 0, {}, fk,
+                       opts);
+}
+int cufinufft1d1(i64 nj, const double *xj, const cuDoubleComplex *cj, int iflag,
+                 double eps, i64 ms, cuDoubleComplex *fk, const cufinufft_opts *opts) {
+  return cufinufft1d1many(1, nj, xj, cj, iflag, eps, ms, fk, opts);
+}
+int cufinufftf1d1(i64 nj, const float *xj, const cuFloatComplex *cj, int iflag, float eps,
+                  i64 ms, cuFloatComplex *fk, const cufinufft_opts *opts) {
+  return cufinufftf1d1many(1, nj, xj, cj, iflag, eps, ms, fk, opts);
+}
+
+int cufinufft1d2many(int n_transf, i64 nj, const double *xj, cuDoubleComplex *cj,
+                     int iflag, double eps, i64 ms, const cuDoubleComplex *fk,
+                     const cufinufft_opts *opts) {
+  return guru2<double>(1, 2, n_transf, nj, {xj}, cj, iflag, eps, {ms, 1, 1}, 0, {}, fk,
+                       opts);
+}
+int cufinufftf1d2many(int n_transf, i64 nj, const float *xj, cuFloatComplex *cj,
+                      int iflag, float eps, i64 ms, const cuFloatComplex *fk,
+                      const cufinufft_opts *opts) {
+  return guru2<float>(1, 2, n_transf, nj, {xj}, cj, iflag, eps, {ms, 1, 1}, 0, {}, fk,
+                      opts);
+}
+int cufinufft1d2(i64 nj, const double *xj, cuDoubleComplex *cj, int iflag, double eps,
+                 i64 ms, const cuDoubleComplex *fk, const cufinufft_opts *opts) {
+  return cufinufft1d2many(1, nj, xj, cj, iflag, eps, ms, fk, opts);
+}
+int cufinufftf1d2(i64 nj, const float *xj, cuFloatComplex *cj, int iflag, float eps,
+                  i64 ms, const cuFloatComplex *fk, const cufinufft_opts *opts) {
+  return cufinufftf1d2many(1, nj, xj, cj, iflag, eps, ms, fk, opts);
+}
+
+int cufinufft1d3many(int n_transf, i64 nj, const double *xj, const cuDoubleComplex *cj,
+                     int iflag, double eps, i64 nk, const double *s, cuDoubleComplex *fk,
+                     const cufinufft_opts *opts) {
+  return guru13<double>(1, 3, n_transf, nj, {xj}, cj, iflag, eps, {0, 0, 0}, nk, {s}, fk,
+                        opts);
+}
+int cufinufftf1d3many(int n_transf, i64 nj, const float *xj, const cuFloatComplex *cj,
+                      int iflag, float eps, i64 nk, const float *s, cuFloatComplex *fk,
+                      const cufinufft_opts *opts) {
+  return guru13<float>(1, 3, n_transf, nj, {xj}, cj, iflag, eps, {0, 0, 0}, nk, {s}, fk,
+                       opts);
+}
+int cufinufft1d3(i64 nj, const double *xj, const cuDoubleComplex *cj, int iflag,
+                 double eps, i64 nk, const double *s, cuDoubleComplex *fk,
+                 const cufinufft_opts *opts) {
+  return cufinufft1d3many(1, nj, xj, cj, iflag, eps, nk, s, fk, opts);
 }
+int cufinufftf1d3(i64 nj, const float *xj, const cuFloatComplex *cj, int iflag, float eps,
+                  i64 nk, const float *s, cuFloatComplex *fk,
+                  const cufinufft_opts *opts) {
+  return cufinufftf1d3many(1, nj, xj, cj, iflag, eps, nk, s, fk, opts);
+}
+
+// Dimension 22222222222222222222222222222222222222222222222222222222222222222
+
+int cufinufft2d1many(int n_transf, i64 nj, const double *xj, const double *yj,
+                     const cuDoubleComplex *cj, int iflag, double eps, i64 ms, i64 mt,
+                     cuDoubleComplex *fk, const cufinufft_opts *opts) {
+  return guru13<double>(2, 1, n_transf, nj, {xj, yj}, cj, iflag, eps, {ms, mt, 1}, 0, {},
+                        fk, opts);
+}
+int cufinufftf2d1many(int n_transf, i64 nj, const float *xj, const float *yj,
+                      const cuFloatComplex *cj, int iflag, float eps, i64 ms, i64 mt,
+                      cuFloatComplex *fk, const cufinufft_opts *opts) {
+  return guru13<float>(2, 1, n_transf, nj, {xj, yj}, cj, iflag, eps, {ms, mt, 1}, 0, {},
+                       fk, opts);
+}
+int cufinufft2d1(i64 nj, const double *xj, const double *yj, const cuDoubleComplex *cj,
+                 int iflag, double eps, i64 ms, i64 mt, cuDoubleComplex *fk,
+                 const cufinufft_opts *opts) {
+  return cufinufft2d1many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts);
+}
+int cufinufftf2d1(i64 nj, const float *xj, const float *yj, const cuFloatComplex *cj,
+                  int iflag, float eps, i64 ms, i64 mt, cuFloatComplex *fk,
+                  const cufinufft_opts *opts) {
+  return cufinufftf2d1many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts);
+}
+
+int cufinufft2d2many(int n_transf, i64 nj, const double *xj, const double *yj,
+                     cuDoubleComplex *cj, int iflag, double eps, i64 ms, i64 mt,
+                     const cuDoubleComplex *fk, const cufinufft_opts *opts) {
+  return guru2<double>(2, 2, n_transf, nj, {xj, yj}, cj, iflag, eps, {ms, mt, 1}, 0, {},
+                       fk, opts);
+}
+int cufinufftf2d2many(int n_transf, i64 nj, const float *xj, const float *yj,
+                      cuFloatComplex *cj, int iflag, float eps, i64 ms, i64 mt,
+                      const cuFloatComplex *fk, const cufinufft_opts *opts) {
+  return guru2<float>(2, 2, n_transf, nj, {xj, yj}, cj, iflag, eps, {ms, mt, 1}, 0, {},
+                      fk, opts);
+}
+int cufinufft2d2(i64 nj, const double *xj, const double *yj, cuDoubleComplex *cj,
+                 int iflag, double eps, i64 ms, i64 mt, const cuDoubleComplex *fk,
+                 const cufinufft_opts *opts) {
+  return cufinufft2d2many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts);
+}
+int cufinufftf2d2(i64 nj, const float *xj, const float *yj, cuFloatComplex *cj, int iflag,
+                  float eps, i64 ms, i64 mt, const cuFloatComplex *fk,
+                  const cufinufft_opts *opts) {
+  return cufinufftf2d2many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts);
+}
+
+int cufinufft2d3many(int n_transf, i64 nj, const double *xj, const double *yj,
+                     const cuDoubleComplex *cj, int iflag, double eps, i64 nk,
+                     const double *s, const double *t, cuDoubleComplex *fk,
+                     const cufinufft_opts *opts) {
+  return guru13<double>(2, 3, n_transf, nj, {xj, yj}, cj, iflag, eps, {0, 0, 0}, nk,
+                        {s, t}, fk, opts);
+}
+int cufinufftf2d3many(int n_transf, i64 nj, const float *xj, const float *yj,
+                      const cuFloatComplex *cj, int iflag, float eps, i64 nk,
+                      const float *s, const float *t, cuFloatComplex *fk,
+                      const cufinufft_opts *opts) {
+  return guru13<float>(2, 3, n_transf, nj, {xj, yj}, cj, iflag, eps, {0, 0, 0}, nk,
+                       {s, t}, fk, opts);
+}
+int cufinufft2d3(i64 nj, const double *xj, const double *yj, const cuDoubleComplex *cj,
+                 int iflag, double eps, i64 nk, const double *s, const double *t,
+                 cuDoubleComplex *fk, const cufinufft_opts *opts) {
+  return cufinufft2d3many(1, nj, xj, yj, cj, iflag, eps, nk, s, t, fk, opts);
+}
+int cufinufftf2d3(i64 nj, const float *xj, const float *yj, const cuFloatComplex *cj,
+                  int iflag, float eps, i64 nk, const float *s, const float *t,
+                  cuFloatComplex *fk, const cufinufft_opts *opts) {
+  return cufinufftf2d3many(1, nj, xj, yj, cj, iflag, eps, nk, s, t, fk, opts);
+}
+
+// Dimension 3333333333333333333333333333333333333333333333333333333333333333
+
+int cufinufft3d1many(int n_transf, i64 nj, const double *xj, const double *yj,
+                     const double *zj, const cuDoubleComplex *cj, int iflag, double eps,
+                     i64 ms, i64 mt, i64 mu, cuDoubleComplex *fk,
+                     const cufinufft_opts *opts) {
+  return guru13<double>(3, 1, n_transf, nj, {xj, yj, zj}, cj, iflag, eps, {ms, mt, mu}, 0,
+                        {}, fk, opts);
+}
+int cufinufftf3d1many(int n_transf, i64 nj, const float *xj, const float *yj,
+                      const float *zj, const cuFloatComplex *cj, int iflag, float eps,
+                      i64 ms, i64 mt, i64 mu, cuFloatComplex *fk,
+                      const cufinufft_opts *opts) {
+  return guru13<float>(3, 1, n_transf, nj, {xj, yj, zj}, cj, iflag, eps, {ms, mt, mu}, 0,
+                       {}, fk, opts);
+}
+int cufinufft3d1(i64 nj, const double *xj, const double *yj, const double *zj,
+                 const cuDoubleComplex *cj, int iflag, double eps, i64 ms, i64 mt, i64 mu,
+                 cuDoubleComplex *fk, const cufinufft_opts *opts) {
+  return cufinufft3d1many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts);
+}
+int cufinufftf3d1(i64 nj, const float *xj, const float *yj, const float *zj,
+                  const cuFloatComplex *cj, int iflag, float eps, i64 ms, i64 mt, i64 mu,
+                  cuFloatComplex *fk, const cufinufft_opts *opts) {
+  return cufinufftf3d1many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts);
+}
+
+int cufinufft3d2many(int n_transf, i64 nj, const double *xj, const double *yj,
+                     const double *zj, cuDoubleComplex *cj, int iflag, double eps, i64 ms,
+                     i64 mt, i64 mu, const cuDoubleComplex *fk,
+                     const cufinufft_opts *opts) {
+  return guru2<double>(3, 2, n_transf, nj, {xj, yj, zj}, cj, iflag, eps, {ms, mt, mu}, 0,
+                       {}, fk, opts);
+}
+int cufinufftf3d2many(int n_transf, i64 nj, const float *xj, const float *yj,
+                      const float *zj, cuFloatComplex *cj, int iflag, float eps, i64 ms,
+                      i64 mt, i64 mu, const cuFloatComplex *fk,
+                      const cufinufft_opts *opts) {
+  return guru2<float>(3, 2, n_transf, nj, {xj, yj, zj}, cj, iflag, eps, {ms, mt, mu}, 0,
+                      {}, fk, opts);
+}
+int cufinufft3d2(i64 nj, const double *xj, const double *yj, const double *zj,
+                 cuDoubleComplex *cj, int iflag, double eps, i64 ms, i64 mt, i64 mu,
+                 const cuDoubleComplex *fk, const cufinufft_opts *opts) {
+  return cufinufft3d2many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts);
+}
+int cufinufftf3d2(i64 nj, const float *xj, const float *yj, const float *zj,
+                  cuFloatComplex *cj, int iflag, float eps, i64 ms, i64 mt, i64 mu,
+                  const cuFloatComplex *fk, const cufinufft_opts *opts) {
+  return cufinufftf3d2many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts);
+}
+
+int cufinufft3d3many(int n_transf, i64 nj, const double *xj, const double *yj,
+                     const double *zj, const cuDoubleComplex *cj, int iflag, double eps,
+                     i64 nk, const double *s, const double *t, const double *u,
+                     cuDoubleComplex *fk, const cufinufft_opts *opts) {
+  return guru13<double>(3, 3, n_transf, nj, {xj, yj, zj}, cj, iflag, eps, {0, 0, 0}, nk,
+                        {s, t, u}, fk, opts);
+}
+int cufinufftf3d3many(int n_transf, i64 nj, const float *xj, const float *yj,
+                      const float *zj, const cuFloatComplex *cj, int iflag, float eps,
+                      i64 nk, const float *s, const float *t, const float *u,
+                      cuFloatComplex *fk, const cufinufft_opts *opts) {
+  return guru13<float>(3, 3, n_transf, nj, {xj, yj, zj}, cj, iflag, eps, {0, 0, 0}, nk,
+                       {s, t, u}, fk, opts);
+}
+int cufinufft3d3(i64 nj, const double *xj, const double *yj, const double *zj,
+                 const cuDoubleComplex *cj, int iflag, double eps, i64 nk,
+                 const double *s, const double *t, const double *u, cuDoubleComplex *fk,
+                 const cufinufft_opts *opts) {
+  return cufinufft3d3many(1, nj, xj, yj, zj, cj, iflag, eps, nk, s, t, u, fk, opts);
+}
+int cufinufftf3d3(i64 nj, const float *xj, const float *yj, const float *zj,
+                  const cuFloatComplex *cj, int iflag, float eps, i64 nk, const float *s,
+                  const float *t, const float *u, cuFloatComplex *fk,
+                  const cufinufft_opts *opts) {
+  return cufinufftf3d3many(1, nj, xj, yj, zj, cj, iflag, eps, nk, s, t, u, fk, opts);
+}
+
+} // extern "C"
diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt
index e76dcf068..ff82b20c1 100644
--- a/test/cuda/CMakeLists.txt
+++ b/test/cuda/CMakeLists.txt
@@ -223,6 +223,7 @@ add_test(NAME cufinufft_public_api COMMAND public_api_test)
 add_test(NAME cufinufft_makeplan COMMAND test_makeplan)
 add_test(NAME cufinufft_error_handling COMMAND cufinufft_error_handling)
 add_test(NAME cufinufft_math_test COMMAND cufinufft_math_test)
+add_test(NAME cufinufft_simple_test COMMAND cufinufft_simple_test)
 add_test(
     NAME cufinufft1dspreadinterponly_double_test
     COMMAND cufinufft1dspreadinterponly_test 1e3 1e6 1e-6 1e-5 d 2.00
diff --git a/test/cuda/cufinufft_simple_test.cu b/test/cuda/cufinufft_simple_test.cu
new file mode 100644
index 000000000..25177e872
--- /dev/null
+++ b/test/cuda/cufinufft_simple_test.cu
@@ -0,0 +1,372 @@
+// Cross-check the C-level simple cuFINUFFT API against the 4-step plan API.
+//
+// For each (dim, type) pair we run the simple call and the equivalent 4-step
+// plan path on identical inputs and require the device output to be bit-for-bit
+// identical. Both float and double precision are exercised.
+
+#include <complex>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <random>
+#include <string>
+#include <vector>
+
+#include <cuComplex.h>
+#include <cuda_runtime.h>
+
+#include <cufinufft.h>
+
+namespace {
+
+#define CUDA_CHECK(call)                                                                \
+  do {                                                                                  \
+    cudaError_t e = (call);                                                             \
+    if (e != cudaSuccess) {                                                             \
+      std::fprintf(stderr, "CUDA error %s at %s:%d\n", cudaGetErrorString(e), __FILE__, \
+                   __LINE__);                                                           \
+      return 1;                                                                         \
+    }                                                                                   \
+  } while (0)
+
+#define CHECK_RET(expr, name)                            \
+  do {                                                   \
+    int _r = (expr);                                     \
+    if (_r != 0) {                                       \
+      std::fprintf(stderr, "%s failed: %d\n", name, _r); \
+      return {};                                         \
+    }                                                    \
+  } while (0)
+
+template<typename T> struct Traits;
+template<> struct Traits<float> {
+  using cuc = cuFloatComplex;
+  static constexpr const char *label = "float";
+};
+template<> struct Traits<double> {
+  using cuc = cuDoubleComplex;
+  static constexpr const char *label = "double";
+};
+
+template<typename T> T *dev_alloc(size_t n) {
+  T *p = nullptr;
+  cudaMalloc(reinterpret_cast<void **>(&p), n * sizeof(T));
+  return p;
+}
+
+template<typename T> void dev_copy_in(T *dst, const std::vector<T> &src) {
+  cudaMemcpy(dst, src.data(), src.size() * sizeof(T), cudaMemcpyHostToDevice);
+}
+
+template<typename T> std::vector<T> dev_copy_out(const T *src, size_t n) {
+  std::vector<T> v(n);
+  cudaMemcpy(v.data(), src, n * sizeof(T), cudaMemcpyDeviceToHost);
+  return v;
+}
+
+// Approximate equality. GPU spreading uses atomicAdd which is not bit-
+// deterministic for float, so two independent runs of the same type-1 / type-3
+// transform can differ by a few ULPs even though the algorithm is the same.
+// Type-2 paths (interpolation only, no atomics) compare bit-identically; we
+// allow a small relative tolerance uniformly to keep the test simple.
+template<typename T>
+bool near_equal(const std::vector<std::complex<T>> &a,
+                const std::vector<std::complex<T>> &b, T rtol) {
+  if (a.size() != b.size()) return false;
+  T worst = 0;
+  for (size_t i = 0; i < a.size(); ++i) {
+    const T num = std::abs(a[i] - b[i]);
+    const T den = std::max<T>(std::abs(a[i]), std::abs(b[i]));
+    const T r = den > 0 ? num / den : num;
+    if (r > worst) worst = r;
+  }
+  if (worst > rtol) {
+    std::fprintf(stderr, "  worst rel error = %g (tol %g)\n", double(worst),
+                 double(rtol));
+    return false;
+  }
+  return true;
+}
+
+template<typename T> struct Fixture {
+  using cuc = typename Traits<T>::cuc;
+  int dim, type;
+  int64_t M;
+  int64_t N1, N2, N3, Nk; // Nk = nk for type 3
+  std::vector<T> x, y, z;
+  std::vector<T> s, t, u;
+  // For type 1/3 c is input strengths; for type 2 c is output.
+  std::vector<std::complex<T>> c_in;
+  std::vector<std::complex<T>> fk_in;
+
+  size_t c_count() const { return static_cast<size_t>(M); }
+  size_t fk_count() const {
+    if (type == 3) return static_cast<size_t>(Nk);
+    return static_cast<size_t>(N1) * N2 * N3;
+  }
+};
+
+template<typename T> Fixture<T> make_fixture(int dim, int type, std::mt19937 &rng) {
+  Fixture<T> f;
+  f.dim = dim;
+  f.type = type;
+  f.M = 137;
+  f.N1 = 16;
+  f.N2 = (dim >= 2) ? 12 : 1;
+  f.N3 = (dim >= 3) ? 8 : 1;
+  f.Nk = 23;
+  std::uniform_real_distribution<T> uni(T(-3.14), T(3.14));
+  std::uniform_real_distribution<T> uni_s(T(-2), T(2));
+  std::uniform_real_distribution<T> uni_c(T(-1), T(1));
+  f.x.resize(f.M);
+  if (dim >= 2) f.y.resize(f.M);
+  if (dim >= 3) f.z.resize(f.M);
+  for (int64_t j = 0; j < f.M; ++j) {
+    f.x[j] = uni(rng);
+    if (dim >= 2) f.y[j] = uni(rng);
+    if (dim >= 3) f.z[j] = uni(rng);
+  }
+  if (type == 3) {
+    f.s.resize(f.Nk);
+    if (dim >= 2) f.t.resize(f.Nk);
+    if (dim >= 3) f.u.resize(f.Nk);
+    for (int64_t k = 0; k < f.Nk; ++k) {
+      f.s[k] = uni_s(rng);
+      if (dim >= 2) f.t[k] = uni_s(rng);
+      if (dim >= 3) f.u[k] = uni_s(rng);
+    }
+  }
+  if (type == 1 || type == 3) {
+    f.c_in.resize(f.M);
+    for (auto &z : f.c_in) z = {uni_c(rng), uni_c(rng)};
+  } else {
+    f.fk_in.resize(f.fk_count());
+    for (auto &z : f.fk_in) z = {uni_c(rng), uni_c(rng)};
+  }
+  return f;
+}
+
+// 4-step path. Returns host vector of output complex values.
+template<typename T> std::vector<std::complex<T>> run_plan(const Fixture<T> &f) {
+  using cuc = typename Traits<T>::cuc;
+  T *d_x = nullptr, *d_y = nullptr, *d_z = nullptr;
+  T *d_s = nullptr, *d_t = nullptr, *d_u = nullptr;
+  cuc *d_c = nullptr, *d_fk = nullptr;
+  d_x = dev_alloc<T>(f.M);
+  dev_copy_in(d_x, f.x);
+  if (f.dim >= 2) {
+    d_y = dev_alloc<T>(f.M);
+    dev_copy_in(d_y, f.y);
+  }
+  if (f.dim >= 3) {
+    d_z = dev_alloc<T>(f.M);
+    dev_copy_in(d_z, f.z);
+  }
+  if (f.type == 3) {
+    d_s = dev_alloc<T>(f.Nk);
+    dev_copy_in(d_s, f.s);
+    if (f.dim >= 2) {
+      d_t = dev_alloc<T>(f.Nk);
+      dev_copy_in(d_t, f.t);
+    }
+    if (f.dim >= 3) {
+      d_u = dev_alloc<T>(f.Nk);
+      dev_copy_in(d_u, f.u);
+    }
+  }
+  d_c = dev_alloc<cuc>(f.c_count());
+  d_fk = dev_alloc<cuc>(f.fk_count());
+
+  if (f.type == 1 || f.type == 3) {
+    cudaMemcpy(d_c, f.c_in.data(), f.c_count() * sizeof(cuc), cudaMemcpyHostToDevice);
+  } else {
+    cudaMemcpy(d_fk, f.fk_in.data(), f.fk_count() * sizeof(cuc), cudaMemcpyHostToDevice);
+  }
+
+  const int64_t nmodes[3] = {f.N1, f.N2, f.N3};
+  const int Nk32 = (f.type == 3) ? static_cast<int>(f.Nk) : 0;
+  std::vector<std::complex<T>> out;
+  if constexpr (std::is_same_v<T, float>) {
+    cufinufftf_plan plan{};
+    CHECK_RET(cufinufftf_makeplan(f.type, f.dim, nmodes, 1, 1, T(1e-5), &plan, nullptr),
+              "makeplan");
+    CHECK_RET(cufinufftf_setpts(plan, f.M, d_x, d_y, d_z, Nk32, d_s, d_t, d_u), "setpts");
+    CHECK_RET(cufinufftf_execute(plan, d_c, d_fk), "execute");
+    CHECK_RET(cufinufftf_destroy(plan), "destroy");
+  } else {
+    cufinufft_plan plan{};
+    CHECK_RET(cufinufft_makeplan(f.type, f.dim, nmodes, 1, 1, T(1e-9), &plan, nullptr),
+              "makeplan");
+    CHECK_RET(cufinufft_setpts(plan, f.M, d_x, d_y, d_z, Nk32, d_s, d_t, d_u), "setpts");
+    CHECK_RET(cufinufft_execute(plan, d_c, d_fk), "execute");
+    CHECK_RET(cufinufft_destroy(plan), "destroy");
+  }
+
+  if (f.type == 2)
+    out = dev_copy_out(reinterpret_cast<const std::complex<T> *>(d_c), f.c_count());
+  else
+    out = dev_copy_out(reinterpret_cast<const std::complex<T> *>(d_fk), f.fk_count());
+
+  cudaFree(d_x);
+  cudaFree(d_y);
+  cudaFree(d_z);
+  cudaFree(d_s);
+  cudaFree(d_t);
+  cudaFree(d_u);
+  cudaFree(d_c);
+  cudaFree(d_fk);
+  return out;
+}
+
+// Simple-API path. Mirrors run_plan().
+template<typename T> std::vector<std::complex<T>> run_simple(const Fixture<T> &f) {
+  using cuc = typename Traits<T>::cuc;
+  T *d_x = nullptr, *d_y = nullptr, *d_z = nullptr;
+  T *d_s = nullptr, *d_t = nullptr, *d_u = nullptr;
+  cuc *d_c = nullptr, *d_fk = nullptr;
+  d_x = dev_alloc<T>(f.M);
+  dev_copy_in(d_x, f.x);
+  if (f.dim >= 2) {
+    d_y = dev_alloc<T>(f.M);
+    dev_copy_in(d_y, f.y);
+  }
+  if (f.dim >= 3) {
+    d_z = dev_alloc<T>(f.M);
+    dev_copy_in(d_z, f.z);
+  }
+  if (f.type == 3) {
+    d_s = dev_alloc<T>(f.Nk);
+    dev_copy_in(d_s, f.s);
+    if (f.dim >= 2) {
+      d_t = dev_alloc<T>(f.Nk);
+      dev_copy_in(d_t, f.t);
+    }
+    if (f.dim >= 3) {
+      d_u = dev_alloc<T>(f.Nk);
+      dev_copy_in(d_u, f.u);
+    }
+  }
+  d_c = dev_alloc<cuc>(f.c_count());
+  d_fk = dev_alloc<cuc>(f.fk_count());
+
+  if (f.type == 1 || f.type == 3) {
+    cudaMemcpy(d_c, f.c_in.data(), f.c_count() * sizeof(cuc), cudaMemcpyHostToDevice);
+  } else {
+    cudaMemcpy(d_fk, f.fk_in.data(), f.fk_count() * sizeof(cuc), cudaMemcpyHostToDevice);
+  }
+
+  const T eps = std::is_same_v<T, float> ? T(1e-5) : T(1e-9);
+  int rc = 0;
+  if constexpr (std::is_same_v<T, float>) {
+    if (f.dim == 1) {
+      if (f.type == 1)
+        rc = cufinufftf1d1(f.M, d_x, d_c, 1, eps, f.N1, d_fk, nullptr);
+      else if (f.type == 2)
+        rc = cufinufftf1d2(f.M, d_x, d_c, 1, eps, f.N1, d_fk, nullptr);
+      else
+        rc = cufinufftf1d3(f.M, d_x, d_c, 1, eps, f.Nk, d_s, d_fk, nullptr);
+    } else if (f.dim == 2) {
+      if (f.type == 1)
+        rc = cufinufftf2d1(f.M, d_x, d_y, d_c, 1, eps, f.N1, f.N2, d_fk, nullptr);
+      else if (f.type == 2)
+        rc = cufinufftf2d2(f.M, d_x, d_y, d_c, 1, eps, f.N1, f.N2, d_fk, nullptr);
+      else
+        rc = cufinufftf2d3(f.M, d_x, d_y, d_c, 1, eps, f.Nk, d_s, d_t, d_fk, nullptr);
+    } else {
+      if (f.type == 1)
+        rc = cufinufftf3d1(f.M, d_x, d_y, d_z, d_c, 1, eps, f.N1, f.N2, f.N3, d_fk,
+                           nullptr);
+      else if (f.type == 2)
+        rc = cufinufftf3d2(f.M, d_x, d_y, d_z, d_c, 1, eps, f.N1, f.N2, f.N3, d_fk,
+                           nullptr);
+      else
+        rc = cufinufftf3d3(f.M, d_x, d_y, d_z, d_c, 1, eps, f.Nk, d_s, d_t, d_u, d_fk,
+                           nullptr);
+    }
+  } else {
+    if (f.dim == 1) {
+      if (f.type == 1)
+        rc = cufinufft1d1(f.M, d_x, d_c, 1, eps, f.N1, d_fk, nullptr);
+      else if (f.type == 2)
+        rc = cufinufft1d2(f.M, d_x, d_c, 1, eps, f.N1, d_fk, nullptr);
+      else
+        rc = cufinufft1d3(f.M, d_x, d_c, 1, eps, f.Nk, d_s, d_fk, nullptr);
+    } else if (f.dim == 2) {
+      if (f.type == 1)
+        rc = cufinufft2d1(f.M, d_x, d_y, d_c, 1, eps, f.N1, f.N2, d_fk, nullptr);
+      else if (f.type == 2)
+        rc = cufinufft2d2(f.M, d_x, d_y, d_c, 1, eps, f.N1, f.N2, d_fk, nullptr);
+      else
+        rc = cufinufft2d3(f.M, d_x, d_y, d_c, 1, eps, f.Nk, d_s, d_t, d_fk, nullptr);
+    } else {
+      if (f.type == 1)
+        rc = cufinufft3d1(f.M, d_x, d_y, d_z, d_c, 1, eps, f.N1, f.N2, f.N3, d_fk,
+                          nullptr);
+      else if (f.type == 2)
+        rc = cufinufft3d2(f.M, d_x, d_y, d_z, d_c, 1, eps, f.N1, f.N2, f.N3, d_fk,
+                          nullptr);
+      else
+        rc = cufinufft3d3(f.M, d_x, d_y, d_z, d_c, 1, eps, f.Nk, d_s, d_t, d_u, d_fk,
+                          nullptr);
+    }
+  }
+  if (rc != 0) {
+    std::fprintf(stderr, "simple call failed: %d (dim=%d type=%d)\n", rc, f.dim, f.type);
+    return {};
+  }
+
+  std::vector<std::complex<T>> out;
+  if (f.type == 2)
+    out = dev_copy_out(reinterpret_cast<const std::complex<T> *>(d_c), f.c_count());
+  else
+    out = dev_copy_out(reinterpret_cast<const std::complex<T> *>(d_fk), f.fk_count());
+
+  cudaFree(d_x);
+  cudaFree(d_y);
+  cudaFree(d_z);
+  cudaFree(d_s);
+  cudaFree(d_t);
+  cudaFree(d_u);
+  cudaFree(d_c);
+  cudaFree(d_fk);
+  return out;
+}
+
+template<typename T> int compare_one(int dim, int type, std::mt19937 &rng) {
+  auto f = make_fixture<T>(dim, type, rng);
+  auto via_p = run_plan(f);
+  auto via_s = run_simple(f);
+  if (via_p.empty() || via_s.empty()) return 1;
+  // Tolerance: the algorithm is the same; only atomic-add non-determinism
+  // separates the two runs. Float type-3 chains spreading + FFT + interpolation
+  // so the error accumulates more; allow a looser bound there.
+  const T rtol = std::is_same_v<T, float> ? T(5e-3) : T(1e-9);
+  if (!near_equal(via_p, via_s, rtol)) {
+    std::fprintf(stderr, "mismatch %s dim=%d type=%d (n=%zu)\n", Traits<T>::label, dim,
+                 type, via_p.size());
+    return 1;
+  }
+  std::printf("ok %s dim=%d type=%d (n=%zu)\n", Traits<T>::label, dim, type,
+              via_p.size());
+  return 0;
+}
+
+template<typename T> int run_all() {
+  std::mt19937 rng(static_cast<unsigned>(std::is_same_v<T, float> ? 1 : 2));
+  int rc = 0;
+  for (int dim = 1; dim <= 3; ++dim) {
+    for (int type = 1; type <= 3; ++type) {
+      rc |= compare_one<T>(dim, type, rng);
+    }
+  }
+  return rc;
+}
+
+} // namespace
+
+int main() {
+  int rc = 0;
+  rc |= run_all<float>();
+  rc |= run_all<double>();
+  return rc;
+}
diff --git a/test/cuda/public_api_test.c b/test/cuda/public_api_test.c
index 33f0ab954..757eb680e 100644
--- a/test/cuda/public_api_test.c
+++ b/test/cuda/public_api_test.c
@@ -182,11 +182,59 @@ int test_double(int M, int N) {
   return 0;
 }
 
+// Smoke test for the C-level simple API: same 1D type-1 problem as test_float,
+// but using cufinufftf1d1 (one call) instead of the 4-step plan path. Verifies
+// the C ABI (this TU is compiled as C, not C++).
+int test_simple_float(int M, int N) {
+  int64_t modes[1] = {N};
+
+  float *x;
+  float _Complex *c;
+  float _Complex *f;
+
+  float *d_x;
+  cuFloatComplex *d_c, *d_f;
+
+  x = (float *)malloc(M * sizeof(float));
+  c = (float _Complex *)malloc(M * sizeof(float _Complex));
+  f = (float _Complex *)malloc(N * sizeof(float _Complex));
+
+  srand(0);
+  for (int j = 0; j < M; ++j) {
+    x[j] = 2 * PI * (((float)rand()) / RAND_MAX - 1);
+    c[j] =
+        (2 * ((float)rand()) / RAND_MAX - 1) + I * (2 * ((float)rand()) / RAND_MAX - 1);
+  }
+
+  cudaMalloc((void **)&d_x, M * sizeof(float));
+  cudaMalloc((void **)&d_c, M * sizeof(float _Complex));
+  cudaMalloc((void **)&d_f, N * sizeof(float _Complex));
+  cudaMemcpy(d_x, x, M * sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_c, c, M * sizeof(float _Complex), cudaMemcpyHostToDevice);
+
+  int rc = cufinufftf1d1(M, d_x, d_c, 1, 1e-6f, modes[0], d_f, NULL);
+
+  cudaMemcpy(f, d_f, N * sizeof(float _Complex), cudaMemcpyDeviceToHost);
+
+  cudaFree(d_x);
+  cudaFree(d_c);
+  cudaFree(d_f);
+
+  int idx = 4 * N / 7;
+  printf("simple f[%d] = %lf + %lfi\n", idx, crealf(f[idx]), cimagf(f[idx]));
+
+  free(x);
+  free(c);
+  free(f);
+  return rc;
+}
+
 int main() {
   // Problem size: number of nonuniform points (M) and grid size (N).
   const int M = 100, N = 200;
   int errf = test_float(M, N);
-  int err  = test_double(M, N);
+  int err = test_double(M, N);
+  int errs = test_simple_float(M, N);
 
-  return (err | errf);
+  return (err | errf | errs);
 }