diff --git a/CHANGELOG b/CHANGELOG index 115c96603..da2e55311 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -3,6 +3,10 @@ If not stated, FINUFFT is assumed (old cuFINUFFT <=1.3 is listed separately). v2.6.0-dev +* Added C-level simple API to cuFINUFFT + (`cufinufft{,f}{1,2,3}d{1,2,3}[many]`) mirroring the CPU `finufft1d1`-style + one-shot wrappers. 36 new entry points that allocate a plan, set points, + execute, and destroy in a single call; inputs are device pointers. (Barbone) * Added `threadsafe_execute` regression test verifying concurrent `execute()` calls on the same plan produce correct results. Added sanitizer mode selection via `FINUFFT_USE_SANITIZERS=OFF|ON|MEMSAN|TSAN`, and extended the sanitizer diff --git a/docs/c_gpu.rst b/docs/c_gpu.rst index fb776cbc0..215de2867 100644 --- a/docs/c_gpu.rst +++ b/docs/c_gpu.rst @@ -19,6 +19,67 @@ You will also want to read the examples in ``examples/cuda`` and ``test/cuda/cuf *Note*: The interface to cuFINUFFT has changed between versions 1.3 and 2.2. Please see :ref:`Migration to cuFINUFFT v2.2` for details. +Simple interface +---------------- + +For users who only need to perform a single transform per plan, cuFINUFFT +provides one-shot wrappers that combine all four plan steps (make, setpts, +execute, destroy) into a single call. These mirror the CPU ``finufft1d1`` +family (see :ref:`c`) and follow the same naming and argument order; the +only difference is that all input/output array pointers refer to memory on +the GPU. They have double (``cufinufft``) and single (``cufinufftf``) +precision versions, which we document together. + +For each dimension ``d`` (1, 2, or 3) and transform type ``t`` (1, 2, or 3), +there are four entry points:: + + cufinufft{d}d{t}(...); // double precision, single transform + cufinufftf{d}d{t}(...); // single precision, single transform + cufinufft{d}d{t}many(n_transf, ...); // double precision, many transforms + cufinufftf{d}d{t}many(n_transf, ...); // single precision, many transforms + +This gives 36 entry points in total. The full prototypes are declared in +``include/cufinufft.h``. As an example, the 1D type-1 simple call in single +precision is: + +.. code-block:: c + + int cufinufftf1d1(int64_t nj, const float *xj, const cuFloatComplex *cj, + int iflag, float eps, int64_t ms, cuFloatComplex *fk, + const cufinufft_opts *opts); + +Inputs: + +* ``nj`` — number of nonuniform points +* ``xj`` — length-``nj`` device array of NU point coordinates (in :math:`[-\pi,\pi)`, + values outside are folded) +* ``cj`` — length-``nj`` device array of NU strengths +* ``iflag`` — sign in the complex exponential (>=0 for +, <0 for -) +* ``eps`` — requested relative tolerance +* ``ms`` — number of Fourier modes +* ``fk`` — length-``ms`` device array, output +* ``opts`` — optional options pointer (NULL for defaults) + +Returns ``0`` on success, otherwise an error code (see ``finufft_errors.h``). + +A complete example using the simple interface in place of the 4-step calls +from the :ref:`Getting started ` walkthrough below is: + +.. code-block:: c + + cufinufftf1d1(M, d_x, d_c, 1, 1e-6f, N, d_f, NULL); + +This single call replaces the ``makeplan``/``setpts``/``execute``/``destroy`` +sequence. Type-3 calls also take frequency-target arrays (``s`` for 1D, +``s,t`` for 2D, ``s,t,u`` for 3D), exactly as in the CPU API. The ``many`` +variants take an ``n_transf`` parameter as the first argument and use the +same set of NU points for ``n_transf`` consecutive transforms (with input +and output arrays sized accordingly). + +The simple interface is intended for cases where the plan would otherwise be +used exactly once. When the same NU points are reused across several +transforms, use the 4-step plan interface below for better performance. + Getting started --------------- diff --git a/include/cufinufft.h b/include/cufinufft.h index 954ffae00..da9f419f6 100644 --- a/include/cufinufft.h +++ b/include/cufinufft.h @@ -36,6 +36,154 @@ FINUFFT_EXPORT int cufinufftf_execute(cufinufftf_plan d_plan, cuFloatComplex *d_ FINUFFT_EXPORT int cufinufft_destroy(cufinufft_plan d_plan); FINUFFT_EXPORT int cufinufftf_destroy(cufinufftf_plan d_plan); + +// Simple (one-shot) interfaces. Pointers are device pointers. Behavior matches +// the 4-step plan API above. 36 entry points (3 dims x 3 types x {single,many} +// x {double,float}). + +// Dimension 1111111111111111111111111111111111111111111111111111111111111111 +FINUFFT_EXPORT int cufinufft1d1many( + int n_transf, int64_t nj, const double *xj, const cuDoubleComplex *cj, int iflag, + double eps, int64_t ms, cuDoubleComplex *fk, const cufinufft_opts *opts); +FINUFFT_EXPORT int cufinufftf1d1many( + int n_transf, int64_t nj, const float *xj, const cuFloatComplex *cj, int iflag, + float eps, int64_t ms, cuFloatComplex *fk, const cufinufft_opts *opts); +FINUFFT_EXPORT int cufinufft1d1(int64_t nj, const double *xj, const cuDoubleComplex *cj, + int iflag, double eps, int64_t ms, cuDoubleComplex *fk, + const cufinufft_opts *opts); +FINUFFT_EXPORT int cufinufftf1d1(int64_t nj, const float *xj, const cuFloatComplex *cj, + int iflag, float eps, int64_t ms, cuFloatComplex *fk, + const cufinufft_opts *opts); + +FINUFFT_EXPORT int cufinufft1d2many( + int n_transf, int64_t nj, const double *xj, cuDoubleComplex *cj, int iflag, + double eps, int64_t ms, const cuDoubleComplex *fk, const cufinufft_opts *opts); +FINUFFT_EXPORT int cufinufftf1d2many( + int n_transf, int64_t nj, const float *xj, cuFloatComplex *cj, int iflag, float eps, + int64_t ms, const cuFloatComplex *fk, const cufinufft_opts *opts); +FINUFFT_EXPORT int cufinufft1d2(int64_t nj, const double *xj, cuDoubleComplex *cj, + int iflag, double eps, int64_t ms, + const cuDoubleComplex *fk, const cufinufft_opts *opts); +FINUFFT_EXPORT int cufinufftf1d2(int64_t nj, const float *xj, cuFloatComplex *cj, + int iflag, float eps, int64_t ms, + const cuFloatComplex *fk, const cufinufft_opts *opts); + +FINUFFT_EXPORT int cufinufft1d3many(int n_transf, int64_t nj, const double *xj, + const cuDoubleComplex *cj, int iflag, double eps, + int64_t nk, const double *s, cuDoubleComplex *fk, + const cufinufft_opts *opts); +FINUFFT_EXPORT int cufinufftf1d3many(int n_transf, int64_t nj, const float *xj, + const cuFloatComplex *cj, int iflag, float eps, + int64_t nk, const float *s, cuFloatComplex *fk, + const cufinufft_opts *opts); +FINUFFT_EXPORT int cufinufft1d3(int64_t nj, const double *xj, const cuDoubleComplex *cj, + int iflag, double eps, int64_t nk, const double *s, + cuDoubleComplex *fk, const cufinufft_opts *opts); +FINUFFT_EXPORT int cufinufftf1d3(int64_t nj, const float *xj, const cuFloatComplex *cj, + int iflag, float eps, int64_t nk, const float *s, + cuFloatComplex *fk, const cufinufft_opts *opts); + +// Dimension 22222222222222222222222222222222222222222222222222222222222222222 +FINUFFT_EXPORT int cufinufft2d1many(int n_transf, int64_t nj, const double *xj, + const double *yj, const cuDoubleComplex *cj, + int iflag, double eps, int64_t ms, int64_t mt, + cuDoubleComplex *fk, const cufinufft_opts *opts); +FINUFFT_EXPORT int cufinufftf2d1many(int n_transf, int64_t nj, const float *xj, + const float *yj, const cuFloatComplex *cj, int iflag, + float eps, int64_t ms, int64_t mt, + cuFloatComplex *fk, const cufinufft_opts *opts); +FINUFFT_EXPORT int cufinufft2d1( + int64_t nj, const double *xj, const double *yj, const cuDoubleComplex *cj, int iflag, + double eps, int64_t ms, int64_t mt, cuDoubleComplex *fk, const cufinufft_opts *opts); +FINUFFT_EXPORT int cufinufftf2d1( + int64_t nj, const float *xj, const float *yj, const cuFloatComplex *cj, int iflag, + float eps, int64_t ms, int64_t mt, cuFloatComplex *fk, const cufinufft_opts *opts); + +FINUFFT_EXPORT int cufinufft2d2many( + int n_transf, int64_t nj, const double *xj, const double *yj, cuDoubleComplex *cj, + int iflag, double eps, int64_t ms, int64_t mt, const cuDoubleComplex *fk, + const cufinufft_opts *opts); +FINUFFT_EXPORT int cufinufftf2d2many( + int n_transf, int64_t nj, const float *xj, const float *yj, cuFloatComplex *cj, + int iflag, float eps, int64_t ms, int64_t mt, const cuFloatComplex *fk, + const cufinufft_opts *opts); +FINUFFT_EXPORT int cufinufft2d2(int64_t nj, const double *xj, const double *yj, + cuDoubleComplex *cj, int iflag, double eps, int64_t ms, + int64_t mt, const cuDoubleComplex *fk, + const cufinufft_opts *opts); +FINUFFT_EXPORT int cufinufftf2d2(int64_t nj, const float *xj, const float *yj, + cuFloatComplex *cj, int iflag, float eps, int64_t ms, + int64_t mt, const cuFloatComplex *fk, + const cufinufft_opts *opts); + +FINUFFT_EXPORT int cufinufft2d3many( + int n_transf, int64_t nj, const double *xj, const double *yj, + const cuDoubleComplex *cj, int iflag, double eps, int64_t nk, const double *s, + const double *t, cuDoubleComplex *fk, const cufinufft_opts *opts); +FINUFFT_EXPORT int cufinufftf2d3many( + int n_transf, int64_t nj, const float *xj, const float *yj, const cuFloatComplex *cj, + int iflag, float eps, int64_t nk, const float *s, const float *t, cuFloatComplex *fk, + const cufinufft_opts *opts); +FINUFFT_EXPORT int cufinufft2d3(int64_t nj, const double *xj, const double *yj, + const cuDoubleComplex *cj, int iflag, double eps, + int64_t nk, const double *s, const double *t, + cuDoubleComplex *fk, const cufinufft_opts *opts); +FINUFFT_EXPORT int cufinufftf2d3(int64_t nj, const float *xj, const float *yj, + const cuFloatComplex *cj, int iflag, float eps, + int64_t nk, const float *s, const float *t, + cuFloatComplex *fk, const cufinufft_opts *opts); + +// Dimension 3333333333333333333333333333333333333333333333333333333333333333 +FINUFFT_EXPORT int cufinufft3d1many( + int n_transf, int64_t nj, const double *xj, const double *yj, const double *zj, + const cuDoubleComplex *cj, int iflag, double eps, int64_t ms, int64_t mt, int64_t mu, + cuDoubleComplex *fk, const cufinufft_opts *opts); +FINUFFT_EXPORT int cufinufftf3d1many( + int n_transf, int64_t nj, const float *xj, const float *yj, const float *zj, + const cuFloatComplex *cj, int iflag, float eps, int64_t ms, int64_t mt, int64_t mu, + cuFloatComplex *fk, const cufinufft_opts *opts); +FINUFFT_EXPORT int cufinufft3d1(int64_t nj, const double *xj, const double *yj, + const double *zj, const cuDoubleComplex *cj, int iflag, + double eps, int64_t ms, int64_t mt, int64_t mu, + cuDoubleComplex *fk, const cufinufft_opts *opts); +FINUFFT_EXPORT int cufinufftf3d1(int64_t nj, const float *xj, const float *yj, + const float *zj, const cuFloatComplex *cj, int iflag, + float eps, int64_t ms, int64_t mt, int64_t mu, + cuFloatComplex *fk, const cufinufft_opts *opts); + +FINUFFT_EXPORT int cufinufft3d2many( + int n_transf, int64_t nj, const double *xj, const double *yj, const double *zj, + cuDoubleComplex *cj, int iflag, double eps, int64_t ms, int64_t mt, int64_t mu, + const cuDoubleComplex *fk, const cufinufft_opts *opts); +FINUFFT_EXPORT int cufinufftf3d2many( + int n_transf, int64_t nj, const float *xj, const float *yj, const float *zj, + cuFloatComplex *cj, int iflag, float eps, int64_t ms, int64_t mt, int64_t mu, + const cuFloatComplex *fk, const cufinufft_opts *opts); +FINUFFT_EXPORT int cufinufft3d2(int64_t nj, const double *xj, const double *yj, + const double *zj, cuDoubleComplex *cj, int iflag, + double eps, int64_t ms, int64_t mt, int64_t mu, + const cuDoubleComplex *fk, const cufinufft_opts *opts); +FINUFFT_EXPORT int cufinufftf3d2(int64_t nj, const float *xj, const float *yj, + const float *zj, cuFloatComplex *cj, int iflag, + float eps, int64_t ms, int64_t mt, int64_t mu, + const cuFloatComplex *fk, const cufinufft_opts *opts); + +FINUFFT_EXPORT int cufinufft3d3many( + int n_transf, int64_t nj, const double *xj, const double *yj, const double *zj, + const cuDoubleComplex *cj, int iflag, double eps, int64_t nk, const double *s, + const double *t, const double *u, cuDoubleComplex *fk, const cufinufft_opts *opts); +FINUFFT_EXPORT int cufinufftf3d3many( + int n_transf, int64_t nj, const float *xj, const float *yj, const float *zj, + const cuFloatComplex *cj, int iflag, float eps, int64_t nk, const float *s, + const float *t, const float *u, cuFloatComplex *fk, const cufinufft_opts *opts); +FINUFFT_EXPORT int cufinufft3d3( + int64_t nj, const double *xj, const double *yj, const double *zj, + const cuDoubleComplex *cj, int iflag, double eps, int64_t nk, const double *s, + const double *t, const double *u, cuDoubleComplex *fk, const cufinufft_opts *opts); +FINUFFT_EXPORT int cufinufftf3d3( + int64_t nj, const float *xj, const float *yj, const float *zj, + const cuFloatComplex *cj, int iflag, float eps, int64_t nk, const float *s, + const float *t, const float *u, cuFloatComplex *fk, const cufinufft_opts *opts); #ifdef __cplusplus } #endif diff --git a/src/cuda/c_interface.cpp b/src/cuda/c_interface.cpp index 8af543002..9f7e6e0df 100644 --- a/src/cuda/c_interface.cpp +++ b/src/cuda/c_interface.cpp @@ -1,7 +1,9 @@ +#include #include #include #include #include +#include #include #include @@ -170,4 +172,296 @@ void cufinufft_default_opts(cufinufft_opts *opts) opts->gpu_stream = cudaStreamDefault; // sphinx tag (don't remove): @gpu_defopts_end } +} // extern "C" + +/* --------------------------------------------------------------------------- + The 18 simple interfaces (= 3 dims * 3 types * {singlecall,many}) to + cuFINUFFT, mirroring the CPU c_interface.cpp layout. Pointers are device + pointers; behavior mirrors the 4-step plan API above. + --------------------------------------------------------------------------- */ + +namespace { // helpers local to this TU + +using i64 = int64_t; +using ci64 = const int64_t; + +template +int simple_guru( + int n_dims, int type, int n_transf, i64 nj, const std::array &xyz, + cuda_complex *cj, int iflag, T eps, const std::array &n_modes, i64 nk, + const std::array &stu, cuda_complex *fk, const cufinufft_opts *popts) +// Helper layer between simple interfaces and the underlying plan methods. +// Plan is heap-allocated (it owns CUDA streams / cuFFT plans / device buffers +// whose destruction order matters) and managed via unique_ptr so error paths +// still free GPU resources. +{ + if (n_dims < 1 || n_dims > 3) return FINUFFT_ERR_DIM_NOTVALID; + if (nj > std::numeric_limits::max()) return FINUFFT_ERR_NDATA_NOTVALID; + if (nk > std::numeric_limits::max()) return FINUFFT_ERR_NDATA_NOTVALID; + + int nmodes32[3]; + if (is_invalid_mode_array(type, n_dims, n_modes.data(), nmodes32)) + return FINUFFT_ERR_NDATA_NOTVALID; + + cufinufft_opts planopts; + if (popts) + planopts = *popts; + else + cufinufft_default_opts(&planopts); + + auto plan = std::make_unique>(type, n_dims, nmodes32, iflag, + n_transf, eps, planopts); + const int warn = plan->eps_too_small ? FINUFFT_WARN_EPS_TOO_SMALL : 0; + plan->setpts((int)nj, xyz[0], xyz[1], xyz[2], (int)nk, stu[0], stu[1], stu[2]); + plan->execute(cj, fk); + return warn; +} + +template +int guru13(int n_dims, int type, int n_transf, i64 nj, + const std::array &xyz, const cuda_complex *cj, int iflag, + T eps, const std::array &n_modes, i64 nk, + const std::array &stu, cuda_complex *fk, + const cufinufft_opts *popts) { + return safe_finufft_call([&]() -> int { + return simple_guru(n_dims, type, n_transf, nj, xyz, + const_cast *>(cj), iflag, eps, n_modes, nk, stu, + fk, popts); + }); +} + +template +int guru2(int n_dims, int type, int n_transf, i64 nj, const std::array &xyz, + cuda_complex *cj, int iflag, T eps, const std::array &n_modes, + i64 nk, const std::array &stu, const cuda_complex *fk, + const cufinufft_opts *popts) { + return safe_finufft_call([&]() -> int { + return simple_guru(n_dims, type, n_transf, nj, xyz, cj, iflag, eps, n_modes, nk, + stu, const_cast *>(fk), popts); + }); +} + +} // anonymous namespace + +extern "C" { + +// Dimension 1111111111111111111111111111111111111111111111111111111111111111 + +int cufinufft1d1many(int n_transf, i64 nj, const double *xj, const cuDoubleComplex *cj, + int iflag, double eps, i64 ms, cuDoubleComplex *fk, + const cufinufft_opts *opts) { + return guru13(1, 1, n_transf, nj, {xj}, cj, iflag, eps, {ms, 1, 1}, 0, {}, fk, + opts); +} +int cufinufftf1d1many(int n_transf, i64 nj, const float *xj, const cuFloatComplex *cj, + int iflag, float eps, i64 ms, cuFloatComplex *fk, + const cufinufft_opts *opts) { + return guru13(1, 1, n_transf, nj, {xj}, cj, iflag, eps, {ms, 1, 1}, 0, {}, fk, + opts); +} +int cufinufft1d1(i64 nj, const double *xj, const cuDoubleComplex *cj, int iflag, + double eps, i64 ms, cuDoubleComplex *fk, const cufinufft_opts *opts) { + return cufinufft1d1many(1, nj, xj, cj, iflag, eps, ms, fk, opts); +} +int cufinufftf1d1(i64 nj, const float *xj, const cuFloatComplex *cj, int iflag, float eps, + i64 ms, cuFloatComplex *fk, const cufinufft_opts *opts) { + return cufinufftf1d1many(1, nj, xj, cj, iflag, eps, ms, fk, opts); +} + +int cufinufft1d2many(int n_transf, i64 nj, const double *xj, cuDoubleComplex *cj, + int iflag, double eps, i64 ms, const cuDoubleComplex *fk, + const cufinufft_opts *opts) { + return guru2(1, 2, n_transf, nj, {xj}, cj, iflag, eps, {ms, 1, 1}, 0, {}, fk, + opts); +} +int cufinufftf1d2many(int n_transf, i64 nj, const float *xj, cuFloatComplex *cj, + int iflag, float eps, i64 ms, const cuFloatComplex *fk, + const cufinufft_opts *opts) { + return guru2(1, 2, n_transf, nj, {xj}, cj, iflag, eps, {ms, 1, 1}, 0, {}, fk, + opts); +} +int cufinufft1d2(i64 nj, const double *xj, cuDoubleComplex *cj, int iflag, double eps, + i64 ms, const cuDoubleComplex *fk, const cufinufft_opts *opts) { + return cufinufft1d2many(1, nj, xj, cj, iflag, eps, ms, fk, opts); +} +int cufinufftf1d2(i64 nj, const float *xj, cuFloatComplex *cj, int iflag, float eps, + i64 ms, const cuFloatComplex *fk, const cufinufft_opts *opts) { + return cufinufftf1d2many(1, nj, xj, cj, iflag, eps, ms, fk, opts); +} + +int cufinufft1d3many(int n_transf, i64 nj, const double *xj, const cuDoubleComplex *cj, + int iflag, double eps, i64 nk, const double *s, cuDoubleComplex *fk, + const cufinufft_opts *opts) { + return guru13(1, 3, n_transf, nj, {xj}, cj, iflag, eps, {0, 0, 0}, nk, {s}, fk, + opts); +} +int cufinufftf1d3many(int n_transf, i64 nj, const float *xj, const cuFloatComplex *cj, + int iflag, float eps, i64 nk, const float *s, cuFloatComplex *fk, + const cufinufft_opts *opts) { + return guru13(1, 3, n_transf, nj, {xj}, cj, iflag, eps, {0, 0, 0}, nk, {s}, fk, + opts); +} +int cufinufft1d3(i64 nj, const double *xj, const cuDoubleComplex *cj, int iflag, + double eps, i64 nk, const double *s, cuDoubleComplex *fk, + const cufinufft_opts *opts) { + return cufinufft1d3many(1, nj, xj, cj, iflag, eps, nk, s, fk, opts); } +int cufinufftf1d3(i64 nj, const float *xj, const cuFloatComplex *cj, int iflag, float eps, + i64 nk, const float *s, cuFloatComplex *fk, + const cufinufft_opts *opts) { + return cufinufftf1d3many(1, nj, xj, cj, iflag, eps, nk, s, fk, opts); +} + +// Dimension 22222222222222222222222222222222222222222222222222222222222222222 + +int cufinufft2d1many(int n_transf, i64 nj, const double *xj, const double *yj, + const cuDoubleComplex *cj, int iflag, double eps, i64 ms, i64 mt, + cuDoubleComplex *fk, const cufinufft_opts *opts) { + return guru13(2, 1, n_transf, nj, {xj, yj}, cj, iflag, eps, {ms, mt, 1}, 0, {}, + fk, opts); +} +int cufinufftf2d1many(int n_transf, i64 nj, const float *xj, const float *yj, + const cuFloatComplex *cj, int iflag, float eps, i64 ms, i64 mt, + cuFloatComplex *fk, const cufinufft_opts *opts) { + return guru13(2, 1, n_transf, nj, {xj, yj}, cj, iflag, eps, {ms, mt, 1}, 0, {}, + fk, opts); +} +int cufinufft2d1(i64 nj, const double *xj, const double *yj, const cuDoubleComplex *cj, + int iflag, double eps, i64 ms, i64 mt, cuDoubleComplex *fk, + const cufinufft_opts *opts) { + return cufinufft2d1many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts); +} +int cufinufftf2d1(i64 nj, const float *xj, const float *yj, const cuFloatComplex *cj, + int iflag, float eps, i64 ms, i64 mt, cuFloatComplex *fk, + const cufinufft_opts *opts) { + return cufinufftf2d1many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts); +} + +int cufinufft2d2many(int n_transf, i64 nj, const double *xj, const double *yj, + cuDoubleComplex *cj, int iflag, double eps, i64 ms, i64 mt, + const cuDoubleComplex *fk, const cufinufft_opts *opts) { + return guru2(2, 2, n_transf, nj, {xj, yj}, cj, iflag, eps, {ms, mt, 1}, 0, {}, + fk, opts); +} +int cufinufftf2d2many(int n_transf, i64 nj, const float *xj, const float *yj, + cuFloatComplex *cj, int iflag, float eps, i64 ms, i64 mt, + const cuFloatComplex *fk, const cufinufft_opts *opts) { + return guru2(2, 2, n_transf, nj, {xj, yj}, cj, iflag, eps, {ms, mt, 1}, 0, {}, + fk, opts); +} +int cufinufft2d2(i64 nj, const double *xj, const double *yj, cuDoubleComplex *cj, + int iflag, double eps, i64 ms, i64 mt, const cuDoubleComplex *fk, + const cufinufft_opts *opts) { + return cufinufft2d2many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts); +} +int cufinufftf2d2(i64 nj, const float *xj, const float *yj, cuFloatComplex *cj, int iflag, + float eps, i64 ms, i64 mt, const cuFloatComplex *fk, + const cufinufft_opts *opts) { + return cufinufftf2d2many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts); +} + +int cufinufft2d3many(int n_transf, i64 nj, const double *xj, const double *yj, + const cuDoubleComplex *cj, int iflag, double eps, i64 nk, + const double *s, const double *t, cuDoubleComplex *fk, + const cufinufft_opts *opts) { + return guru13(2, 3, n_transf, nj, {xj, yj}, cj, iflag, eps, {0, 0, 0}, nk, + {s, t}, fk, opts); +} +int cufinufftf2d3many(int n_transf, i64 nj, const float *xj, const float *yj, + const cuFloatComplex *cj, int iflag, float eps, i64 nk, + const float *s, const float *t, cuFloatComplex *fk, + const cufinufft_opts *opts) { + return guru13(2, 3, n_transf, nj, {xj, yj}, cj, iflag, eps, {0, 0, 0}, nk, + {s, t}, fk, opts); +} +int cufinufft2d3(i64 nj, const double *xj, const double *yj, const cuDoubleComplex *cj, + int iflag, double eps, i64 nk, const double *s, const double *t, + cuDoubleComplex *fk, const cufinufft_opts *opts) { + return cufinufft2d3many(1, nj, xj, yj, cj, iflag, eps, nk, s, t, fk, opts); +} +int cufinufftf2d3(i64 nj, const float *xj, const float *yj, const cuFloatComplex *cj, + int iflag, float eps, i64 nk, const float *s, const float *t, + cuFloatComplex *fk, const cufinufft_opts *opts) { + return cufinufftf2d3many(1, nj, xj, yj, cj, iflag, eps, nk, s, t, fk, opts); +} + +// Dimension 3333333333333333333333333333333333333333333333333333333333333333 + +int cufinufft3d1many(int n_transf, i64 nj, const double *xj, const double *yj, + const double *zj, const cuDoubleComplex *cj, int iflag, double eps, + i64 ms, i64 mt, i64 mu, cuDoubleComplex *fk, + const cufinufft_opts *opts) { + return guru13(3, 1, n_transf, nj, {xj, yj, zj}, cj, iflag, eps, {ms, mt, mu}, 0, + {}, fk, opts); +} +int cufinufftf3d1many(int n_transf, i64 nj, const float *xj, const float *yj, + const float *zj, const cuFloatComplex *cj, int iflag, float eps, + i64 ms, i64 mt, i64 mu, cuFloatComplex *fk, + const cufinufft_opts *opts) { + return guru13(3, 1, n_transf, nj, {xj, yj, zj}, cj, iflag, eps, {ms, mt, mu}, 0, + {}, fk, opts); +} +int cufinufft3d1(i64 nj, const double *xj, const double *yj, const double *zj, + const cuDoubleComplex *cj, int iflag, double eps, i64 ms, i64 mt, i64 mu, + cuDoubleComplex *fk, const cufinufft_opts *opts) { + return cufinufft3d1many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts); +} +int cufinufftf3d1(i64 nj, const float *xj, const float *yj, const float *zj, + const cuFloatComplex *cj, int iflag, float eps, i64 ms, i64 mt, i64 mu, + cuFloatComplex *fk, const cufinufft_opts *opts) { + return cufinufftf3d1many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts); +} + +int cufinufft3d2many(int n_transf, i64 nj, const double *xj, const double *yj, + const double *zj, cuDoubleComplex *cj, int iflag, double eps, i64 ms, + i64 mt, i64 mu, const cuDoubleComplex *fk, + const cufinufft_opts *opts) { + return guru2(3, 2, n_transf, nj, {xj, yj, zj}, cj, iflag, eps, {ms, mt, mu}, 0, + {}, fk, opts); +} +int cufinufftf3d2many(int n_transf, i64 nj, const float *xj, const float *yj, + const float *zj, cuFloatComplex *cj, int iflag, float eps, i64 ms, + i64 mt, i64 mu, const cuFloatComplex *fk, + const cufinufft_opts *opts) { + return guru2(3, 2, n_transf, nj, {xj, yj, zj}, cj, iflag, eps, {ms, mt, mu}, 0, + {}, fk, opts); +} +int cufinufft3d2(i64 nj, const double *xj, const double *yj, const double *zj, + cuDoubleComplex *cj, int iflag, double eps, i64 ms, i64 mt, i64 mu, + const cuDoubleComplex *fk, const cufinufft_opts *opts) { + return cufinufft3d2many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts); +} +int cufinufftf3d2(i64 nj, const float *xj, const float *yj, const float *zj, + cuFloatComplex *cj, int iflag, float eps, i64 ms, i64 mt, i64 mu, + const cuFloatComplex *fk, const cufinufft_opts *opts) { + return cufinufftf3d2many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts); +} + +int cufinufft3d3many(int n_transf, i64 nj, const double *xj, const double *yj, + const double *zj, const cuDoubleComplex *cj, int iflag, double eps, + i64 nk, const double *s, const double *t, const double *u, + cuDoubleComplex *fk, const cufinufft_opts *opts) { + return guru13(3, 3, n_transf, nj, {xj, yj, zj}, cj, iflag, eps, {0, 0, 0}, nk, + {s, t, u}, fk, opts); +} +int cufinufftf3d3many(int n_transf, i64 nj, const float *xj, const float *yj, + const float *zj, const cuFloatComplex *cj, int iflag, float eps, + i64 nk, const float *s, const float *t, const float *u, + cuFloatComplex *fk, const cufinufft_opts *opts) { + return guru13(3, 3, n_transf, nj, {xj, yj, zj}, cj, iflag, eps, {0, 0, 0}, nk, + {s, t, u}, fk, opts); +} +int cufinufft3d3(i64 nj, const double *xj, const double *yj, const double *zj, + const cuDoubleComplex *cj, int iflag, double eps, i64 nk, + const double *s, const double *t, const double *u, cuDoubleComplex *fk, + const cufinufft_opts *opts) { + return cufinufft3d3many(1, nj, xj, yj, zj, cj, iflag, eps, nk, s, t, u, fk, opts); +} +int cufinufftf3d3(i64 nj, const float *xj, const float *yj, const float *zj, + const cuFloatComplex *cj, int iflag, float eps, i64 nk, const float *s, + const float *t, const float *u, cuFloatComplex *fk, + const cufinufft_opts *opts) { + return cufinufftf3d3many(1, nj, xj, yj, zj, cj, iflag, eps, nk, s, t, u, fk, opts); +} + +} // extern "C" diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt index e76dcf068..ff82b20c1 100644 --- a/test/cuda/CMakeLists.txt +++ b/test/cuda/CMakeLists.txt @@ -223,6 +223,7 @@ add_test(NAME cufinufft_public_api COMMAND public_api_test) add_test(NAME cufinufft_makeplan COMMAND test_makeplan) add_test(NAME cufinufft_error_handling COMMAND cufinufft_error_handling) add_test(NAME cufinufft_math_test COMMAND cufinufft_math_test) +add_test(NAME cufinufft_simple_test COMMAND cufinufft_simple_test) add_test( NAME cufinufft1dspreadinterponly_double_test COMMAND cufinufft1dspreadinterponly_test 1e3 1e6 1e-6 1e-5 d 2.00 diff --git a/test/cuda/cufinufft_simple_test.cu b/test/cuda/cufinufft_simple_test.cu new file mode 100644 index 000000000..25177e872 --- /dev/null +++ b/test/cuda/cufinufft_simple_test.cu @@ -0,0 +1,372 @@ +// Cross-check the C-level simple cuFINUFFT API against the 4-step plan API. +// +// For each (dim, type) pair we run the simple call and the equivalent 4-step +// plan path on identical inputs and require the device output to be bit-for-bit +// identical. Both float and double precision are exercised. + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace { + +#define CUDA_CHECK(call) \ + do { \ + cudaError_t e = (call); \ + if (e != cudaSuccess) { \ + std::fprintf(stderr, "CUDA error %s at %s:%d\n", cudaGetErrorString(e), __FILE__, \ + __LINE__); \ + return 1; \ + } \ + } while (0) + +#define CHECK_RET(expr, name) \ + do { \ + int _r = (expr); \ + if (_r != 0) { \ + std::fprintf(stderr, "%s failed: %d\n", name, _r); \ + return {}; \ + } \ + } while (0) + +template struct Traits; +template<> struct Traits { + using cuc = cuFloatComplex; + static constexpr const char *label = "float"; +}; +template<> struct Traits { + using cuc = cuDoubleComplex; + static constexpr const char *label = "double"; +}; + +template T *dev_alloc(size_t n) { + T *p = nullptr; + cudaMalloc(reinterpret_cast(&p), n * sizeof(T)); + return p; +} + +template void dev_copy_in(T *dst, const std::vector &src) { + cudaMemcpy(dst, src.data(), src.size() * sizeof(T), cudaMemcpyHostToDevice); +} + +template std::vector dev_copy_out(const T *src, size_t n) { + std::vector v(n); + cudaMemcpy(v.data(), src, n * sizeof(T), cudaMemcpyDeviceToHost); + return v; +} + +// Approximate equality. GPU spreading uses atomicAdd which is not bit- +// deterministic for float, so two independent runs of the same type-1 / type-3 +// transform can differ by a few ULPs even though the algorithm is the same. +// Type-2 paths (interpolation only, no atomics) compare bit-identically; we +// allow a small relative tolerance uniformly to keep the test simple. +template +bool near_equal(const std::vector> &a, + const std::vector> &b, T rtol) { + if (a.size() != b.size()) return false; + T worst = 0; + for (size_t i = 0; i < a.size(); ++i) { + const T num = std::abs(a[i] - b[i]); + const T den = std::max(std::abs(a[i]), std::abs(b[i])); + const T r = den > 0 ? num / den : num; + if (r > worst) worst = r; + } + if (worst > rtol) { + std::fprintf(stderr, " worst rel error = %g (tol %g)\n", double(worst), + double(rtol)); + return false; + } + return true; +} + +template struct Fixture { + using cuc = typename Traits::cuc; + int dim, type; + int64_t M; + int64_t N1, N2, N3, Nk; // Nk = nk for type 3 + std::vector x, y, z; + std::vector s, t, u; + // For type 1/3 c is input strengths; for type 2 c is output. + std::vector> c_in; + std::vector> fk_in; + + size_t c_count() const { return static_cast(M); } + size_t fk_count() const { + if (type == 3) return static_cast(Nk); + return static_cast(N1) * N2 * N3; + } +}; + +template Fixture make_fixture(int dim, int type, std::mt19937 &rng) { + Fixture f; + f.dim = dim; + f.type = type; + f.M = 137; + f.N1 = 16; + f.N2 = (dim >= 2) ? 12 : 1; + f.N3 = (dim >= 3) ? 8 : 1; + f.Nk = 23; + std::uniform_real_distribution uni(T(-3.14), T(3.14)); + std::uniform_real_distribution uni_s(T(-2), T(2)); + std::uniform_real_distribution uni_c(T(-1), T(1)); + f.x.resize(f.M); + if (dim >= 2) f.y.resize(f.M); + if (dim >= 3) f.z.resize(f.M); + for (int64_t j = 0; j < f.M; ++j) { + f.x[j] = uni(rng); + if (dim >= 2) f.y[j] = uni(rng); + if (dim >= 3) f.z[j] = uni(rng); + } + if (type == 3) { + f.s.resize(f.Nk); + if (dim >= 2) f.t.resize(f.Nk); + if (dim >= 3) f.u.resize(f.Nk); + for (int64_t k = 0; k < f.Nk; ++k) { + f.s[k] = uni_s(rng); + if (dim >= 2) f.t[k] = uni_s(rng); + if (dim >= 3) f.u[k] = uni_s(rng); + } + } + if (type == 1 || type == 3) { + f.c_in.resize(f.M); + for (auto &z : f.c_in) z = {uni_c(rng), uni_c(rng)}; + } else { + f.fk_in.resize(f.fk_count()); + for (auto &z : f.fk_in) z = {uni_c(rng), uni_c(rng)}; + } + return f; +} + +// 4-step path. Returns host vector of output complex values. +template std::vector> run_plan(const Fixture &f) { + using cuc = typename Traits::cuc; + T *d_x = nullptr, *d_y = nullptr, *d_z = nullptr; + T *d_s = nullptr, *d_t = nullptr, *d_u = nullptr; + cuc *d_c = nullptr, *d_fk = nullptr; + d_x = dev_alloc(f.M); + dev_copy_in(d_x, f.x); + if (f.dim >= 2) { + d_y = dev_alloc(f.M); + dev_copy_in(d_y, f.y); + } + if (f.dim >= 3) { + d_z = dev_alloc(f.M); + dev_copy_in(d_z, f.z); + } + if (f.type == 3) { + d_s = dev_alloc(f.Nk); + dev_copy_in(d_s, f.s); + if (f.dim >= 2) { + d_t = dev_alloc(f.Nk); + dev_copy_in(d_t, f.t); + } + if (f.dim >= 3) { + d_u = dev_alloc(f.Nk); + dev_copy_in(d_u, f.u); + } + } + d_c = dev_alloc(f.c_count()); + d_fk = dev_alloc(f.fk_count()); + + if (f.type == 1 || f.type == 3) { + cudaMemcpy(d_c, f.c_in.data(), f.c_count() * sizeof(cuc), cudaMemcpyHostToDevice); + } else { + cudaMemcpy(d_fk, f.fk_in.data(), f.fk_count() * sizeof(cuc), cudaMemcpyHostToDevice); + } + + const int64_t nmodes[3] = {f.N1, f.N2, f.N3}; + const int Nk32 = (f.type == 3) ? static_cast(f.Nk) : 0; + std::vector> out; + if constexpr (std::is_same_v) { + cufinufftf_plan plan{}; + CHECK_RET(cufinufftf_makeplan(f.type, f.dim, nmodes, 1, 1, T(1e-5), &plan, nullptr), + "makeplan"); + CHECK_RET(cufinufftf_setpts(plan, f.M, d_x, d_y, d_z, Nk32, d_s, d_t, d_u), "setpts"); + CHECK_RET(cufinufftf_execute(plan, d_c, d_fk), "execute"); + CHECK_RET(cufinufftf_destroy(plan), "destroy"); + } else { + cufinufft_plan plan{}; + CHECK_RET(cufinufft_makeplan(f.type, f.dim, nmodes, 1, 1, T(1e-9), &plan, nullptr), + "makeplan"); + CHECK_RET(cufinufft_setpts(plan, f.M, d_x, d_y, d_z, Nk32, d_s, d_t, d_u), "setpts"); + CHECK_RET(cufinufft_execute(plan, d_c, d_fk), "execute"); + CHECK_RET(cufinufft_destroy(plan), "destroy"); + } + + if (f.type == 2) + out = dev_copy_out(reinterpret_cast *>(d_c), f.c_count()); + else + out = dev_copy_out(reinterpret_cast *>(d_fk), f.fk_count()); + + cudaFree(d_x); + cudaFree(d_y); + cudaFree(d_z); + cudaFree(d_s); + cudaFree(d_t); + cudaFree(d_u); + cudaFree(d_c); + cudaFree(d_fk); + return out; +} + +// Simple-API path. Mirrors run_plan(). +template std::vector> run_simple(const Fixture &f) { + using cuc = typename Traits::cuc; + T *d_x = nullptr, *d_y = nullptr, *d_z = nullptr; + T *d_s = nullptr, *d_t = nullptr, *d_u = nullptr; + cuc *d_c = nullptr, *d_fk = nullptr; + d_x = dev_alloc(f.M); + dev_copy_in(d_x, f.x); + if (f.dim >= 2) { + d_y = dev_alloc(f.M); + dev_copy_in(d_y, f.y); + } + if (f.dim >= 3) { + d_z = dev_alloc(f.M); + dev_copy_in(d_z, f.z); + } + if (f.type == 3) { + d_s = dev_alloc(f.Nk); + dev_copy_in(d_s, f.s); + if (f.dim >= 2) { + d_t = dev_alloc(f.Nk); + dev_copy_in(d_t, f.t); + } + if (f.dim >= 3) { + d_u = dev_alloc(f.Nk); + dev_copy_in(d_u, f.u); + } + } + d_c = dev_alloc(f.c_count()); + d_fk = dev_alloc(f.fk_count()); + + if (f.type == 1 || f.type == 3) { + cudaMemcpy(d_c, f.c_in.data(), f.c_count() * sizeof(cuc), cudaMemcpyHostToDevice); + } else { + cudaMemcpy(d_fk, f.fk_in.data(), f.fk_count() * sizeof(cuc), cudaMemcpyHostToDevice); + } + + const T eps = std::is_same_v ? T(1e-5) : T(1e-9); + int rc = 0; + if constexpr (std::is_same_v) { + if (f.dim == 1) { + if (f.type == 1) + rc = cufinufftf1d1(f.M, d_x, d_c, 1, eps, f.N1, d_fk, nullptr); + else if (f.type == 2) + rc = cufinufftf1d2(f.M, d_x, d_c, 1, eps, f.N1, d_fk, nullptr); + else + rc = cufinufftf1d3(f.M, d_x, d_c, 1, eps, f.Nk, d_s, d_fk, nullptr); + } else if (f.dim == 2) { + if (f.type == 1) + rc = cufinufftf2d1(f.M, d_x, d_y, d_c, 1, eps, f.N1, f.N2, d_fk, nullptr); + else if (f.type == 2) + rc = cufinufftf2d2(f.M, d_x, d_y, d_c, 1, eps, f.N1, f.N2, d_fk, nullptr); + else + rc = cufinufftf2d3(f.M, d_x, d_y, d_c, 1, eps, f.Nk, d_s, d_t, d_fk, nullptr); + } else { + if (f.type == 1) + rc = cufinufftf3d1(f.M, d_x, d_y, d_z, d_c, 1, eps, f.N1, f.N2, f.N3, d_fk, + nullptr); + else if (f.type == 2) + rc = cufinufftf3d2(f.M, d_x, d_y, d_z, d_c, 1, eps, f.N1, f.N2, f.N3, d_fk, + nullptr); + else + rc = cufinufftf3d3(f.M, d_x, d_y, d_z, d_c, 1, eps, f.Nk, d_s, d_t, d_u, d_fk, + nullptr); + } + } else { + if (f.dim == 1) { + if (f.type == 1) + rc = cufinufft1d1(f.M, d_x, d_c, 1, eps, f.N1, d_fk, nullptr); + else if (f.type == 2) + rc = cufinufft1d2(f.M, d_x, d_c, 1, eps, f.N1, d_fk, nullptr); + else + rc = cufinufft1d3(f.M, d_x, d_c, 1, eps, f.Nk, d_s, d_fk, nullptr); + } else if (f.dim == 2) { + if (f.type == 1) + rc = cufinufft2d1(f.M, d_x, d_y, d_c, 1, eps, f.N1, f.N2, d_fk, nullptr); + else if (f.type == 2) + rc = cufinufft2d2(f.M, d_x, d_y, d_c, 1, eps, f.N1, f.N2, d_fk, nullptr); + else + rc = cufinufft2d3(f.M, d_x, d_y, d_c, 1, eps, f.Nk, d_s, d_t, d_fk, nullptr); + } else { + if (f.type == 1) + rc = cufinufft3d1(f.M, d_x, d_y, d_z, d_c, 1, eps, f.N1, f.N2, f.N3, d_fk, + nullptr); + else if (f.type == 2) + rc = cufinufft3d2(f.M, d_x, d_y, d_z, d_c, 1, eps, f.N1, f.N2, f.N3, d_fk, + nullptr); + else + rc = cufinufft3d3(f.M, d_x, d_y, d_z, d_c, 1, eps, f.Nk, d_s, d_t, d_u, d_fk, + nullptr); + } + } + if (rc != 0) { + std::fprintf(stderr, "simple call failed: %d (dim=%d type=%d)\n", rc, f.dim, f.type); + return {}; + } + + std::vector> out; + if (f.type == 2) + out = dev_copy_out(reinterpret_cast *>(d_c), f.c_count()); + else + out = dev_copy_out(reinterpret_cast *>(d_fk), f.fk_count()); + + cudaFree(d_x); + cudaFree(d_y); + cudaFree(d_z); + cudaFree(d_s); + cudaFree(d_t); + cudaFree(d_u); + cudaFree(d_c); + cudaFree(d_fk); + return out; +} + +template int compare_one(int dim, int type, std::mt19937 &rng) { + auto f = make_fixture(dim, type, rng); + auto via_p = run_plan(f); + auto via_s = run_simple(f); + if (via_p.empty() || via_s.empty()) return 1; + // Tolerance: the algorithm is the same; only atomic-add non-determinism + // separates the two runs. Float type-3 chains spreading + FFT + interpolation + // so the error accumulates more; allow a looser bound there. + const T rtol = std::is_same_v ? T(5e-3) : T(1e-9); + if (!near_equal(via_p, via_s, rtol)) { + std::fprintf(stderr, "mismatch %s dim=%d type=%d (n=%zu)\n", Traits::label, dim, + type, via_p.size()); + return 1; + } + std::printf("ok %s dim=%d type=%d (n=%zu)\n", Traits::label, dim, type, + via_p.size()); + return 0; +} + +template int run_all() { + std::mt19937 rng(static_cast(std::is_same_v ? 1 : 2)); + int rc = 0; + for (int dim = 1; dim <= 3; ++dim) { + for (int type = 1; type <= 3; ++type) { + rc |= compare_one(dim, type, rng); + } + } + return rc; +} + +} // namespace + +int main() { + int rc = 0; + rc |= run_all(); + rc |= run_all(); + return rc; +} diff --git a/test/cuda/public_api_test.c b/test/cuda/public_api_test.c index 33f0ab954..757eb680e 100644 --- a/test/cuda/public_api_test.c +++ b/test/cuda/public_api_test.c @@ -182,11 +182,59 @@ int test_double(int M, int N) { return 0; } +// Smoke test for the C-level simple API: same 1D type-1 problem as test_float, +// but using cufinufftf1d1 (one call) instead of the 4-step plan path. Verifies +// the C ABI (this TU is compiled as C, not C++). +int test_simple_float(int M, int N) { + int64_t modes[1] = {N}; + + float *x; + float _Complex *c; + float _Complex *f; + + float *d_x; + cuFloatComplex *d_c, *d_f; + + x = (float *)malloc(M * sizeof(float)); + c = (float _Complex *)malloc(M * sizeof(float _Complex)); + f = (float _Complex *)malloc(N * sizeof(float _Complex)); + + srand(0); + for (int j = 0; j < M; ++j) { + x[j] = 2 * PI * (((float)rand()) / RAND_MAX - 1); + c[j] = + (2 * ((float)rand()) / RAND_MAX - 1) + I * (2 * ((float)rand()) / RAND_MAX - 1); + } + + cudaMalloc((void **)&d_x, M * sizeof(float)); + cudaMalloc((void **)&d_c, M * sizeof(float _Complex)); + cudaMalloc((void **)&d_f, N * sizeof(float _Complex)); + cudaMemcpy(d_x, x, M * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_c, c, M * sizeof(float _Complex), cudaMemcpyHostToDevice); + + int rc = cufinufftf1d1(M, d_x, d_c, 1, 1e-6f, modes[0], d_f, NULL); + + cudaMemcpy(f, d_f, N * sizeof(float _Complex), cudaMemcpyDeviceToHost); + + cudaFree(d_x); + cudaFree(d_c); + cudaFree(d_f); + + int idx = 4 * N / 7; + printf("simple f[%d] = %lf + %lfi\n", idx, crealf(f[idx]), cimagf(f[idx])); + + free(x); + free(c); + free(f); + return rc; +} + int main() { // Problem size: number of nonuniform points (M) and grid size (N). const int M = 100, N = 200; int errf = test_float(M, N); - int err = test_double(M, N); + int err = test_double(M, N); + int errs = test_simple_float(M, N); - return (err | errf); + return (err | errf | errs); }