flatironinstitute · mreineck · May 4, 2026 · May 4, 2026 · May 4, 2026 · May 4, 2026
diff --git a/docs/error.rst b/docs/error.rst
@@ -36,6 +36,7 @@ has the following meanings which are used by both CPU and GPU versions
   24 spread kernel formula type invalid
   25 unknown exception caught
   26 requested tolerance epsilon too small to achieve (hard error; tolerance must be >= machine epsilon)
+  27 iteration inside the setup code for the PSWF function evaluator failed to converge
 
 For any nonzero value of ``ier`` the transform may not have been performed and the output should not be trusted. However, we hope that the value of ``ier`` will help to narrow down the problem.
 

diff --git a/include/finufft/makeplan.hpp b/include/finufft/makeplan.hpp
@@ -76,19 +76,19 @@
   int q = (int)(2 + 3.0 * J2); // not sure why so large? (NB cannot exceed MAX_NQUAD)
   TF f[MAX_NQUAD];
   double z[2 * MAX_NQUAD], w[2 * MAX_NQUAD];
-  gaussquad(2 * q, z, w); // only half the nodes used, eg on (0,1)
+  gaussquad(2 * q, z, w);       // only half the nodes used, eg on (0,1)
   std::complex<TF> a[MAX_NQUAD];
-  for (int n = 0; n < q; ++n) {            // set up nodes z_n and vals f_n
-    z[n] *= J2;                            // rescale nodes
-                                           // vals & quadr weighs
+  for (int n = 0; n < q; ++n) { // set up nodes z_n and vals f_n
+    z[n] *= J2;                 // rescale nodes
+                                // vals & quadr weighs
     f[n] = J2 * (TF)w[n] * evaluate_kernel_runtime(TF(z[n]));
     // phase winding rates
     a[n] = -std::exp(2 * PI * std::complex<double>(0, 1) * z[n] / double(nf));
   }
   BIGINT nout = nf / 2 + 1; // how many values we're writing to
   int nt      = std::min(nout, (BIGINT)m.spopts.nthreads); // how many chunks
-  std::vector<BIGINT> brk(nt + 1);                       // start indices for each thread
-  for (int t = 0; t <= nt; ++t) // split nout mode indices btw threads
+  std::vector<BIGINT> brk(nt + 1); // start indices for each thread
+  for (int t = 0; t <= nt; ++t)    // split nout mode indices btw threads
     brk[t] = (BIGINT)(0.5 + nout * t / (double)nt);
 #pragma omp parallel num_threads(nt)
   {                                                // each thread gets own chunk to do
@@ -198,7 +198,7 @@
 
   // heuristic dir=1 chunking for nthr>>1, typical for intel i7 and skylake...
   m.spopts.max_subproblem_size = (dim == 1) ? 10000 : 100000; // todo: revisit
-  if (opts.spread_max_sp_size > 0)                             // override
+  if (opts.spread_max_sp_size > 0)                            // override
     m.spopts.max_subproblem_size = opts.spread_max_sp_size;
   // nthr above which switch OMP critical->atomic (add_wrapped..). R Blackwell's val:
   m.spopts.atomic_threshold =
@@ -238,13 +238,16 @@
   // coefficient (k=0 -> highest-degree). `horner_coeffs` was filled with
   // zeros above, so panels that need fewer coefficients leave the rest as 0.
 
+  auto kernel_lambda = kernel_definition_lambda(m.spopts);
+
   for (int j = 0; j < nspread; ++j) { // ......... loop over intervals (panels)
     // affine map of x in [-1,1] to z in jth interval [-1+2j/w,-1+2(j+1)/w]
     const TF xshiftj = TF(2 * j + 1 - nspread); // jth center in [-w,w]
     // *** explore making this lambda double always, like kernel itself:
-    const auto kernel_this_interval = [xshiftj, this, nspread](TF x) -> TF {
+    const auto kernel_this_interval = [xshiftj, this, nspread, kernel_lambda](
+                                          TF x) -> TF {
       const TF z = (x + xshiftj) / (TF)nspread;
-      return (TF)kernel_definition(m.spopts, (double)z);
+      return (TF)kernel_lambda((double)z);
     };
 
     // we're fitting in float for TF=float, *** explore always double:

diff --git a/include/finufft_common/kernel.h b/include/finufft_common/kernel.h
@@ -4,6 +4,7 @@
 #include <cmath>
 #include <cstddef>
 #include <cstdio>
+#include <functional>
 #include <vector>
 
 #include <finufft_common/constants.h>
@@ -67,7 +68,7 @@ template<class T, class F> std::vector<T> poly_fit(F &&f, int n) {
 // The spread/interp kernel phi_beta(z) on z in [-1,1]. Not performance-critical;
 // used only for polynomial interpolation (precompute_horner_coeffs). Always double.
 // Defined in src/common/kernel.cpp.
-double kernel_definition(const finufft_spread_opts &spopts, double z);
+std::function<double(double)> kernel_definition_lambda(const finufft_spread_opts &spopts);
 
 int theoretical_kernel_ns(double tol, int dim, int type, int debug,
                           const finufft_spread_opts &spopts);

diff --git a/include/finufft_common/pswf.h b/include/finufft_common/pswf.h
@@ -1,11 +1,63 @@
 #ifndef MATH_PSWF_H
 #define MATH_PSWF_H
 
+#include <array>
+#include <cmath>
+#include <finufft_errors.h>
+#include <vector>
+
 namespace finufft::common {
-/*
-normalized zeroth-order pswf
-*/
-double pswf(double c, double x);
+
+/* Class for evaluation of the prolate spheroidal wavefunction
+   of order zero (Psi_0^c) inside [-1,1], for arbitrary frequency parameter c.
+   Computation is done using a basis of Legendre polynomials.
+   This implementation is based on work by Libin Lu for FINUFFT.
+   The orignal implementation was done by Vladimir Rokhlin and
+   can be found in src/common/specialfunctions/ of the DMK repo
+   https://github.com/flatironinstitute/dmk
+   The returned function values are normalized in such a way that
+   evaluation at x=0 always returns 1.0.
+
+   CAUTION: the internal routines have been heavily tweaked compared
+   to the original versions and cannot be expected to work in a more
+   general context! */
+class PSWF0 {
+private:
+  double c;
+  std::vector<double> workdata; // Legendre coefficients
+  std::vector<std::array<double, 3>> coef;
+  double xv0;                   // factor needed for normalization
+
+  template<typename T> T eval_raw(T x) const {
+    const T xsq = x * x;
+    T pjm1      = 0;
+    T pjm2      = 1;
+    T val       = workdata[0];
+
+    size_t i = 1;
+    for (; i + 1 < coef.size(); i += 2) {
+      pjm1 = pjm2 * (xsq * coef[i][0] - coef[i][1]) - pjm1 * coef[i][2];
+      val += workdata[i] * pjm1;
+      pjm2 = pjm1 * (xsq * coef[i + 1][0] - coef[i + 1][1]) - pjm2 * coef[i + 1][2];
+      val += workdata[i + 1] * pjm2;
+    }
+    for (; i < coef.size(); ++i) {
+      T tmp = pjm2 * (xsq * coef[i][0] - coef[i][1]) - pjm1 * coef[i][2];
+      val += workdata[i] * tmp;
+      pjm1 = pjm2;
+      pjm2 = tmp;
+    }
+    return val;
+  }
+
+public:
+  PSWF0(double c_);
+
+  double operator()(double x) const {
+    if (std::abs(x) > 1) return 0.;
+    return eval_raw(x) * xv0;
+  }
+};
 
 } // namespace finufft::common
 #endif // MATH_PSWF_H
diff --git a/include/finufft_errors.h b/include/finufft_errors.h
@@ -37,6 +37,7 @@ enum {
   FINUFFT_ERR_KERFORMULA_NOTVALID                  = 24,
   FINUFFT_ERR_UNKNOWN_EXCEPTION                    = 25,
   FINUFFT_ERR_EPS_TOO_SMALL                        = 26,
+  FINUFFT_ERR_PSWF_SETUP                           = 27,
 };
 // clang-format on
 #endif
diff --git a/src/common/kernel.cpp b/src/common/kernel.cpp
@@ -12,17 +12,15 @@
 
 namespace finufft::kernel {
 
-double kernel_definition(const finufft_spread_opts &spopts, const double z) {
+std::function<double(double)> kernel_definition_lambda(
+    const finufft_spread_opts &spopts) {
   /* The spread/interp kernel phi_beta(z) function on standard interval z in [-1,1].
      This evaluation does not need to be fast; it is used *only* for polynomial
      interpolation via Horner coeffs (the interpolant is evaluated fast).
      It can thus always be double-precision. No analytic Fourier transform pair is
      needed, thanks to numerical quadrature in finufft_core:onedim*; playing with
      new kernels is thus very easy.
     Inputs:
-    z      - real ordinate on standard interval [-1,1]. Handling of edge cases
-            at or near +-1 is no longer crucial, because precompute_horner_coeffs
-            (the only user of this function) has interpolation nodes in (-1,1).
     spopts - spread_opts struct containing fields:
       beta        - shape parameter for ES, KB, or other prolate kernels
                     (a.k.a. c parameter in PSWF).
@@ -32,40 +30,62 @@ double kernel_definition(const finufft_spread_opts &spopts, const double z) {
                     kerformula also to select a parameter-choice method.)
                     Note: the default 0 (in opts.spread_kerformula) is invalid here;
                     selection of a >0 kernel type must already have happened.
-    Output: phi(z), as in the notation of original 2019 paper ([FIN] in the docs).
+    Output: lambda function that can be called with z to return phi(z),
+                    as in the notation of original 2019 paper ([FIN] in the docs).
 
     Notes: 1) no normalization of max value or integral is needed, since any
             overall factor is cancelled out in the deconvolve step. However,
             values as large as exp(beta) have caused floating-pt overflow; don't
             use them.
     Barnett rewritten 1/13/26 for double on [-1,1]; based on Barbone Dec 2025.
   */
-  if (std::abs(z) > 1.0) return 0.0;           // restrict support to [-1,1]
-  double beta = spopts.beta;                   // get shape param
-  double arg  = beta * std::sqrt(1.0 - z * z); // common argument for exp, I0, etc
+  double beta = spopts.beta; // get shape param
   int kf      = spopts.kerformula;
 
-  if (kf == 1 || kf == 2)
+  if (kf == 1 || kf == 2) {
     // ES ("exponential of semicircle" or "exp sqrt"), see [FIN] reference.
     // Used in FINUFFT 2017-2025 (up to v2.4.1). max is 1, as of v2.3.0.
-    return std::exp(arg) / std::exp(beta);
-  else if (kf == 3)
+    const double expbeta = std::exp(beta);
+    return [beta, expbeta](double z) {
+      if (std::abs(z) > 1.0) return 0.0; // restrict support to [-1,1]
+      return std::exp(beta * std::sqrt(1.0 - z * z)) / expbeta;
+    };
+  } else if (kf == 3) {
     // forwards Kaiser--Bessel (KB), normalized to max of 1.
     // std::cyl_bessel_i is from <cmath>, expects double. See src/common/utils.cpp
-    return common::cyl_bessel_i(0, arg) / common::cyl_bessel_i(0, beta);
-  else if (kf == 4)
+    const double besselbeta = common::cyl_bessel_i(0, beta);
+    return [beta, besselbeta](double z) {
+      if (std::abs(z) > 1.0) return 0.0; // restrict support to [-1,1]
+      return common::cyl_bessel_i(0, beta * std::sqrt(1.0 - z * z)) / besselbeta;
+    };
+  } else if (kf == 4) {
     // continuous (deplinthed) KB, as in Barnett SIREV 2022, normalized to max nearly 1
-    return (common::cyl_bessel_i(0, arg) - 1.0) / common::cyl_bessel_i(0, beta);
-  else if (kf == 5)
-    return std::cosh(arg) / std::cosh(beta); // normalized cosh-type of Rmk. 13 [FIN]
-  else if (kf == 6)
-    return (std::cosh(arg) - 1.0) / std::cosh(beta); // Potts-Tasche cont cosh-type
-  else if (kf >= 7 && kf <= 9)
-    return common::pswf(beta, z); // prolate (PSWF) Psi_0, normalized to 1 at z=0
-  else {
+    const double besselbeta = common::cyl_bessel_i(0, beta);
+    return [beta, besselbeta](double z) {
+      if (std::abs(z) > 1.0) return 0.0; // restrict support to [-1,1]
+      return (common::cyl_bessel_i(0, beta * std::sqrt(1.0 - z * z)) - 1.0) / besselbeta;
+    };
+  } else if (kf == 5) {
+    const double coshbeta = std::cosh(beta);
+    return [beta, coshbeta](double z) {
+      if (std::abs(z) > 1.0) return 0.0; // restrict support to [-1,1]
+      return std::cosh(beta * std::sqrt(1.0 - z * z)) / coshbeta;
+    }; // normalized cosh-type of Rmk. 13 [FIN]
+  } else if (kf == 6) {
+    const double coshbeta = std::cosh(beta);
+    return [beta, coshbeta](double z) {
+      if (std::abs(z) > 1.0) return 0.0; // restrict support to [-1,1]
+      return (std::cosh(beta * std::sqrt(1.0 - z * z)) - 1.0) / coshbeta;
+    }; // Potts-Tasche cont cosh-type
+  } else if (kf >= 7 && kf <= 9) {
+    finufft::common::PSWF0 pswf(beta);
+    return [pswf](double z) {
+      if (std::abs(z) > 1.0) return 0.0; // restrict support to [-1,1]
+      return pswf(z);
+    }; // prolate (PSWF) Psi_0, normalized to 1 at z=0
+  } else {
     fprintf(stderr, "[%s] unknown spopts.kerformula=%d\n", __func__, spopts.kerformula);
     throw finufft::exception(FINUFFT_ERR_KERFORMULA_NOTVALID);
-    return std::numeric_limits<double>::quiet_NaN(); // never gets here, non-signalling
   }
 }