diff --git a/CHANGELOG b/CHANGELOG index 057e42f3c..b56e30325 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -3,9 +3,12 @@ If not stated, FINUFFT is assumed (old cuFINUFFT <=1.3 is listed separately). v2.6.0-dev -* Added `threadsafe_execute` regression test verifying concurrent `execute()` - calls on the same plan produce correct results. Added sanitizer mode selection - via `FINUFFT_USE_SANITIZERS=OFF|ON|MEMSAN|TSAN`, and extended the sanitizer +* Fixed CPU fallback execute scratch handling to use thread-local reclaimable + buffers when caller scratch is not provided, so concurrent `execute()` calls + on the same plan no longer share internal workspace. Added unit coverage for + `ReclaimableMemory` and a `threadsafe_execute` regression test based on + direct 1D reference evaluation. Added sanitizer mode selection via + `FINUFFT_USE_SANITIZERS=OFF|ON|MEMSAN|TSAN`, and extended the sanitizer GitHub workflow to run a focused Linux TSAN job. (Barbone) * SIMD-vectorized bin sort with parallel prefix sum: uint32_t bin counts, ndims dispatch for vectorized coordinate binning, std::exclusive_scan for diff --git a/include/finufft/execute.hpp b/include/finufft/execute.hpp index 8f02faa91..0f9f1aa05 100644 --- a/include/finufft/execute.hpp +++ b/include/finufft/execute.hpp @@ -5,10 +5,11 @@ #include #include +#include #include -#include -#include #include +#include +#include /* Computational core for FINUFFT. @@ -363,11 +364,17 @@ int FINUFFT_PLAN_T::execute_internal(TC *cj, TC *fk, bool adjoint, int ntran if (opts.debug) printf("[%s] start%s ntrans=%d (%d batches, bsize=%d)...\n", "execute", adjoint ? " adjoint" : "", ntrans_actual, nbatch, batchSize); - // allocate temporary buffers + // Use caller-provided scratch, or fall back to a thread-local reclaimable + // buffer so concurrent execute() calls on the same plan do not share state. bool scratch_provided = scratch_size >= size_t(nf() * batchSize); - std::vector> fwBatch_( - scratch_provided ? 0 : nf() * batchSize); - TC *fwBatch = scratch_provided ? aligned_scratch : fwBatch_.data(); + static thread_local finufft::ReclaimableMemory tls_fwBatchBuf; + TC *fwBatch = aligned_scratch; + if (!scratch_provided) { + const size_t fwBatchBytes = size_t(nf()) * batchSize * sizeof(TC); + if (!tls_fwBatchBuf.allocate(fwBatchBytes)) + throw finufft::exception(FINUFFT_ERR_ALLOC); + fwBatch = static_cast(tls_fwBatchBuf.data()); + } for (int b = 0; b * batchSize < ntrans_actual; b++) { // .....loop b over batches // current batch is either batchSize, or possibly truncated if last one @@ -411,6 +418,11 @@ int FINUFFT_PLAN_T::execute_internal(TC *cj, TC *fk, bool adjoint, int ntran } } // ........end b loop + // Mark thread-local pages as reclaimable so the OS can reclaim physical + // memory between execute calls if under pressure. Pages stay resident + // otherwise, and virtual addresses remain stable for reuse. + if (!scratch_provided) tls_fwBatchBuf.mark_reclaimable(); + if (opts.debug) { // report total times in their natural order... if ((type == 1) != adjoint) { printf("[%s] done. tot spread:\t\t%.3g s\n", "execute", t_sprint); diff --git a/include/finufft/memory.hpp b/include/finufft/memory.hpp new file mode 100644 index 000000000..78ab9ce32 --- /dev/null +++ b/include/finufft/memory.hpp @@ -0,0 +1,147 @@ +#pragma once + +// Cross-platform RAII wrapper for large temporary buffers. +// +// Uses mmap/VirtualAlloc for large allocations with two key features: +// 1. Page-aligned allocation that keeps a stable virtual address range for +// reuse across calls (no munmap/VirtualFree between uses). +// 2. mark_reclaimable() issues MADV_FREE (Linux/macOS) or MEM_RESET (Windows) +// to tell the OS the physical pages may be reclaimed under memory pressure, +// without releasing the virtual address range. If pages are not reclaimed, +// the next use is essentially free; if they are, the OS transparently +// zero-fills them on the next fault — no user-visible error. +// +// Platform support: +// Linux / macOS — mmap + MADV_FREE +// Windows — VirtualAlloc + MEM_RESET +// Other — std::aligned_alloc fallback (no reclaimable support) +// +// This relies only on POSIX mmap (available since 4.4BSD / POSIX.1-2001) and +// Win32 VirtualAlloc — both stable OS-level APIs with decades of support. + +#include +#include +#include + +#if defined(_WIN32) +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN // Exclude rarely-used Windows headers to speed compilation +#endif +#ifndef NOMINMAX +#define NOMINMAX // Prevent Windows from defining min/max macros +#endif +#include +#elif defined(__unix__) || defined(__APPLE__) +#include +#include +#else +#include // memset fallback +#endif + +namespace finufft { + +// Large-buffer allocator that returns page-aligned memory and supports +// marking pages as reclaimable between uses. +class ReclaimableMemory { +public: + ReclaimableMemory() = default; + + // Non-copyable, movable + ReclaimableMemory(const ReclaimableMemory &) = delete; + ReclaimableMemory &operator=(const ReclaimableMemory &) = delete; + ReclaimableMemory(ReclaimableMemory &&o) noexcept : ptr_(o.ptr_), nbytes_(o.nbytes_) { + o.ptr_ = nullptr; + o.nbytes_ = 0; + } + ReclaimableMemory &operator=(ReclaimableMemory &&o) noexcept { + if (this != &o) { + deallocate(); + ptr_ = o.ptr_; + nbytes_ = o.nbytes_; + o.ptr_ = nullptr; + o.nbytes_ = 0; + } + return *this; + } + + ~ReclaimableMemory() { deallocate(); } + + // Allocate nbytes of memory while leaving physical pages to be faulted in on + // first use. Returns true on success. + bool allocate(size_t nbytes) { + if (nbytes == 0) return true; + if (ptr_ && nbytes_ == nbytes) return true; // already the right size + deallocate(); + nbytes_ = nbytes; +#if defined(_WIN32) + ptr_ = VirtualAlloc(nullptr, nbytes, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE); + if (!ptr_) { + nbytes_ = 0; + return false; + } +#elif defined(__linux__) + ptr_ = + mmap(nullptr, nbytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (ptr_ == MAP_FAILED) { + ptr_ = nullptr; + nbytes_ = 0; + return false; + } +#elif defined(__APPLE__) || defined(__unix__) + // macOS and other Unix: no MAP_POPULATE, use plain mmap + ptr_ = + mmap(nullptr, nbytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (ptr_ == MAP_FAILED) { + ptr_ = nullptr; + nbytes_ = 0; + return false; + } +#else + // Fallback: aligned allocation + ptr_ = std::aligned_alloc(4096, ((nbytes + 4095) / 4096) * 4096); + if (!ptr_) { + nbytes_ = 0; + return false; + } + std::memset(ptr_, 0, nbytes); +#endif + return true; + } + + // Mark pages as reclaimable by the OS. The virtual address range is kept, + // and pages may remain resident if there is no memory pressure. + // After this call, the contents are undefined until the next write. + void mark_reclaimable() { + if (!ptr_ || !nbytes_) return; +#if defined(_WIN32) + // MEM_RESET tells Windows the pages are no longer needed. + // Pages remain committed but can be discarded under pressure. + VirtualAlloc(ptr_, nbytes_, MEM_RESET, PAGE_READWRITE); +#elif defined(__linux__) || defined(__APPLE__) + madvise(ptr_, nbytes_, MADV_FREE); +#endif + // Other platforms: no-op, pages stay resident + } + + void *data() const { return ptr_; } + size_t size() const { return nbytes_; } + +private: + void deallocate() { + if (!ptr_) return; +#if defined(_WIN32) + VirtualFree(ptr_, 0, MEM_RELEASE); +#elif defined(__unix__) || defined(__APPLE__) + munmap(ptr_, nbytes_); +#else + std::free(ptr_); +#endif + ptr_ = nullptr; + nbytes_ = 0; + } + + void *ptr_ = nullptr; + size_t nbytes_ = 0; +}; + +} // namespace finufft diff --git a/include/finufft/plan.hpp b/include/finufft/plan.hpp index 05ab7e954..c0da8f51a 100644 --- a/include/finufft/plan.hpp +++ b/include/finufft/plan.hpp @@ -4,9 +4,9 @@ #include #include #include +#include #include "finufft_common/common.h" -#include "finufft_errors.h" // All indexing in library that potentially can exceed 2^31 uses 64-bit signed. // This includes all calling arguments (eg M,N) that could be huge someday. diff --git a/include/finufft_common/kernel.h b/include/finufft_common/kernel.h index 177e85564..8e5bf7fea 100644 --- a/include/finufft_common/kernel.h +++ b/include/finufft_common/kernel.h @@ -78,10 +78,12 @@ void set_kernel_shape_given_ns(finufft_spread_opts &opts, int debug); // Since for low upsampfacs, ns=16 can need only nc~12, allow such low nc here. // Note: spreadinterp.cpp compilation time grows with the gap between these bounds... inline constexpr int min_nc_given_ns(int ns) { - return std::max(common::MIN_NC, ns - 4); // note must stay in bounds from constants.h + // Parens around (std::max) prevent expansion of Windows min/max macros + // that are defined in when NOMINMAX is not set. + return (std::max)(common::MIN_NC, ns - 4); // note must stay in bounds from constants.h } inline constexpr int max_nc_given_ns(int ns) { - return std::min(common::MAX_NC, ns + 3); // " + return (std::min)(common::MAX_NC, ns + 3); // (std::min) for same Windows macro reason } template inline constexpr bool ValidKernelParams() noexcept { diff --git a/src/fft.cpp b/src/fft.cpp index b8a5254be..868cac17d 100644 --- a/src/fft.cpp +++ b/src/fft.cpp @@ -458,6 +458,12 @@ template void FINUFFT_PLAN_T::init_grid_kerFT_FFT() { if (opts.debug) printf("[%s] FFT plan (mode %d, nthr=%d):\t%.3g s\n", __func__, opts.fftw, nthr_fft, timer.elapsedsec()); + + if (opts.debug) { + const size_t fwBatchBytes = size_t(nf()) * batchSize * sizeof(TC); + printf("[%s] fwBatch alloc (%.3g GB):\tdeferred to execute\n", __func__, + fwBatchBytes / 1e9); + } } } template void FINUFFT_PLAN_T::init_grid_kerFT_FFT(); diff --git a/src/utils.cpp b/src/utils.cpp index dc2f76924..96c52dfa0 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -15,6 +15,9 @@ #if defined(_WIN32) #include +#ifndef NOMINMAX +#define NOMINMAX +#endif #include #elif defined(__APPLE__) #include diff --git a/test/testutils.cpp b/test/testutils.cpp index 721c9e4e1..77760e454 100644 --- a/test/testutils.cpp +++ b/test/testutils.cpp @@ -18,8 +18,10 @@ // This switches FLT macro from double to float if SINGLE is defined, etc... +#include "finufft/memory.hpp" #include "finufft/utils.hpp" #include "utils/norms.hpp" +#include #include namespace finufft::common { @@ -93,6 +95,24 @@ int main(int argc, char *argv[]) { if (std::abs(errtwonorm(M, &a[0], &b[0]) - 1.0) > relerr) return 1; if (std::abs(std::sqrt((FLT)M) * relerrtwonorm(M, &a[0], &b[0]) - 1.0) > relerr) return 1; + // test reclaimable workspace allocator... + finufft::ReclaimableMemory buf; + buf.mark_reclaimable(); // no-op before allocation + if (!buf.allocate(0) || buf.size() != 0) return 1; + + constexpr size_t nbytes = 8192; + if (!buf.allocate(nbytes) || buf.data() == nullptr || buf.size() != nbytes) return 1; + if ((reinterpret_cast(buf.data()) & 4095u) != 0u) return 1; + + void *ptr = buf.data(); + if (!buf.allocate(nbytes) || buf.data() != ptr) return 1; // same-size reuse + buf.mark_reclaimable(); // should be safe after alloc + + finufft::ReclaimableMemory moved = std::move(buf); + if (buf.data() != nullptr || buf.size() != 0) return 1; + if (moved.data() != ptr || moved.size() != nbytes) return 1; + moved.mark_reclaimable(); + #if defined(__cpp_lib_math_special_functions) // std::cyl_bessel_i present: compare std vs custom series for (double x = 0.0; x <= 42.0; x += 0.5) { diff --git a/test/threadsafe_execute.cpp b/test/threadsafe_execute.cpp index 107c4aa84..60488d4d7 100644 --- a/test/threadsafe_execute.cpp +++ b/test/threadsafe_execute.cpp @@ -1,3 +1,14 @@ +/* Regression test for thread-safe concurrent execute() on the same plan. + + Creates a single 1D type-1 plan and then runs finufft_execute from + multiple threads simultaneously, each into its own output array. + Correctness is verified against a direct (slow) Fourier transform. + This catches data races in internal scratch workspace allocation. + + Usage: ./threadsafe_execute (exit 0 = pass, >0 = fail) + Barbone, Mar 2026. +*/ + #include #include #include