Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@ If not stated, FINUFFT is assumed (old cuFINUFFT <=1.3 is listed separately).

v2.6.0-dev

* Added `threadsafe_execute` regression test verifying concurrent `execute()`
calls on the same plan produce correct results. Added sanitizer mode selection
via `FINUFFT_USE_SANITIZERS=OFF|ON|MEMSAN|TSAN`, and extended the sanitizer
* Fixed CPU fallback execute scratch handling to use thread-local reclaimable
buffers when caller scratch is not provided, so concurrent `execute()` calls
on the same plan no longer share internal workspace. Added unit coverage for
`ReclaimableMemory` and a `threadsafe_execute` regression test based on
direct 1D reference evaluation. Added sanitizer mode selection via
`FINUFFT_USE_SANITIZERS=OFF|ON|MEMSAN|TSAN`, and extended the sanitizer
GitHub workflow to run a focused Linux TSAN job. (Barbone)
* SIMD-vectorized bin sort with parallel prefix sum: uint32_t bin counts,
ndims dispatch for vectorized coordinate binning, std::exclusive_scan for
Expand Down
24 changes: 18 additions & 6 deletions include/finufft/execute.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@
#include <cstdio>
#include <vector>

#include <finufft/memory.hpp>
#include <finufft/plan.hpp>
#include <finufft/utils.hpp>
#include <finufft/spreadinterp.hpp>
#include <finufft/simd.hpp>
#include <finufft/spreadinterp.hpp>
#include <finufft/utils.hpp>

/* Computational core for FINUFFT.

Expand Down Expand Up @@ -363,11 +364,17 @@ int FINUFFT_PLAN_T<TF>::execute_internal(TC *cj, TC *fk, bool adjoint, int ntran
if (opts.debug)
printf("[%s] start%s ntrans=%d (%d batches, bsize=%d)...\n", "execute",
adjoint ? " adjoint" : "", ntrans_actual, nbatch, batchSize);
// allocate temporary buffers
// Use caller-provided scratch, or fall back to a thread-local reclaimable
// buffer so concurrent execute() calls on the same plan do not share state.
bool scratch_provided = scratch_size >= size_t(nf() * batchSize);
std::vector<TC, xsimd::aligned_allocator<TC, 64>> fwBatch_(
scratch_provided ? 0 : nf() * batchSize);
TC *fwBatch = scratch_provided ? aligned_scratch : fwBatch_.data();
static thread_local finufft::ReclaimableMemory tls_fwBatchBuf;
TC *fwBatch = aligned_scratch;
if (!scratch_provided) {
const size_t fwBatchBytes = size_t(nf()) * batchSize * sizeof(TC);
if (!tls_fwBatchBuf.allocate(fwBatchBytes))
throw finufft::exception(FINUFFT_ERR_ALLOC);
fwBatch = static_cast<TC *>(tls_fwBatchBuf.data());
}
for (int b = 0; b * batchSize < ntrans_actual; b++) { // .....loop b over batches

// current batch is either batchSize, or possibly truncated if last one
Expand Down Expand Up @@ -411,6 +418,11 @@ int FINUFFT_PLAN_T<TF>::execute_internal(TC *cj, TC *fk, bool adjoint, int ntran
}
} // ........end b loop

// Mark thread-local pages as reclaimable so the OS can reclaim physical
// memory between execute calls if under pressure. Pages stay resident
// otherwise, and virtual addresses remain stable for reuse.
if (!scratch_provided) tls_fwBatchBuf.mark_reclaimable();

if (opts.debug) { // report total times in their natural order...
if ((type == 1) != adjoint) {
printf("[%s] done. tot spread:\t\t%.3g s\n", "execute", t_sprint);
Expand Down
147 changes: 147 additions & 0 deletions include/finufft/memory.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
#pragma once

// Cross-platform RAII wrapper for large temporary buffers.
//
// Uses mmap/VirtualAlloc for large allocations with two key features:
// 1. Page-aligned allocation that keeps a stable virtual address range for
// reuse across calls (no munmap/VirtualFree between uses).
// 2. mark_reclaimable() issues MADV_FREE (Linux/macOS) or MEM_RESET (Windows)
// to tell the OS the physical pages may be reclaimed under memory pressure,
// without releasing the virtual address range. If pages are not reclaimed,
// the next use is essentially free; if they are, the OS transparently
// zero-fills them on the next fault — no user-visible error.
//
// Platform support:
// Linux / macOS — mmap + MADV_FREE
// Windows — VirtualAlloc + MEM_RESET
// Other — std::aligned_alloc fallback (no reclaimable support)
//
// This relies only on POSIX mmap (available since 4.4BSD / POSIX.1-2001) and
// Win32 VirtualAlloc — both stable OS-level APIs with decades of support.

#include <cstddef>
#include <cstdio>
#include <cstdlib>

#if defined(_WIN32)
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN // Exclude rarely-used Windows headers to speed compilation
#endif
#ifndef NOMINMAX
#define NOMINMAX // Prevent Windows <windows.h> from defining min/max macros
#endif
#include <windows.h>
#elif defined(__unix__) || defined(__APPLE__)
#include <sys/mman.h>
#include <unistd.h>
#else
#include <cstring> // memset fallback
#endif

namespace finufft {

// Large-buffer allocator that returns page-aligned memory and supports
// marking pages as reclaimable between uses.
class ReclaimableMemory {
public:
ReclaimableMemory() = default;

// Non-copyable, movable
ReclaimableMemory(const ReclaimableMemory &) = delete;
ReclaimableMemory &operator=(const ReclaimableMemory &) = delete;
ReclaimableMemory(ReclaimableMemory &&o) noexcept : ptr_(o.ptr_), nbytes_(o.nbytes_) {
o.ptr_ = nullptr;
o.nbytes_ = 0;
}
ReclaimableMemory &operator=(ReclaimableMemory &&o) noexcept {
if (this != &o) {
deallocate();
ptr_ = o.ptr_;
nbytes_ = o.nbytes_;
o.ptr_ = nullptr;
o.nbytes_ = 0;
}
return *this;
}

~ReclaimableMemory() { deallocate(); }

// Allocate nbytes of memory while leaving physical pages to be faulted in on
// first use. Returns true on success.
bool allocate(size_t nbytes) {
if (nbytes == 0) return true;
if (ptr_ && nbytes_ == nbytes) return true; // already the right size
deallocate();
nbytes_ = nbytes;
#if defined(_WIN32)
ptr_ = VirtualAlloc(nullptr, nbytes, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
if (!ptr_) {
nbytes_ = 0;
return false;
}
#elif defined(__linux__)
ptr_ =
mmap(nullptr, nbytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (ptr_ == MAP_FAILED) {
ptr_ = nullptr;
nbytes_ = 0;
return false;
}
#elif defined(__APPLE__) || defined(__unix__)
// macOS and other Unix: no MAP_POPULATE, use plain mmap
ptr_ =
mmap(nullptr, nbytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (ptr_ == MAP_FAILED) {
ptr_ = nullptr;
nbytes_ = 0;
return false;
}
#else
// Fallback: aligned allocation
ptr_ = std::aligned_alloc(4096, ((nbytes + 4095) / 4096) * 4096);
if (!ptr_) {
nbytes_ = 0;
return false;
}
std::memset(ptr_, 0, nbytes);
#endif
return true;
}

// Mark pages as reclaimable by the OS. The virtual address range is kept,
// and pages may remain resident if there is no memory pressure.
// After this call, the contents are undefined until the next write.
void mark_reclaimable() {
if (!ptr_ || !nbytes_) return;
#if defined(_WIN32)
// MEM_RESET tells Windows the pages are no longer needed.
// Pages remain committed but can be discarded under pressure.
VirtualAlloc(ptr_, nbytes_, MEM_RESET, PAGE_READWRITE);
#elif defined(__linux__) || defined(__APPLE__)
madvise(ptr_, nbytes_, MADV_FREE);
#endif
// Other platforms: no-op, pages stay resident
}

void *data() const { return ptr_; }
size_t size() const { return nbytes_; }

private:
void deallocate() {
if (!ptr_) return;
#if defined(_WIN32)
VirtualFree(ptr_, 0, MEM_RELEASE);
#elif defined(__unix__) || defined(__APPLE__)
munmap(ptr_, nbytes_);
#else
std::free(ptr_);
#endif
ptr_ = nullptr;
nbytes_ = 0;
}

void *ptr_ = nullptr;
size_t nbytes_ = 0;
};

} // namespace finufft
2 changes: 1 addition & 1 deletion include/finufft/plan.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
#include <complex>
#include <cstdint>
#include <memory>
#include <vector>

#include "finufft_common/common.h"
#include "finufft_errors.h"

// All indexing in library that potentially can exceed 2^31 uses 64-bit signed.
// This includes all calling arguments (eg M,N) that could be huge someday.
Expand Down
6 changes: 4 additions & 2 deletions include/finufft_common/kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,12 @@ void set_kernel_shape_given_ns(finufft_spread_opts &opts, int debug);
// Since for low upsampfacs, ns=16 can need only nc~12, allow such low nc here.
// Note: spreadinterp.cpp compilation time grows with the gap between these bounds...
inline constexpr int min_nc_given_ns(int ns) {
return std::max(common::MIN_NC, ns - 4); // note must stay in bounds from constants.h
// Parens around (std::max) prevent expansion of Windows min/max macros
// that are defined in <windows.h> when NOMINMAX is not set.
return (std::max)(common::MIN_NC, ns - 4); // note must stay in bounds from constants.h
}
inline constexpr int max_nc_given_ns(int ns) {
return std::min(common::MAX_NC, ns + 3); // "
return (std::min)(common::MAX_NC, ns + 3); // (std::min) for same Windows macro reason
}

template<int NS, int NC> inline constexpr bool ValidKernelParams() noexcept {
Expand Down
6 changes: 6 additions & 0 deletions src/fft.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,12 @@ template<typename TF> void FINUFFT_PLAN_T<TF>::init_grid_kerFT_FFT() {
if (opts.debug)
printf("[%s] FFT plan (mode %d, nthr=%d):\t%.3g s\n", __func__, opts.fftw, nthr_fft,
timer.elapsedsec());

if (opts.debug) {
const size_t fwBatchBytes = size_t(nf()) * batchSize * sizeof(TC);
printf("[%s] fwBatch alloc (%.3g GB):\tdeferred to execute\n", __func__,
fwBatchBytes / 1e9);
}
}
}
template void FINUFFT_PLAN_T<float>::init_grid_kerFT_FFT();
Expand Down
3 changes: 3 additions & 0 deletions src/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@

#if defined(_WIN32)
#include <vector>
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#elif defined(__APPLE__)
#include <sys/sysctl.h>
Expand Down
20 changes: 20 additions & 0 deletions test/testutils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@

// This switches FLT macro from double to float if SINGLE is defined, etc...

#include "finufft/memory.hpp"
#include "finufft/utils.hpp"
#include "utils/norms.hpp"
#include <cstdint>
#include <finufft/test_defs.hpp>

namespace finufft::common {
Expand Down Expand Up @@ -93,6 +95,24 @@ int main(int argc, char *argv[]) {
if (std::abs(errtwonorm(M, &a[0], &b[0]) - 1.0) > relerr) return 1;
if (std::abs(std::sqrt((FLT)M) * relerrtwonorm(M, &a[0], &b[0]) - 1.0) > relerr) return 1;

// test reclaimable workspace allocator...
finufft::ReclaimableMemory buf;
buf.mark_reclaimable(); // no-op before allocation
if (!buf.allocate(0) || buf.size() != 0) return 1;

constexpr size_t nbytes = 8192;
if (!buf.allocate(nbytes) || buf.data() == nullptr || buf.size() != nbytes) return 1;
if ((reinterpret_cast<std::uintptr_t>(buf.data()) & 4095u) != 0u) return 1;

void *ptr = buf.data();
if (!buf.allocate(nbytes) || buf.data() != ptr) return 1; // same-size reuse
buf.mark_reclaimable(); // should be safe after alloc

finufft::ReclaimableMemory moved = std::move(buf);
if (buf.data() != nullptr || buf.size() != 0) return 1;
if (moved.data() != ptr || moved.size() != nbytes) return 1;
moved.mark_reclaimable();

#if defined(__cpp_lib_math_special_functions)
// std::cyl_bessel_i present: compare std vs custom series
for (double x = 0.0; x <= 42.0; x += 0.5) {
Expand Down
11 changes: 11 additions & 0 deletions test/threadsafe_execute.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
/* Regression test for thread-safe concurrent execute() on the same plan.

Creates a single 1D type-1 plan and then runs finufft_execute from
multiple threads simultaneously, each into its own output array.
Correctness is verified against a direct (slow) Fourier transform.
This catches data races in internal scratch workspace allocation.

Usage: ./threadsafe_execute (exit 0 = pass, >0 = fail)
Barbone, Mar 2026.
*/

#include <finufft.h>
#include <finufft_common/constants.h>
#include <finufft_opts.h>
Expand Down
Loading