Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 29 additions & 2 deletions docs/source/api/data_transfer.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,18 @@ Data Transfers
From memory:

+---------------------------------------+----------------------------------------------------+
| :cpp:func:`load` | load values from memory (optionally masked) |
| :cpp:func:`load` | load values from memory (optionally masked) [#m]_ |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`load_aligned` | load values from aligned memory |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`load_unaligned` | load values from unaligned memory |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`load_as` | load values, forcing a type conversion |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`load_head` | load the first ``n`` contiguous elements [#h]_ |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`load_tail` | load the last ``n`` contiguous elements [#h]_ |
+---------------------------------------+----------------------------------------------------+

From a scalar:

Expand All @@ -32,14 +36,18 @@ From a scalar:
To memory:

+---------------------------------------+----------------------------------------------------+
| :cpp:func:`store` | store values to memory (optionally masked) |
| :cpp:func:`store` | store values to memory (optionally masked) [#m]_ |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`store_aligned` | store values to aligned memory |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`store_unaligned` | store values to unaligned memory |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`store_as` | store values, forcing a type conversion |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`store_head` | store the first ``n`` contiguous elements [#h]_ |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`store_tail` | store the last ``n`` contiguous elements [#h]_ |
+---------------------------------------+----------------------------------------------------+

In place:

Expand Down Expand Up @@ -84,3 +92,22 @@ The following empty types are used for tag dispatching:

.. doxygenstruct:: xsimd::unaligned_mode
:project: xsimd

.. rubric:: Footnotes

.. [#m] Masked ``load`` / ``store`` come in two flavours. The
:cpp:class:`batch_bool_constant` overload encodes the mask in the type, is
resolved at compile time and is always efficient. The runtime
:cpp:class:`batch_bool` overload, by contrast, falls back to a per-lane
scalar loop on architectures without a native masked load/store
instruction — SSE2 through SSE4.2, NEON/NEON64, VSX, S390x, and WASM.
AVX, AVX2, AVX-512, SVE and RVV use native masked instructions and pay no
such penalty. Prefer the compile-time mask whenever the selection is known
at compile time, and avoid runtime-mask loads/stores in hot inner loops on
the affected architectures.

.. [#h] ``load_head`` / ``store_head`` / ``load_tail`` / ``store_tail``
take a runtime element count ``n`` instead of a constructed mask;
they are sugar for the runtime-mask ``load`` / ``store`` with a
contiguous-prefix or contiguous-suffix mask, and inherit its
contract and per-arch codegen.
83 changes: 83 additions & 0 deletions include/xsimd/arch/common/xsimd_common_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <algorithm>
#include <array>
#include <complex>
#include <cstdint>

#include "../../types/xsimd_batch_constant.hpp"
#include "./xsimd_common_details.hpp"
Expand Down Expand Up @@ -374,6 +375,21 @@ namespace xsimd
return batch<T_out, A>::load(buffer.data(), aligned_mode {});
}

template <class A, class T, class Mode>
XSIMD_INLINE batch<T, A>
load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<common>) noexcept
{
// Per-lane validity contract: only active lanes are read.
// Arches with hardware predicated loads override this.
constexpr std::size_t size = batch<T, A>::size;
alignas(A::alignment()) std::array<T, size> buffer {};
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to make it worse, building a mask is not always a single operation depending on the target...

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this array assignment forces everything to zero, while some stores are not needed, and the compiler is notable to optimize this away in the generic case

const uint64_t bits = mask.mask();
for (std::size_t i = 0; i < size; ++i)
if ((bits >> i) & uint64_t(1))
buffer[i] = mem[i];
return batch<T, A>::load_aligned(buffer.data());
}

template <class A, class T_in, class T_out, bool... Values, class alignment>
XSIMD_INLINE void
store_masked(T_out* mem, batch<T_in, A> const& src, batch_bool_constant<T_in, A, Values...>, alignment, requires_arch<common>) noexcept
Expand All @@ -388,6 +404,73 @@ namespace xsimd
}
}

template <class A, class T, class Mode>
XSIMD_INLINE void
store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<common>) noexcept
{
// Per-lane validity contract: only active lanes are written.
// Arches with hardware predicated stores override this.
constexpr std::size_t size = batch<T, A>::size;
alignas(A::alignment()) std::array<T, size> src_buf;
src.store_aligned(src_buf.data());
const uint64_t bits = mask.mask();
for (std::size_t i = 0; i < size; ++i)
if ((bits >> i) & uint64_t(1))
mem[i] = src_buf[i];
}

// Head/tail forward to the runtime-mask path. ``tail`` offsets
// the base pointer back by ``(size - n)`` so the active high-``n``
// lanes land at ``[mem, mem + n)``; the offset goes through
// ``uintptr_t`` to dodge ``-Warray-bounds`` on small buffers.
namespace detail
{
template <class T>
XSIMD_INLINE T const* offset_back(T const* p, std::size_t k) noexcept
{
return reinterpret_cast<T const*>(reinterpret_cast<std::uintptr_t>(p) - k * sizeof(T));
}
template <class T>
XSIMD_INLINE T* offset_back(T* p, std::size_t k) noexcept
{
return reinterpret_cast<T*>(reinterpret_cast<std::uintptr_t>(p) - k * sizeof(T));
}
}

template <class A, class T, class Mode>
XSIMD_INLINE batch<T, A>
load_head(T const* mem, std::size_t n, Mode, requires_arch<common>) noexcept
{
const auto mask = batch_bool<T, A>::from_mask(::xsimd::details::full_mask(n));
return load_masked<A>(mem, mask, convert<T> {}, unaligned_mode {}, A {});
}

template <class A, class T, class Mode>
XSIMD_INLINE void
store_head(T* mem, std::size_t n, batch<T, A> const& src, Mode, requires_arch<common>) noexcept
{
const auto mask = batch_bool<T, A>::from_mask(::xsimd::details::full_mask(n));
store_masked<A>(mem, src, mask, unaligned_mode {}, A {});
}

template <class A, class T, class Mode>
XSIMD_INLINE batch<T, A>
load_tail(T const* mem, std::size_t n, Mode, requires_arch<common>) noexcept
{
constexpr std::size_t size = batch<T, A>::size;
const auto mask = batch_bool<T, A>::from_mask(::xsimd::details::full_mask(n) << (size - n));
return load_masked<A>(detail::offset_back(mem, size - n), mask, convert<T> {}, unaligned_mode {}, A {});
}

template <class A, class T, class Mode>
XSIMD_INLINE void
store_tail(T* mem, std::size_t n, batch<T, A> const& src, Mode, requires_arch<common>) noexcept
{
constexpr std::size_t size = batch<T, A>::size;
const auto mask = batch_bool<T, A>::from_mask(::xsimd::details::full_mask(n) << (size - n));
store_masked<A>(detail::offset_back(mem, size - n), src, mask, unaligned_mode {}, A {});
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...>, convert<int32_t>, Mode, requires_arch<A>) noexcept
{
Expand Down
33 changes: 33 additions & 0 deletions include/xsimd/arch/xsimd_avx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1015,6 +1015,23 @@ namespace xsimd
}
}

// Runtime-mask load for float/double on AVX. Both aligned_mode and
// unaligned_mode map to _mm256_maskload_* — the intrinsic does not fault
// on masked-off lanes, so partial loads across page boundaries are safe.
template <class A, class Mode>
XSIMD_INLINE batch<float, A>
load_masked(float const* mem, batch_bool<float, A> mask, convert<float>, Mode, requires_arch<avx>) noexcept
{
return _mm256_maskload_ps(mem, _mm256_castps_si256(mask));
}

template <class A, class Mode>
XSIMD_INLINE batch<double, A>
load_masked(double const* mem, batch_bool<double, A> mask, convert<double>, Mode, requires_arch<avx>) noexcept
{
return _mm256_maskload_pd(mem, _mm256_castpd_si256(mask));
}

// store_masked
namespace detail
{
Expand All @@ -1031,6 +1048,22 @@ namespace xsimd
}
}

// Runtime-mask store for float/double on AVX. Same fault-suppression
// semantics as the masked loads above; alignment mode is irrelevant.
template <class A, class Mode>
XSIMD_INLINE void
store_masked(float* mem, batch<float, A> const& src, batch_bool<float, A> mask, Mode, requires_arch<avx>) noexcept
{
_mm256_maskstore_ps(mem, _mm256_castps_si256(mask), src);
}

template <class A, class Mode>
XSIMD_INLINE void
store_masked(double* mem, batch<double, A> const& src, batch_bool<double, A> mask, Mode, requires_arch<avx>) noexcept
{
_mm256_maskstore_pd(mem, _mm256_castpd_si256(mask), src);
}

template <class A, class T, bool... Values, class Mode>
XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx>) noexcept
{
Expand Down
33 changes: 28 additions & 5 deletions include/xsimd/arch/xsimd_avx2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,6 @@ namespace xsimd
}

// load_masked
// AVX2 low-level helpers (operate on raw SIMD registers)
namespace detail
{
XSIMD_INLINE __m256i maskload(const int32_t* mem, __m256i mask) noexcept
Expand All @@ -138,14 +137,12 @@ namespace xsimd
}
}

// single templated implementation for integer masked loads (32/64-bit)
template <class A, class T, bool... Values, class Mode>
XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) >= 4), batch<T, A>>
load_masked(T const* mem, batch_bool_constant<T, A, Values...> mask, convert<T>, Mode, requires_arch<avx2>) noexcept
{
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "load_masked supports only 32/64-bit integers on AVX2");
using int_t = std::conditional_t<sizeof(T) == 4, int32_t, long long>;
// Use the raw register-level maskload helpers for the remaining cases.
return detail::maskload(reinterpret_cast<const int_t*>(mem), mask.as_batch());
}

Expand Down Expand Up @@ -175,6 +172,20 @@ namespace xsimd
return bitwise_cast<uint64_t>(r);
}

// Runtime-mask load for 32/64-bit integers on AVX2. 8/16-bit integers
// fall back to the scalar common path: AVX2 has no native maskload for
// those widths, and a load-then-blend would break fault-suppression at
// page boundaries (the main reason callers ask for a masked load).
// Both aligned_mode and unaligned_mode route to the same intrinsic —
// masked-off lanes do not fault regardless of alignment.
template <class A, class T, class Mode>
XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), batch<T, A>>
load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<avx2>) noexcept
{
using int_t = std::conditional_t<sizeof(T) == 4, int32_t, long long>;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why long long and not int64_t ? Tehre's no garantee that sizeof(long long) == 8

return detail::maskload(reinterpret_cast<const int_t*>(mem), __m256i(mask));
}

// store_masked
namespace detail
{
Expand All @@ -196,14 +207,12 @@ namespace xsimd
{
constexpr size_t lanes_per_half = batch<T, A>::size / 2;

// confined to lower 128-bit half → forward to SSE
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= lanes_per_half)
{
constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
const auto lo = detail::lower_half(src);
store_masked<sse4_2>(mem, lo, mlo, Mode {}, sse4_2 {});
}
// confined to upper 128-bit half → forward to SSE
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= lanes_per_half)
{
constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
Expand All @@ -230,6 +239,20 @@ namespace xsimd
store_masked<A>(reinterpret_cast<int64_t*>(mem), s64, batch_bool_constant<int64_t, A, Values...> {}, Mode {}, avx2 {});
}

template <class A, class T, class Mode>
XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), void>
store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<avx2>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
_mm256_maskstore_epi32(reinterpret_cast<int*>(mem), __m256i(mask), __m256i(src));
}
else
{
_mm256_maskstore_epi64(reinterpret_cast<long long*>(mem), __m256i(mask), __m256i(src));
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, I guess that's a constraint of the Intel intrinsic, at least static_assert that sizeof(long long) ==8 and sizeof(int) == 4 if you're using this to disntinguish between the two?

}
}

// load_stream
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value, void>>
XSIMD_INLINE batch<T, A> load_stream(T const* mem, convert<T>, requires_arch<avx2>) noexcept
Expand Down
37 changes: 37 additions & 0 deletions include/xsimd/arch/xsimd_avx2_128.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,43 @@ namespace xsimd
return _mm_maskstore_epi64((int64_t*)mem, mask.as_batch(), src);
}

// Runtime-mask load for 32/64-bit integers on AVX2-128. 8/16-bit
// integers fall back to the scalar common path: there is no native
// _mm_maskload for those widths, and a load-then-blend would break
// fault-suppression at page boundaries (the main reason callers ask
// for a masked load). Both aligned_mode and unaligned_mode route to
// the same intrinsic — masked-off lanes do not fault regardless of
// alignment.
template <class A, class T, class Mode>
XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), batch<T, A>>
load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<avx2_128>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return _mm_maskload_epi32(reinterpret_cast<const int*>(mem), __m128i(mask));
}
else
{
return _mm_maskload_epi64(reinterpret_cast<const long long*>(mem), __m128i(mask));
}
}

// Runtime-mask store for 32/64-bit integers on AVX2-128. Same
// fault-suppression semantics as the masked loads above.
template <class A, class T, class Mode>
XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), void>
store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<avx2_128>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
_mm_maskstore_epi32(reinterpret_cast<int*>(mem), __m128i(mask), __m128i(src));
}
else
{
_mm_maskstore_epi64(reinterpret_cast<long long*>(mem), __m128i(mask), __m128i(src));
}
}

// gather
template <class T, class A, class U, detail::enable_sized_integral_t<T, 4> = 0, detail::enable_sized_integral_t<U, 4> = 0>
XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
Expand Down
31 changes: 31 additions & 0 deletions include/xsimd/arch/xsimd_avx_128.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,22 @@ namespace xsimd
return _mm_maskload_pd(mem, mask.as_batch());
}

// Runtime-mask load for float/double on AVX-128. Both aligned_mode and
// unaligned_mode map to _mm_maskload_* — the intrinsic does not fault
// on masked-off lanes, so partial loads across page boundaries are safe.
template <class A, class Mode>
XSIMD_INLINE batch<float, A>
load_masked(float const* mem, batch_bool<float, A> mask, convert<float>, Mode, requires_arch<avx_128>) noexcept
{
return _mm_maskload_ps(mem, _mm_castps_si128(mask));
}
template <class A, class Mode>
XSIMD_INLINE batch<double, A>
load_masked(double const* mem, batch_bool<double, A> mask, convert<double>, Mode, requires_arch<avx_128>) noexcept
{
return _mm_maskload_pd(mem, _mm_castpd_si128(mask));
}

// store_masked
template <class A, bool... Values, class Mode>
XSIMD_INLINE void store_masked(float* mem, batch<float, A> const& src, batch_bool_constant<float, A, Values...> mask, Mode, requires_arch<avx_128>) noexcept
Expand All @@ -128,6 +144,21 @@ namespace xsimd
return _mm_maskstore_pd(mem, mask.as_batch(), src);
}

// Runtime-mask store for float/double on AVX-128. Same fault-suppression
// semantics as the masked loads above; alignment mode is irrelevant.
template <class A, class Mode>
XSIMD_INLINE void
store_masked(float* mem, batch<float, A> const& src, batch_bool<float, A> mask, Mode, requires_arch<avx_128>) noexcept
{
_mm_maskstore_ps(mem, _mm_castps_si128(mask), src);
}
template <class A, class Mode>
XSIMD_INLINE void
store_masked(double* mem, batch<double, A> const& src, batch_bool<double, A> mask, Mode, requires_arch<avx_128>) noexcept
{
_mm_maskstore_pd(mem, _mm_castpd_si128(mask), src);
}

// swizzle (dynamic mask)
template <class A, class T, class ITy, class = std::enable_if_t<std::is_floating_point<T>::value && sizeof(T) == sizeof(ITy)>>
XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch<ITy, A> mask, requires_arch<avx_128>) noexcept
Expand Down
Loading
Loading