diff --git a/LICENSES/XSIMD_LICENSE b/LICENSES/XSIMD_LICENSE
new file mode 100644
index 0000000000000..eee7a54bc956b
--- /dev/null
+++ b/LICENSES/XSIMD_LICENSE
@@ -0,0 +1,29 @@
+Copyright (c) 2016, Johan Mabille, Sylvain Corlay, Wolf Vollprecht and Martin Renou
+Copyright (c) 2016, QuantStack
+Copyright (c) 2018, Serge Guelton
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/meson.build b/meson.build
index d2874e85cc3a7..23d2717e237dd 100644
--- a/meson.build
+++ b/meson.build
@@ -10,7 +10,7 @@ project(
     default_options: [
         'buildtype=release',
         'c_std=c17',
-        'cpp_std=c++17',
+        'cpp_std=c++20',
         'warning_level=2',
     ],
 )
@@ -37,6 +37,7 @@ add_project_arguments(
 )
 
 cc = meson.get_compiler('c')
+cxx = meson.get_compiler('cpp')
 if cc.get_id() == 'msvc'
     # Tracking issue: https://github.com/pandas-dev/pandas/issues/63701
     # Ignore some MSVC specific warnings:
@@ -44,8 +45,9 @@ if cc.get_id() == 'msvc'
     # C4267: conversion from `size_t` to smaller type.
     # C4551: occurs due to Cython generating code with (void)func.
     #        https://github.com/cython/cython/issues/3579
+    # C4146: unary minus operator applied to unsigned type. Occurs in xsimd.
     add_project_arguments(
-        ['/wd4244', '/wd4267', '/wd4551'],
+        ['/wd4244', '/wd4267', '/wd4551', '/wd4146'],
         language: ['c', 'cpp'],
     )
 endif
diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd
index e47f7d56ba343..5cb09973983c2 100644
--- a/pandas/_libs/algos.pxd
+++ b/pandas/_libs/algos.pxd
@@ -1,4 +1,5 @@
 cimport cython
+from cython cimport size_t
 from libc.math cimport (
     NAN,
     sqrt,
@@ -6,6 +7,7 @@ from libc.math cimport (
 from numpy cimport (
     float64_t,
     int64_t,
+    uint8_t,
 )
 
 from pandas._libs.dtypes cimport (
@@ -50,6 +52,22 @@ cdef inline void moments_add_value(
     mean[0] += delta_n
 
 
+cdef extern from "pandas/moments.h":
+    ctypedef struct Moments:
+        float64_t mean
+        float64_t m2
+        float64_t m3
+        float64_t m4
+        size_t n
+
+    Moments moments_reduce(
+            const double *values,
+            size_t n,
+            bint skipna,
+            const uint8_t *mask,
+            int max_moment) noexcept nogil
+
+
 @cython.cdivision(True)
 cdef inline float64_t calc_skew(
     int64_t nobs, float64_t m2, float64_t m3
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 97aa64593a3fc..0cc8ef75baab1 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -1603,9 +1603,9 @@ def diff_2d(
 @cython.boundscheck(False)
 @cython.wraparound(False)
 cdef void accumulate_moments_scalar(
-    const float64_t[:] values,
+    const float64_t[::1] values,
     bint skipna,
-    const uint8_t[:] mask,
+    const uint8_t[::1] mask,
     int64_t* nobs,
     float64_t* mean,
     float64_t* m2,
@@ -1614,17 +1614,20 @@ cdef void accumulate_moments_scalar(
     int max_moment,
 ) noexcept nogil:
     cdef:
-        Py_ssize_t i, n = len(values)
-        bint uses_mask = mask is not None
-        float64_t val
+        Moments moments
+        const float64_t* values_ptr = &values[0]
+        const uint8_t* mask_ptr = &mask[0] if mask is not None else NULL
+        size_t n = <size_t>values.shape[0]
 
-    for i in range(n):
-        val = values[i]
-        if uses_mask and mask[i]:
-            val = NaN
-        if skipna and isnan(val):
-            continue
-        moments_add_value(val, nobs, mean, m2, m3, m4, max_moment)
+    moments = moments_reduce(values_ptr, n, skipna, mask_ptr, max_moment)
+    if max_moment >= 4:
+        m4[0] = moments.m4
+    if max_moment >= 3:
+        m3[0] = moments.m3
+
+    m2[0] = moments.m2
+    mean[0] = moments.mean
+    nobs[0] = <int64_t>moments.n
 
 
 @cython.boundscheck(False)
@@ -1676,9 +1679,9 @@ cdef void accumulate_moments_axis(
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def scalar_skew(
-    const float64_t[:] values,
+    const float64_t[::1] values,
     bint skipna,
-    const uint8_t[:] mask,
+    const uint8_t[::1] mask,
 ) -> float:
     cdef:
         int64_t nobs = 0
@@ -1693,9 +1696,9 @@ def scalar_skew(
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def scalar_kurt(
-    const float64_t[:] values,
+    const float64_t[::1] values,
     bint skipna,
-    const uint8_t[:] mask,
+    const uint8_t[::1] mask,
 ) -> float:
     cdef:
         int64_t nobs = 0
diff --git a/pandas/_libs/include/pandas/moments.h b/pandas/_libs/include/pandas/moments.h
new file mode 100644
index 0000000000000..98f1f1a1e31f8
--- /dev/null
+++ b/pandas/_libs/include/pandas/moments.h
@@ -0,0 +1,33 @@
+/*
+Copyright (c) 2026, PyData Development Team
+All rights reserved.
+
+Distributed under the terms of the BSD Simplified License.
+
+The full license is in the LICENSE file, distributed with this software.
+*/
+
+#pragma once
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  double mean;
+  double m2;
+  double m3;
+  double m4;
+  size_t n;
+} Moments;
+
+/// Compute central moments until `max_moment` using `n` elements from `values`.
+Moments moments_reduce(const double *values, size_t n, bool skipna,
+                       const uint8_t *mask, int max_moment);
+#ifdef __cplusplus
+}
+#endif
diff --git a/pandas/_libs/include/pandas/simd/moments_simd.hpp b/pandas/_libs/include/pandas/simd/moments_simd.hpp
new file mode 100644
index 0000000000000..fbedeea0dd932
--- /dev/null
+++ b/pandas/_libs/include/pandas/simd/moments_simd.hpp
@@ -0,0 +1,388 @@
+/*
+Copyright (c) 2026, PyData Development Team
+All rights reserved.
+
+Distributed under the terms of the BSD Simplified License.
+
+The full license is in the LICENSE file, distributed with this software.
+*/
+
+#pragma once
+
+#include "pandas/moments.h"
+#include "pandas_simd_config.h"
+#include "xsimd/xsimd.hpp"
+#include <cassert>
+#include <cmath>
+#include <optional>
+#include <span>
+
+namespace pandas::moments {
+
+namespace detail {
+
+static inline void moments_add_value(Moments &moments, double val,
+                                     int max_moment) {
+  const auto delta = val - moments.mean;
+  moments.n++;
+  const auto n = static_cast<double>(moments.n);
+  const auto delta_n = delta / n;
+  const auto term1 = delta * delta_n * (n - 1.0);
+
+  if (max_moment >= 4) {
+    const auto m3_term = -4.0 * moments.m3;
+    const auto m2_term = 6.0 * moments.m2;
+    const auto m0_term = (n * (n - 3.0)) + 3.0;
+    moments.m4 +=
+        delta_n * (m3_term + (delta_n * (m2_term + (term1 * m0_term))));
+  }
+  if (max_moment >= 3) {
+    const auto m2_term = -3.0 * moments.m2;
+    const auto m0_term = n - 2.0;
+    moments.m3 += delta_n * (m2_term + (term1 * m0_term));
+  }
+  moments.m2 += term1;
+  moments.mean += delta_n;
+}
+
+template <class batch_type>
+void update_moments_batch(batch_type &mean, batch_type &m2, batch_type &m3,
+                          batch_type &m4, batch_type &nobs, batch_type val,
+                          typename batch_type::batch_bool_type nan_mask,
+                          int max_moment) {
+  const batch_type zero(0.0);
+  const batch_type one(1.0);
+  const batch_type three(3.0);
+
+  const auto nobs_increment = xsimd::select(nan_mask, zero, one);
+  nobs += nobs_increment;
+
+  const auto n_nonzero = xsimd::max(nobs, one);
+  const auto delta = xsimd::select(nan_mask, zero, val - mean);
+  const auto delta_n = delta / n_nonzero;
+  const auto delta_n2 = delta * delta_n;
+  const auto term1 = delta_n2 * (nobs - one);
+
+  if (max_moment >= 4) {
+    const auto m3_term = batch_type(-4.0) * m3;
+    const auto m2_term = batch_type(6.0) * m2;
+    const auto m0_term = (nobs * (nobs - three)) + three;
+    m4 += delta_n * (m3_term + (delta_n * (m2_term + (term1 * m0_term))));
+  }
+
+  if (max_moment >= 3) {
+    const auto m2_term = three * m2;
+    const auto m0_term = nobs - batch_type(2.0);
+    m3 += delta_n * ((term1 * m0_term) - m2_term);
+  }
+
+  m2 += term1;
+  mean += delta_n;
+}
+
+/// Merge results from moments accumulators.
+/// It uses the formula for merging central moments:
+/// $M_{p; N} = \sum_{k=1}^l \sum_{j=0}^p
+/// \binom{p}{j} M_{p-j; k} (-\frac{\delta_k}{n})^j$
+/// where $\delta_k = \sum_{j=1}^k n_j * (\bar{x}_j - \bar{x}_k)$.
+template <class batch_type>
+Moments merge_batches(batch_type &nobs, batch_type &mean, batch_type &m2,
+                      batch_type &m3, batch_type &m4, int max_moment) {
+  constexpr std::size_t step = batch_type::size;
+  Moments result{};
+
+  const auto total_n = xsimd::reduce_add(nobs);
+  assert(total_n >= 0);
+  result.n = static_cast<std::size_t>(total_n);
+
+  if (result.n == 0) {
+    return result;
+  }
+
+  const auto mean_orig = mean;
+  batch_type delta(0.0);
+  for (std::size_t i = 0; i + 1 < step; ++i) {
+    mean = xsimd::rotate_left<1>(mean);
+    nobs = xsimd::rotate_left<1>(nobs);
+    delta += nobs * (mean - mean_orig);
+  }
+
+  // Restore mean and nobs to their original lane positions
+  mean = xsimd::rotate_left<1>(mean);
+  nobs = xsimd::rotate_left<1>(nobs);
+
+  const batch_type total_n_v(total_n);
+  const auto delta_n = delta / total_n_v;
+  const auto delta2_n = delta_n * delta_n;
+
+  if (max_moment >= 4) {
+    const auto m3_term = batch_type(-4.0) * m3;
+    const auto m2_term = batch_type(6.0) * m2;
+    const auto m4_acc =
+        m4 + (delta_n * (m3_term + (delta_n * (m2_term + (delta2_n * nobs)))));
+    result.m4 = xsimd::reduce_add(m4_acc);
+  }
+
+  if (max_moment >= 3) {
+    const auto m2_term = batch_type(-3.0) * m2;
+    const auto m3_acc = m3 + (delta_n * (m2_term - (nobs * delta2_n)));
+    result.m3 = xsimd::reduce_add(m3_acc);
+  }
+
+  const auto m2_acc = m2 + (nobs * delta2_n);
+  result.m2 = xsimd::reduce_add(m2_acc);
+
+  const auto mean_v = mean + delta_n;
+  result.mean = mean_v.first();
+
+  [[maybe_unused]] constexpr double rtol = 1e-12;
+  [[maybe_unused]] constexpr double atol = 1e-8;
+  assert((xsimd::all(xsimd::isnan(mean_v)) ||
+          xsimd::all(xsimd::abs(mean_v - result.mean) <
+                     ((rtol * xsimd::abs(result.mean)) + atol))) &&
+         "mean lanes aren't homogeneous after merge");
+
+  return result;
+}
+
+/// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Higher-order_statistics
+static inline void moments_merge(Moments &acc, const Moments &src,
+                                 int max_moment) {
+  if (acc.n == 0) {
+    acc = src;
+    return;
+  }
+  if (src.n == 0) {
+    return;
+  }
+
+  const auto n_a = static_cast<double>(acc.n);
+  const auto n_b = static_cast<double>(src.n);
+  acc.n += src.n;
+
+  const auto delta = src.mean - acc.mean;
+  const auto delta_n = delta / static_cast<double>(acc.n);
+  const auto term1 = delta * delta_n * n_a * n_b;
+
+  if (max_moment >= 4) {
+    const auto m3_term = 4.0 * ((n_a * src.m3) - (n_b * acc.m3));
+    const auto m2_term = 6.0 * ((n_a * n_a * src.m2) + (n_b * n_b * acc.m2));
+    const auto m0_term = (n_a * n_a) - (n_a * n_b) + (n_b * n_b);
+    acc.m4 += src.m4 +
+              (delta_n * (m3_term + (delta_n * (m2_term + (term1 * m0_term)))));
+  }
+
+  if (max_moment >= 3) {
+    const auto m2_term = 3.0 * ((n_a * src.m2) - (n_b * acc.m2));
+    const auto m0_term = n_a - n_b;
+    acc.m3 += src.m3 + (delta_n * (m2_term + (term1 * m0_term)));
+  }
+
+  acc.m2 += src.m2 + term1;
+  acc.mean += delta_n * n_b;
+}
+
+template <class batch_type>
+static inline void set_moments_nan(batch_type &mean, batch_type &m2,
+                                   batch_type &m3, batch_type &m4,
+                                   batch_type &nobs, std::size_t total_n) {
+  nobs = batch_type(static_cast<double>(total_n));
+  mean = batch_type(NAN);
+  m2 = batch_type(NAN);
+  m3 = batch_type(NAN);
+  m4 = batch_type(NAN);
+}
+
+template <class Arch>
+void accumulate_moments_simd_impl(xsimd::batch<double, Arch> &nobs,
+                                  xsimd::batch<double, Arch> &mean,
+                                  xsimd::batch<double, Arch> &m2,
+                                  xsimd::batch<double, Arch> &m3,
+                                  xsimd::batch<double, Arch> &m4,
+                                  int max_moment,
+                                  std::span<const double> values, bool skipna) {
+  using batch_type = xsimd::batch<double, Arch>;
+  constexpr std::size_t step = batch_type::size;
+  assert(values.size() % step == 0);
+
+  for (std::size_t i = 0; i < values.size(); i += step) {
+    auto val = xsimd::load_unaligned<Arch>(&values[i]);
+    auto nan_mask = xsimd::isnan(val);
+
+    if (!skipna && xsimd::any(nan_mask)) [[unlikely]] {
+      const std::size_t nobs_per_lane = values.size() / step;
+      set_moments_nan(mean, m2, m3, m4, nobs, nobs_per_lane);
+      return;
+    }
+
+    detail::update_moments_batch(mean, m2, m3, m4, nobs, val, nan_mask,
+                                 max_moment);
+  }
+}
+
+template <class A>
+void accumulate_moments_simd_masked_impl(
+    xsimd::batch<double, A> &nobs, xsimd::batch<double, A> &mean,
+    xsimd::batch<double, A> &m2, xsimd::batch<double, A> &m3,
+    xsimd::batch<double, A> &m4, int max_moment, std::span<const double> values,
+    std::span<const uint8_t> mask, bool skipna) {
+  using mask_batch_type = xsimd::batch<uint8_t, A>;
+  using value_batch_type = xsimd::batch<double, A>;
+  constexpr std::size_t mask_step = mask_batch_type::size;
+  constexpr std::size_t val_step = value_batch_type::size;
+
+  assert(values.size() == mask.size());
+  assert(mask.size() % mask_step == 0);
+
+  std::size_t left = 0;
+  for (std::size_t right = 0; right < mask.size(); right += mask_step) {
+    const mask_batch_type mask_batch = xsimd::load_unaligned<A>(&mask[right]);
+    const auto is_masked = mask_batch != mask_batch_type(0U);
+
+    if (!xsimd::any(is_masked)) {
+      continue;
+    }
+
+    if (!skipna) {
+      const std::size_t nobs_per_lane = values.size() / val_step;
+      set_moments_nan(mean, m2, m3, m4, nobs, nobs_per_lane);
+      return;
+    }
+
+    // NaN values aren't skipped when there is a mask
+    accumulate_moments_simd_impl<A>(nobs, mean, m2, m3, m4, max_moment,
+                                    values.subspan(left, right - left),
+                                    /*skipna=*/false);
+
+    const std::uint64_t is_masked_bitmask = is_masked.mask();
+    static_assert(mask_step % val_step == 0);
+
+    const std::uint64_t lane_values_mask = ((1 << val_step) - 1);
+    for (std::size_t i = 0; i < mask_step; i += val_step) {
+      const std::uint64_t lane_mask_bits =
+          (is_masked_bitmask >> i) & lane_values_mask;
+      const auto isna_pd =
+          xsimd::batch_bool<double, A>::from_mask(lane_mask_bits);
+
+      const value_batch_type val = xsimd::load_unaligned<A>(&values[right + i]);
+
+      detail::update_moments_batch(mean, m2, m3, m4, nobs, val, isna_pd,
+                                   max_moment);
+    }
+
+    left = right + mask_step;
+  }
+
+  accumulate_moments_simd_impl<A>(nobs, mean, m2, m3, m4, max_moment,
+                                  values.last(values.size() - left), false);
+}
+} // namespace detail
+
+struct accumulate_moments_simd {
+  template <class Arch>
+  Moments operator()(Arch, std::span<const double> values, bool skipna,
+                     std::optional<std::span<const uint8_t>> mask,
+                     int max_moment) noexcept;
+};
+
+template <>
+inline Moments accumulate_moments_simd::operator()<xsimd::common>(
+    xsimd::common, std::span<const double> values, bool skipna,
+    std::optional<std::span<const uint8_t>> mask, int max_moment) noexcept {
+  Moments acc{};
+  for (std::size_t i = 0; i < values.size(); i++) {
+    const auto val = values[i];
+    const auto isna_entry =
+        mask.has_value() ? (*mask)[i] != 0 : std::isnan(val);
+
+    if (skipna && isna_entry) {
+      continue;
+    }
+    if (isna_entry) [[unlikely]] {
+      return {.mean = NAN, .m2 = NAN, .m3 = NAN, .m4 = NAN, .n = values.size()};
+    }
+    detail::moments_add_value(acc, val, max_moment);
+  }
+  return acc;
+}
+
+template <class Arch>
+Moments accumulate_moments_simd::operator()(
+    Arch, std::span<const double> values, bool skipna,
+    std::optional<std::span<const uint8_t>> mask, int max_moment) noexcept {
+  using values_batch_type = xsimd::batch<double, Arch>;
+  values_batch_type mean(0.0);
+  values_batch_type m2(0.0);
+  values_batch_type m3(0.0);
+  values_batch_type m4(0.0);
+  values_batch_type nobs(0.0);
+
+  std::size_t vec_size;
+  std::size_t tail_size;
+
+  if (mask.has_value()) {
+    using mask_batch_type = xsimd::batch<uint8_t, Arch>;
+    constexpr std::size_t batch_size = mask_batch_type::size;
+    tail_size = values.size() % batch_size;
+    vec_size = values.size() - tail_size;
+
+    detail::accumulate_moments_simd_masked_impl<Arch>(
+        nobs, mean, m2, m3, m4, max_moment, values.first(vec_size),
+        mask->first(vec_size), skipna);
+  } else {
+    constexpr std::size_t batch_size = values_batch_type::size;
+    tail_size = values.size() % batch_size;
+    vec_size = values.size() - tail_size;
+
+    detail::accumulate_moments_simd_impl<Arch>(
+        nobs, mean, m2, m3, m4, max_moment, values.first(vec_size), skipna);
+  }
+
+  Moments moments_acc =
+      detail::merge_batches(nobs, mean, m2, m3, m4, max_moment);
+
+  auto values_tail = values.last(tail_size);
+  std::optional<std::span<const uint8_t>> mask_tail{};
+  if (mask.has_value()) {
+    mask_tail = mask->last(tail_size);
+  }
+
+  Moments tail = accumulate_moments_simd{}(xsimd::common{}, values_tail, skipna,
+                                           mask_tail, max_moment);
+  detail::moments_merge(moments_acc, tail, max_moment);
+
+  return moments_acc;
+}
+
+extern template Moments accumulate_moments_simd::operator()<xsimd::avx512cd>(
+    xsimd::avx512cd, std::span<const double>, bool,
+    std::optional<std::span<const uint8_t>>, int) noexcept;
+extern template Moments accumulate_moments_simd::operator()<xsimd::avx2>(
+    xsimd::avx2, std::span<const double>, bool,
+    std::optional<std::span<const uint8_t>>, int) noexcept;
+extern template Moments accumulate_moments_simd::operator()<xsimd::sse2>(
+    xsimd::sse2, std::span<const double>, bool,
+    std::optional<std::span<const uint8_t>>, int) noexcept;
+extern template Moments accumulate_moments_simd::operator()<xsimd::neon64>(
+    xsimd::neon64, std::span<const double>, bool,
+    std::optional<std::span<const uint8_t>>, int) noexcept;
+
+// TODO: remove macro
+using arch_list = xsimd::arch_list<>
+#if PANDAS_HAVE_AVX512CD
+    ::add<xsimd::avx512cd>
+#endif
+#if PANDAS_HAVE_AVX2
+    ::add<xsimd::avx2>
+#endif
+#if PANDAS_HAVE_SSE2
+    ::add<xsimd::sse2>
+#endif
+#if PANDAS_HAVE_NEON
+    ::add<xsimd::neon64>
+#endif
+#if PANDAS_HAVE_SCALAR
+    ::add<xsimd::common>
+#endif
+    ;
+} // namespace pandas::moments
diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build
index 56ff2a01b450c..fd758c787eb7e 100644
--- a/pandas/_libs/meson.build
+++ b/pandas/_libs/meson.build
@@ -53,6 +53,9 @@ _khash_primitive_helper_dep = declare_dependency(
 m_dep = cc.find_library('m', required: false)
 fast_float = subproject('fast_float')
 fast_float_dep = fast_float.get_variable('fast_float_dep')
+xsimd_dep = dependency('xsimd', version: '>=14.2')
+
+subdir('simd')
 
 subdir('tslibs')
 
@@ -61,7 +64,7 @@ libs_sources = {
     # numpy include dir is implicitly included
     'algos': {
         'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper],
-        'deps': [_khash_primitive_helper_dep, m_dep],
+        'deps': [_khash_primitive_helper_dep, m_dep, moments_simd_dep],
     },
     'arrays': {'sources': ['arrays.pyx']},
     'groupby': {'sources': ['groupby.pyx'], 'deps': [m_dep]},
diff --git a/pandas/_libs/simd/meson.build b/pandas/_libs/simd/meson.build
new file mode 100644
index 0000000000000..b59e1c25efaa7
--- /dev/null
+++ b/pandas/_libs/simd/meson.build
@@ -0,0 +1,55 @@
+# All architectures we might support
+# Key is the architecture name used in file suffixes and macros
+is_msvc_syntax = cxx.get_argument_syntax() == 'msvc'
+simd_x86_flags = {
+    'sse2': is_msvc_syntax ? ['/arch:SSE2'] : ['-msse2'],
+    'avx2': is_msvc_syntax ? ['/arch:AVX2'] : ['-mavx2'],
+    'avx512cd': is_msvc_syntax ? ['/arch:AVX512'] : ['-mavx512cd'],
+}
+
+simd_config = configuration_data()
+supported_simd_archs = {}
+if host_machine.cpu_family() == 'aarch64'
+    supported_simd_archs += {'neon': []}
+    simd_config.set('PANDAS_HAVE_NEON', 1)
+elif host_machine.cpu_family() in ['x86', 'x86_64']
+    foreach name, flags : simd_x86_flags
+        if cxx.has_multi_arguments(flags)
+            supported_simd_archs += {name: flags}
+            simd_config.set('PANDAS_HAVE_@0@'.format(name.to_upper()), 1)
+        endif
+    endforeach
+endif
+
+# Ensure scalar version on all architectures for now...
+simd_config.set('PANDAS_HAVE_SCALAR', 1)
+
+configure_file(
+    output: 'pandas_simd_config.h',
+    configuration: simd_config,
+)
+
+simd_config_inc = include_directories('.')
+
+moments_libs = []
+foreach arch_name, arch_flags : supported_simd_archs
+    moments_libs += static_library(
+        'moments_simd_@0@'.format(arch_name),
+        'moments_inst_@0@.cpp'.format(arch_name),
+        include_directories: [inc_pd],
+        dependencies: [xsimd_dep],
+        cpp_args: arch_flags,
+    )
+endforeach
+
+moments_libs += static_library(
+    'moments_base',
+    'moments.cpp',
+    include_directories: [inc_pd],
+    dependencies: [xsimd_dep],
+)
+
+moments_simd_dep = declare_dependency(
+    link_with: moments_libs,
+    include_directories: [inc_pd],
+)
diff --git a/pandas/_libs/simd/moments.cpp b/pandas/_libs/simd/moments.cpp
new file mode 100644
index 0000000000000..008873cba0226
--- /dev/null
+++ b/pandas/_libs/simd/moments.cpp
@@ -0,0 +1,24 @@
+/*
+Copyright (c) 2026, PyData Development Team
+All rights reserved.
+
+Distributed under the terms of the BSD Simplified License.
+
+The full license is in the LICENSE file, distributed with this software.
+*/
+
+#include "pandas/moments.h"
+#include "pandas/simd/moments_simd.hpp"
+#include <optional>
+#include <span>
+
+Moments moments_reduce(const double *values, size_t n, bool skipna,
+                       const uint8_t *mask, int max_moment) {
+  auto moments_dispatch = xsimd::dispatch<pandas::moments::arch_list>(
+      pandas::moments::accumulate_moments_simd{});
+  std::span<const double> values_span(values, n);
+  std::optional<std::span<const uint8_t>> mask_span{};
+  if (mask != nullptr)
+    mask_span = std::span(mask, n);
+  return moments_dispatch(values_span, skipna, mask_span, max_moment);
+}
diff --git a/pandas/_libs/simd/moments_inst_avx2.cpp b/pandas/_libs/simd/moments_inst_avx2.cpp
new file mode 100644
index 0000000000000..974060c5d960d
--- /dev/null
+++ b/pandas/_libs/simd/moments_inst_avx2.cpp
@@ -0,0 +1,18 @@
+/*
+Copyright (c) 2026, PyData Development Team
+All rights reserved.
+
+Distributed under the terms of the BSD Simplified License.
+
+The full license is in the LICENSE file, distributed with this software.
+*/
+
+#include "pandas/simd/moments_simd.hpp"
+
+namespace pandas::moments {
+
+template Moments accumulate_moments_simd::operator()<xsimd::avx2>(
+    xsimd::avx2, std::span<const double>, bool,
+    std::optional<std::span<const uint8_t>>, int) noexcept;
+
+} // namespace pandas::moments
diff --git a/pandas/_libs/simd/moments_inst_avx512cd.cpp b/pandas/_libs/simd/moments_inst_avx512cd.cpp
new file mode 100644
index 0000000000000..cf5ad0a8d08fc
--- /dev/null
+++ b/pandas/_libs/simd/moments_inst_avx512cd.cpp
@@ -0,0 +1,18 @@
+/*
+Copyright (c) 2026, PyData Development Team
+All rights reserved.
+
+Distributed under the terms of the BSD Simplified License.
+
+The full license is in the LICENSE file, distributed with this software.
+*/
+
+#include "pandas/simd/moments_simd.hpp"
+
+namespace pandas::moments {
+
+template Moments accumulate_moments_simd::operator()<xsimd::avx512cd>(
+    xsimd::avx512cd, std::span<const double>, bool,
+    std::optional<std::span<const uint8_t>>, int) noexcept;
+
+} // namespace pandas::moments
diff --git a/pandas/_libs/simd/moments_inst_neon.cpp b/pandas/_libs/simd/moments_inst_neon.cpp
new file mode 100644
index 0000000000000..34072425e0d44
--- /dev/null
+++ b/pandas/_libs/simd/moments_inst_neon.cpp
@@ -0,0 +1,18 @@
+/*
+Copyright (c) 2026, PyData Development Team
+All rights reserved.
+
+Distributed under the terms of the BSD Simplified License.
+
+The full license is in the LICENSE file, distributed with this software.
+*/
+
+#include "pandas/simd/moments_simd.hpp"
+
+namespace pandas::moments {
+
+template Moments accumulate_moments_simd::operator()<xsimd::neon64>(
+    xsimd::neon64, std::span<const double>, bool,
+    std::optional<std::span<const uint8_t>>, int) noexcept;
+
+} // namespace pandas::moments
diff --git a/pandas/_libs/simd/moments_inst_sse2.cpp b/pandas/_libs/simd/moments_inst_sse2.cpp
new file mode 100644
index 0000000000000..240bc54706664
--- /dev/null
+++ b/pandas/_libs/simd/moments_inst_sse2.cpp
@@ -0,0 +1,18 @@
+/*
+Copyright (c) 2026, PyData Development Team
+All rights reserved.
+
+Distributed under the terms of the BSD Simplified License.
+
+The full license is in the LICENSE file, distributed with this software.
+*/
+
+#include "pandas/simd/moments_simd.hpp"
+
+namespace pandas::moments {
+
+template Moments accumulate_moments_simd::operator()<xsimd::sse2>(
+    xsimd::sse2, std::span<const double>, bool,
+    std::optional<std::span<const uint8_t>>, int) noexcept;
+
+} // namespace pandas::moments
diff --git a/pandas/tests/arrays/floating/test_function.py b/pandas/tests/arrays/floating/test_function.py
index a5938b2aa93e8..7e4da94cc7919 100644
--- a/pandas/tests/arrays/floating/test_function.py
+++ b/pandas/tests/arrays/floating/test_function.py
@@ -116,7 +116,7 @@ def test_stat_method(pandasmethname, kwargs):
     s2 = pd.Series(data=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6], dtype="float64")
     pandasmeth = getattr(s2, pandasmethname)
     expected = pandasmeth(**kwargs)
-    assert expected == result
+    tm.assert_almost_equal(result, expected)
 
 
 def test_value_counts_na():
diff --git a/pyproject.toml b/pyproject.toml
index 9160526cf0f1a..40cfca5ac638b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,6 +38,7 @@ license-files = [
   "LICENSES/PYUPGRADE_LICENSE", # MIT
   "LICENSES/SAS7BDAT_LICENSE", # MIT
   "LICENSES/ULTRAJSON_LICENSE", # BSD-3-Clause AND TCL
+  "LICENSES/XSIMD_LICENSE", # BSD-3-Clause
   "subprojects/fast_float-*/LICENSE-APACHE", # Apache-2.0
   "subprojects/fast_float-*/LICENSE-BOOST", # BSL
   "subprojects/fast_float-*/LICENSE-MIT", # MIT
diff --git a/subprojects/packagefiles/xsimd/meson.build b/subprojects/packagefiles/xsimd/meson.build
new file mode 100644
index 0000000000000..595fb6122ef51
--- /dev/null
+++ b/subprojects/packagefiles/xsimd/meson.build
@@ -0,0 +1,12 @@
+project(
+    'xsimd',
+    'cpp',
+    meson_version: '>=0.58.0',
+    license: 'BSD-3-Clause',
+    version: '14.2.0',
+)
+
+xsimd_inc = include_directories('include')
+
+xsimd_dep = declare_dependency(include_directories: xsimd_inc)
+meson.override_dependency('xsimd', xsimd_dep)
diff --git a/subprojects/xsimd.wrap b/subprojects/xsimd.wrap
new file mode 100644
index 0000000000000..39706456be925
--- /dev/null
+++ b/subprojects/xsimd.wrap
@@ -0,0 +1,9 @@
+[wrap-file]
+directory = xsimd-14.2.0
+source_url = https://github.com/xtensor-stack/xsimd/archive/refs/tags/14.2.0.tar.gz
+source_filename = xsimd-14.2.0.tar.gz
+source_hash = 21e841ab684b05331e81e7f782431753a029ef7b7d9d6d3ddab837e7782a40ee
+patch_directory = xsimd
+
+[provide]
+dependency_names = xsimd