Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions LICENSES/XSIMD_LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
Copyright (c) 2016, Johan Mabille, Sylvain Corlay, Wolf Vollprecht and Martin Renou
Copyright (c) 2016, QuantStack
Copyright (c) 2018, Serge Guelton
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.

* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4 changes: 3 additions & 1 deletion meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,17 @@ add_project_arguments(
)

cc = meson.get_compiler('c')
cxx = meson.get_compiler('cpp')
if cc.get_id() == 'msvc'
# Tracking issue: https://github.com/pandas-dev/pandas/issues/63701
# Ignore some MSVC specific warnings:
# C4244: possible loss of data in conversion. Reproductible with `-Wconversion`.
# C4267: conversion from `size_t` to smaller type.
# C4551: occurs due to Cython generating code with (void)func.
# https://github.com/cython/cython/issues/3579
# C4146: unary minus operator applied to unsigned type. Occurs in xsimd.
add_project_arguments(
['/wd4244', '/wd4267', '/wd4551'],
['/wd4244', '/wd4267', '/wd4551', '/wd4146'],
language: ['c', 'cpp'],
)
endif
Expand Down
40 changes: 40 additions & 0 deletions pandas/_libs/include/pandas/parser/simd_scan.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
Copyright (c) 2026, PyData Development Team
All rights reserved.

Distributed under the terms of the BSD Simplified License.

The full license is in the LICENSE file, distributed with this software.
*/

#pragma once

#include <stddef.h>

#ifdef __cplusplus
extern "C" {
#endif

// Minimum bytes the scanner can process in one call. Callers should
// fall through to the scalar path when fewer bytes remain.
#define PD_SCAN_MIN_BYTES 16

typedef struct pd_scanner pd_scanner;

// Build a scanner that halts on any of `n` special bytes. Supported
// values for `n` are 2 (quoted-field scan) and 6 (unquoted-field scan).
// Returns NULL on allocation failure or unsupported `n`.
pd_scanner *pd_scanner_create(const char *chars, int n);

// Free a scanner. Accepts NULL.
void pd_scanner_destroy(pd_scanner *scanner);

// Returns the byte offset of the first special char in data[0..len),
// or `len` if no special char was found within full SIMD chunks. The
// trailing <PD_SCAN_MIN_BYTES bytes are not scanned; the caller's
// scalar fallback handles them.
size_t pd_scanner_scan(const pd_scanner *scanner, const char *data, size_t len);

#ifdef __cplusplus
}
#endif
12 changes: 9 additions & 3 deletions pandas/_libs/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ _khash_primitive_helper_dep = declare_dependency(
m_dep = cc.find_library('m', required: false)
fast_float = subproject('fast_float')
fast_float_dep = fast_float.get_variable('fast_float_dep')
xsimd_dep = dependency('xsimd', version: '>=14.2')

subdir('simd')

subdir('tslibs')

Expand Down Expand Up @@ -93,8 +96,9 @@ libs_sources = {
'lib.pyx',
'src/parser/tokenizer.c',
'src/parser/fast_float_strtod.cpp',
'src/parser/simd_scan.cpp',
],
'deps': [fast_float_dep],
'deps': [fast_float_dep, xsimd_dep],
},
'missing': {'sources': ['missing.pyx']},
'pandas_datetime': {
Expand All @@ -109,19 +113,21 @@ libs_sources = {
'sources': [
'src/parser/tokenizer.c',
'src/parser/fast_float_strtod.cpp',
'src/parser/simd_scan.cpp',
'src/parser/io.c',
'src/parser/pd_parser.c',
],
'deps': [fast_float_dep],
'deps': [fast_float_dep, xsimd_dep],
},
'parsers': {
'sources': [
'parsers.pyx',
'src/parser/tokenizer.c',
'src/parser/fast_float_strtod.cpp',
'src/parser/simd_scan.cpp',
'src/parser/io.c',
],
'deps': [fast_float_dep, _khash_primitive_helper_dep],
'deps': [fast_float_dep, xsimd_dep, _khash_primitive_helper_dep],
},
'_ujson': {
'sources': [
Expand Down
32 changes: 32 additions & 0 deletions pandas/_libs/simd/meson.build
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# All architectures we might support
# Key is the architecture name used in file suffixes and macros
is_msvc_syntax = cxx.get_argument_syntax() == 'msvc'
simd_x86_flags = {
'sse2': is_msvc_syntax ? ['/arch:SSE2'] : ['-msse2'],
'avx2': is_msvc_syntax ? ['/arch:AVX2'] : ['-mavx2'],
'avx512cd': is_msvc_syntax ? ['/arch:AVX512'] : ['-mavx512cd'],
}

simd_config = configuration_data()
supported_simd_archs = {}
if host_machine.cpu_family() == 'aarch64'
supported_simd_archs += {'neon': []}
simd_config.set('PANDAS_HAVE_NEON', 1)
elif host_machine.cpu_family() in ['x86', 'x86_64']
foreach name, flags : simd_x86_flags
if cxx.has_multi_arguments(flags)
supported_simd_archs += {name: flags}
simd_config.set('PANDAS_HAVE_@0@'.format(name.to_upper()), 1)
endif
endforeach
endif

# Ensure scalar version on all architectures for now...
simd_config.set('PANDAS_HAVE_SCALAR', 1)

configure_file(
output: 'pandas_simd_config.h',
configuration: simd_config,
)

simd_config_inc = include_directories('.')
92 changes: 92 additions & 0 deletions pandas/_libs/src/parser/simd_scan.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
/*
Copyright (c) 2026, PyData Development Team
All rights reserved.

Distributed under the terms of the BSD Simplified License.

The full license is in the LICENSE file, distributed with this software.
*/

#include "pandas/parser/simd_scan.h"

#include <xsimd/xsimd.hpp>

#include <cstdint>
#include <new>

#if defined(_MSC_VER)
# include <intrin.h>
#endif

namespace {

using batch_u8 = xsimd::batch<std::uint8_t>;
constexpr std::size_t kStep = batch_u8::size;

static_assert(kStep >= PD_SCAN_MIN_BYTES,
"xsimd batch<uint8_t> must be at least 16 lanes wide");

static inline unsigned ctz64(std::uint64_t value) {
#if defined(_MSC_VER)
unsigned long index;
_BitScanForward64(&index, value);
return static_cast<unsigned>(index);
#else
return static_cast<unsigned>(__builtin_ctzll(value));
#endif
}

template <int N>
static inline std::size_t scan_impl(const batch_u8 *v, const char *data,
std::size_t len) {
const auto *p = reinterpret_cast<const std::uint8_t *>(data);
std::size_t i = 0;
for (; i + kStep <= len; i += kStep) {
const auto chunk = batch_u8::load_unaligned(p + i);
auto mask = (chunk == v[0]);
for (int j = 1; j < N; ++j) {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Alvaro-Kothe is going through a similar exercise on his SIMD PR and I gave the feedback that this type of looping + bit fiddling is extremely inefficient. nanoarrow has built-in functionality to do this better and it sounds like @Alvaro-Kothe is researching something with xsimd (?) - let's keep tabs on that

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this was a problem with the non-xsimd variant? That felt lower-touch to me and I marginally prefer it.

mask = mask | (chunk == v[j]);
}
if (xsimd::any(mask)) {
return i + ctz64(mask.mask());
}
}
return i;
}

} // namespace

struct pd_scanner {
batch_u8 v[6];
int n;
};

extern "C" {

pd_scanner *pd_scanner_create(const char *chars, int n) {
if (n != 2 && n != 6)
return nullptr;
auto *scanner = new (std::nothrow) pd_scanner;
if (!scanner)
return nullptr;
scanner->n = n;
for (int j = 0; j < n; ++j) {
scanner->v[j] = batch_u8::broadcast(static_cast<std::uint8_t>(chars[j]));
}
return scanner;
}

void pd_scanner_destroy(pd_scanner *scanner) { delete scanner; }

size_t pd_scanner_scan(const pd_scanner *scanner, const char *data,
size_t len) {
switch (scanner->n) {
case 2:
return scan_impl<2>(scanner->v, data, len);
case 6:
return scan_impl<6>(scanner->v, data, len);
}
return len;
}

} // extern "C"
61 changes: 57 additions & 4 deletions pandas/_libs/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ GitHub. See Python Software Foundation License and BSD licenses for these.
#include <stdbool.h>
#include <stdlib.h>

#include "pandas/parser/simd_scan.h"
#include "pandas/portable.h"
#include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64

Expand Down Expand Up @@ -583,6 +584,8 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes,
((!delim_whitespace && c == delimiter) || (delim_whitespace && isblank(c)))

#define _TOKEN_CLEANUP() \
pd_scanner_destroy(unquoted_scanner); \
pd_scanner_destroy(quoted_scanner); \
self->stream_len = slen; \
self->datapos = i;

Expand Down Expand Up @@ -633,7 +636,27 @@ static int tokenize_bytes(parser_t *self, uint64_t line_limit,
const bool has_skip = (self->skipfunc != NULL || self->skipset != NULL ||
self->skip_first_N_rows >= 0);

// Build SIMD scanners over the chars that halt a bulk scan. Disabled
// features alias to lineterminator so the scanners always see 6/2 chars
// and the call sites stay branch-free.
const char unquoted_chars[6] = {
delimiter,
lineterminator,
has_carriage ? carriage_symbol : lineterminator,
(self->quoting != QUOTE_NONE) ? self->quotechar : lineterminator,
has_escape ? escape_symbol : lineterminator,
has_comment ? comment_symbol : lineterminator,
};
const char quoted_chars[2] = {
(self->quoting != QUOTE_NONE) ? self->quotechar : lineterminator,
has_escape ? escape_symbol : lineterminator,
};
pd_scanner *unquoted_scanner = pd_scanner_create(unquoted_chars, 6);
pd_scanner *quoted_scanner = pd_scanner_create(quoted_chars, 2);

if (make_stream_space(self, self->datalen - self->datapos) < 0) {
pd_scanner_destroy(unquoted_scanner);
pd_scanner_destroy(quoted_scanner);
const size_t bufsize = 100;
self->error_msg = malloc(bufsize);
snprintf(self->error_msg, bufsize, "out of memory");
Expand Down Expand Up @@ -922,8 +945,23 @@ static int tokenize_bytes(parser_t *self, uint64_t line_limit,
// normal character - save in field
PUSH_CHAR(c);

// Bulk scan: copy remaining ordinary characters directly,
// bypassing the per-char state machine overhead.
// SIMD bulk scan: process a full SIMD chunk at a time, copying
// normal characters directly without state-machine overhead.
if (unquoted_scanner && !self->delim_whitespace) {
size_t remaining = self->datalen - (i + 1);
Comment on lines +947 to +951
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to better understand how the specific simd code is used (didn't look in detail at the surrounding code): this is adding some new logic, but doesn't seem to replace another line of code that would otherwise run now? (or put differently, there is no fallback?)

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

claude: the preceding PUSH_CHAR(c) already handled the current byte. The SIMD block then scans ahead and bulk-copies any subsequent normal characters. If there are fewer than 16 bytes remaining or skip is 0, it does nothing and the byte-at-a-time loop continues normally. The fallback is the existing code

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add some comments to clarify this for future readers? (like the existing comment for the "Scalar bulk scan fallback", but then for the simd code)

if (remaining >= PD_SCAN_MIN_BYTES) {
size_t skip = pd_scanner_scan(unquoted_scanner, buf, remaining);
if (skip > 0) {
memcpy(stream, buf, skip);
stream += skip;
slen += skip;
buf += skip;
i += skip;
}
}
}
// Scalar bulk scan fallback: copy remaining ordinary characters
// directly, bypassing the per-char state machine overhead.
while (i + 1 < self->datalen &&
!(breaks_field_scan[(uint8_t)*buf] & 0x1)) {
*stream++ = *buf++;
Expand All @@ -950,8 +988,23 @@ static int tokenize_bytes(parser_t *self, uint64_t line_limit,
// normal character - save in field
PUSH_CHAR(c);

// Bulk scan: copy remaining ordinary characters directly,
// bypassing the per-char state machine overhead.
// SIMD bulk scan for quoted fields: only quote and escape
// chars are special, so use a lighter scan.
if (quoted_scanner) {
size_t remaining = self->datalen - (i + 1);
if (remaining >= PD_SCAN_MIN_BYTES) {
size_t skip = pd_scanner_scan(quoted_scanner, buf, remaining);
if (skip > 0) {
memcpy(stream, buf, skip);
stream += skip;
slen += skip;
buf += skip;
i += skip;
}
}
}
// Scalar bulk scan fallback: copy remaining ordinary characters
// directly, bypassing the per-char state machine overhead.
while (i + 1 < self->datalen &&
!(breaks_field_scan[(uint8_t)*buf] & 0x2)) {
*stream++ = *buf++;
Expand Down
3 changes: 2 additions & 1 deletion pandas/_libs/tslibs/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@ tslibs_sources = {
'parsing.pyx',
'../src/parser/tokenizer.c',
'../src/parser/fast_float_strtod.cpp',
'../src/parser/simd_scan.cpp',
],
'deps': [fast_float_dep],
'deps': [fast_float_dep, xsimd_dep],
},
'period': {'sources': ['period.pyx']},
'strptime': {'sources': ['strptime.pyx']},
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ license-files = [
"LICENSES/PYUPGRADE_LICENSE", # MIT
"LICENSES/SAS7BDAT_LICENSE", # MIT
"LICENSES/ULTRAJSON_LICENSE", # BSD-3-Clause AND TCL
"LICENSES/XSIMD_LICENSE", # BSD-3-Clause
"subprojects/fast_float-*/LICENSE-APACHE", # Apache-2.0
"subprojects/fast_float-*/LICENSE-BOOST", # BSL
"subprojects/fast_float-*/LICENSE-MIT", # MIT
Expand Down
12 changes: 12 additions & 0 deletions subprojects/packagefiles/xsimd/meson.build
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
project(
'xsimd',
'cpp',
meson_version: '>=0.58.0',
license: 'BSD-3-Clause',
version: '14.2.0',
)

xsimd_inc = include_directories('include')

xsimd_dep = declare_dependency(include_directories: xsimd_inc)
meson.override_dependency('xsimd', xsimd_dep)
9 changes: 9 additions & 0 deletions subprojects/xsimd.wrap
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[wrap-file]
directory = xsimd-14.2.0
source_url = https://github.com/xtensor-stack/xsimd/archive/refs/tags/14.2.0.tar.gz
source_filename = xsimd-14.2.0.tar.gz
source_hash = 21e841ab684b05331e81e7f782431753a029ef7b7d9d6d3ddab837e7782a40ee
patch_directory = xsimd

[provide]
dependency_names = xsimd
Loading