diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index de219240f8..ecc17378d6 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -35,3 +35,4 @@ pyo3 = { version = "0.28.2", features = ["auto-initialize"] } [features] default = ["ext-module"] ext-module = ["pyo3/extension-module"] +pcre2 = ["tokenizers/pcre2"] diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml index 40b273ac4a..3a00df382b 100644 --- a/tokenizers/Cargo.toml +++ b/tokenizers/Cargo.toml @@ -16,7 +16,7 @@ with a focus on performances and versatility. exclude = [ "rust-toolchain", "target/*", "Cargo.lock", "benches/*.txt", "benches/*.json", "data/*" ] [package.metadata.docs.rs] -all-features = true +features = ["http"] [lib] name = "tokenizers" @@ -77,6 +77,7 @@ monostate = "0.1.12" ahash = { version = "0.8.11", features = ["serde"] } dary_heap = { version = "0.3.6", features = ["serde"] } compact_str = { version = "0.9", features = ["serde"] } +pcre2 = { version = "0.2.11", optional = true } [features] default = ["progressbar", "onig", "esaxx_fast"] diff --git a/tokenizers/README.md b/tokenizers/README.md index 173e0bc065..8482c2a5cd 100644 --- a/tokenizers/README.md +++ b/tokenizers/README.md @@ -141,3 +141,7 @@ fn main() -> Result<()> { - **http**: This feature enables downloading the tokenizer via HTTP. It is disabled by default. With this feature enabled, `Tokenizer::from_pretrained` becomes accessible. + +- **pcre2**: Use PCRE2 as the regex backend instead of the default Oniguruma. PCRE2 with JIT + can offer up to 16% faster encoding for models using BPE or Llama3 tokenizer. + diff --git a/tokenizers/src/lib.rs b/tokenizers/src/lib.rs index 1233a8bd2a..5465fa85c9 100644 --- a/tokenizers/src/lib.rs +++ b/tokenizers/src/lib.rs @@ -130,6 +130,10 @@ //! //! - **http**: This feature enables downloading the tokenizer via HTTP. It is disabled by default. //! With this feature enabled, `Tokenizer::from_pretrained` becomes accessible. +//! +//! - **pcre2**: Use PCRE2 as the regex backend instead of the default Oniguruma. PCRE2 with JIT +//! can offer up to 16% faster encoding for models using BPE or Llama3 tokenizer. +//! #[macro_use] extern crate log; diff --git a/tokenizers/src/utils/mod.rs b/tokenizers/src/utils/mod.rs index c9450b3222..b039518f71 100644 --- a/tokenizers/src/utils/mod.rs +++ b/tokenizers/src/utils/mod.rs @@ -2,17 +2,21 @@ pub(crate) mod cache; #[cfg(feature = "http")] pub(crate) mod from_pretrained; -#[cfg(all(feature = "fancy-regex", not(feature = "onig")))] +#[cfg(all(feature = "fancy-regex", not(feature = "onig"), not(feature = "pcre2")))] mod fancy; -#[cfg(all(feature = "fancy-regex", not(feature = "onig")))] +#[cfg(all(feature = "fancy-regex", not(feature = "onig"), not(feature = "pcre2")))] pub use fancy::SysRegex; -#[cfg(feature = "onig")] +#[cfg(all(feature = "onig", not(feature = "pcre2")))] mod onig; -#[cfg(feature = "onig")] +#[cfg(all(feature = "onig", not(feature = "pcre2")))] pub use crate::utils::onig::SysRegex; +#[cfg(feature = "pcre2")] +mod pcre2; +#[cfg(feature = "pcre2")] +pub use crate::utils::pcre2::SysRegex; -#[cfg(not(any(feature = "onig", feature = "fancy-regex")))] -compile_error!("One of the `onig`, or `fancy-regex` features must be enabled"); +#[cfg(not(any(feature = "onig", feature = "fancy-regex", feature = "pcre2")))] +compile_error!("One of the `onig`, `fancy-regex`, or `pcre2` features must be enabled"); pub mod iter; pub mod padding; diff --git a/tokenizers/src/utils/pcre2.rs b/tokenizers/src/utils/pcre2.rs new file mode 100644 index 0000000000..564349ba85 --- /dev/null +++ b/tokenizers/src/utils/pcre2.rs @@ -0,0 +1,62 @@ +//! PCRE2 regex engine backend for tokenizers. +//! +//! This module provides a PCRE2-based backend as an alternative to the default Oniguruma regex engine. +//! +//! # Features +//! +//! Enable the `pcre2` feature to use this backend instead of the default Oniguruma engine: +//! +//! ```toml +//! [dependencies] +//! tokenizers = { version = "...", features = ["pcre2"] } +//! ``` +//! +//! # Performance +//! +//! PCRE2 with JIT can offer ~16% faster encoding compared to the default +//! Oniguruma backend for models using BPE pre-tokenization. +//! +//! +//! # Build +//! +//! The underlying `pcre2-sys` crate will use a system-installed libpcre2 via pkg-config if +//! available, otherwise it builds PCRE2 from bundled source automatically. No external +//! dependencies are required. +//! + +use pcre2::bytes::RegexBuilder; + +#[derive(Debug)] +pub struct SysRegex { + regex: pcre2::bytes::Regex, +} + +impl SysRegex { + pub fn new( + regex_str: &str, + ) -> std::result::Result> { + let regex = RegexBuilder::new() + .jit_if_available(true) + .utf(true) + .ucp(true) + .build(regex_str)?; + Ok(Self { regex }) + } + + pub fn find_iter<'r, 't>(&'r self, inside: &'t str) -> Matches<'r, 't> { + Matches(self.regex.find_iter(inside.as_bytes())) + } +} + +pub struct Matches<'r, 't>(pcre2::bytes::Matches<'r, 't>); + +impl Iterator for Matches<'_, '_> { + type Item = (usize, usize); + + fn next(&mut self) -> Option { + match self.0.next() { + Some(Ok(mat)) => Some((mat.start(), mat.end())), + None | Some(Err(_)) => None, + } + } +}