From 5075b120257a4905326c00950b8298dc4e5fdd68 Mon Sep 17 00:00:00 2001 From: wheynelau Date: Mon, 23 Feb 2026 17:32:41 +0800 Subject: [PATCH 1/5] docs: add http feature for docs and docsrs --- tokenizers/Cargo.toml | 3 +++ tokenizers/src/lib.rs | 1 + tokenizers/src/tokenizer/mod.rs | 2 ++ 3 files changed, 6 insertions(+) diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml index 9ea3e690f5..e0af57cdc6 100644 --- a/tokenizers/Cargo.toml +++ b/tokenizers/Cargo.toml @@ -15,6 +15,9 @@ with a focus on performances and versatility. """ exclude = [ "rust-toolchain", "target/*", "Cargo.lock", "benches/*.txt", "benches/*.json", "data/*" ] +[package.metadata.docs.rs] +features = ["http"] + [lib] name = "tokenizers" path = "src/lib.rs" diff --git a/tokenizers/src/lib.rs b/tokenizers/src/lib.rs index 7841314d05..789ae13cd6 100644 --- a/tokenizers/src/lib.rs +++ b/tokenizers/src/lib.rs @@ -1,3 +1,4 @@ +#![cfg_attr(docsrs, feature(doc_cfg))] #![warn(clippy::all)] #![allow(clippy::upper_case_acronyms)] #![doc(html_favicon_url = "https://huggingface.co/favicon.ico")] diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index cedabeebc1..6e88c5e023 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -445,6 +445,7 @@ impl Tokenizer { Ok(tokenizer) } #[cfg(feature = "http")] + #[cfg_attr(docsrs, doc(cfg(feature = "http")))] pub fn from_pretrained>( identifier: S, params: Option, @@ -1539,6 +1540,7 @@ where note = "Users should download the file separately using https://github.com/huggingface/hf-hub instead, which splits concerns of accessing the web, and should use the new cache layout" )] #[cfg(feature = "http")] + #[cfg_attr(docsrs, doc(cfg(feature = "http")))] /// Instantiate a new Tokenizer from a file hosted on the Hugging Face Hub. /// It expects the `identifier` of a model that includes a `tokenizer.json` file. pub fn from_pretrained>( From b7b337c533b9b85fd058bf7f8de7c9eceb9a654d Mon Sep 17 00:00:00 2001 From: wheynelau Date: Mon, 2 Mar 2026 20:26:54 +0800 Subject: [PATCH 2/5] feat: add pcre2 as feature --- tokenizers/Cargo.toml | 1 + tokenizers/src/utils/mod.rs | 16 ++++++++++------ tokenizers/src/utils/pcre2.rs | 36 +++++++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 6 deletions(-) create mode 100644 tokenizers/src/utils/pcre2.rs diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml index 40b273ac4a..a7ab9b634d 100644 --- a/tokenizers/Cargo.toml +++ b/tokenizers/Cargo.toml @@ -77,6 +77,7 @@ monostate = "0.1.12" ahash = { version = "0.8.11", features = ["serde"] } dary_heap = { version = "0.3.6", features = ["serde"] } compact_str = { version = "0.9", features = ["serde"] } +pcre2 = { version = "0.2.11", optional = true } [features] default = ["progressbar", "onig", "esaxx_fast"] diff --git a/tokenizers/src/utils/mod.rs b/tokenizers/src/utils/mod.rs index 636bee660d..405d8eac8a 100644 --- a/tokenizers/src/utils/mod.rs +++ b/tokenizers/src/utils/mod.rs @@ -2,17 +2,21 @@ pub(crate) mod cache; #[cfg(feature = "http")] pub(crate) mod from_pretrained; -#[cfg(all(feature = "fancy-regex", not(feature = "onig")))] +#[cfg(all(feature = "fancy-regex", not(feature = "onig"), not(feature = "pcre2")))] mod fancy; -#[cfg(all(feature = "fancy-regex", not(feature = "onig")))] +#[cfg(all(feature = "fancy-regex", not(feature = "onig"), not(feature = "pcre2")))] pub use fancy::SysRegex; -#[cfg(feature = "onig")] +#[cfg(all(feature = "onig", not(feature = "pcre2")))] mod onig; -#[cfg(feature = "onig")] +#[cfg(all(feature = "onig", not(feature = "pcre2")))] pub use crate::utils::onig::SysRegex; +#[cfg(feature = "pcre2")] +mod pcre2; +#[cfg(feature = "pcre2")] +pub use crate::utils::pcre2::SysRegex; -#[cfg(not(any(feature = "onig", feature = "fancy-regex")))] -compile_error!("One of the `onig`, or `fancy-regex` features must be enabled"); +#[cfg(not(any(feature = "onig", feature = "fancy-regex", feature = "pcre2")))] +compile_error!("One of the `onig`, `fancy-regex`, or `pcre2` features must be enabled"); pub mod iter; pub mod padding; diff --git a/tokenizers/src/utils/pcre2.rs b/tokenizers/src/utils/pcre2.rs new file mode 100644 index 0000000000..53edb4f580 --- /dev/null +++ b/tokenizers/src/utils/pcre2.rs @@ -0,0 +1,36 @@ +use pcre2::bytes::RegexBuilder; + +#[derive(Debug)] +pub struct SysRegex { + regex: pcre2::bytes::Regex, +} + +impl SysRegex { + pub fn new( + regex_str: &str, + ) -> std::result::Result> { + let regex = RegexBuilder::new() + .jit_if_available(true) + .utf(true) + .ucp(true) + .build(regex_str)?; + Ok(Self { regex }) + } + + pub fn find_iter<'r, 't>(&'r self, inside: &'t str) -> Matches<'r, 't> { + Matches(self.regex.find_iter(inside.as_bytes())) + } +} + +pub struct Matches<'r, 't>(pcre2::bytes::Matches<'r, 't>); + +impl Iterator for Matches<'_, '_> { + type Item = (usize, usize); + + fn next(&mut self) -> Option { + match self.0.next() { + Some(Ok(mat)) => Some((mat.start(), mat.end())), + None | Some(Err(_)) => None, + } + } +} From ef25a68ea1bdebe363b9d94fe4df415c5dd3dd31 Mon Sep 17 00:00:00 2001 From: wheynelau Date: Fri, 27 Mar 2026 10:40:14 +0800 Subject: [PATCH 3/5] docs: add notes on pcre2 docs on lib and module level --- tokenizers/src/lib.rs | 4 ++++ tokenizers/src/utils/pcre2.rs | 26 ++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/tokenizers/src/lib.rs b/tokenizers/src/lib.rs index 1233a8bd2a..5465fa85c9 100644 --- a/tokenizers/src/lib.rs +++ b/tokenizers/src/lib.rs @@ -130,6 +130,10 @@ //! //! - **http**: This feature enables downloading the tokenizer via HTTP. It is disabled by default. //! With this feature enabled, `Tokenizer::from_pretrained` becomes accessible. +//! +//! - **pcre2**: Use PCRE2 as the regex backend instead of the default Oniguruma. PCRE2 with JIT +//! can offer up to 16% faster encoding for models using BPE or Llama3 tokenizer. +//! #[macro_use] extern crate log; diff --git a/tokenizers/src/utils/pcre2.rs b/tokenizers/src/utils/pcre2.rs index 53edb4f580..564349ba85 100644 --- a/tokenizers/src/utils/pcre2.rs +++ b/tokenizers/src/utils/pcre2.rs @@ -1,3 +1,29 @@ +//! PCRE2 regex engine backend for tokenizers. +//! +//! This module provides a PCRE2-based backend as an alternative to the default Oniguruma regex engine. +//! +//! # Features +//! +//! Enable the `pcre2` feature to use this backend instead of the default Oniguruma engine: +//! +//! ```toml +//! [dependencies] +//! tokenizers = { version = "...", features = ["pcre2"] } +//! ``` +//! +//! # Performance +//! +//! PCRE2 with JIT can offer ~16% faster encoding compared to the default +//! Oniguruma backend for models using BPE pre-tokenization. +//! +//! +//! # Build +//! +//! The underlying `pcre2-sys` crate will use a system-installed libpcre2 via pkg-config if +//! available, otherwise it builds PCRE2 from bundled source automatically. No external +//! dependencies are required. +//! + use pcre2::bytes::RegexBuilder; #[derive(Debug)] From 73125f9f60eac59615f427d78265eeef88bc80e1 Mon Sep 17 00:00:00 2001 From: wheynelau Date: Thu, 2 Apr 2026 01:53:23 +0800 Subject: [PATCH 4/5] feat: add pcre2 for maturin --- bindings/python/Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index de219240f8..ecc17378d6 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -35,3 +35,4 @@ pyo3 = { version = "0.28.2", features = ["auto-initialize"] } [features] default = ["ext-module"] ext-module = ["pyo3/extension-module"] +pcre2 = ["tokenizers/pcre2"] From d3164515ad36dfe83df037cf2e5ef921f3cbd114 Mon Sep 17 00:00:00 2001 From: wheynelau Date: Fri, 3 Apr 2026 08:48:31 +0800 Subject: [PATCH 5/5] docs: update README.md with cargo readme --- tokenizers/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tokenizers/README.md b/tokenizers/README.md index 173e0bc065..8482c2a5cd 100644 --- a/tokenizers/README.md +++ b/tokenizers/README.md @@ -141,3 +141,7 @@ fn main() -> Result<()> { - **http**: This feature enables downloading the tokenizer via HTTP. It is disabled by default. With this feature enabled, `Tokenizer::from_pretrained` becomes accessible. + +- **pcre2**: Use PCRE2 as the regex backend instead of the default Oniguruma. PCRE2 with JIT + can offer up to 16% faster encoding for models using BPE or Llama3 tokenizer. +