Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions bindings/python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,4 @@ pyo3 = { version = "0.28.2", features = ["auto-initialize"] }
[features]
default = ["ext-module"]
ext-module = ["pyo3/extension-module"]
pcre2 = ["tokenizers/pcre2"]
3 changes: 2 additions & 1 deletion tokenizers/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ with a focus on performances and versatility.
exclude = [ "rust-toolchain", "target/*", "Cargo.lock", "benches/*.txt", "benches/*.json", "data/*" ]

[package.metadata.docs.rs]
all-features = true
features = ["http"]

[lib]
name = "tokenizers"
Expand Down Expand Up @@ -77,6 +77,7 @@ monostate = "0.1.12"
ahash = { version = "0.8.11", features = ["serde"] }
dary_heap = { version = "0.3.6", features = ["serde"] }
compact_str = { version = "0.9", features = ["serde"] }
pcre2 = { version = "0.2.11", optional = true }

[features]
default = ["progressbar", "onig", "esaxx_fast"]
Expand Down
4 changes: 4 additions & 0 deletions tokenizers/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -141,3 +141,7 @@ fn main() -> Result<()> {

- **http**: This feature enables downloading the tokenizer via HTTP. It is disabled by default.
With this feature enabled, `Tokenizer::from_pretrained` becomes accessible.

- **pcre2**: Use PCRE2 as the regex backend instead of the default Oniguruma. PCRE2 with JIT
can offer up to 16% faster encoding for models using BPE or Llama3 tokenizer.

4 changes: 4 additions & 0 deletions tokenizers/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,10 @@
//!
//! - **http**: This feature enables downloading the tokenizer via HTTP. It is disabled by default.
//! With this feature enabled, `Tokenizer::from_pretrained` becomes accessible.
//!
//! - **pcre2**: Use PCRE2 as the regex backend instead of the default Oniguruma. PCRE2 with JIT
//! can offer up to 16% faster encoding for models using BPE or Llama3 tokenizer.
//!

#[macro_use]
extern crate log;
Expand Down
16 changes: 10 additions & 6 deletions tokenizers/src/utils/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,21 @@ pub(crate) mod cache;
#[cfg(feature = "http")]
pub(crate) mod from_pretrained;

#[cfg(all(feature = "fancy-regex", not(feature = "onig")))]
#[cfg(all(feature = "fancy-regex", not(feature = "onig"), not(feature = "pcre2")))]
mod fancy;
#[cfg(all(feature = "fancy-regex", not(feature = "onig")))]
#[cfg(all(feature = "fancy-regex", not(feature = "onig"), not(feature = "pcre2")))]
pub use fancy::SysRegex;
#[cfg(feature = "onig")]
#[cfg(all(feature = "onig", not(feature = "pcre2")))]
mod onig;
#[cfg(feature = "onig")]
#[cfg(all(feature = "onig", not(feature = "pcre2")))]
pub use crate::utils::onig::SysRegex;
#[cfg(feature = "pcre2")]
mod pcre2;
#[cfg(feature = "pcre2")]
pub use crate::utils::pcre2::SysRegex;

#[cfg(not(any(feature = "onig", feature = "fancy-regex")))]
compile_error!("One of the `onig`, or `fancy-regex` features must be enabled");
#[cfg(not(any(feature = "onig", feature = "fancy-regex", feature = "pcre2")))]
compile_error!("One of the `onig`, `fancy-regex`, or `pcre2` features must be enabled");

pub mod iter;
pub mod padding;
Expand Down
62 changes: 62 additions & 0 deletions tokenizers/src/utils/pcre2.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
//! PCRE2 regex engine backend for tokenizers.
//!
//! This module provides a PCRE2-based backend as an alternative to the default Oniguruma regex engine.
//!
//! # Features
//!
//! Enable the `pcre2` feature to use this backend instead of the default Oniguruma engine:
//!
//! ```toml
//! [dependencies]
//! tokenizers = { version = "...", features = ["pcre2"] }
//! ```
//!
//! # Performance
//!
//! PCRE2 with JIT can offer ~16% faster encoding compared to the default
//! Oniguruma backend for models using BPE pre-tokenization.
//!
//!
//! # Build
//!
//! The underlying `pcre2-sys` crate will use a system-installed libpcre2 via pkg-config if
//! available, otherwise it builds PCRE2 from bundled source automatically. No external
//! dependencies are required.
//!

use pcre2::bytes::RegexBuilder;

#[derive(Debug)]
pub struct SysRegex {
regex: pcre2::bytes::Regex,
}

impl SysRegex {
pub fn new(
regex_str: &str,
) -> std::result::Result<Self, Box<dyn std::error::Error + Send + Sync + 'static>> {
let regex = RegexBuilder::new()
.jit_if_available(true)
.utf(true)
.ucp(true)
.build(regex_str)?;
Ok(Self { regex })
}

pub fn find_iter<'r, 't>(&'r self, inside: &'t str) -> Matches<'r, 't> {
Matches(self.regex.find_iter(inside.as_bytes()))
}
}

pub struct Matches<'r, 't>(pcre2::bytes::Matches<'r, 't>);

impl Iterator for Matches<'_, '_> {
type Item = (usize, usize);

fn next(&mut self) -> Option<Self::Item> {
match self.0.next() {
Some(Ok(mat)) => Some((mat.start(), mat.end())),
None | Some(Err(_)) => None,
}
}
}