diff --git a/tokenizers/benches/ci_benchmark.rs b/tokenizers/benches/ci_benchmark.rs index 593d12a1f..92c610dbb 100644 --- a/tokenizers/benches/ci_benchmark.rs +++ b/tokenizers/benches/ci_benchmark.rs @@ -233,6 +233,24 @@ fn bench_serialization(c: &mut Criterion) { b.iter(|| black_box(serde_json::from_str::(&llama3_json).unwrap())) }); + // Deserialize with 100k added tokens + NFKC normalizer + // This stresses the normalize path during add_tokens/refresh_added_tokens. + { + use tokenizers::normalizers::NFKC; + let mut tok = Tokenizer::from_file("data/roberta.json").unwrap(); + let _ = tok.with_normalizer(Some(NFKC)); + let tokens: Vec<_> = (0..100_000) + .map(|i| AddedToken::from(format!("tok{i}"), false)) + .collect(); + let _ = tok.add_tokens(tokens); + let path = std::env::temp_dir().join("bench_100k_nfkc.json"); + tok.save(&path, false).unwrap(); + group.bench_function("deserialize-100k-nfkc", |b| { + b.iter(|| black_box(Tokenizer::from_file(&path).unwrap())) + }); + std::fs::remove_file(&path).ok(); + } + group.finish(); } diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index ca7bae558..f804f13e8 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -562,6 +562,39 @@ impl AddedVocabulary { pretokenized } + + /// Like [`extract_and_normalize`] but uses [`Normalizer::normalize_str`] + /// instead of [`Normalizer::normalize`], skipping alignment tracking. + /// + /// This is used by `encode_fast` where offsets are not needed. The + /// normalization step avoids building per-byte alignment vectors, which + /// saves O(n) allocations per split. + pub fn extract_and_normalize_fast( + &self, + normalizer: Option<&N>, + sequence: &str, + ) -> PreTokenizedString { + let mut pretokenized: PreTokenizedString = sequence.into(); + + // 1. Extract non-normalized tokens from the raw string + pretokenized + .split(|_, sequence| Ok(self.split_with_indices(sequence, &self.split_trie))) + .expect("AddedVocabulary bad split"); + + // 2. Normalize remaining pieces via normalize_str (no alignment tracking) + // and extract normalized tokens + pretokenized + .split(|_, mut sequence| { + if let Some(n) = normalizer { + let normed = n.normalize_str(sequence.get())?; + sequence.set_normalized(normed); + } + Ok(self.split_with_indices(sequence, &self.split_normalized_trie)) + }) + .expect("AddedVocabulary bad split"); + + pretokenized + } } impl Default for AddedVocabulary { diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index 8e282fba2..2ddbf20b5 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -55,6 +55,16 @@ pub type Offsets = (usize, usize); /// Takes care of pre-processing strings. pub trait Normalizer: Sync { fn normalize(&self, normalized: &mut NormalizedString) -> Result<()>; + + /// Normalize a plain string without tracking alignments. + /// + /// The default allocates a full [`NormalizedString`]. Normalizers that can + /// produce their output more cheaply should override this. + fn normalize_str(&self, s: &str) -> Result { + let mut n = NormalizedString::from(s); + self.normalize(&mut n)?; + Ok(n.get().to_owned()) + } } /// The `PreTokenizer` is in charge of doing the pre-segmentation step. It splits the given string @@ -731,10 +741,15 @@ where type_id: u32, offsets_type: OffsetType, ) -> Result { + let fast = matches!(offsets_type, OffsetType::None); let encode = |is_pre_tokenized, subseq_idx, subseq| -> Result { - let normalized = self - .added_vocabulary - .extract_and_normalize(self.normalizer.as_ref(), subseq); + let normalized = if fast { + self.added_vocabulary + .extract_and_normalize_fast(self.normalizer.as_ref(), subseq) + } else { + self.added_vocabulary + .extract_and_normalize(self.normalizer.as_ref(), subseq) + }; let pre_tokenized = self.do_pre_tokenize(normalized)?; let subseq_encoding = self.do_tokenize( pre_tokenized, diff --git a/tokenizers/src/tokenizer/normalizer.rs b/tokenizers/src/tokenizer/normalizer.rs index 5bebd5f7b..fab640ae3 100644 --- a/tokenizers/src/tokenizer/normalizer.rs +++ b/tokenizers/src/tokenizer/normalizer.rs @@ -136,6 +136,18 @@ impl NormalizedString { &self.normalized } + /// Replace the normalized content without tracking alignments. + /// + /// This is significantly cheaper than going through `transform()` since it + /// skips the per-byte alignment bookkeeping. Use this when offset tracking + /// is not needed (e.g. `encode_fast`). + pub fn set_normalized(&mut self, new: String) { + // Build trivial 1:1 alignments so that slice() still works for + // splitting, but no real offset mapping is preserved. + self.alignments = new.as_bytes().iter().enumerate().map(|(i, _)| (i, i + 1)).collect(); + self.normalized = new; + } + /// Return the original string pub fn get_original(&self) -> &str { &self.original