huggingface · ArthurZucker · Apr 10, 2026 · Apr 10, 2026 · Apr 10, 2026 · Apr 23, 2026
diff --git a/tokenizers/benches/ci_benchmark.rs b/tokenizers/benches/ci_benchmark.rs
@@ -233,6 +233,24 @@ fn bench_serialization(c: &mut Criterion) {
         b.iter(|| black_box(serde_json::from_str::<Tokenizer>(&llama3_json).unwrap()))
     });
 
+    // Deserialize with 100k added tokens + NFKC normalizer
+    // This stresses the normalize path during add_tokens/refresh_added_tokens.
+    {
+        use tokenizers::normalizers::NFKC;
+        let mut tok = Tokenizer::from_file("data/roberta.json").unwrap();
+        let _ = tok.with_normalizer(Some(NFKC));
+        let tokens: Vec<_> = (0..100_000)
+            .map(|i| AddedToken::from(format!("tok{i}"), false))
+            .collect();
+        let _ = tok.add_tokens(tokens);
+        let path = std::env::temp_dir().join("bench_100k_nfkc.json");
+        tok.save(&path, false).unwrap();
+        group.bench_function("deserialize-100k-nfkc", |b| {
+            b.iter(|| black_box(Tokenizer::from_file(&path).unwrap()))
+        });
+        std::fs::remove_file(&path).ok();
+    }
+
     group.finish();
 }
 

diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs
@@ -562,6 +562,39 @@ impl AddedVocabulary {
 
         pretokenized
     }
+
+    /// Like [`extract_and_normalize`] but uses [`Normalizer::normalize_str`]
+    /// instead of [`Normalizer::normalize`], skipping alignment tracking.
+    ///
+    /// This is used by `encode_fast` where offsets are not needed. The
+    /// normalization step avoids building per-byte alignment vectors, which
+    /// saves O(n) allocations per split.
+    pub fn extract_and_normalize_fast<N: Normalizer>(
+        &self,
+        normalizer: Option<&N>,
+        sequence: &str,
+    ) -> PreTokenizedString {
+        let mut pretokenized: PreTokenizedString = sequence.into();
+
+        // 1. Extract non-normalized tokens from the raw string
+        pretokenized
+            .split(|_, sequence| Ok(self.split_with_indices(sequence, &self.split_trie)))
+            .expect("AddedVocabulary bad split");
+
+        // 2. Normalize remaining pieces via normalize_str (no alignment tracking)
+        //    and extract normalized tokens
+        pretokenized
+            .split(|_, mut sequence| {
+                if let Some(n) = normalizer {
+                    let normed = n.normalize_str(sequence.get())?;
+                    sequence.set_normalized(normed);
+                }
+                Ok(self.split_with_indices(sequence, &self.split_normalized_trie))
+            })
+            .expect("AddedVocabulary bad split");
+
+        pretokenized
+    }
 }
 
 impl Default for AddedVocabulary {

diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs
@@ -55,6 +55,16 @@ pub type Offsets = (usize, usize);
 /// Takes care of pre-processing strings.
 pub trait Normalizer: Sync {
     fn normalize(&self, normalized: &mut NormalizedString) -> Result<()>;
+
+    /// Normalize a plain string without tracking alignments.
+    ///
+    /// The default allocates a full [`NormalizedString`].  Normalizers that can
+    /// produce their output more cheaply should override this.
+    fn normalize_str(&self, s: &str) -> Result<String> {
+        let mut n = NormalizedString::from(s);
+        self.normalize(&mut n)?;
+        Ok(n.get().to_owned())
+    }
 }
 
 /// The `PreTokenizer` is in charge of doing the pre-segmentation step. It splits the given string
@@ -731,10 +741,15 @@ where
         type_id: u32,
         offsets_type: OffsetType,
     ) -> Result<Encoding> {
+        let fast = matches!(offsets_type, OffsetType::None);
         let encode = |is_pre_tokenized, subseq_idx, subseq| -> Result<Encoding> {
-            let normalized = self
-                .added_vocabulary
-                .extract_and_normalize(self.normalizer.as_ref(), subseq);
+            let normalized = if fast {
+                self.added_vocabulary
+                    .extract_and_normalize_fast(self.normalizer.as_ref(), subseq)
+            } else {
+                self.added_vocabulary
+                    .extract_and_normalize(self.normalizer.as_ref(), subseq)
+            };
             let pre_tokenized = self.do_pre_tokenize(normalized)?;
             let subseq_encoding = self.do_tokenize(
                 pre_tokenized,

diff --git a/tokenizers/src/tokenizer/normalizer.rs b/tokenizers/src/tokenizer/normalizer.rs
@@ -136,6 +136,18 @@ impl NormalizedString {
         &self.normalized
     }
 
+    /// Replace the normalized content without tracking alignments.
+    ///
+    /// This is significantly cheaper than going through `transform()` since it
+    /// skips the per-byte alignment bookkeeping.  Use this when offset tracking
+    /// is not needed (e.g. `encode_fast`).
+    pub fn set_normalized(&mut self, new: String) {
+        // Build trivial 1:1 alignments so that slice() still works for
+        // splitting, but no real offset mapping is preserved.
+        self.alignments = new.as_bytes().iter().enumerate().map(|(i, _)| (i, i + 1)).collect();
+        self.normalized = new;
+    }
+
     /// Return the original string
     pub fn get_original(&self) -> &str {
         &self.original