diff --git a/tokenizers/src/normalizers/unicode.rs b/tokenizers/src/normalizers/unicode.rs index 502b4239b..607087464 100644 --- a/tokenizers/src/normalizers/unicode.rs +++ b/tokenizers/src/normalizers/unicode.rs @@ -26,6 +26,13 @@ impl Normalizer for NFKD { pub struct NFC; impl Normalizer for NFC { fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> { + // ASCII strings are NFC by definition (U+0000..=U+007F have no + // decomposition or composition mappings), so we can skip the + // per-`char` Unicode pass and the alignments rebuild it triggers. + // Any non-ASCII byte falls through to the original path unchanged. + if normalized.get().is_ascii() { + return Ok(()); + } normalized.nfc(); Ok(()) } @@ -100,4 +107,27 @@ mod tests { assert_eq!(n.alignments_original(), vec![(0, 2), (0, 2), (0, 2)]); } + + #[test] + fn nfc_ascii_fast_path_is_no_op() { + // After an NFKD step expands a ligature, `normalized` is all-ASCII but + // `alignments` is non-trivial (each output byte still maps back to the + // 3-byte ligature). NFC over ASCII must leave every field untouched. + let mut n = NormalizedString::from("\u{fb00}"); + n.nfkd(); + assert!(n.get().is_ascii()); + + let before = n.clone(); + NFC.normalize(&mut n).unwrap(); + assert_eq!(n, before); + } + + #[test] + fn nfc_non_ascii_still_runs_unicode_path() { + // A combining-mark sequence ("e" + COMBINING ACUTE) must still be + // composed to "é" by the original NFC path; the gate must not skip it. + let mut n = NormalizedString::from("e\u{0301}"); + NFC.normalize(&mut n).unwrap(); + assert_eq!(n.get(), "\u{00e9}"); + } }