From b8cddf5332ae058483269f3c53d5764f4c118688 Mon Sep 17 00:00:00 2001 From: Kim Yang Date: Mon, 27 Apr 2026 00:41:41 +0800 Subject: [PATCH] NFC: skip Unicode pass for all-ASCII inputs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ASCII (U+0000..=U+007F) is NFC by Unicode invariant — none of those code points have a `Decomposition_Mapping`, none combine with adjacent characters, and none are the target of any composition. Running the full `unicode-normalization-alignments` pass on an all-ASCII `NormalizedString` therefore rebuilds `normalized` and `alignments` to the exact same bytes and tuples it already had. We can return early and save the iterator allocation, the per-`char` UTF-8 decode loop, and the `transform` rebuild. The gate is conservative: any non-ASCII byte in the input falls through to the original code path with zero changes, so combining-mark sequences, CJK, Arabic, Cyrillic, Vietnamese, etc. are unaffected. Two unit tests pin the contract: - `nfc_ascii_fast_path_is_no_op` — runs NFKD on `ff` (producing all-ASCII text with non-trivial alignments) and asserts NFC leaves the entire `NormalizedString` byte-identical. - `nfc_non_ascii_still_runs_unicode_path` — checks that "e" + combining acute is still composed to "é". Co-Authored-By: Claude Opus 4.7 (1M context) --- tokenizers/src/normalizers/unicode.rs | 30 +++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tokenizers/src/normalizers/unicode.rs b/tokenizers/src/normalizers/unicode.rs index 502b4239b..607087464 100644 --- a/tokenizers/src/normalizers/unicode.rs +++ b/tokenizers/src/normalizers/unicode.rs @@ -26,6 +26,13 @@ impl Normalizer for NFKD { pub struct NFC; impl Normalizer for NFC { fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> { + // ASCII strings are NFC by definition (U+0000..=U+007F have no + // decomposition or composition mappings), so we can skip the + // per-`char` Unicode pass and the alignments rebuild it triggers. + // Any non-ASCII byte falls through to the original path unchanged. + if normalized.get().is_ascii() { + return Ok(()); + } normalized.nfc(); Ok(()) } @@ -100,4 +107,27 @@ mod tests { assert_eq!(n.alignments_original(), vec![(0, 2), (0, 2), (0, 2)]); } + + #[test] + fn nfc_ascii_fast_path_is_no_op() { + // After an NFKD step expands a ligature, `normalized` is all-ASCII but + // `alignments` is non-trivial (each output byte still maps back to the + // 3-byte ligature). NFC over ASCII must leave every field untouched. + let mut n = NormalizedString::from("\u{fb00}"); + n.nfkd(); + assert!(n.get().is_ascii()); + + let before = n.clone(); + NFC.normalize(&mut n).unwrap(); + assert_eq!(n, before); + } + + #[test] + fn nfc_non_ascii_still_runs_unicode_path() { + // A combining-mark sequence ("e" + COMBINING ACUTE) must still be + // composed to "é" by the original NFC path; the gate must not skip it. + let mut n = NormalizedString::from("e\u{0301}"); + NFC.normalize(&mut n).unwrap(); + assert_eq!(n.get(), "\u{00e9}"); + } }