From b8cddf5332ae058483269f3c53d5764f4c118688 Mon Sep 17 00:00:00 2001
From: Kim Yang <kimy@nvidia.com>
Date: Mon, 27 Apr 2026 00:41:41 +0800
Subject: [PATCH] NFC: skip Unicode pass for all-ASCII inputs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ASCII (U+0000..=U+007F) is NFC by Unicode invariant — none of those code
points have a `Decomposition_Mapping`, none combine with adjacent
characters, and none are the target of any composition. Running the full
`unicode-normalization-alignments` pass on an all-ASCII `NormalizedString`
therefore rebuilds `normalized` and `alignments` to the exact same bytes
and tuples it already had. We can return early and save the iterator
allocation, the per-`char` UTF-8 decode loop, and the `transform` rebuild.

The gate is conservative: any non-ASCII byte in the input falls through
to the original code path with zero changes, so combining-mark sequences,
CJK, Arabic, Cyrillic, Vietnamese, etc. are unaffected. Two unit tests
pin the contract:

  - `nfc_ascii_fast_path_is_no_op` — runs NFKD on `ﬀ` (producing all-ASCII
    text with non-trivial alignments) and asserts NFC leaves the entire
    `NormalizedString` byte-identical.
  - `nfc_non_ascii_still_runs_unicode_path` — checks that "e" + combining
    acute is still composed to "é".

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tokenizers/src/normalizers/unicode.rs | 30 +++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/tokenizers/src/normalizers/unicode.rs b/tokenizers/src/normalizers/unicode.rs
index 502b4239b..607087464 100644
--- a/tokenizers/src/normalizers/unicode.rs
+++ b/tokenizers/src/normalizers/unicode.rs
@@ -26,6 +26,13 @@ impl Normalizer for NFKD {
 pub struct NFC;
 impl Normalizer for NFC {
     fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
+        // ASCII strings are NFC by definition (U+0000..=U+007F have no
+        // decomposition or composition mappings), so we can skip the
+        // per-`char` Unicode pass and the alignments rebuild it triggers.
+        // Any non-ASCII byte falls through to the original path unchanged.
+        if normalized.get().is_ascii() {
+            return Ok(());
+        }
         normalized.nfc();
         Ok(())
     }
@@ -100,4 +107,27 @@ mod tests {
 
         assert_eq!(n.alignments_original(), vec![(0, 2), (0, 2), (0, 2)]);
     }
+
+    #[test]
+    fn nfc_ascii_fast_path_is_no_op() {
+        // After an NFKD step expands a ligature, `normalized` is all-ASCII but
+        // `alignments` is non-trivial (each output byte still maps back to the
+        // 3-byte ligature). NFC over ASCII must leave every field untouched.
+        let mut n = NormalizedString::from("\u{fb00}");
+        n.nfkd();
+        assert!(n.get().is_ascii());
+
+        let before = n.clone();
+        NFC.normalize(&mut n).unwrap();
+        assert_eq!(n, before);
+    }
+
+    #[test]
+    fn nfc_non_ascii_still_runs_unicode_path() {
+        // A combining-mark sequence ("e" + COMBINING ACUTE) must still be
+        // composed to "é" by the original NFC path; the gate must not skip it.
+        let mut n = NormalizedString::from("e\u{0301}");
+        NFC.normalize(&mut n).unwrap();
+        assert_eq!(n.get(), "\u{00e9}");
+    }
 }