Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions tokenizers/src/normalizers/unicode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,13 @@ impl Normalizer for NFKD {
pub struct NFC;
impl Normalizer for NFC {
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
// ASCII strings are NFC by definition (U+0000..=U+007F have no
// decomposition or composition mappings), so we can skip the
// per-`char` Unicode pass and the alignments rebuild it triggers.
// Any non-ASCII byte falls through to the original path unchanged.
if normalized.get().is_ascii() {
return Ok(());
}
normalized.nfc();
Ok(())
}
Expand Down Expand Up @@ -100,4 +107,27 @@ mod tests {

assert_eq!(n.alignments_original(), vec![(0, 2), (0, 2), (0, 2)]);
}

#[test]
fn nfc_ascii_fast_path_is_no_op() {
// After an NFKD step expands a ligature, `normalized` is all-ASCII but
// `alignments` is non-trivial (each output byte still maps back to the
// 3-byte ligature). NFC over ASCII must leave every field untouched.
let mut n = NormalizedString::from("\u{fb00}");
n.nfkd();
assert!(n.get().is_ascii());

let before = n.clone();
NFC.normalize(&mut n).unwrap();
assert_eq!(n, before);
}

#[test]
fn nfc_non_ascii_still_runs_unicode_path() {
// A combining-mark sequence ("e" + COMBINING ACUTE) must still be
// composed to "é" by the original NFC path; the gate must not skip it.
let mut n = NormalizedString::from("e\u{0301}");
NFC.normalize(&mut n).unwrap();
assert_eq!(n.get(), "\u{00e9}");
}
}