Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions tokenizers/benches/ci_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,24 @@ fn bench_serialization(c: &mut Criterion) {
b.iter(|| black_box(serde_json::from_str::<Tokenizer>(&llama3_json).unwrap()))
});

// Deserialize with 100k added tokens + NFKC normalizer
// This stresses the normalize path during add_tokens/refresh_added_tokens.
{
use tokenizers::normalizers::NFKC;
let mut tok = Tokenizer::from_file("data/roberta.json").unwrap();
let _ = tok.with_normalizer(Some(NFKC));
let tokens: Vec<_> = (0..100_000)
.map(|i| AddedToken::from(format!("tok{i}"), false))
.collect();
let _ = tok.add_tokens(tokens);
let path = std::env::temp_dir().join("bench_100k_nfkc.json");
tok.save(&path, false).unwrap();
group.bench_function("deserialize-100k-nfkc", |b| {
b.iter(|| black_box(Tokenizer::from_file(&path).unwrap()))
});
std::fs::remove_file(&path).ok();
}

group.finish();
}

Expand Down
33 changes: 33 additions & 0 deletions tokenizers/src/tokenizer/added_vocabulary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -562,6 +562,39 @@ impl AddedVocabulary {

pretokenized
}

/// Like [`extract_and_normalize`] but uses [`Normalizer::normalize_str`]
/// instead of [`Normalizer::normalize`], skipping alignment tracking.
///
/// This is used by `encode_fast` where offsets are not needed. The
/// normalization step avoids building per-byte alignment vectors, which
/// saves O(n) allocations per split.
pub fn extract_and_normalize_fast<N: Normalizer>(
&self,
normalizer: Option<&N>,
sequence: &str,
) -> PreTokenizedString {
let mut pretokenized: PreTokenizedString = sequence.into();

// 1. Extract non-normalized tokens from the raw string
pretokenized
.split(|_, sequence| Ok(self.split_with_indices(sequence, &self.split_trie)))
.expect("AddedVocabulary bad split");

// 2. Normalize remaining pieces via normalize_str (no alignment tracking)
// and extract normalized tokens
pretokenized
.split(|_, mut sequence| {
if let Some(n) = normalizer {
let normed = n.normalize_str(sequence.get())?;
sequence.set_normalized(normed);
}
Ok(self.split_with_indices(sequence, &self.split_normalized_trie))
})
.expect("AddedVocabulary bad split");

pretokenized
}
}

impl Default for AddedVocabulary {
Expand Down
21 changes: 18 additions & 3 deletions tokenizers/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,16 @@ pub type Offsets = (usize, usize);
/// Takes care of pre-processing strings.
pub trait Normalizer: Sync {
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()>;

/// Normalize a plain string without tracking alignments.
///
/// The default allocates a full [`NormalizedString`]. Normalizers that can
/// produce their output more cheaply should override this.
fn normalize_str(&self, s: &str) -> Result<String> {
let mut n = NormalizedString::from(s);
self.normalize(&mut n)?;
Ok(n.get().to_owned())
}
}

/// The `PreTokenizer` is in charge of doing the pre-segmentation step. It splits the given string
Expand Down Expand Up @@ -731,10 +741,15 @@ where
type_id: u32,
offsets_type: OffsetType,
) -> Result<Encoding> {
let fast = matches!(offsets_type, OffsetType::None);
let encode = |is_pre_tokenized, subseq_idx, subseq| -> Result<Encoding> {
let normalized = self
.added_vocabulary
.extract_and_normalize(self.normalizer.as_ref(), subseq);
let normalized = if fast {
self.added_vocabulary
.extract_and_normalize_fast(self.normalizer.as_ref(), subseq)
} else {
self.added_vocabulary
.extract_and_normalize(self.normalizer.as_ref(), subseq)
};
let pre_tokenized = self.do_pre_tokenize(normalized)?;
let subseq_encoding = self.do_tokenize(
pre_tokenized,
Expand Down
12 changes: 12 additions & 0 deletions tokenizers/src/tokenizer/normalizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,18 @@ impl NormalizedString {
&self.normalized
}

/// Replace the normalized content without tracking alignments.
///
/// This is significantly cheaper than going through `transform()` since it
/// skips the per-byte alignment bookkeeping. Use this when offset tracking
/// is not needed (e.g. `encode_fast`).
pub fn set_normalized(&mut self, new: String) {
// Build trivial 1:1 alignments so that slice() still works for
// splitting, but no real offset mapping is preserved.
self.alignments = new.as_bytes().iter().enumerate().map(|(i, _)| (i, i + 1)).collect();
self.normalized = new;
}

/// Return the original string
pub fn get_original(&self) -> &str {
&self.original
Expand Down
Loading