Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion tokenizers/src/models/unigram/model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,14 @@ impl Unigram {
byte_fallback: bool,
) -> Result<Self> {
let n = vocab.len();
let mut token_to_ids: TokenMap = AHashMap::new();
// Pre-size the map: we already know exactly how many entries
// are coming. Without this hint, AHashMap::new() starts at 0
// capacity and grows by doubling, triggering ~log2(n) rehashes
// that each allocate a fresh table and copy every existing
// entry. For a 500k-vocab tokenizer (e.g. multilingual models)
// this churn dominates loader memory — measured at tens of MB
// of redundant transient allocations.
let mut token_to_ids: TokenMap = AHashMap::with_capacity(n);
let mut builder = TrieBuilder::default();

if let Some(unk_id) = unk_id {
Expand Down
40 changes: 35 additions & 5 deletions tokenizers/src/models/unigram/trie.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use ahash::AHashMap;
use std::hash::Hash;

#[derive(Default)]
Expand All @@ -25,7 +24,29 @@ impl<Label: Eq + Hash + Copy> Trie<Label> {
pub fn push(&mut self, element: &[Label]) {
let mut node = &mut self.root;
for label in element.iter() {
node = node.children.entry(*label).or_default();
// Children store: a flat Vec<(Label, Node)>. For Unigram
// trie nodes the fan-out is tiny — usually 1–4 children, with
// root being the only "wide" node (≤ alphabet size). At those
// sizes a linear scan over a packed Vec beats a HashMap on
// both memory (no per-node hash table overhead × millions of
// nodes) and lookup latency (predictable, cache-resident).
//
// The (A)HashMap variant in upstream tokenizers ≤ 0.22.x
// ships an empty hash table with every Default::default()
// Node, so for a 500k-vocab tokenizer (e.g. multilingual
// model) it's 500k+ near-empty hash tables — hundreds of MB
// of resident hashbrown structure with millions of small
// allocs. Switching to ahash (PR #1799) cut hasher cost
// but left the per-node table overhead unchanged.
let pos = node.children.iter().position(|(l, _)| *l == *label);
node = match pos {
Some(idx) => &mut node.children[idx].1,
None => {
node.children.push((*label, Node::default()));
let last = node.children.len() - 1;
&mut node.children[last].1
}
};
}
node.is_leaf = true;
}
Expand Down Expand Up @@ -58,7 +79,13 @@ where
loop {
let label = self.iterator.next()?;
self.prefix.push(label);
let child = self.node.children.get(&label)?;
// Linear find — same fan-out argument as in `push`.
let child = self
.node
.children
.iter()
.find(|(l, _)| *l == label)
.map(|(_, n)| n)?;
self.node = child;
if self.node.is_leaf {
return Some(self.prefix.clone());
Expand All @@ -78,14 +105,17 @@ impl<Label> Default for Trie<Label> {
#[derive(Clone)]
pub struct Node<Label> {
is_leaf: bool,
children: AHashMap<Label, Node<Label>>,
/// Packed list of children. Empty `Vec` (no allocation) when the
/// node has no children, which is the dominant case in deep trie
/// branches.
children: Vec<(Label, Node<Label>)>,
}

impl<Label> Default for Node<Label> {
fn default() -> Self {
Self {
is_leaf: false,
children: AHashMap::new(),
children: Vec::new(),
}
}
}