huggingface · taeyun16 · Apr 26, 2026 · Apr 27, 2026
diff --git a/tokenizers/src/models/unigram/model.rs b/tokenizers/src/models/unigram/model.rs
@@ -107,7 +107,14 @@ impl Unigram {
         byte_fallback: bool,
     ) -> Result<Self> {
         let n = vocab.len();
-        let mut token_to_ids: TokenMap = AHashMap::new();
+        // Pre-size the map: we already know exactly how many entries
+        // are coming. Without this hint, AHashMap::new() starts at 0
+        // capacity and grows by doubling, triggering ~log2(n) rehashes
+        // that each allocate a fresh table and copy every existing
+        // entry. For a 500k-vocab tokenizer (e.g. multilingual models)
+        // this churn dominates loader memory — measured at tens of MB
+        // of redundant transient allocations.
+        let mut token_to_ids: TokenMap = AHashMap::with_capacity(n);
         let mut builder = TrieBuilder::default();
 
         if let Some(unk_id) = unk_id {

diff --git a/tokenizers/src/models/unigram/trie.rs b/tokenizers/src/models/unigram/trie.rs
@@ -1,4 +1,3 @@
-use ahash::AHashMap;
 use std::hash::Hash;
 
 #[derive(Default)]
@@ -25,7 +24,29 @@ impl<Label: Eq + Hash + Copy> Trie<Label> {
     pub fn push(&mut self, element: &[Label]) {
         let mut node = &mut self.root;
         for label in element.iter() {
-            node = node.children.entry(*label).or_default();
+            // Children store: a flat Vec<(Label, Node)>. For Unigram
+            // trie nodes the fan-out is tiny — usually 1–4 children, with
+            // root being the only "wide" node (≤ alphabet size). At those
+            // sizes a linear scan over a packed Vec beats a HashMap on
+            // both memory (no per-node hash table overhead × millions of
+            // nodes) and lookup latency (predictable, cache-resident).
+            //
+            // The (A)HashMap variant in upstream tokenizers ≤ 0.22.x
+            // ships an empty hash table with every Default::default()
+            // Node, so for a 500k-vocab tokenizer (e.g. multilingual
+            // model) it's 500k+ near-empty hash tables — hundreds of MB
+            // of resident hashbrown structure with millions of small
+            // allocs. Switching to ahash (PR #1799) cut hasher cost
+            // but left the per-node table overhead unchanged.
+            let pos = node.children.iter().position(|(l, _)| *l == *label);
+            node = match pos {
+                Some(idx) => &mut node.children[idx].1,
+                None => {
+                    node.children.push((*label, Node::default()));
+                    let last = node.children.len() - 1;
+                    &mut node.children[last].1
+                }
+            };
         }
         node.is_leaf = true;
     }
@@ -58,7 +79,13 @@ where
         loop {
             let label = self.iterator.next()?;
             self.prefix.push(label);
-            let child = self.node.children.get(&label)?;
+            // Linear find — same fan-out argument as in `push`.
+            let child = self
+                .node
+                .children
+                .iter()
+                .find(|(l, _)| *l == label)
+                .map(|(_, n)| n)?;
             self.node = child;
             if self.node.is_leaf {
                 return Some(self.prefix.clone());
@@ -78,14 +105,17 @@ impl<Label> Default for Trie<Label> {
 #[derive(Clone)]
 pub struct Node<Label> {
     is_leaf: bool,
-    children: AHashMap<Label, Node<Label>>,
+    /// Packed list of children. Empty `Vec` (no allocation) when the
+    /// node has no children, which is the dominant case in deep trie
+    /// branches.
+    children: Vec<(Label, Node<Label>)>,
 }
 
 impl<Label> Default for Node<Label> {
     fn default() -> Self {
         Self {
             is_leaf: false,
-            children: AHashMap::new(),
+            children: Vec::new(),
         }
     }
 }