From b1436b3db3894d9165b3c00622f37856f08905a7 Mon Sep 17 00:00:00 2001
From: Taeyun Jang <taeyun16@pm.me>
Date: Mon, 27 Apr 2026 02:29:42 +0900
Subject: [PATCH] perf(unigram): pre-size token map and replace per-node
 HashMap with Vec
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

While profiling Unigram::from for the 500 353-vocab
minishlab/potion-multilingual-128M tokenizer, two allocation
sites showed up as dominant in a dhat heap profile. PR #1799
already swapped the std HashMaps in this code path to
ahash::AHashMap, which addressed hasher cost; the remaining
heap pressure is structural.

1) models/unigram/model.rs: AHashMap::new() built without a
   capacity hint despite vocab.len() being known. Replaced
   with AHashMap::with_capacity(n) so the 500k-vocab map skips
   ~17 doubling rehashes on load.

2) models/unigram/trie.rs: Node::children switched from
   AHashMap<Label, Node> to Vec<(Label, Node)>. Trie nodes
   typically have 1–4 children; even the root maxes out at
   the alphabet size (≤256 for byte-level tries). At those
   fan-outs a packed Vec with linear scan is smaller and
   faster than a hashbrown table — and crucially, the empty
   Vec costs zero allocations vs ~48 B of hashbrown header
   per node × millions of nodes.

Measured on a 1500-fact wiki search bench using
minishlab/potion-multilingual-128M (decode 20 queries, embed
each), v0.22.2 base vs both fixes applied:

  Heap peak           515.2 MB → 315.3 MB    -39%
  RSS peak            840.8 MB → 554.0 MB    -34%
  phys_footprint      708.3 MB → 421.4 MB    -41%
  CPU user time        1673 ms → 1447 ms     -14%
  p50 latency          23.27 ms → 23.27 ms   no regression
  p95 latency          59.59 ms → 59.32 ms   noise
  dhat At t-gmax       540.2 MB → 330.6 MB   -39%

The CPU win is incidental — fewer allocator round-trips
through the global allocator, plus tighter inner loops in
the trie scan.

Public API unchanged; cargo test -p tokenizers --lib unigram
(20 tests) passes. Encoding determinism verified externally
via Model2Vec round-trip: cosine(encode_baseline, encode_patched)
> 0.9999 on real text.
---
 tokenizers/src/models/unigram/model.rs |  9 +++++-
 tokenizers/src/models/unigram/trie.rs  | 40 ++++++++++++++++++++++----
 2 files changed, 43 insertions(+), 6 deletions(-)
diff --git a/tokenizers/src/models/unigram/model.rs b/tokenizers/src/models/unigram/model.rs
index 3a9a6bddbd..5b153de404 100644
--- a/tokenizers/src/models/unigram/model.rs
+++ b/tokenizers/src/models/unigram/model.rs
@@ -107,7 +107,14 @@ impl Unigram {
         byte_fallback: bool,
     ) -> Result<Self> {
         let n = vocab.len();
-        let mut token_to_ids: TokenMap = AHashMap::new();
+        // Pre-size the map: we already know exactly how many entries
+        // are coming. Without this hint, AHashMap::new() starts at 0
+        // capacity and grows by doubling, triggering ~log2(n) rehashes
+        // that each allocate a fresh table and copy every existing
+        // entry. For a 500k-vocab tokenizer (e.g. multilingual models)
+        // this churn dominates loader memory — measured at tens of MB
+        // of redundant transient allocations.
+        let mut token_to_ids: TokenMap = AHashMap::with_capacity(n);
         let mut builder = TrieBuilder::default();
 
         if let Some(unk_id) = unk_id {
diff --git a/tokenizers/src/models/unigram/trie.rs b/tokenizers/src/models/unigram/trie.rs
index 7c7149d00a..11e601dac3 100644
--- a/tokenizers/src/models/unigram/trie.rs
+++ b/tokenizers/src/models/unigram/trie.rs
@@ -1,4 +1,3 @@
-use ahash::AHashMap;
 use std::hash::Hash;
 
 #[derive(Default)]
@@ -25,7 +24,29 @@ impl<Label: Eq + Hash + Copy> Trie<Label> {
     pub fn push(&mut self, element: &[Label]) {
         let mut node = &mut self.root;
         for label in element.iter() {
-            node = node.children.entry(*label).or_default();
+            // Children store: a flat Vec<(Label, Node)>. For Unigram
+            // trie nodes the fan-out is tiny — usually 1–4 children, with
+            // root being the only "wide" node (≤ alphabet size). At those
+            // sizes a linear scan over a packed Vec beats a HashMap on
+            // both memory (no per-node hash table overhead × millions of
+            // nodes) and lookup latency (predictable, cache-resident).
+            //
+            // The (A)HashMap variant in upstream tokenizers ≤ 0.22.x
+            // ships an empty hash table with every Default::default()
+            // Node, so for a 500k-vocab tokenizer (e.g. multilingual
+            // model) it's 500k+ near-empty hash tables — hundreds of MB
+            // of resident hashbrown structure with millions of small
+            // allocs. Switching to ahash (PR #1799) cut hasher cost
+            // but left the per-node table overhead unchanged.
+            let pos = node.children.iter().position(|(l, _)| *l == *label);
+            node = match pos {
+                Some(idx) => &mut node.children[idx].1,
+                None => {
+                    node.children.push((*label, Node::default()));
+                    let last = node.children.len() - 1;
+                    &mut node.children[last].1
+                }
+            };
         }
         node.is_leaf = true;
     }
@@ -58,7 +79,13 @@ where
         loop {
             let label = self.iterator.next()?;
             self.prefix.push(label);
-            let child = self.node.children.get(&label)?;
+            // Linear find — same fan-out argument as in `push`.
+            let child = self
+                .node
+                .children
+                .iter()
+                .find(|(l, _)| *l == label)
+                .map(|(_, n)| n)?;
             self.node = child;
             if self.node.is_leaf {
                 return Some(self.prefix.clone());
@@ -78,14 +105,17 @@ impl<Label> Default for Trie<Label> {
 #[derive(Clone)]
 pub struct Node<Label> {
     is_leaf: bool,
-    children: AHashMap<Label, Node<Label>>,
+    /// Packed list of children. Empty `Vec` (no allocation) when the
+    /// node has no children, which is the dominant case in deep trie
+    /// branches.
+    children: Vec<(Label, Node<Label>)>,
 }
 
 impl<Label> Default for Node<Label> {
     fn default() -> Self {
         Self {
             is_leaf: false,
-            children: AHashMap::new(),
+            children: Vec::new(),
         }
     }
 }