From b1436b3db3894d9165b3c00622f37856f08905a7 Mon Sep 17 00:00:00 2001 From: Taeyun Jang Date: Mon, 27 Apr 2026 02:29:42 +0900 Subject: [PATCH] perf(unigram): pre-size token map and replace per-node HashMap with Vec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While profiling Unigram::from for the 500 353-vocab minishlab/potion-multilingual-128M tokenizer, two allocation sites showed up as dominant in a dhat heap profile. PR #1799 already swapped the std HashMaps in this code path to ahash::AHashMap, which addressed hasher cost; the remaining heap pressure is structural. 1) models/unigram/model.rs: AHashMap::new() built without a capacity hint despite vocab.len() being known. Replaced with AHashMap::with_capacity(n) so the 500k-vocab map skips ~17 doubling rehashes on load. 2) models/unigram/trie.rs: Node::children switched from AHashMap to Vec<(Label, Node)>. Trie nodes typically have 1–4 children; even the root maxes out at the alphabet size (≤256 for byte-level tries). At those fan-outs a packed Vec with linear scan is smaller and faster than a hashbrown table — and crucially, the empty Vec costs zero allocations vs ~48 B of hashbrown header per node × millions of nodes. Measured on a 1500-fact wiki search bench using minishlab/potion-multilingual-128M (decode 20 queries, embed each), v0.22.2 base vs both fixes applied: Heap peak 515.2 MB → 315.3 MB -39% RSS peak 840.8 MB → 554.0 MB -34% phys_footprint 708.3 MB → 421.4 MB -41% CPU user time 1673 ms → 1447 ms -14% p50 latency 23.27 ms → 23.27 ms no regression p95 latency 59.59 ms → 59.32 ms noise dhat At t-gmax 540.2 MB → 330.6 MB -39% The CPU win is incidental — fewer allocator round-trips through the global allocator, plus tighter inner loops in the trie scan. Public API unchanged; cargo test -p tokenizers --lib unigram (20 tests) passes. Encoding determinism verified externally via Model2Vec round-trip: cosine(encode_baseline, encode_patched) > 0.9999 on real text. --- tokenizers/src/models/unigram/model.rs | 9 +++++- tokenizers/src/models/unigram/trie.rs | 40 ++++++++++++++++++++++---- 2 files changed, 43 insertions(+), 6 deletions(-) diff --git a/tokenizers/src/models/unigram/model.rs b/tokenizers/src/models/unigram/model.rs index 3a9a6bddbd..5b153de404 100644 --- a/tokenizers/src/models/unigram/model.rs +++ b/tokenizers/src/models/unigram/model.rs @@ -107,7 +107,14 @@ impl Unigram { byte_fallback: bool, ) -> Result { let n = vocab.len(); - let mut token_to_ids: TokenMap = AHashMap::new(); + // Pre-size the map: we already know exactly how many entries + // are coming. Without this hint, AHashMap::new() starts at 0 + // capacity and grows by doubling, triggering ~log2(n) rehashes + // that each allocate a fresh table and copy every existing + // entry. For a 500k-vocab tokenizer (e.g. multilingual models) + // this churn dominates loader memory — measured at tens of MB + // of redundant transient allocations. + let mut token_to_ids: TokenMap = AHashMap::with_capacity(n); let mut builder = TrieBuilder::default(); if let Some(unk_id) = unk_id { diff --git a/tokenizers/src/models/unigram/trie.rs b/tokenizers/src/models/unigram/trie.rs index 7c7149d00a..11e601dac3 100644 --- a/tokenizers/src/models/unigram/trie.rs +++ b/tokenizers/src/models/unigram/trie.rs @@ -1,4 +1,3 @@ -use ahash::AHashMap; use std::hash::Hash; #[derive(Default)] @@ -25,7 +24,29 @@ impl Trie