From b2ae556199fd10c2f9b686a086901bdd11e34833 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 23 Apr 2026 07:11:10 +0000
Subject: [PATCH] =?UTF-8?q?benches:=20real-world=20(batch=20=C3=97=20input?=
 =?UTF-8?q?=5Flength)=20matrix=20+=20cross-library=20leaderboard?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rewrites the tiktoken comparison bench into a standardized
(batch_size × input_length) sweep mirroring the knobs used by fastokens'
`examples/ablation.sh` and wordchipper's fineweb batch bench. Samples are
pulled from `zai-org/LongBench-v2` and truncated/repeated per prompt to hit
exact token lengths (same helper as fastokens' `_adjust_tokens`).

**Python side — `bindings/python/benches/test_tiktoken.py`**
- Five backends on a uniform encode/decode API, skipped gracefully if
  unavailable: `tokenizers`, `tiktoken`, `wordchipper`
  (https://github.com/zspacelabs/wordchipper), `iree.tokenizer`
  (https://github.com/iree-org/iree-tokenizer-py), `bpe` via `bpe-openai`
  (https://github.com/github/rust-gems).
- Accepts OpenAI encoding names (`cl100k_base`, `o200k_base`, `gpt2`, `llama3`)
  or any HF repo id. `--hf-models` iterates a list and prints a cross-model
  leaderboard.
- Both encode and decode are timed (best of warmup+iters); `rich` renders
  live colored tables with per-row winner and a geo-mean summary.
- Cross-backend correctness probe before timing.
- Fairness preflight (CPU model, load avg, pinned CPUs, governor) with an
  optional `--strict-fairness` abort above 50% nproc.
- `--save-json` / `--save-md` serialize full results + a markdown leaderboard.

**Rust side — `tokenizers/benches/matrix_benchmark.rs`**
- New Criterion bench that sweeps the same (batch, input_length) matrix and
  measures `encode_batch`, `encode_batch_fast`, and **`decode_batch`** — the
  prior suite (`ci_benchmark`, `llama3_benchmark`) had no decode coverage
  and no parametric matrix.
- Matrix is env-configurable via `MATRIX_BATCH_SIZES`, `MATRIX_INPUT_LENGTHS`.
- Registered in `tokenizers/Cargo.toml`.
---
 bindings/python/benches/test_tiktoken.py | 1066 ++++++++++++++++++++--
 tokenizers/Cargo.toml                    |    4 +
 tokenizers/benches/matrix_benchmark.rs   |  159 ++++
 3 files changed, 1135 insertions(+), 94 deletions(-)
 create mode 100644 tokenizers/benches/matrix_benchmark.rs

diff --git a/bindings/python/benches/test_tiktoken.py b/bindings/python/benches/test_tiktoken.py
index f88c18e0f..d59154525 100755
--- a/bindings/python/benches/test_tiktoken.py
+++ b/bindings/python/benches/test_tiktoken.py
@@ -1,128 +1,1006 @@
+"""Real-world tokenizer throughput benchmark.
+
+Compares up to five backends head-to-head on a standardized
+``(batch_size, input_length_tokens)`` matrix — the same knobs fastokens' and
+wordchipper's ablation scripts sweep over:
+
+    * ``tokenizers``                (this repo)
+    * ``tiktoken``                  (OpenAI)
+    * ``wordchipper``               (https://github.com/zspacelabs/wordchipper)
+    * ``iree.tokenizer``            (https://github.com/iree-org/iree-tokenizer-py)
+    * ``bpe`` / ``bpe-openai``      (https://github.com/github/rust-gems/tree/main/crates/bpe)
+
+Real prompts are sourced from ``zai-org/LongBench-v2`` (same dataset used by
+fastokens' ``simple_bench.rs``), tokenized once, then truncated/repeated per
+prompt to reach exactly ``input_length`` tokens — mirroring fastokens'
+``_adjust_tokens`` helper and wordchipper's fineweb batch approach.
+
+Both **encode** and **decode** are benchmarked over the same matrix. Results
+stream live into colored ``rich`` tables.
+
+Backends that are not installed are gracefully skipped.
+"""
+
+import argparse
+import datetime
+import json
 import os
+import platform
+import statistics
+import subprocess
 import time
-import argparse
-from datasets import load_dataset
-from tiktoken.load import load_tiktoken_bpe  # type: ignore[import]
-import tiktoken  # type: ignore[import]
-from tokenizers import Tokenizer
+from multiprocessing import Process, Queue
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
 from huggingface_hub import hf_hub_download
-from typing import Tuple, List
-from multiprocessing import Process
+from rich import box
+from rich.console import Console
+from rich.live import Live
+from rich.panel import Panel
+from rich.table import Table
+
+# Backends whose encode outputs must agree for a given encoding.
+ENCODINGS: Dict[str, Dict[str, str]] = {
+    "cl100k_base": {"hf_repo": "Xenova/text-embedding-ada-002"},
+    "o200k_base":  {"hf_repo": "Xenova/gpt-4o"},
+    "gpt2":        {"hf_repo": "Xenova/gpt2"},
+    "llama3":      {"hf_repo": "meta-llama/Llama-3.2-1B"},
+}
+DEFAULT_ENCODING = "cl100k_base"
+DATASET_REPO = "zai-org/LongBench-v2"
+DATASET_FILE = "data.json"
+
+# Matrix follows fastokens' ``ablation.sh`` (batches) and the token-length
+# buckets from ``dynamo_speed.py``.
+DEFAULT_BATCH_SIZES = [1, 8, 32, 128, 512]
+DEFAULT_INPUT_LENGTHS = [128, 512, 2048, 8192, 32768]
+ALL_BACKENDS = ["tokenizers", "tiktoken", "wordchipper", "iree", "bpe"]
+
+LLAMA_PAT = (
+    r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}|"
+    r" ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
+)
 
-MODEL_ID = "meta-llama/Llama-3.2-1B"
-DATASET = "facebook/xnli"
-DATASET_CONFIG = "all_languages"
-DEFAULT_THREADS = [2**i for i in range(8) if 2**i]
+# ---------------------------------------------------------------------------
+# Backend adapters
+# ---------------------------------------------------------------------------
 
 
-def format_byte_size(num_bytes: int) -> Tuple[str, str]:
-    """Convert bytes to a human-readable format (KB, MB, GB)."""
-    num_bytes_f = float(num_bytes)
-    for unit in ["B", "KB", "MB", "GB", "TB"]:
-        if num_bytes_f < 1024:
-            return f"{num_bytes_f:.2f} {unit}", unit
-        num_bytes_f /= 1024
-    return f"{num_bytes_f:.2f} PB", "PB"
+class Backend:
+    """Uniform encode/decode API across the four tokenizers under test."""
 
+    def __init__(self, name: str, encode_batch: Callable, decode_batch: Callable) -> None:
+        self.name = name
+        self.encode_batch = encode_batch
+        self.decode_batch = decode_batch
 
-def benchmark_batch(model: str, documents: list[str], num_threads: int, document_length: float) -> None:
+
+def _resolve_hf_repo(model: str) -> str:
+    """Map a short encoding name to an HF repo, or pass through as-is."""
+    if model in ENCODINGS:
+        return ENCODINGS[model]["hf_repo"]
+    return model
+
+
+def _load_hf(model: str, num_threads: int) -> Optional[Backend]:
+    try:
+        from tokenizers import Tokenizer
+    except ImportError:
+        return None
     os.environ["RAYON_NUM_THREADS"] = str(num_threads)
-    num_bytes = sum(map(len, map(str.encode, documents)))
-    readable_size, unit = format_byte_size(num_bytes)
-    print(f"==============")
-    print(
-        f"num_threads: {num_threads}, data size: {readable_size}, documents: {len(documents)} Avg Length: {document_length:.0f}"
+    try:
+        tok = Tokenizer.from_pretrained(_resolve_hf_repo(model))
+    except Exception:
+        return None
+
+    def encode_batch(texts: List[str]) -> List[List[int]]:
+        out = tok.encode_batch_fast(texts, add_special_tokens=False)
+        return [e.ids for e in out]
+
+    def decode_batch(ids_list: List[List[int]]) -> List[str]:
+        return tok.decode_batch(ids_list, skip_special_tokens=False)
+
+    return Backend("tokenizers", encode_batch, decode_batch)
+
+
+def _load_tiktoken(model: str, num_threads: int) -> Optional[Backend]:
+    try:
+        import tiktoken
+    except ImportError:
+        return None
+
+    if model in ("cl100k_base", "o200k_base", "gpt2", "r50k_base", "p50k_base"):
+        enc = tiktoken.get_encoding(model)
+    else:
+        # Try to load a tiktoken-format ``original/tokenizer.model`` from the
+        # HF repo. This only works for llama-3-style BPE models.
+        from tiktoken.load import load_tiktoken_bpe
+        repo = _resolve_hf_repo(model)
+        try:
+            path = hf_hub_download(repo, "original/tokenizer.model")
+        except Exception:
+            return None
+        try:
+            ranks = load_tiktoken_bpe(path)
+        except Exception:
+            return None
+        specials = [
+            "<|begin_of_text|>", "<|end_of_text|>",
+            "<|start_header_id|>", "<|end_header_id|>", "<|eot_id|>",
+        ]
+        specials_map = {t: len(ranks) + i for i, t in enumerate(specials)}
+        enc = tiktoken.Encoding(
+            name=model.replace("/", "_"),
+            pat_str=LLAMA_PAT,
+            mergeable_ranks=ranks,
+            special_tokens=specials_map,
+        )
+
+    def encode_batch(texts: List[str]) -> List[List[int]]:
+        return enc.encode_ordinary_batch(texts, num_threads=num_threads)
+
+    def decode_batch(ids_list: List[List[int]]) -> List[str]:
+        return enc.decode_batch(ids_list, num_threads=num_threads)
+
+    return Backend("tiktoken", encode_batch, decode_batch)
+
+
+def _load_wordchipper(model: str, num_threads: int) -> Optional[Backend]:
+    try:
+        import wordchipper as wc
+    except ImportError:
+        return None
+    # wordchipper only ships OpenAI encodings.
+    if model not in ("gpt2", "cl100k_base", "r50k_base", "p50k_base", "o200k_base"):
+        return None
+
+    options = wc.TokenizerOptions.default()
+    if hasattr(options, "set_parallel"):
+        options.set_parallel(num_threads > 1)
+    if hasattr(options, "set_accelerated_lexers"):
+        options.set_accelerated_lexers(True)
+    tok = wc.Tokenizer.from_pretrained(model, options)
+
+    def encode_batch(texts: List[str]) -> List[List[int]]:
+        return tok.encode_batch(texts)
+
+    def decode_batch(ids_list: List[List[int]]) -> List[str]:
+        return tok.decode_batch(ids_list)
+
+    return Backend("wordchipper", encode_batch, decode_batch)
+
+
+def _load_iree(model: str, num_threads: int) -> Optional[Backend]:
+    try:
+        from iree.tokenizer import Tokenizer as IreeTokenizer
+    except ImportError:
+        return None
+    try:
+        path = hf_hub_download(_resolve_hf_repo(model), "tokenizer.json")
+    except Exception:
+        return None
+    try:
+        tok = IreeTokenizer.from_file(path)
+    except Exception:
+        return None
+
+    def encode_batch(texts: List[str]) -> List[List[int]]:
+        return tok.encode_batch(texts)
+
+    def decode_batch(ids_list: List[List[int]]) -> List[str]:
+        return tok.decode_batch(ids_list)
+
+    return Backend("iree", encode_batch, decode_batch)
+
+
+def _load_bpe(model: str, num_threads: int) -> Optional[Backend]:
+    """github/rust-gems ``bpe-openai`` crate (Python wheel ``bpe-openai``).
+
+    Only ships OpenAI-compatible encodings. Returns ``None`` for arbitrary HF
+    repos, matching wordchipper's scope.
+    """
+    try:
+        import bpe_openai
+    except ImportError:
+        return None
+    try:
+        names = set(bpe_openai.list_encoding_names())
+    except Exception:
+        return None
+    if model not in names:
+        return None
+    try:
+        enc = bpe_openai.get_encoding(model)
+    except Exception:
+        return None
+
+    def encode_batch(texts: List[str]) -> List[List[int]]:
+        return enc.encode_ordinary_batch(texts, num_threads=num_threads)
+
+    def decode_batch(ids_list: List[List[int]]) -> List[str]:
+        return enc.decode_batch(ids_list, num_threads=num_threads)
+
+    return Backend("bpe", encode_batch, decode_batch)
+
+
+BACKEND_LOADERS: Dict[str, Callable[[str, int], Optional[Backend]]] = {
+    "tokenizers": _load_hf,
+    "tiktoken": _load_tiktoken,
+    "wordchipper": _load_wordchipper,
+    "iree": _load_iree,
+    "bpe": _load_bpe,
+}
+
+
+# ---------------------------------------------------------------------------
+# Data / corpus
+# ---------------------------------------------------------------------------
+
+
+def _load_prompts(num_prompts: int) -> List[str]:
+    path = hf_hub_download(DATASET_REPO, DATASET_FILE, repo_type="dataset")
+    with open(path) as f:
+        data = json.load(f)
+    out: List[str] = []
+    for item in data:
+        ctx = (item.get("context") or "").strip()
+        if ctx:
+            out.append(ctx)
+        if len(out) >= num_prompts:
+            break
+    return out
+
+
+def _adjust_tokens(ids: List[int], target: int) -> List[int]:
+    if len(ids) >= target:
+        return ids[:target]
+    reps = (target // len(ids)) + 1
+    return (ids * reps)[:target]
+
+
+def _build_sample_pool(
+    reference: Backend, prompts: List[str], input_lengths: List[int],
+) -> Dict[int, List[str]]:
+    """Pre-encode prompts once, then derive per-length texts.
+
+    For each target length ``L`` we produce ``len(prompts)`` strings; at bench
+    time we slice the first ``batch_size`` of them.
+    """
+    ids_pool = [reference.encode_batch([p])[0] for p in prompts]
+    ids_pool = [ids for ids in ids_pool if ids]
+    # Decode adjusted ids back to text using the *reference* backend, so the
+    # text round-trips exactly to the target length for every backend that
+    # agrees with the reference encoding.
+    pool: Dict[int, List[str]] = {}
+    for L in input_lengths:
+        adjusted = [_adjust_tokens(ids, L) for ids in ids_pool]
+        decoded = reference.decode_batch(adjusted)
+        pool[L] = decoded
+    return pool
+
+
+# ---------------------------------------------------------------------------
+# Timing
+# ---------------------------------------------------------------------------
+
+
+def _time(fn: Callable, warmup: int = 1, iters: int = 3) -> float:
+    for _ in range(warmup):
+        fn()
+    best = float("inf")
+    for _ in range(iters):
+        t0 = time.perf_counter_ns()
+        fn()
+        dt = time.perf_counter_ns() - t0
+        if dt < best:
+            best = dt
+    return best / 1e9
+
+
+# ---------------------------------------------------------------------------
+# Worker
+# ---------------------------------------------------------------------------
+
+
+def _worker(
+    model: str,
+    backends_wanted: List[str],
+    combos: List[Tuple[int, int]],
+    num_prompts: int,
+    num_threads: int,
+    iters: int,
+    warmup: int,
+    q: "Queue",
+) -> None:
+    try:
+        # Order matters: we prefer to use a reference backend for sample
+        # generation. tokenizers > tiktoken > wordchipper > iree.
+        loaded: Dict[str, Backend] = {}
+        for name in backends_wanted:
+            loader = BACKEND_LOADERS[name]
+            try:
+                b = loader(model, num_threads)
+            except Exception as e:  # noqa: BLE001
+                q.put({"log": f"[yellow]skip[/yellow] {name}: {e!r}"})
+                b = None
+            if b is None:
+                q.put({"log": f"[yellow]skip[/yellow] {name} (not available for {model})"})
+            else:
+                loaded[name] = b
+                q.put({"log": f"[green]ok[/green] {name} loaded"})
+
+        if not loaded:
+            q.put({"error": "no backend could be loaded"})
+            return
+
+        reference = next(iter(loaded.values()))
+        q.put({"log": f"[dim]reference backend for sample generation: {reference.name}[/dim]"})
+
+        # Cross-backend correctness on a canonical short input.
+        probe = "Hello world, this is a test of tokenizer equivalence."
+        ref_ids = reference.encode_batch([probe])[0]
+        for name, b in loaded.items():
+            if b is reference:
+                continue
+            ids = b.encode_batch([probe])[0]
+            if ids != ref_ids:
+                q.put({"log": f"[red]mismatch[/red] {name} vs {reference.name} on probe"})
+            else:
+                q.put({"log": f"[dim]agree[/dim] {name} == {reference.name}"})
+
+        prompts = _load_prompts(num_prompts)
+        input_lengths = sorted({L for _, L in combos})
+        text_pool = _build_sample_pool(reference, prompts, input_lengths)
+
+        for batch_size, input_length in combos:
+            texts_all = text_pool[input_length]
+            texts = [texts_all[i % len(texts_all)] for i in range(batch_size)]
+            total_bytes = sum(len(t.encode("utf-8")) for t in texts)
+
+            # Encode
+            for name, b in loaded.items():
+                sec = _time(lambda b=b: b.encode_batch(texts), warmup=warmup, iters=iters)
+                q.put({
+                    "phase": "encode",
+                    "backend": name,
+                    "batch_size": batch_size,
+                    "input_length": input_length,
+                    "total_bytes": total_bytes,
+                    "sec": sec,
+                })
+
+            # Decode: pre-encode with each backend so every backend decodes its
+            # own ids (avoids cross-backend id-space drift on llama3).
+            for name, b in loaded.items():
+                ids_list = b.encode_batch(texts)
+                tok_count = sum(len(ids) for ids in ids_list)
+                sec = _time(lambda b=b, ids_list=ids_list: b.decode_batch(ids_list),
+                            warmup=warmup, iters=iters)
+                q.put({
+                    "phase": "decode",
+                    "backend": name,
+                    "batch_size": batch_size,
+                    "input_length": input_length,
+                    "total_bytes": total_bytes,
+                    "total_tokens": tok_count,
+                    "sec": sec,
+                })
+    except Exception as e:  # noqa: BLE001
+        q.put({"error": repr(e)})
+    finally:
+        q.put(None)
+
+
+# ---------------------------------------------------------------------------
+# Table rendering
+# ---------------------------------------------------------------------------
+
+
+def _human_bytes(n: float) -> str:
+    for unit in ("B", "KB", "MB", "GB", "TB"):
+        if n < 1024:
+            return f"{n:.2f} {unit}"
+        n /= 1024
+    return f"{n:.2f} PB"
+
+
+BACKEND_STYLE = {
+    "tokenizers": "bold green",
+    "tiktoken":   "bold yellow",
+    "wordchipper": "bold magenta",
+    "iree":       "bold cyan",
+    "bpe":        "bold blue",
+}
+
+
+def _color_speedup(ratio: float) -> str:
+    if ratio >= 1.10:
+        return f"[bold green]{ratio:.2f}×[/bold green]"
+    if ratio <= 0.90:
+        return f"[bold red]{ratio:.2f}×[/bold red]"
+    return f"[white]{ratio:.2f}×[/white]"
+
+
+def _build_matrix_table(
+    phase: str, rows: List[dict], backend_order: List[str], throughput_unit: str,
+) -> Table:
+    """One row per (batch_size, input_length); one ms column and one throughput column per backend."""
+    grouped: Dict[Tuple[int, int], Dict[str, dict]] = {}
+    for r in rows:
+        if r["phase"] != phase:
+            continue
+        grouped.setdefault((r["batch_size"], r["input_length"]), {})[r["backend"]] = r
+
+    t = Table(
+        title=f"[bold]{phase} throughput — {throughput_unit}[/bold]",
+        box=box.ROUNDED,
+        header_style="bold cyan",
+        title_justify="left",
+        expand=False,
     )
-    filename = hf_hub_download(MODEL_ID, "original/tokenizer.model")
-    mergeable_ranks = load_tiktoken_bpe(filename)
-    pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
-    num_reserved_special_tokens = 256
-    special_tokens = [
-        "<|begin_of_text|>",
-        "<|end_of_text|>",
-        "<|reserved_special_token_0|>",
-        "<|reserved_special_token_1|>",
-        "<|reserved_special_token_2|>",
-        "<|reserved_special_token_3|>",
-        "<|start_header_id|>",
-        "<|end_header_id|>",
-        "<|reserved_special_token_4|>",
-        "<|eot_id|>",  # end of turn
-    ] + [f"<|reserved_special_token_{i}|>" for i in range(5, num_reserved_special_tokens - 5)]
-    num_base_tokens = len(mergeable_ranks)
-    special_tokens = {token: num_base_tokens + i for i, token in enumerate(special_tokens)}
-    enc = tiktoken.Encoding(
-        name=model,
-        pat_str=pat_str,
-        mergeable_ranks=mergeable_ranks,
-        special_tokens=special_tokens,
+    t.add_column("batch", justify="right", style="cyan", no_wrap=True)
+    t.add_column("input_len", justify="right", style="cyan", no_wrap=True)
+    for name in backend_order:
+        style = BACKEND_STYLE.get(name, "white")
+        t.add_column(f"{name}\nms", justify="right", style=style, no_wrap=True)
+        t.add_column(f"{name}\n{throughput_unit}", justify="right", style=style, no_wrap=True)
+    t.add_column("winner", justify="center", no_wrap=True)
+
+    last_bs = None
+    for (bs, L) in sorted(grouped.keys()):
+        cell = grouped[(bs, L)]
+        if last_bs is not None and bs != last_bs:
+            t.add_section()
+        last_bs = bs
+
+        row: List[str] = [str(bs), str(L)]
+        throughputs: Dict[str, float] = {}
+        for name in backend_order:
+            r = cell.get(name)
+            if r is None:
+                row.extend(["-", "-"])
+                continue
+            ms = r["sec"] * 1000
+            if phase == "encode":
+                through = r["total_bytes"] / r["sec"] / 1e6  # MB/s
+            else:
+                through = r.get("total_tokens", 0) / r["sec"] / 1e6  # Mtok/s
+            throughputs[name] = through
+            row.append(f"{ms:,.2f}")
+            row.append(f"{through:,.1f}")
+
+        if throughputs:
+            winner = max(throughputs, key=throughputs.get)
+            style = BACKEND_STYLE.get(winner, "white")
+            row.append(f"[{style}]{winner}[/{style}]")
+        else:
+            row.append("-")
+        t.add_row(*row)
+    return t
+
+
+def _summary_panel(rows: List[dict], backend_order: List[str], phase: str) -> Panel:
+    """Geo-mean speedup of tokenizers over each competitor for the given phase."""
+    if "tokenizers" not in backend_order:
+        return Panel("[dim]tokenizers backend missing — no relative summary[/dim]", box=box.ROUNDED)
+
+    per_combo: Dict[Tuple[int, int], Dict[str, float]] = {}
+    for r in rows:
+        if r["phase"] != phase:
+            continue
+        per_combo.setdefault((r["batch_size"], r["input_length"]), {})[r["backend"]] = r["sec"]
+
+    parts: List[str] = []
+    for other in backend_order:
+        if other == "tokenizers":
+            continue
+        ratios = []
+        for combo, backends in per_combo.items():
+            if "tokenizers" in backends and other in backends:
+                if backends["tokenizers"] > 0 and backends[other] > 0:
+                    ratios.append(backends[other] / backends["tokenizers"])
+        if not ratios:
+            continue
+        gmean = statistics.geometric_mean(ratios)
+        best = max(ratios)
+        worst = min(ratios)
+        parts.append(
+            f"vs [{BACKEND_STYLE.get(other, 'white')}]{other}[/]: "
+            f"geo-mean {_color_speedup(gmean)}  "
+            f"range [{worst:.2f}× .. {best:.2f}×]"
+        )
+    body = "\n".join(parts) if parts else "[dim]no pairwise data[/dim]"
+    return Panel(body, title=f"{phase} summary (higher is better for tokenizers)", box=box.ROUNDED)
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+
+def run(
+    model: str,
+    backends_wanted: List[str],
+    batch_sizes: List[int],
+    input_lengths: List[int],
+    num_threads: int,
+    num_prompts: int,
+    iters: int,
+    warmup: int,
+    console: Optional[Console] = None,
+    print_tables: bool = True,
+) -> Tuple[List[dict], List[str]]:
+    """Run the (batch × input_length) matrix for a single model.
+
+    Returns ``(rows, backend_order)`` so callers (e.g., multi-model suites)
+    can aggregate without re-parsing printed output.
+    """
+    console = console or Console()
+    combos = [(b, L) for b in batch_sizes for L in input_lengths]
+
+    console.print(Panel(
+        f"[bold]model:[/bold] [cyan]{model}[/cyan]   "
+        f"[bold]backends:[/bold] {backends_wanted}\n"
+        f"[bold]batch sizes:[/bold]   {batch_sizes}\n"
+        f"[bold]input lengths:[/bold] {input_lengths}\n"
+        f"[bold]threads:[/bold] {num_threads}   "
+        f"[bold]iters:[/bold] {iters} (warmup {warmup})   "
+        f"[bold]combos:[/bold] {len(combos)}",
+        title="benchmark configuration", box=box.ROUNDED, title_align="left",
+    ))
+
+    q: "Queue" = Queue()
+    p = Process(
+        target=_worker,
+        args=(model, backends_wanted, combos, num_prompts,
+              num_threads, iters, warmup, q),
     )
-    out = enc.encode("This is a test")
+    p.start()
+
+    rows: List[dict] = []
+    error: Optional[str] = None
+    ordered: List[str] = []
+    expected = len(backends_wanted) * len(combos) * 2
+
+    def _status(done: int, current: Optional[dict]) -> Panel:
+        bar = "█" * int(40 * done / max(expected, 1))
+        bar = bar.ljust(40, "·")
+        line = f"[cyan]{bar}[/cyan]  {done}/{expected}"
+        if current is not None:
+            line += (
+                f"   [dim]{current['phase']} "
+                f"{current['backend']} bs={current['batch_size']} "
+                f"len={current['input_length']} "
+                f"{current['sec']*1000:.2f}ms[/dim]"
+            )
+        return Panel(line, title="benchmark progress", box=box.ROUNDED, title_align="left")
+
+    last_item: Optional[dict] = None
+    with Live(_status(0, None), console=console, refresh_per_second=8) as live:
+        while True:
+            item = q.get()
+            if item is None:
+                break
+            if "log" in item:
+                live.console.log(item["log"])
+                continue
+            if "error" in item:
+                error = item["error"]
+                continue
+            if item["backend"] not in ordered:
+                ordered.append(item["backend"])
+            rows.append(item)
+            last_item = item
+            live.update(_status(len(rows), last_item))
 
-    hf_enc = Tokenizer.from_pretrained(model)
-    out2 = hf_enc.encode("This is a test", add_special_tokens=False).ids
+    p.join()
 
-    assert out == out2, "sanity check"
+    if print_tables:
+        console.print()
+        console.print(_build_matrix_table("encode", rows, ordered, "MB/s"))
+        console.print(_summary_panel(rows, ordered, "encode"))
+        console.print()
+        console.print(_build_matrix_table("decode", rows, ordered, "Mtok/s"))
+        console.print(_summary_panel(rows, ordered, "decode"))
 
-    start = time.perf_counter_ns()
-    enc.encode_ordinary_batch(documents, num_threads=num_threads)
-    end = time.perf_counter_ns()
+    if error:
+        console.print(f"[bold red]worker error for {model}:[/bold red] {error}")
 
-    readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9)
-    print(f"tiktoken \t{readable_size}  / s")
+    return rows, ordered
 
-    start = time.perf_counter_ns()
-    hf_enc.encode_batch_fast(documents)
-    end = time.perf_counter_ns()
-    readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9)
-    print(f"huggingface \t{readable_size} / s")
 
+# ---------------------------------------------------------------------------
+# Multi-model suite
+# ---------------------------------------------------------------------------
 
-def test(model: str, dataset: str, dataset_config: str, threads: List[int]):
-    dataset_xnli = load_dataset(dataset, dataset_config)
 
-    input_lengths = [(10, False, True), (10_000, False, True), (10_000, False, False)]
+def _best_throughput(
+    rows: List[dict], backend: str, phase: str, kind: str,
+) -> Optional[float]:
+    """Return the peak throughput (MB/s for encode, Mtok/s for decode) over all combos."""
+    best = None
+    for r in rows:
+        if r["backend"] != backend or r["phase"] != phase:
+            continue
+        if r["sec"] <= 0:
+            continue
+        if kind == "MB/s":
+            v = r["total_bytes"] / r["sec"] / 1e6
+        else:
+            v = r.get("total_tokens", 0) / r["sec"] / 1e6
+        if best is None or v > best:
+            best = v
+    return best
 
-    for num_threads in threads:
-        for length, fuse, long in input_lengths:
-            documents = []
-            for i, item in enumerate(dataset_xnli["train"]):
-                if i >= length:
-                    break
-                if long:
-                    documents.append("".join(item["premise"].values()))
+
+def _build_leaderboard(
+    per_model: Dict[str, Tuple[List[dict], List[str]]],
+    phase: str,
+    unit: str,
+) -> Table:
+    """Cross-model leaderboard: one row per model, one column per backend."""
+    backends_seen: List[str] = []
+    for _, order in per_model.values():
+        for b in order:
+            if b not in backends_seen:
+                backends_seen.append(b)
+
+    t = Table(
+        title=f"[bold]{phase} peak throughput per model ({unit})[/bold]",
+        box=box.ROUNDED,
+        header_style="bold cyan",
+        title_justify="left",
+        expand=False,
+    )
+    t.add_column("model", style="cyan", no_wrap=True)
+    for name in backends_seen:
+        style = BACKEND_STYLE.get(name, "white")
+        t.add_column(name, justify="right", style=style, no_wrap=True)
+    if "tokenizers" in backends_seen and len(backends_seen) > 1:
+        t.add_column("best ×\ntokenizers", justify="right", no_wrap=True)
+
+    for model, (rows, _order) in per_model.items():
+        vals: Dict[str, Optional[float]] = {
+            b: _best_throughput(rows, b, phase, unit)
+            for b in backends_seen
+        }
+        row = [model]
+        for b in backends_seen:
+            v = vals[b]
+            row.append(f"{v:,.1f}" if v is not None else "-")
+        if "tokenizers" in backends_seen and len(backends_seen) > 1:
+            tk = vals.get("tokenizers") or 0.0
+            best_other_name, best_other_val = None, 0.0
+            for b in backends_seen:
+                if b == "tokenizers":
+                    continue
+                v = vals.get(b)
+                if v is not None and v > best_other_val:
+                    best_other_val, best_other_name = v, b
+            if tk > 0 and best_other_val > 0 and best_other_name:
+                ratio = best_other_val / tk
+                style = BACKEND_STYLE.get(best_other_name, "white")
+                marker = _color_speedup(ratio)
+                row.append(f"{marker} [{style}]{best_other_name}[/{style}]")
+            else:
+                row.append("-")
+        t.add_row(*row)
+    return t
+
+
+def _machine_state() -> Dict[str, Any]:
+    """Snapshot of machine state for fairness reporting."""
+    info: Dict[str, Any] = {
+        "timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat(),
+        "platform": platform.platform(),
+        "python": platform.python_version(),
+        "nproc": os.cpu_count(),
+    }
+    try:
+        with open("/proc/loadavg") as f:
+            info["loadavg"] = f.read().split()[:3]
+    except OSError:
+        info["loadavg"] = None
+    try:
+        info["pinned_cpus"] = sorted(os.sched_getaffinity(0))
+    except (AttributeError, OSError):
+        info["pinned_cpus"] = None
+    try:
+        with open("/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor") as f:
+            info["cpu_governor"] = f.read().strip()
+    except OSError:
+        info["cpu_governor"] = None
+    try:
+        out = subprocess.check_output(
+            ["lscpu"], stderr=subprocess.DEVNULL, text=True, timeout=5
+        )
+        for line in out.splitlines():
+            if line.startswith("Model name:"):
+                info["cpu_model"] = line.split(":", 1)[1].strip()
+                break
+    except (subprocess.SubprocessError, FileNotFoundError):
+        info["cpu_model"] = None
+    return info
+
+
+def _preflight(console: Console, fail_on_load: bool = False) -> Dict[str, Any]:
+    """Print a fairness preflight; optionally abort on heavy load."""
+    state = _machine_state()
+    loadavg = state.get("loadavg") or ["?", "?", "?"]
+    pinned = state.get("pinned_cpus")
+    pinned_str = (
+        f"{len(pinned)} ({pinned[0]}..{pinned[-1]})"
+        if pinned and len(pinned) > 4
+        else str(pinned)
+    )
+    lines = [
+        f"[bold]host:[/bold] {state.get('cpu_model') or 'unknown CPU'}   "
+        f"[bold]nproc:[/bold] {state.get('nproc')}",
+        f"[bold]load avg (1/5/15):[/bold] {loadavg[0]} / {loadavg[1]} / {loadavg[2]}",
+        f"[bold]pinned cpus:[/bold] {pinned_str}",
+        f"[bold]governor:[/bold] {state.get('cpu_governor') or 'unknown'}",
+    ]
+    console.print(Panel("\n".join(lines), title="fairness preflight", box=box.ROUNDED, title_align="left"))
+
+    nproc = state.get("nproc") or 1
+    try:
+        one_min = float(loadavg[0])
+    except (TypeError, ValueError):
+        one_min = 0.0
+    # Warn if load > 50% of cores; abort only if --strict requested.
+    if one_min > nproc * 0.5:
+        msg = f"[bold red]high system load ({one_min} over {nproc} cores)[/bold red]"
+        console.print(msg)
+        if fail_on_load:
+            raise SystemExit(2)
+    return state
+
+
+def _serialize(per_model: Dict[str, Tuple[List[dict], List[str]]],
+               state: Dict[str, Any],
+               config: Dict[str, Any]) -> Dict[str, Any]:
+    out: Dict[str, Any] = {
+        "state": state,
+        "config": config,
+        "models": {},
+    }
+    for model, (rows, order) in per_model.items():
+        peaks: Dict[str, Dict[str, float]] = {}
+        for b in order:
+            peaks[b] = {
+                "encode_mb_s": _best_throughput(rows, b, "encode", "MB/s") or 0.0,
+                "decode_mtok_s": _best_throughput(rows, b, "decode", "Mtok/s") or 0.0,
+            }
+        out["models"][model] = {"backends": order, "peaks": peaks, "rows": rows}
+    return out
+
+
+def _markdown_leaderboard(per_model: Dict[str, Tuple[List[dict], List[str]]],
+                          state: Dict[str, Any],
+                          config: Dict[str, Any]) -> str:
+    backends_seen: List[str] = []
+    for _, order in per_model.values():
+        for b in order:
+            if b not in backends_seen:
+                backends_seen.append(b)
+
+    parts: List[str] = []
+    parts.append(f"# Tokenizer benchmark results\n")
+    parts.append(f"- **Timestamp**: {state.get('timestamp')}")
+    parts.append(f"- **CPU**: {state.get('cpu_model') or 'unknown'}   (nproc: {state.get('nproc')})")
+    parts.append(f"- **Load avg (1/5/15)**: {' / '.join(map(str, state.get('loadavg') or []))}")
+    pinned = state.get('pinned_cpus') or []
+    parts.append(f"- **Pinned CPUs**: {len(pinned)} ({min(pinned)}..{max(pinned)})"
+                 if pinned else "- **Pinned CPUs**: n/a")
+    parts.append(f"- **Governor**: {state.get('cpu_governor') or 'unknown'}")
+    parts.append(f"- **Config**: batches={config['batch_sizes']} lengths={config['input_lengths']} "
+                 f"threads={config['num_threads']} iters={config['iters']} warmup={config['warmup']}")
+    parts.append("")
+
+    for phase, unit in (("encode", "MB/s"), ("decode", "Mtok/s")):
+        parts.append(f"## {phase} peak throughput ({unit})\n")
+        header = ["model"] + backends_seen + (["best × tokenizers"] if "tokenizers" in backends_seen else [])
+        parts.append("| " + " | ".join(header) + " |")
+        parts.append("|" + "|".join(["---"] * len(header)) + "|")
+        for model, (rows, _order) in per_model.items():
+            cells = [model]
+            vals: Dict[str, Optional[float]] = {}
+            for b in backends_seen:
+                v = _best_throughput(rows, b, phase, unit)
+                vals[b] = v
+                cells.append(f"{v:,.1f}" if v is not None else "–")
+            if "tokenizers" in backends_seen:
+                tk = vals.get("tokenizers") or 0.0
+                best_name, best_val = None, 0.0
+                for b in backends_seen:
+                    if b == "tokenizers":
+                        continue
+                    v = vals.get(b)
+                    if v is not None and v > best_val:
+                        best_val, best_name = v, b
+                if tk > 0 and best_val > 0 and best_name:
+                    cells.append(f"{best_val/tk:.2f}× {best_name}")
                 else:
-                    documents.append(item["premise"]["en"])
-            if fuse:
-                documents = ["".join(documents)]
+                    cells.append("–")
+            parts.append("| " + " | ".join(cells) + " |")
+        parts.append("")
+    return "\n".join(parts)
 
-            document_length = sum(len(d) for d in documents) / len(documents)
 
-            # Rayon thread pool is global to a process, we need to launch
-            # separate processes in order to accurately use the correct number of threads.
-            # Otherwise, we're simply running tokenizers in whatever tests comes first.
-            # tokenizers does NOT provide a method to change the number of threads during
-            # runtime.
-            p = Process(target=benchmark_batch, args=(model, documents, num_threads, document_length))
-            p.start()
-            p.join()
+def run_suite(
+    models: List[str],
+    backends_wanted: List[str],
+    batch_sizes: List[int],
+    input_lengths: List[int],
+    num_threads: int,
+    num_prompts: int,
+    iters: int,
+    warmup: int,
+    save_json: Optional[str] = None,
+    save_md: Optional[str] = None,
+    strict_fairness: bool = False,
+) -> None:
+    console = Console()
+    state = _preflight(console, fail_on_load=strict_fairness)
 
-            # benchmark_batch(model, documents, num_threads)
+    per_model: Dict[str, Tuple[List[dict], List[str]]] = {}
 
+    for idx, model in enumerate(models, 1):
+        console.rule(f"[bold cyan]{idx}/{len(models)}  {model}[/bold cyan]")
+        # 2-second settle between runs to let thermals and caches reset.
+        if idx > 1:
+            time.sleep(2)
+        rows, order = run(
+            model=model,
+            backends_wanted=backends_wanted,
+            batch_sizes=batch_sizes,
+            input_lengths=input_lengths,
+            num_threads=num_threads,
+            num_prompts=num_prompts,
+            iters=iters,
+            warmup=warmup,
+            console=console,
+            print_tables=True,
+        )
+        per_model[model] = (rows, order)
 
-def main():
+    console.rule("[bold]cross-model leaderboard[/bold]")
+    console.print()
+    console.print(_build_leaderboard(per_model, "encode", "MB/s"))
+    console.print()
+    console.print(_build_leaderboard(per_model, "decode", "Mtok/s"))
+
+    config = dict(
+        batch_sizes=batch_sizes,
+        input_lengths=input_lengths,
+        num_threads=num_threads,
+        num_prompts=num_prompts,
+        iters=iters,
+        warmup=warmup,
+        backends=backends_wanted,
+    )
+
+    if save_json:
+        payload = _serialize(per_model, state, config)
+        os.makedirs(os.path.dirname(save_json) or ".", exist_ok=True)
+        with open(save_json, "w") as f:
+            json.dump(payload, f, indent=2, default=str)
+        console.print(f"[dim]wrote {save_json}[/dim]")
+    if save_md:
+        md = _markdown_leaderboard(per_model, state, config)
+        os.makedirs(os.path.dirname(save_md) or ".", exist_ok=True)
+        with open(save_md, "w") as f:
+            f.write(md)
+        console.print(f"[dim]wrote {save_md}[/dim]")
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+# Curated non-OpenAI BPE models for the default --hf-models sweep.
+DEFAULT_HF_MODELS = [
+    "meta-llama/Llama-3.2-1B",
+    "Qwen/Qwen2.5-7B",
+    "Qwen/Qwen3-8B",
+    "deepseek-ai/DeepSeek-V3",
+    "zai-org/GLM-4.5-Air",
+    "mistralai/Mistral-Nemo-Instruct-2407",
+    "01-ai/Yi-1.5-9B",
+    "bigcode/starcoder2-7b",
+    "EleutherAI/gpt-neox-20b",
+    "tiiuae/falcon-7b",
+]
+
+
+def main() -> None:
     parser = argparse.ArgumentParser(
         prog="bench_tokenizer",
-        description="Getting a feel for speed when tokenizing",
+        description="Cross-library tokenizer benchmark "
+                    "(tokenizers vs tiktoken vs wordchipper vs iree).",
+    )
+    parser.add_argument(
+        "-e", "--encoding", default=None,
+        help="Run a single OpenAI encoding (gpt2/cl100k_base/o200k_base/llama3) "
+             "or any HF repo id.",
+    )
+    parser.add_argument(
+        "--hf-models", nargs="*", default=None,
+        help="Run the matrix for multiple HF repo ids and print a leaderboard. "
+             f"If given with no args, uses the default list: {DEFAULT_HF_MODELS}.",
+    )
+    parser.add_argument(
+        "--backends", nargs="+", default=ALL_BACKENDS, choices=ALL_BACKENDS,
     )
-    parser.add_argument("-m", "--model", default=MODEL_ID, type=str)
-    parser.add_argument("-d", "--dataset", default=DATASET, type=str)
-    parser.add_argument("-ds", "--dataset-config", default=DATASET_CONFIG, type=str)
-    parser.add_argument("-t", "--threads", nargs="+", default=DEFAULT_THREADS, type=int)
+    parser.add_argument(
+        "-b", "--batch-sizes", nargs="+", default=DEFAULT_BATCH_SIZES, type=int,
+    )
+    parser.add_argument(
+        "-l", "--input-lengths", nargs="+", default=DEFAULT_INPUT_LENGTHS, type=int,
+    )
+    parser.add_argument("-t", "--threads", default=os.cpu_count() or 8, type=int)
+    parser.add_argument("-p", "--num-prompts", default=16, type=int)
+    parser.add_argument("--iters", default=3, type=int)
+    parser.add_argument("--warmup", default=1, type=int)
+    parser.add_argument("--save-json", default=None, type=str,
+                        help="Write full results (rows + peaks + machine state) to this path.")
+    parser.add_argument("--save-md", default=None, type=str,
+                        help="Write a markdown leaderboard to this path.")
+    parser.add_argument("--strict-fairness", action="store_true",
+                        help="Abort when system load exceeds 50% of nproc.")
     args = parser.parse_args()
-    test(args.model, args.dataset, args.dataset_config, args.threads)
+
+    if args.hf_models is not None:
+        models = args.hf_models or DEFAULT_HF_MODELS
+        run_suite(
+            models=models,
+            backends_wanted=args.backends,
+            batch_sizes=args.batch_sizes,
+            input_lengths=args.input_lengths,
+            num_threads=args.threads,
+            num_prompts=args.num_prompts,
+            iters=args.iters,
+            warmup=args.warmup,
+            save_json=args.save_json,
+            save_md=args.save_md,
+            strict_fairness=args.strict_fairness,
+        )
+    else:
+        console = Console()
+        state = _preflight(console, fail_on_load=args.strict_fairness)
+        rows, order = run(
+            model=args.encoding or DEFAULT_ENCODING,
+            backends_wanted=args.backends,
+            batch_sizes=args.batch_sizes,
+            input_lengths=args.input_lengths,
+            num_threads=args.threads,
+            num_prompts=args.num_prompts,
+            iters=args.iters,
+            warmup=args.warmup,
+        )
+        if args.save_json or args.save_md:
+            per_model = {args.encoding or DEFAULT_ENCODING: (rows, order)}
+            config = dict(
+                batch_sizes=args.batch_sizes,
+                input_lengths=args.input_lengths,
+                num_threads=args.threads,
+                num_prompts=args.num_prompts,
+                iters=args.iters,
+                warmup=args.warmup,
+                backends=args.backends,
+            )
+            if args.save_json:
+                payload = _serialize(per_model, state, config)
+                os.makedirs(os.path.dirname(args.save_json) or ".", exist_ok=True)
+                with open(args.save_json, "w") as f:
+                    json.dump(payload, f, indent=2, default=str)
+            if args.save_md:
+                md = _markdown_leaderboard(per_model, state, config)
+                os.makedirs(os.path.dirname(args.save_md) or ".", exist_ok=True)
+                with open(args.save_md, "w") as f:
+                    f.write(md)
 
 
-# Call the function to run the benchmark
 if __name__ == "__main__":
     main()
diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml
index 0e937f3cc..b102f74ff 100644
--- a/tokenizers/Cargo.toml
+++ b/tokenizers/Cargo.toml
@@ -66,6 +66,10 @@ harness = false
 name = "ci_benchmark"
 harness = false
 
+[[bench]]
+name = "matrix_benchmark"
+harness = false
+
 [dependencies]
 rand = "0.9"
 onig = { version = "6.5.1", default-features = false, optional = true }
diff --git a/tokenizers/benches/matrix_benchmark.rs b/tokenizers/benches/matrix_benchmark.rs
new file mode 100644
index 000000000..fc625702d
--- /dev/null
+++ b/tokenizers/benches/matrix_benchmark.rs
@@ -0,0 +1,159 @@
+//! Standardized ``(batch_size × input_length)`` encode + decode matrix.
+//!
+//! Mirrors the knobs used by fastokens' ``examples/ablation.sh`` and
+//! wordchipper's batch bench: per-sample token length is controlled directly
+//! (not document length), and both encode and decode are measured on the same
+//! samples so throughput numbers are comparable side-by-side.
+//!
+//! Samples are built from ``data/big.txt``: the file is tokenized once, the
+//! resulting ids are truncated/repeated to the target length (following
+//! fastokens' ``_adjust_tokens``) and decoded back to text.
+//!
+//! Run with:
+//!
+//!   cargo bench --bench matrix_benchmark
+//!
+//! Local tuning of the sweep via env vars:
+//!
+//!   MATRIX_BATCH_SIZES=1,8,32   MATRIX_INPUT_LENGTHS=128,1024,8192  cargo bench --bench matrix_benchmark
+
+#[macro_use]
+extern crate criterion;
+
+use criterion::{BenchmarkId, Criterion, Throughput};
+use std::hint::black_box;
+use tokenizers::Tokenizer;
+
+const DEFAULT_BATCH_SIZES: &[usize] = &[1, 8, 32, 128];
+const DEFAULT_INPUT_LENGTHS: &[usize] = &[128, 512, 2048, 8192];
+
+fn parse_env_list(name: &str, default: &[usize]) -> Vec<usize> {
+    match std::env::var(name) {
+        Ok(s) => s
+            .split(',')
+            .filter_map(|x| x.trim().parse::<usize>().ok())
+            .collect(),
+        Err(_) => default.to_vec(),
+    }
+}
+
+/// Truncate or cyclically repeat ``ids`` to reach exactly ``target`` length.
+/// Mirrors fastokens' ``_adjust_tokens`` helper.
+fn adjust_tokens(ids: &[u32], target: usize) -> Vec<u32> {
+    if ids.len() >= target {
+        return ids[..target].to_vec();
+    }
+    let mut out = Vec::with_capacity(target);
+    while out.len() < target {
+        let remaining = target - out.len();
+        out.extend_from_slice(&ids[..remaining.min(ids.len())]);
+    }
+    out
+}
+
+/// Build a fixed-length sample at the text level.
+///
+/// We tokenize ``source`` once, reshape the ids to ``input_length`` tokens via
+/// ``adjust_tokens``, then decode back to text. Re-encoding the decoded text
+/// may not produce exactly ``input_length`` ids for every tokenizer, but the
+/// byte length is stable and representative of real inputs at that scale.
+fn build_sample(tok: &Tokenizer, source: &str, input_length: usize) -> String {
+    let encoded = tok.encode(source, false).unwrap();
+    let adjusted = adjust_tokens(encoded.get_ids(), input_length);
+    tok.decode(&adjusted, false).unwrap()
+}
+
+fn bench_matrix(c: &mut Criterion) {
+    let tokenizer = Tokenizer::from_file("data/llama-3-tokenizer.json").unwrap();
+    let corpus = std::fs::read_to_string("data/big.txt").unwrap();
+
+    let batch_sizes = parse_env_list("MATRIX_BATCH_SIZES", DEFAULT_BATCH_SIZES);
+    let input_lengths = parse_env_list("MATRIX_INPUT_LENGTHS", DEFAULT_INPUT_LENGTHS);
+
+    // Pre-build one sample per input_length (reused across batch sizes).
+    let samples: Vec<(usize, String)> = input_lengths
+        .iter()
+        .map(|&l| (l, build_sample(&tokenizer, &corpus, l)))
+        .collect();
+
+    // ------- ENCODE (batch, offsets tracked) -------
+    let mut group = c.benchmark_group("matrix/encode-batch");
+    for &bs in &batch_sizes {
+        for (input_length, sample) in &samples {
+            let batch: Vec<&str> = (0..bs).map(|_| sample.as_str()).collect();
+            let total_bytes: u64 = batch.iter().map(|s| s.len() as u64).sum();
+            group.throughput(Throughput::Bytes(total_bytes));
+            group.bench_with_input(
+                BenchmarkId::new(format!("len{input_length}"), bs),
+                &(batch.clone()),
+                |b, batch| {
+                    b.iter(|| {
+                        black_box(
+                            tokenizer
+                                .encode_batch(black_box(batch.clone()), false)
+                                .unwrap(),
+                        )
+                    })
+                },
+            );
+        }
+    }
+    group.finish();
+
+    // ------- ENCODE (batch, offset-free fast path) -------
+    let mut group = c.benchmark_group("matrix/encode-batch-fast");
+    for &bs in &batch_sizes {
+        for (input_length, sample) in &samples {
+            let batch: Vec<&str> = (0..bs).map(|_| sample.as_str()).collect();
+            let total_bytes: u64 = batch.iter().map(|s| s.len() as u64).sum();
+            group.throughput(Throughput::Bytes(total_bytes));
+            group.bench_with_input(
+                BenchmarkId::new(format!("len{input_length}"), bs),
+                &(batch.clone()),
+                |b, batch| {
+                    b.iter(|| {
+                        black_box(
+                            tokenizer
+                                .encode_batch_fast(black_box(batch.clone()), false)
+                                .unwrap(),
+                        )
+                    })
+                },
+            );
+        }
+    }
+    group.finish();
+
+    // ------- DECODE (batch) -------
+    let mut group = c.benchmark_group("matrix/decode-batch");
+    for &bs in &batch_sizes {
+        for (input_length, sample) in &samples {
+            let batch: Vec<&str> = (0..bs).map(|_| sample.as_str()).collect();
+            let encs = tokenizer.encode_batch_fast(batch.clone(), false).unwrap();
+            let ids_per_sample: Vec<Vec<u32>> =
+                encs.iter().map(|e| e.get_ids().to_vec()).collect();
+            let total_tokens: u64 = ids_per_sample.iter().map(|v| v.len() as u64).sum();
+            group.throughput(Throughput::Elements(total_tokens));
+            group.bench_with_input(
+                BenchmarkId::new(format!("len{input_length}"), bs),
+                &ids_per_sample,
+                |b, ids_per_sample| {
+                    b.iter(|| {
+                        // decode_batch takes Vec<Vec<u32>> by value in this API; clone per iter
+                        let slices: Vec<&[u32]> =
+                            ids_per_sample.iter().map(|v| v.as_slice()).collect();
+                        black_box(tokenizer.decode_batch(&slices, false).unwrap())
+                    })
+                },
+            );
+        }
+    }
+    group.finish();
+}
+
+criterion_group! {
+    name = matrix;
+    config = Criterion::default().sample_size(15);
+    targets = bench_matrix
+}
+criterion_main!(matrix);