diff --git a/examples/benchmark/index.html b/examples/benchmark/index.html
index fb1a5d4..80e932b 100644
--- a/examples/benchmark/index.html
+++ b/examples/benchmark/index.html
@@ -370,7 +370,7 @@
Comparison Charts
},
flare: {
id: 'smollm2-135m-flare',
- url: 'https://huggingface.co/HuggingFaceTB/smollm2-135M-instruct-GGUF/resolve/main/smollm2-135m-instruct-q8_0.gguf',
+ url: 'https://huggingface.co/bartowski/SmolLM2-135M-Instruct-GGUF/resolve/main/SmolLM2-135M-Instruct-Q8_0.gguf',
},
},
'llama-3.2-1b': {
@@ -520,7 +520,7 @@ Comparison Charts
async function runTransformersInference(prompt, opts) {
const t0 = performance.now();
let firstTokenTime = null;
- let outputTokenCount = 0;
+ let prevTokenCount = 0;
const messages = [{ role: 'user', content: prompt }];
let input;
@@ -533,33 +533,44 @@ Comparison Charts
input = prompt;
}
- const inputTokens = transformersPipeline.tokenizer(input);
- const inputLen = inputTokens?.input_ids?.dims?.[1] || input.length / 4;
-
const result = await transformersPipeline(input, {
max_new_tokens: opts.maxTokens,
temperature: opts.temperature || 0.001,
do_sample: opts.temperature > 0,
- callback_function: (beams) => {
- const generated = beams?.[0]?.output_token_ids?.length || 0;
- if (firstTokenTime === null && generated > inputLen) {
+ callback_function: (output) => {
+ // output can be [{generated_text}] or beams with output_token_ids
+ const ids = output?.[0]?.output_token_ids;
+ const currentLen = ids?.length || ids?.size || 0;
+ if (firstTokenTime === null && currentLen > prevTokenCount) {
firstTokenTime = performance.now() - t0;
}
- outputTokenCount = Math.max(0, generated - inputLen);
+ if (currentLen > 0) prevTokenCount = currentLen;
},
});
const totalTime = performance.now() - t0;
const output = result[0]?.generated_text || '';
const generatedPart = typeof output === 'string' ? output.slice(input.length) : '';
- const decodeTime = totalTime - (firstTokenTime || totalTime);
+ // Count output tokens by tokenizing the generated part
+ let outputTokenCount = 0;
+ try {
+ const genTokens = transformersPipeline.tokenizer(generatedPart);
+ outputTokenCount = genTokens?.input_ids?.dims?.[1] || genTokens?.input_ids?.length || 0;
+ } catch {
+ // Rough fallback: ~4 chars per token
+ outputTokenCount = Math.max(1, Math.round(generatedPart.length / 4));
+ }
+
+ // Transformers.js generates all tokens in a single batch call (no true
+ // streaming), so firstTokenTime ≈ totalTime. Use tokens/totalTime for
+ // throughput since there's no separate decode phase.
return {
output: generatedPart,
ttft: firstTokenTime || totalTime,
totalTime,
tokenCount: outputTokenCount,
- tokensPerSec: (outputTokenCount > 1 && decodeTime > 0) ? ((outputTokenCount - 1) / (decodeTime / 1000)) : 0,
+ tokensPerSec: outputTokenCount > 0 ? (outputTokenCount / (totalTime / 1000)) : 0,
};
}
@@ -575,9 +586,38 @@ Comparison Charts
if (!flareLib) {
log('Loading @sauravpanda/flare WASM from CDN...', 'info');
const CDN = 'https://cdn.jsdelivr.net/npm/@sauravpanda/flare@0.2.0/pkg';
- flareLib = await import(`${CDN}/flare_web.js`);
+ const wasmUrl = `${CDN}/flare_web_bg.wasm`;
+
+ // Fetch the JS module source, patch the WASM URL, and load via blob
+ // to avoid cross-origin ES module import restrictions.
+ const jsResp = await fetch(`${CDN}/flare_web.js`, { cache: 'no-cache' });
+ if (!jsResp.ok) throw new Error(`HTTP ${jsResp.status} fetching flare_web.js`);
+ let jsSrc = await jsResp.text();
+ log(`Fetched flare_web.js (${(jsSrc.length / 1024).toFixed(0)} KB)`, 'info');
+
+ // Patch import.meta.url references so the WASM file resolves to CDN
+ jsSrc = jsSrc.replaceAll('import.meta.url', `'${CDN}/flare_web.js'`);
+
+ // Fix wasm-pack codegen bug: a JSDoc block contains "/* done */"
+ // which prematurely closes the outer /** */ comment, causing
+ // "Unexpected token '*'" parse error. Remove the nested comment.
+ jsSrc = jsSrc.replaceAll('/* done */', '/* done -/');
+
+ const blob = new Blob([jsSrc], { type: 'application/javascript' });
+ const blobUrl = URL.createObjectURL(blob);
+ try {
+ flareLib = await import(/* webpackIgnore: true */ blobUrl);
+ } catch (importErr) {
+ URL.revokeObjectURL(blobUrl);
+ log(`Blob import failed: ${importErr.message}`, 'error');
+ // Fallback: try direct CDN import
+ log('Trying direct CDN import...', 'info');
+ flareLib = await import(`${CDN}/flare_web.js`);
+ }
+ URL.revokeObjectURL(blobUrl);
+
// Initialize the WASM module
- await flareLib.default(`${CDN}/flare_web_bg.wasm`);
+ await flareLib.default(wasmUrl);
log('Flare WASM initialized.', 'success');
}
diff --git a/src/config/models/flare-models.json b/src/config/models/flare-models.json
index 3bdb899..a276690 100644
--- a/src/config/models/flare-models.json
+++ b/src/config/models/flare-models.json
@@ -3,8 +3,8 @@
"engine": "flare",
"modelName": "SmolLM2-135M-Instruct",
"modelType": "text-generation",
- "repo": "HuggingFaceTB/smollm2-135M-instruct-GGUF",
- "url": "https://huggingface.co/HuggingFaceTB/smollm2-135M-instruct-GGUF/resolve/main/smollm2-135m-instruct-q8_0.gguf",
+ "repo": "bartowski/SmolLM2-135M-Instruct-GGUF",
+ "url": "https://huggingface.co/bartowski/SmolLM2-135M-Instruct-GGUF/resolve/main/SmolLM2-135M-Instruct-Q8_0.gguf",
"pipeline": "text-generation",
"defaultQuantization": "Q8_0",
"quantizations": ["Q8_0"],
@@ -24,8 +24,8 @@
"engine": "flare",
"modelName": "SmolLM2-135M-Instruct-Q4",
"modelType": "text-generation",
- "repo": "HuggingFaceTB/smollm2-135M-instruct-GGUF",
- "url": "https://huggingface.co/HuggingFaceTB/smollm2-135M-instruct-GGUF/resolve/main/smollm2-135m-instruct-q4_k_m.gguf",
+ "repo": "bartowski/SmolLM2-135M-Instruct-GGUF",
+ "url": "https://huggingface.co/bartowski/SmolLM2-135M-Instruct-GGUF/resolve/main/SmolLM2-135M-Instruct-Q4_K_M.gguf",
"pipeline": "text-generation",
"defaultQuantization": "Q4_K_M",
"quantizations": ["Q4_K_M"],
@@ -45,8 +45,8 @@
"engine": "flare",
"modelName": "SmolLM2-360M-Instruct",
"modelType": "text-generation",
- "repo": "HuggingFaceTB/smollm2-360M-instruct-GGUF",
- "url": "https://huggingface.co/HuggingFaceTB/smollm2-360M-instruct-GGUF/resolve/main/smollm2-360m-instruct-q8_0.gguf",
+ "repo": "bartowski/SmolLM2-360M-Instruct-GGUF",
+ "url": "https://huggingface.co/bartowski/SmolLM2-360M-Instruct-GGUF/resolve/main/SmolLM2-360M-Instruct-Q8_0.gguf",
"pipeline": "text-generation",
"defaultQuantization": "Q8_0",
"quantizations": ["Q8_0"],