diff --git a/examples/benchmark/flare-worker.js b/examples/benchmark/flare-worker.js
new file mode 100644
index 0000000..c9dadad
--- /dev/null
+++ b/examples/benchmark/flare-worker.js
@@ -0,0 +1,145 @@
+// Dedicated Web Worker that owns the Flare FlareEngine.
+//
+// The main thread talks to this worker via structured `{id, type, args}`
+// messages and receives back either `{id, result}`, `{id, error}`, or
+// (for streaming calls) multiple `{id, stream}` messages followed by a
+// final `{id, result}`.
+//
+// Loading Flare on the main thread blocks the UI for the full duration
+// of the 138 MB GGUF parse + GPU buffer upload. Moving it here keeps
+// the tab responsive — matches how MLC and Transformers.js already work.
+
+let flareLib = null;
+let engine = null;
+
+function reply(id, result) {
+ self.postMessage({ id, result });
+}
+
+function replyError(id, err) {
+ self.postMessage({
+ id,
+ error: { message: (err && err.message) || String(err), stack: err && err.stack },
+ });
+}
+
+function replyStream(id, tokenText, tokenId) {
+ self.postMessage({ id, stream: { tokenText, tokenId } });
+}
+
+self.addEventListener('message', async (e) => {
+ const { id, type, args } = e.data || {};
+ try {
+ await dispatch(id, type, args || {});
+ } catch (err) {
+ replyError(id, err);
+ }
+});
+
+async function dispatch(id, type, args) {
+ switch (type) {
+ case 'init': {
+ // Mirrors the main-thread blob-URL trick so the CDN-hosted ES module
+ // can be imported without CORS problems and so import.meta.url resolves
+ // to the correct wasm file.
+ const { jsUrl, wasmUrl, patchImportMeta } = args;
+ const resp = await fetch(jsUrl, { cache: 'no-cache' });
+ if (!resp.ok) throw new Error(`HTTP ${resp.status} fetching ${jsUrl}`);
+ let src = await resp.text();
+ if (patchImportMeta) {
+ src = src.replaceAll('import.meta.url', JSON.stringify(jsUrl));
+ // Workaround for wasm-pack codegen: a JSDoc block contains "/* done */"
+ // which prematurely closes the outer /** */ comment.
+ src = src.replaceAll('/* done */', '/* done -/');
+ }
+ const blob = new Blob([src], { type: 'application/javascript' });
+ const blobUrl = URL.createObjectURL(blob);
+ try {
+ flareLib = await import(blobUrl);
+ } finally {
+ URL.revokeObjectURL(blobUrl);
+ }
+ await flareLib.default(wasmUrl);
+ return reply(id, { ok: true });
+ }
+
+ case 'load': {
+ if (!flareLib) throw new Error('init must be called before load');
+ // `args.bytes` is transferred from the main thread so this is a move,
+ // not a copy.
+ engine = flareLib.FlareEngine.load(args.bytes);
+ return reply(id, { architecture: engine.architecture });
+ }
+
+ case 'init_gpu': {
+ if (!engine) throw new Error('load must be called before init_gpu');
+ const gpuOk = await engine.init_gpu();
+ return reply(id, {
+ gpuOk,
+ backendInfo: JSON.parse(engine.backend_info()),
+ });
+ }
+
+ case 'enable_prefill_profiling':
+ engine.enable_prefill_profiling();
+ return reply(id, null);
+
+ case 'disable_prefill_profiling':
+ engine.disable_prefill_profiling();
+ return reply(id, null);
+
+ case 'prefill_profile_json':
+ return reply(id, engine.prefill_profile_json());
+
+ case 'apply_chat_template':
+ return reply(id, engine.apply_chat_template(args.userMsg || '', args.systemMsg || ''));
+
+ case 'encode_text':
+ return reply(id, Array.from(engine.encode_text(args.text)));
+
+ case 'reset':
+ engine.reset();
+ return reply(id, null);
+
+ case 'stream': {
+ // One-shot streaming: the worker drains every token in a tight loop,
+ // posting each decoded chunk immediately. Since the worker has nothing
+ // else to do between tokens, the event loop stays unblocked.
+ const {
+ promptTokens,
+ maxTokens,
+ temperature,
+ topP,
+ topK,
+ repeatPenalty,
+ minP,
+ } = args;
+ engine.reset();
+ engine.begin_stream_with_params(
+ new Uint32Array(promptTokens),
+ maxTokens,
+ temperature,
+ topP,
+ topK,
+ repeatPenalty,
+ minP,
+ );
+ let count = 0;
+ while (!engine.stream_done) {
+ const tokId = engine.next_token();
+ if (tokId === undefined || tokId === null) break;
+ const text = engine.decode_token_chunk(tokId);
+ replyStream(id, text, tokId);
+ count += 1;
+ }
+ return reply(id, { done: true, completionTokens: count });
+ }
+
+ case 'dispose':
+ engine = null;
+ return reply(id, null);
+
+ default:
+ throw new Error(`unknown message type: ${type}`);
+ }
+}
diff --git a/examples/benchmark/index.html b/examples/benchmark/index.html
index 96c1d6e..669349e 100644
--- a/examples/benchmark/index.html
+++ b/examples/benchmark/index.html
@@ -578,53 +578,69 @@
Comparison Charts
transformersPipeline = null;
}
- // ── Flare Engine (via @sauravpanda/flare WASM) ──
- let flareEngine = null;
- let flareLib = null;
+ // ── Flare Engine (via @sauravpanda/flare WASM in a dedicated Worker) ──
+ //
+ // Flare's FlareEngine.load() is a synchronous WASM call that blocks the
+ // main thread for the full 138MB GGUF parse + GPU buffer upload. We
+ // run it in a dedicated Web Worker to keep the UI responsive — matches
+ // MLC and Transformers.js which do the same internally.
+ let flareWorker = null;
+ let flareArch = null;
+ let flareRpcSeq = 0;
+ const flareRpcPending = new Map();
+
+ function ensureFlareWorker() {
+ if (flareWorker) return flareWorker;
+ flareWorker = new Worker(new URL('./flare-worker.js', import.meta.url), {
+ type: 'module',
+ });
+ flareWorker.addEventListener('message', (e) => {
+ const { id, result, error, stream } = e.data;
+ const slot = flareRpcPending.get(id);
+ if (!slot) return;
+ if (error) {
+ slot.reject(new Error(error.message || 'flare worker error'));
+ flareRpcPending.delete(id);
+ return;
+ }
+ if (stream) {
+ try { slot.onStream && slot.onStream(stream); } catch (err) { console.error(err); }
+ return;
+ }
+ slot.resolve(result);
+ flareRpcPending.delete(id);
+ });
+ flareWorker.addEventListener('error', (e) => {
+ console.error('[flare-worker] error:', e);
+ log(`Flare worker error: ${e.message || e}`, 'error');
+ });
+ return flareWorker;
+ }
+
+ function flareCall(type, args = {}, transfer = [], onStream = null) {
+ const id = ++flareRpcSeq;
+ return new Promise((resolve, reject) => {
+ flareRpcPending.set(id, { resolve, reject, onStream });
+ flareWorker.postMessage({ id, type, args }, transfer);
+ });
+ }
async function loadFlareEngine(config) {
- if (!flareLib) {
- log('Loading @sauravpanda/flare WASM from CDN...', 'info');
+ if (!flareWorker) {
+ log('Starting Flare worker...', 'info');
+ ensureFlareWorker();
const CDN = 'https://cdn.jsdelivr.net/npm/@sauravpanda/flare@0.2.11/pkg';
- const wasmUrl = `${CDN}/flare_web_bg.wasm`;
-
- // Fetch the JS module source, patch the WASM URL, and load via blob
- // to avoid cross-origin ES module import restrictions.
- const jsResp = await fetch(`${CDN}/flare_web.js`, { cache: 'no-cache' });
- if (!jsResp.ok) throw new Error(`HTTP ${jsResp.status} fetching flare_web.js`);
- let jsSrc = await jsResp.text();
- log(`Fetched flare_web.js (${(jsSrc.length / 1024).toFixed(0)} KB)`, 'info');
-
- // Patch import.meta.url references so the WASM file resolves to CDN
- jsSrc = jsSrc.replaceAll('import.meta.url', `'${CDN}/flare_web.js'`);
-
- // Fix wasm-pack codegen bug: a JSDoc block contains "/* done */"
- // which prematurely closes the outer /** */ comment, causing
- // "Unexpected token '*'" parse error. Remove the nested comment.
- jsSrc = jsSrc.replaceAll('/* done */', '/* done -/');
-
- const blob = new Blob([jsSrc], { type: 'application/javascript' });
- const blobUrl = URL.createObjectURL(blob);
- try {
- flareLib = await import(/* webpackIgnore: true */ blobUrl);
- } catch (importErr) {
- URL.revokeObjectURL(blobUrl);
- log(`Blob import failed: ${importErr.message}`, 'error');
- // Fallback: try direct CDN import
- log('Trying direct CDN import...', 'info');
- flareLib = await import(`${CDN}/flare_web.js`);
- }
- URL.revokeObjectURL(blobUrl);
-
- // Initialize the WASM module
- await flareLib.default(wasmUrl);
- log('Flare WASM initialized.', 'success');
+ await flareCall('init', {
+ jsUrl: `${CDN}/flare_web.js`,
+ wasmUrl: `${CDN}/flare_web_bg.wasm`,
+ patchImportMeta: true,
+ });
+ log('Flare WASM initialized in worker.', 'success');
}
log(`Downloading GGUF model: ${config.url.split('/').pop()}`, 'info');
const t0 = performance.now();
- // Download the GGUF file
const resp = await fetch(config.url);
if (!resp.ok) throw new Error(`HTTP ${resp.status} fetching model`);
const contentLength = parseInt(resp.headers.get('content-length') || '0', 10);
@@ -642,7 +658,6 @@ Comparison Charts
}
}
- // Concatenate chunks
const bytes = new Uint8Array(received);
let offset = 0;
for (const chunk of chunks) {
@@ -650,88 +665,97 @@ Comparison Charts
offset += chunk.length;
}
- log(`Downloaded ${(received / 1024 / 1024).toFixed(1)} MB, parsing GGUF...`, 'info');
+ log(`Downloaded ${(received / 1024 / 1024).toFixed(1)} MB, parsing GGUF in worker...`, 'info');
- // Load into Flare engine
- flareEngine = flareLib.FlareEngine.load(bytes);
+ // Transfer the GGUF bytes to the worker — zero-copy. After this,
+ // `bytes.buffer` is detached on the main thread, so we drop the
+ // reference immediately.
+ const { architecture } = await flareCall('load', { bytes }, [bytes.buffer]);
+ flareArch = architecture;
- // Try WebGPU acceleration
+ // Try WebGPU acceleration in the worker
try {
- const gpuOk = await flareEngine.init_gpu();
- const info = JSON.parse(flareEngine.backend_info());
- console.log('[Flare] backend_info:', info);
- log(`Flare backend: ${JSON.stringify(info)}`, gpuOk ? 'success' : 'info');
- // Prefill profiling — captures per-layer timing
- flareEngine.enable_prefill_profiling();
- const profile = JSON.parse(flareEngine.prefill_profile_json());
- console.log('[Flare] prefill profile:', profile);
- log(`Flare prefill profile: ${JSON.stringify(profile)}`, 'info');
- flareEngine.disable_prefill_profiling();
+ const { gpuOk, backendInfo } = await flareCall('init_gpu');
+ console.log('[Flare] backend_info:', backendInfo);
+ log(`Flare backend: ${JSON.stringify(backendInfo)}`, gpuOk ? 'success' : 'info');
+ await flareCall('enable_prefill_profiling');
} catch (e) {
console.warn('[Flare] GPU init failed:', e);
log('Flare GPU init failed — using CPU SIMD', 'info');
}
const loadTime = performance.now() - t0;
- log(`Flare model loaded in ${(loadTime / 1000).toFixed(1)}s (arch: ${flareEngine.architecture})`, 'success');
+ log(`Flare model loaded in ${(loadTime / 1000).toFixed(1)}s (arch: ${architecture})`, 'success');
return loadTime;
}
async function runFlareInference(prompt, opts) {
const t0 = performance.now();
- // Encode prompt using the embedded GGUF tokenizer
- const promptIds = flareEngine.encode_text(prompt);
- if (!promptIds || promptIds.length === 0) {
+ const promptTokens = await flareCall('encode_text', { text: prompt });
+ if (!promptTokens || promptTokens.length === 0) {
throw new Error('Flare tokenizer failed to encode prompt');
}
- // Start streaming
- flareEngine.reset();
- flareEngine.begin_stream_with_params(
- promptIds,
- opts.maxTokens,
- opts.temperature || 0.001, // temperature
- 1.0, // top_p
- 40, // top_k
- 1.0, // repeat_penalty
- 0.0, // min_p
- );
+ await flareCall('reset');
- const firstTokenTime = performance.now() - t0; // prefill done at begin_stream
+ let firstTokenTime = 0;
let tokenCount = 0;
let output = '';
- // Decode tokens
- while (!flareEngine.stream_done) {
- const id = flareEngine.next_token();
- if (id === undefined) break;
- tokenCount++;
- output += flareEngine.decode_token_chunk(id);
+ await flareCall(
+ 'stream',
+ {
+ promptTokens,
+ maxTokens: opts.maxTokens,
+ temperature: opts.temperature || 0.001,
+ topP: 1.0,
+ topK: 40,
+ repeatPenalty: 1.0,
+ minP: 0.0,
+ },
+ [],
+ (chunk) => {
+ if (tokenCount === 0) firstTokenTime = performance.now() - t0;
+ output += chunk.tokenText;
+ tokenCount += 1;
+ },
+ );
+
+ // Read per-phase profile from the completed prefill once per page load.
+ if (!window.__flareProfileLogged) {
+ try {
+ const profileStr = await flareCall('prefill_profile_json');
+ const profile = JSON.parse(profileStr);
+ if (profile && profile.seq_len > 0) {
+ console.log('[Flare] prefill profile:', profile);
+ log(`Flare prefill profile: ${JSON.stringify(profile)}`, 'info');
+ window.__flareProfileLogged = true;
+ }
+ } catch (e) {
+ console.warn('[Flare] prefill profile read failed:', e);
+ }
}
const totalTime = performance.now() - t0;
const decodeTime = totalTime - firstTokenTime;
- // Try to get perf summary from engine
- let engineTps = null;
- try {
- const summary = JSON.parse(flareEngine.performance_summary());
- engineTps = summary.tokens_per_second;
- } catch {}
-
return {
output,
ttft: firstTokenTime,
totalTime,
tokenCount,
- tokensPerSec: engineTps || ((tokenCount > 1 && decodeTime > 0) ? ((tokenCount - 1) / (decodeTime / 1000)) : 0),
+ tokensPerSec: (tokenCount > 1 && decodeTime > 0) ? ((tokenCount - 1) / (decodeTime / 1000)) : 0,
};
}
function disposeFlare() {
- flareEngine = null;
- // Flare doesn't have explicit dispose — GC handles it
+ if (flareWorker) {
+ flareWorker.terminate();
+ flareWorker = null;
+ }
+ flareRpcPending.clear();
+ flareArch = null;
}
// ── Benchmark runner ──
diff --git a/src/core/llm/browserai.test.ts b/src/core/llm/browserai.test.ts
index 87f2b5b..23a4e7b 100644
--- a/src/core/llm/browserai.test.ts
+++ b/src/core/llm/browserai.test.ts
@@ -128,9 +128,7 @@ describe('BrowserAI', () => {
test('loadAdapter() before loadModel() throws with Flare-specific error', async () => {
const ai = new BrowserAI();
- await expect(
- ai.loadAdapter({ url: 'https://example.com/adapter.safetensors' }),
- ).rejects.toThrow(/Flare engine/i);
+ await expect(ai.loadAdapter({ url: 'https://example.com/adapter.safetensors' })).rejects.toThrow(/Flare engine/i);
});
test('isFlareModelCached() returns false when no engine is loaded', async () => {
@@ -185,14 +183,13 @@ describe('BrowserAI — engine registry sanity', () => {
// This is a smoke test that no two registries have collided and broken
// the spread order — if, say, demucs-models.json has a key that shadows
// an mlc-models.json key, loading the older entry would silently break.
-
+
const mlc = require('../../config/models/mlc-models.json') as Record;
-
- const transformers =
- require('../../config/models/transformers-models.json') as Record;
-
+
+ const transformers = require('../../config/models/transformers-models.json') as Record;
+
const demucs = require('../../config/models/demucs-models.json') as Record;
-
+
const flare = require('../../config/models/flare-models.json') as Record;
const mlcKeys = new Set(Object.keys(mlc));
@@ -215,7 +212,6 @@ describe('BrowserAI — engine registry sanity', () => {
});
test('every Flare model config has engine: "flare"', () => {
-
const flare = require('../../config/models/flare-models.json') as Record;
for (const cfg of Object.values(flare)) {
expect(cfg.engine).toBe('flare');
@@ -223,9 +219,7 @@ describe('BrowserAI — engine registry sanity', () => {
});
test('every Demucs model config has engine: "demucs"', () => {
-
- const demucs =
- require('../../config/models/demucs-models.json') as Record;
+ const demucs = require('../../config/models/demucs-models.json') as Record;
for (const cfg of Object.values(demucs)) {
expect(cfg.engine).toBe('demucs');
}
diff --git a/src/engines/flare-engine-wrapper.test.ts b/src/engines/flare-engine-wrapper.test.ts
index d9f47cb..7b740e2 100644
--- a/src/engines/flare-engine-wrapper.test.ts
+++ b/src/engines/flare-engine-wrapper.test.ts
@@ -30,9 +30,9 @@ describe('FlareEngineWrapper', () => {
test('loadAdapter() before loadModel() throws', async () => {
const engine = new FlareEngineWrapper();
- await expect(
- engine.loadAdapter({ url: 'https://example.com/adapter.safetensors' }),
- ).rejects.toThrow(/No model loaded/i);
+ await expect(engine.loadAdapter({ url: 'https://example.com/adapter.safetensors' })).rejects.toThrow(
+ /No model loaded/i,
+ );
});
test('isCached() returns false on a never-loaded engine', async () => {
@@ -101,9 +101,9 @@ describe('flare-models.json', () => {
const flareKeys = new Set(Object.keys(flareModels));
// Intentionally import lazily to avoid the cross-engine tests depending
// on a specific MLC registry shape.
-
+
const mlcModels = require('../config/models/mlc-models.json') as Record;
-
+
const demucsModels = require('../config/models/demucs-models.json') as Record;
for (const k of flareKeys) {
expect(mlcModels[k]).toBeUndefined();
diff --git a/src/engines/flare-engine-wrapper.ts b/src/engines/flare-engine-wrapper.ts
index 0278ee7..54a7586 100644
--- a/src/engines/flare-engine-wrapper.ts
+++ b/src/engines/flare-engine-wrapper.ts
@@ -264,6 +264,11 @@ export class FlareEngineWrapper {
private systemPrompt = '';
private modelCacheKey = '';
private gpuEnabled = false;
+ /**
+ * Process-wide latch: log the first real prefill profile once per page load.
+ * Prevents spamming the console on every `generateText` call.
+ */
+ private static profileLogged = false;
// -------------------------------------------------------------------------
// Lifecycle
@@ -315,10 +320,10 @@ export class FlareEngineWrapper {
try {
this.gpuEnabled = await this.engine.init_gpu();
console.log('[Flare] backend_info:', JSON.parse(this.engine.backend_info()));
- // Prefill profiling — captures per-layer timing for the first run
+ // Turn profiling on now; the first generateText call reads the JSON
+ // after prefill completes. Overhead when active is a single
+ // function-pointer check per phase boundary.
this.engine.enable_prefill_profiling();
- console.log('[Flare] prefill profile:', JSON.parse(this.engine.prefill_profile_json()));
- this.engine.disable_prefill_profiling();
if (!this.gpuEnabled) {
console.info('[Flare] WebGPU unavailable — using CPU SIMD path');
}
@@ -382,6 +387,21 @@ export class FlareEngineWrapper {
// Streaming path — call onToken per decoded token
this.engine.begin_stream_with_params(promptTokens, maxTokens, temperature, topP, topK, repeatPenalty, minP);
+ // First-run prefill profile snapshot (if profiling is enabled).
+ if (!FlareEngineWrapper.profileLogged) {
+ try {
+ const profile = JSON.parse(
+ (this.engine as unknown as { prefill_profile_json(): string }).prefill_profile_json(),
+ );
+ if (profile && profile.seq_len > 0) {
+ console.log('[Flare] prefill profile:', profile);
+ FlareEngineWrapper.profileLogged = true;
+ }
+ } catch {
+ // prefill_profile_json is only present on flare-web >= 0.2.10
+ }
+ }
+
while (!this.engine.stream_done) {
const tokenId = this.engine.next_token();
if (tokenId === undefined) break;