From 624b9c10f1a6926d23d842c10a9a66f290219d7d Mon Sep 17 00:00:00 2001 From: Saurav Panda Date: Mon, 20 Apr 2026 12:36:00 -0700 Subject: [PATCH 1/2] Move Flare benchmark to a dedicated Web Worker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit FlareEngine.load() is a synchronous WASM call that blocks the main thread for the full 138 MB GGUF parse + GPU buffer upload. Running it on the main thread made the tab unresponsive for the duration of the load (2-10s depending on cache state), and was responsible for the "page freezes after clicking Run Benchmark" reports. This change moves Flare into a dedicated Web Worker. The main thread sends RPC-style {id, type, args} messages; the worker owns the single FlareEngine instance and streams decoded tokens back via postMessage. Matches how MLC and Transformers.js already work internally. Key details: - New examples/benchmark/flare-worker.js owns the FlareEngine - index.html talks to the worker via a small Promise-based RPC helper - GGUF bytes cross as a transferable (zero-copy) - Streaming: worker posts one message per decoded token; main-thread client accumulates into the output string - Prefill profile read is deferred until after the first prefill (previously called before any inference, always returned seq_len=0) - src/engines/flare-engine-wrapper.ts: fix same enable/read/disable ordering bug so `generateText` logs a real profile snapshot Library-level worker migration for FlareEngineWrapper is a follow-up — this PR is scoped to fixing the benchmark's freeze. --- examples/benchmark/flare-worker.js | 145 ++++++++++++++++++++ examples/benchmark/index.html | 196 ++++++++++++++++------------ src/engines/flare-engine-wrapper.ts | 24 +++- 3 files changed, 276 insertions(+), 89 deletions(-) create mode 100644 examples/benchmark/flare-worker.js diff --git a/examples/benchmark/flare-worker.js b/examples/benchmark/flare-worker.js new file mode 100644 index 0000000..c9dadad --- /dev/null +++ b/examples/benchmark/flare-worker.js @@ -0,0 +1,145 @@ +// Dedicated Web Worker that owns the Flare FlareEngine. +// +// The main thread talks to this worker via structured `{id, type, args}` +// messages and receives back either `{id, result}`, `{id, error}`, or +// (for streaming calls) multiple `{id, stream}` messages followed by a +// final `{id, result}`. +// +// Loading Flare on the main thread blocks the UI for the full duration +// of the 138 MB GGUF parse + GPU buffer upload. Moving it here keeps +// the tab responsive — matches how MLC and Transformers.js already work. + +let flareLib = null; +let engine = null; + +function reply(id, result) { + self.postMessage({ id, result }); +} + +function replyError(id, err) { + self.postMessage({ + id, + error: { message: (err && err.message) || String(err), stack: err && err.stack }, + }); +} + +function replyStream(id, tokenText, tokenId) { + self.postMessage({ id, stream: { tokenText, tokenId } }); +} + +self.addEventListener('message', async (e) => { + const { id, type, args } = e.data || {}; + try { + await dispatch(id, type, args || {}); + } catch (err) { + replyError(id, err); + } +}); + +async function dispatch(id, type, args) { + switch (type) { + case 'init': { + // Mirrors the main-thread blob-URL trick so the CDN-hosted ES module + // can be imported without CORS problems and so import.meta.url resolves + // to the correct wasm file. + const { jsUrl, wasmUrl, patchImportMeta } = args; + const resp = await fetch(jsUrl, { cache: 'no-cache' }); + if (!resp.ok) throw new Error(`HTTP ${resp.status} fetching ${jsUrl}`); + let src = await resp.text(); + if (patchImportMeta) { + src = src.replaceAll('import.meta.url', JSON.stringify(jsUrl)); + // Workaround for wasm-pack codegen: a JSDoc block contains "/* done */" + // which prematurely closes the outer /** */ comment. + src = src.replaceAll('/* done */', '/* done -/'); + } + const blob = new Blob([src], { type: 'application/javascript' }); + const blobUrl = URL.createObjectURL(blob); + try { + flareLib = await import(blobUrl); + } finally { + URL.revokeObjectURL(blobUrl); + } + await flareLib.default(wasmUrl); + return reply(id, { ok: true }); + } + + case 'load': { + if (!flareLib) throw new Error('init must be called before load'); + // `args.bytes` is transferred from the main thread so this is a move, + // not a copy. + engine = flareLib.FlareEngine.load(args.bytes); + return reply(id, { architecture: engine.architecture }); + } + + case 'init_gpu': { + if (!engine) throw new Error('load must be called before init_gpu'); + const gpuOk = await engine.init_gpu(); + return reply(id, { + gpuOk, + backendInfo: JSON.parse(engine.backend_info()), + }); + } + + case 'enable_prefill_profiling': + engine.enable_prefill_profiling(); + return reply(id, null); + + case 'disable_prefill_profiling': + engine.disable_prefill_profiling(); + return reply(id, null); + + case 'prefill_profile_json': + return reply(id, engine.prefill_profile_json()); + + case 'apply_chat_template': + return reply(id, engine.apply_chat_template(args.userMsg || '', args.systemMsg || '')); + + case 'encode_text': + return reply(id, Array.from(engine.encode_text(args.text))); + + case 'reset': + engine.reset(); + return reply(id, null); + + case 'stream': { + // One-shot streaming: the worker drains every token in a tight loop, + // posting each decoded chunk immediately. Since the worker has nothing + // else to do between tokens, the event loop stays unblocked. + const { + promptTokens, + maxTokens, + temperature, + topP, + topK, + repeatPenalty, + minP, + } = args; + engine.reset(); + engine.begin_stream_with_params( + new Uint32Array(promptTokens), + maxTokens, + temperature, + topP, + topK, + repeatPenalty, + minP, + ); + let count = 0; + while (!engine.stream_done) { + const tokId = engine.next_token(); + if (tokId === undefined || tokId === null) break; + const text = engine.decode_token_chunk(tokId); + replyStream(id, text, tokId); + count += 1; + } + return reply(id, { done: true, completionTokens: count }); + } + + case 'dispose': + engine = null; + return reply(id, null); + + default: + throw new Error(`unknown message type: ${type}`); + } +} diff --git a/examples/benchmark/index.html b/examples/benchmark/index.html index 96c1d6e..669349e 100644 --- a/examples/benchmark/index.html +++ b/examples/benchmark/index.html @@ -578,53 +578,69 @@

Comparison Charts

transformersPipeline = null; } - // ── Flare Engine (via @sauravpanda/flare WASM) ── - let flareEngine = null; - let flareLib = null; + // ── Flare Engine (via @sauravpanda/flare WASM in a dedicated Worker) ── + // + // Flare's FlareEngine.load() is a synchronous WASM call that blocks the + // main thread for the full 138MB GGUF parse + GPU buffer upload. We + // run it in a dedicated Web Worker to keep the UI responsive — matches + // MLC and Transformers.js which do the same internally. + let flareWorker = null; + let flareArch = null; + let flareRpcSeq = 0; + const flareRpcPending = new Map(); + + function ensureFlareWorker() { + if (flareWorker) return flareWorker; + flareWorker = new Worker(new URL('./flare-worker.js', import.meta.url), { + type: 'module', + }); + flareWorker.addEventListener('message', (e) => { + const { id, result, error, stream } = e.data; + const slot = flareRpcPending.get(id); + if (!slot) return; + if (error) { + slot.reject(new Error(error.message || 'flare worker error')); + flareRpcPending.delete(id); + return; + } + if (stream) { + try { slot.onStream && slot.onStream(stream); } catch (err) { console.error(err); } + return; + } + slot.resolve(result); + flareRpcPending.delete(id); + }); + flareWorker.addEventListener('error', (e) => { + console.error('[flare-worker] error:', e); + log(`Flare worker error: ${e.message || e}`, 'error'); + }); + return flareWorker; + } + + function flareCall(type, args = {}, transfer = [], onStream = null) { + const id = ++flareRpcSeq; + return new Promise((resolve, reject) => { + flareRpcPending.set(id, { resolve, reject, onStream }); + flareWorker.postMessage({ id, type, args }, transfer); + }); + } async function loadFlareEngine(config) { - if (!flareLib) { - log('Loading @sauravpanda/flare WASM from CDN...', 'info'); + if (!flareWorker) { + log('Starting Flare worker...', 'info'); + ensureFlareWorker(); const CDN = 'https://cdn.jsdelivr.net/npm/@sauravpanda/flare@0.2.11/pkg'; - const wasmUrl = `${CDN}/flare_web_bg.wasm`; - - // Fetch the JS module source, patch the WASM URL, and load via blob - // to avoid cross-origin ES module import restrictions. - const jsResp = await fetch(`${CDN}/flare_web.js`, { cache: 'no-cache' }); - if (!jsResp.ok) throw new Error(`HTTP ${jsResp.status} fetching flare_web.js`); - let jsSrc = await jsResp.text(); - log(`Fetched flare_web.js (${(jsSrc.length / 1024).toFixed(0)} KB)`, 'info'); - - // Patch import.meta.url references so the WASM file resolves to CDN - jsSrc = jsSrc.replaceAll('import.meta.url', `'${CDN}/flare_web.js'`); - - // Fix wasm-pack codegen bug: a JSDoc block contains "/* done */" - // which prematurely closes the outer /** */ comment, causing - // "Unexpected token '*'" parse error. Remove the nested comment. - jsSrc = jsSrc.replaceAll('/* done */', '/* done -/'); - - const blob = new Blob([jsSrc], { type: 'application/javascript' }); - const blobUrl = URL.createObjectURL(blob); - try { - flareLib = await import(/* webpackIgnore: true */ blobUrl); - } catch (importErr) { - URL.revokeObjectURL(blobUrl); - log(`Blob import failed: ${importErr.message}`, 'error'); - // Fallback: try direct CDN import - log('Trying direct CDN import...', 'info'); - flareLib = await import(`${CDN}/flare_web.js`); - } - URL.revokeObjectURL(blobUrl); - - // Initialize the WASM module - await flareLib.default(wasmUrl); - log('Flare WASM initialized.', 'success'); + await flareCall('init', { + jsUrl: `${CDN}/flare_web.js`, + wasmUrl: `${CDN}/flare_web_bg.wasm`, + patchImportMeta: true, + }); + log('Flare WASM initialized in worker.', 'success'); } log(`Downloading GGUF model: ${config.url.split('/').pop()}`, 'info'); const t0 = performance.now(); - // Download the GGUF file const resp = await fetch(config.url); if (!resp.ok) throw new Error(`HTTP ${resp.status} fetching model`); const contentLength = parseInt(resp.headers.get('content-length') || '0', 10); @@ -642,7 +658,6 @@

Comparison Charts

} } - // Concatenate chunks const bytes = new Uint8Array(received); let offset = 0; for (const chunk of chunks) { @@ -650,88 +665,97 @@

Comparison Charts

offset += chunk.length; } - log(`Downloaded ${(received / 1024 / 1024).toFixed(1)} MB, parsing GGUF...`, 'info'); + log(`Downloaded ${(received / 1024 / 1024).toFixed(1)} MB, parsing GGUF in worker...`, 'info'); - // Load into Flare engine - flareEngine = flareLib.FlareEngine.load(bytes); + // Transfer the GGUF bytes to the worker — zero-copy. After this, + // `bytes.buffer` is detached on the main thread, so we drop the + // reference immediately. + const { architecture } = await flareCall('load', { bytes }, [bytes.buffer]); + flareArch = architecture; - // Try WebGPU acceleration + // Try WebGPU acceleration in the worker try { - const gpuOk = await flareEngine.init_gpu(); - const info = JSON.parse(flareEngine.backend_info()); - console.log('[Flare] backend_info:', info); - log(`Flare backend: ${JSON.stringify(info)}`, gpuOk ? 'success' : 'info'); - // Prefill profiling — captures per-layer timing - flareEngine.enable_prefill_profiling(); - const profile = JSON.parse(flareEngine.prefill_profile_json()); - console.log('[Flare] prefill profile:', profile); - log(`Flare prefill profile: ${JSON.stringify(profile)}`, 'info'); - flareEngine.disable_prefill_profiling(); + const { gpuOk, backendInfo } = await flareCall('init_gpu'); + console.log('[Flare] backend_info:', backendInfo); + log(`Flare backend: ${JSON.stringify(backendInfo)}`, gpuOk ? 'success' : 'info'); + await flareCall('enable_prefill_profiling'); } catch (e) { console.warn('[Flare] GPU init failed:', e); log('Flare GPU init failed — using CPU SIMD', 'info'); } const loadTime = performance.now() - t0; - log(`Flare model loaded in ${(loadTime / 1000).toFixed(1)}s (arch: ${flareEngine.architecture})`, 'success'); + log(`Flare model loaded in ${(loadTime / 1000).toFixed(1)}s (arch: ${architecture})`, 'success'); return loadTime; } async function runFlareInference(prompt, opts) { const t0 = performance.now(); - // Encode prompt using the embedded GGUF tokenizer - const promptIds = flareEngine.encode_text(prompt); - if (!promptIds || promptIds.length === 0) { + const promptTokens = await flareCall('encode_text', { text: prompt }); + if (!promptTokens || promptTokens.length === 0) { throw new Error('Flare tokenizer failed to encode prompt'); } - // Start streaming - flareEngine.reset(); - flareEngine.begin_stream_with_params( - promptIds, - opts.maxTokens, - opts.temperature || 0.001, // temperature - 1.0, // top_p - 40, // top_k - 1.0, // repeat_penalty - 0.0, // min_p - ); + await flareCall('reset'); - const firstTokenTime = performance.now() - t0; // prefill done at begin_stream + let firstTokenTime = 0; let tokenCount = 0; let output = ''; - // Decode tokens - while (!flareEngine.stream_done) { - const id = flareEngine.next_token(); - if (id === undefined) break; - tokenCount++; - output += flareEngine.decode_token_chunk(id); + await flareCall( + 'stream', + { + promptTokens, + maxTokens: opts.maxTokens, + temperature: opts.temperature || 0.001, + topP: 1.0, + topK: 40, + repeatPenalty: 1.0, + minP: 0.0, + }, + [], + (chunk) => { + if (tokenCount === 0) firstTokenTime = performance.now() - t0; + output += chunk.tokenText; + tokenCount += 1; + }, + ); + + // Read per-phase profile from the completed prefill once per page load. + if (!window.__flareProfileLogged) { + try { + const profileStr = await flareCall('prefill_profile_json'); + const profile = JSON.parse(profileStr); + if (profile && profile.seq_len > 0) { + console.log('[Flare] prefill profile:', profile); + log(`Flare prefill profile: ${JSON.stringify(profile)}`, 'info'); + window.__flareProfileLogged = true; + } + } catch (e) { + console.warn('[Flare] prefill profile read failed:', e); + } } const totalTime = performance.now() - t0; const decodeTime = totalTime - firstTokenTime; - // Try to get perf summary from engine - let engineTps = null; - try { - const summary = JSON.parse(flareEngine.performance_summary()); - engineTps = summary.tokens_per_second; - } catch {} - return { output, ttft: firstTokenTime, totalTime, tokenCount, - tokensPerSec: engineTps || ((tokenCount > 1 && decodeTime > 0) ? ((tokenCount - 1) / (decodeTime / 1000)) : 0), + tokensPerSec: (tokenCount > 1 && decodeTime > 0) ? ((tokenCount - 1) / (decodeTime / 1000)) : 0, }; } function disposeFlare() { - flareEngine = null; - // Flare doesn't have explicit dispose — GC handles it + if (flareWorker) { + flareWorker.terminate(); + flareWorker = null; + } + flareRpcPending.clear(); + flareArch = null; } // ── Benchmark runner ── diff --git a/src/engines/flare-engine-wrapper.ts b/src/engines/flare-engine-wrapper.ts index 0278ee7..1f801ce 100644 --- a/src/engines/flare-engine-wrapper.ts +++ b/src/engines/flare-engine-wrapper.ts @@ -264,6 +264,11 @@ export class FlareEngineWrapper { private systemPrompt = ''; private modelCacheKey = ''; private gpuEnabled = false; + /** + * Process-wide latch: log the first real prefill profile once per page load. + * Prevents spamming the console on every `generateText` call. + */ + private static profileLogged = false; // ------------------------------------------------------------------------- // Lifecycle @@ -315,10 +320,10 @@ export class FlareEngineWrapper { try { this.gpuEnabled = await this.engine.init_gpu(); console.log('[Flare] backend_info:', JSON.parse(this.engine.backend_info())); - // Prefill profiling — captures per-layer timing for the first run + // Turn profiling on now; the first generateText call reads the JSON + // after prefill completes. Overhead when active is a single + // function-pointer check per phase boundary. this.engine.enable_prefill_profiling(); - console.log('[Flare] prefill profile:', JSON.parse(this.engine.prefill_profile_json())); - this.engine.disable_prefill_profiling(); if (!this.gpuEnabled) { console.info('[Flare] WebGPU unavailable — using CPU SIMD path'); } @@ -382,6 +387,19 @@ export class FlareEngineWrapper { // Streaming path — call onToken per decoded token this.engine.begin_stream_with_params(promptTokens, maxTokens, temperature, topP, topK, repeatPenalty, minP); + // First-run prefill profile snapshot (if profiling is enabled). + if (!FlareEngineWrapper.profileLogged) { + try { + const profile = JSON.parse((this.engine as unknown as { prefill_profile_json(): string }).prefill_profile_json()); + if (profile && profile.seq_len > 0) { + console.log('[Flare] prefill profile:', profile); + FlareEngineWrapper.profileLogged = true; + } + } catch { + // prefill_profile_json is only present on flare-web >= 0.2.10 + } + } + while (!this.engine.stream_done) { const tokenId = this.engine.next_token(); if (tokenId === undefined) break; From 8370f0e16d6ce29bfacfe587d45264f75097f810 Mon Sep 17 00:00:00 2001 From: Saurav Panda Date: Mon, 20 Apr 2026 12:41:39 -0700 Subject: [PATCH 2/2] Apply prettier formatting --- src/core/llm/browserai.test.ts | 20 +++++++------------- src/engines/flare-engine-wrapper.test.ts | 10 +++++----- src/engines/flare-engine-wrapper.ts | 4 +++- 3 files changed, 15 insertions(+), 19 deletions(-) diff --git a/src/core/llm/browserai.test.ts b/src/core/llm/browserai.test.ts index 87f2b5b..23a4e7b 100644 --- a/src/core/llm/browserai.test.ts +++ b/src/core/llm/browserai.test.ts @@ -128,9 +128,7 @@ describe('BrowserAI', () => { test('loadAdapter() before loadModel() throws with Flare-specific error', async () => { const ai = new BrowserAI(); - await expect( - ai.loadAdapter({ url: 'https://example.com/adapter.safetensors' }), - ).rejects.toThrow(/Flare engine/i); + await expect(ai.loadAdapter({ url: 'https://example.com/adapter.safetensors' })).rejects.toThrow(/Flare engine/i); }); test('isFlareModelCached() returns false when no engine is loaded', async () => { @@ -185,14 +183,13 @@ describe('BrowserAI — engine registry sanity', () => { // This is a smoke test that no two registries have collided and broken // the spread order — if, say, demucs-models.json has a key that shadows // an mlc-models.json key, loading the older entry would silently break. - + const mlc = require('../../config/models/mlc-models.json') as Record; - - const transformers = - require('../../config/models/transformers-models.json') as Record; - + + const transformers = require('../../config/models/transformers-models.json') as Record; + const demucs = require('../../config/models/demucs-models.json') as Record; - + const flare = require('../../config/models/flare-models.json') as Record; const mlcKeys = new Set(Object.keys(mlc)); @@ -215,7 +212,6 @@ describe('BrowserAI — engine registry sanity', () => { }); test('every Flare model config has engine: "flare"', () => { - const flare = require('../../config/models/flare-models.json') as Record; for (const cfg of Object.values(flare)) { expect(cfg.engine).toBe('flare'); @@ -223,9 +219,7 @@ describe('BrowserAI — engine registry sanity', () => { }); test('every Demucs model config has engine: "demucs"', () => { - - const demucs = - require('../../config/models/demucs-models.json') as Record; + const demucs = require('../../config/models/demucs-models.json') as Record; for (const cfg of Object.values(demucs)) { expect(cfg.engine).toBe('demucs'); } diff --git a/src/engines/flare-engine-wrapper.test.ts b/src/engines/flare-engine-wrapper.test.ts index d9f47cb..7b740e2 100644 --- a/src/engines/flare-engine-wrapper.test.ts +++ b/src/engines/flare-engine-wrapper.test.ts @@ -30,9 +30,9 @@ describe('FlareEngineWrapper', () => { test('loadAdapter() before loadModel() throws', async () => { const engine = new FlareEngineWrapper(); - await expect( - engine.loadAdapter({ url: 'https://example.com/adapter.safetensors' }), - ).rejects.toThrow(/No model loaded/i); + await expect(engine.loadAdapter({ url: 'https://example.com/adapter.safetensors' })).rejects.toThrow( + /No model loaded/i, + ); }); test('isCached() returns false on a never-loaded engine', async () => { @@ -101,9 +101,9 @@ describe('flare-models.json', () => { const flareKeys = new Set(Object.keys(flareModels)); // Intentionally import lazily to avoid the cross-engine tests depending // on a specific MLC registry shape. - + const mlcModels = require('../config/models/mlc-models.json') as Record; - + const demucsModels = require('../config/models/demucs-models.json') as Record; for (const k of flareKeys) { expect(mlcModels[k]).toBeUndefined(); diff --git a/src/engines/flare-engine-wrapper.ts b/src/engines/flare-engine-wrapper.ts index 1f801ce..54a7586 100644 --- a/src/engines/flare-engine-wrapper.ts +++ b/src/engines/flare-engine-wrapper.ts @@ -390,7 +390,9 @@ export class FlareEngineWrapper { // First-run prefill profile snapshot (if profiling is enabled). if (!FlareEngineWrapper.profileLogged) { try { - const profile = JSON.parse((this.engine as unknown as { prefill_profile_json(): string }).prefill_profile_json()); + const profile = JSON.parse( + (this.engine as unknown as { prefill_profile_json(): string }).prefill_profile_json(), + ); if (profile && profile.seq_len > 0) { console.log('[Flare] prefill profile:', profile); FlareEngineWrapper.profileLogged = true;