diff --git a/examples/benchmark/flare-worker.js b/examples/benchmark/flare-worker.js deleted file mode 100644 index c9dadad..0000000 --- a/examples/benchmark/flare-worker.js +++ /dev/null @@ -1,145 +0,0 @@ -// Dedicated Web Worker that owns the Flare FlareEngine. -// -// The main thread talks to this worker via structured `{id, type, args}` -// messages and receives back either `{id, result}`, `{id, error}`, or -// (for streaming calls) multiple `{id, stream}` messages followed by a -// final `{id, result}`. -// -// Loading Flare on the main thread blocks the UI for the full duration -// of the 138 MB GGUF parse + GPU buffer upload. Moving it here keeps -// the tab responsive — matches how MLC and Transformers.js already work. - -let flareLib = null; -let engine = null; - -function reply(id, result) { - self.postMessage({ id, result }); -} - -function replyError(id, err) { - self.postMessage({ - id, - error: { message: (err && err.message) || String(err), stack: err && err.stack }, - }); -} - -function replyStream(id, tokenText, tokenId) { - self.postMessage({ id, stream: { tokenText, tokenId } }); -} - -self.addEventListener('message', async (e) => { - const { id, type, args } = e.data || {}; - try { - await dispatch(id, type, args || {}); - } catch (err) { - replyError(id, err); - } -}); - -async function dispatch(id, type, args) { - switch (type) { - case 'init': { - // Mirrors the main-thread blob-URL trick so the CDN-hosted ES module - // can be imported without CORS problems and so import.meta.url resolves - // to the correct wasm file. - const { jsUrl, wasmUrl, patchImportMeta } = args; - const resp = await fetch(jsUrl, { cache: 'no-cache' }); - if (!resp.ok) throw new Error(`HTTP ${resp.status} fetching ${jsUrl}`); - let src = await resp.text(); - if (patchImportMeta) { - src = src.replaceAll('import.meta.url', JSON.stringify(jsUrl)); - // Workaround for wasm-pack codegen: a JSDoc block contains "/* done */" - // which prematurely closes the outer /** */ comment. - src = src.replaceAll('/* done */', '/* done -/'); - } - const blob = new Blob([src], { type: 'application/javascript' }); - const blobUrl = URL.createObjectURL(blob); - try { - flareLib = await import(blobUrl); - } finally { - URL.revokeObjectURL(blobUrl); - } - await flareLib.default(wasmUrl); - return reply(id, { ok: true }); - } - - case 'load': { - if (!flareLib) throw new Error('init must be called before load'); - // `args.bytes` is transferred from the main thread so this is a move, - // not a copy. - engine = flareLib.FlareEngine.load(args.bytes); - return reply(id, { architecture: engine.architecture }); - } - - case 'init_gpu': { - if (!engine) throw new Error('load must be called before init_gpu'); - const gpuOk = await engine.init_gpu(); - return reply(id, { - gpuOk, - backendInfo: JSON.parse(engine.backend_info()), - }); - } - - case 'enable_prefill_profiling': - engine.enable_prefill_profiling(); - return reply(id, null); - - case 'disable_prefill_profiling': - engine.disable_prefill_profiling(); - return reply(id, null); - - case 'prefill_profile_json': - return reply(id, engine.prefill_profile_json()); - - case 'apply_chat_template': - return reply(id, engine.apply_chat_template(args.userMsg || '', args.systemMsg || '')); - - case 'encode_text': - return reply(id, Array.from(engine.encode_text(args.text))); - - case 'reset': - engine.reset(); - return reply(id, null); - - case 'stream': { - // One-shot streaming: the worker drains every token in a tight loop, - // posting each decoded chunk immediately. Since the worker has nothing - // else to do between tokens, the event loop stays unblocked. - const { - promptTokens, - maxTokens, - temperature, - topP, - topK, - repeatPenalty, - minP, - } = args; - engine.reset(); - engine.begin_stream_with_params( - new Uint32Array(promptTokens), - maxTokens, - temperature, - topP, - topK, - repeatPenalty, - minP, - ); - let count = 0; - while (!engine.stream_done) { - const tokId = engine.next_token(); - if (tokId === undefined || tokId === null) break; - const text = engine.decode_token_chunk(tokId); - replyStream(id, text, tokId); - count += 1; - } - return reply(id, { done: true, completionTokens: count }); - } - - case 'dispose': - engine = null; - return reply(id, null); - - default: - throw new Error(`unknown message type: ${type}`); - } -} diff --git a/examples/benchmark/index.html b/examples/benchmark/index.html index d1cce20..e191c85 100644 --- a/examples/benchmark/index.html +++ b/examples/benchmark/index.html @@ -578,64 +578,51 @@

Comparison Charts

transformersPipeline = null; } - // ── Flare Engine (via @sauravpanda/flare WASM in a dedicated Worker) ── + // ── Flare Engine (via @sauravpanda/flare WASM) ── // - // Flare's FlareEngine.load() is a synchronous WASM call that blocks the - // main thread for the full 138MB GGUF parse + GPU buffer upload. We - // run it in a dedicated Web Worker to keep the UI responsive — matches - // MLC and Transformers.js which do the same internally. - let flareWorker = null; - let flareArch = null; - let flareRpcSeq = 0; - const flareRpcPending = new Map(); - - function ensureFlareWorker() { - if (flareWorker) return flareWorker; - flareWorker = new Worker(new URL('./flare-worker.js', import.meta.url), { - type: 'module', - }); - flareWorker.addEventListener('message', (e) => { - const { id, result, error, stream } = e.data; - const slot = flareRpcPending.get(id); - if (!slot) return; - if (error) { - slot.reject(new Error(error.message || 'flare worker error')); - flareRpcPending.delete(id); - return; - } - if (stream) { - try { slot.onStream && slot.onStream(stream); } catch (err) { console.error(err); } - return; - } - slot.resolve(result); - flareRpcPending.delete(id); - }); - flareWorker.addEventListener('error', (e) => { - console.error('[flare-worker] error:', e); - log(`Flare worker error: ${e.message || e}`, 'error'); - }); - return flareWorker; - } - - function flareCall(type, args = {}, transfer = [], onStream = null) { - const id = ++flareRpcSeq; - return new Promise((resolve, reject) => { - flareRpcPending.set(id, { resolve, reject, onStream }); - flareWorker.postMessage({ id, type, args }, transfer); - }); - } + // Flare runs on the main thread. Moving it to a Web Worker is + // architecturally desirable (FlareEngine.load is a synchronous WASM + // call that briefly freezes the UI while parsing 138 MB of GGUF), but + // wgpu's blocking readback path (map_async + channel recv) deadlocks + // in a dedicated worker because the WebGPU promise queue is serviced + // by browser-internal hooks that only fire on the main thread. + // Tracked upstream — until flare-web ships a worker-safe async + // readback path we stay on the main thread to keep WebGPU active. + let flareEngine = null; + let flareLib = null; async function loadFlareEngine(config) { - if (!flareWorker) { - log('Starting Flare worker...', 'info'); - ensureFlareWorker(); + if (!flareLib) { + log('Loading @sauravpanda/flare WASM from CDN...', 'info'); const CDN = 'https://cdn.jsdelivr.net/npm/@sauravpanda/flare@0.2.12/pkg'; - await flareCall('init', { - jsUrl: `${CDN}/flare_web.js`, - wasmUrl: `${CDN}/flare_web_bg.wasm`, - patchImportMeta: true, - }); - log('Flare WASM initialized in worker.', 'success'); + const wasmUrl = `${CDN}/flare_web_bg.wasm`; + + // Fetch the JS module source, patch the WASM URL, and load via blob + // to avoid cross-origin ES module import restrictions. + const jsResp = await fetch(`${CDN}/flare_web.js`, { cache: 'no-cache' }); + if (!jsResp.ok) throw new Error(`HTTP ${jsResp.status} fetching flare_web.js`); + let jsSrc = await jsResp.text(); + log(`Fetched flare_web.js (${(jsSrc.length / 1024).toFixed(0)} KB)`, 'info'); + + jsSrc = jsSrc.replaceAll('import.meta.url', `'${CDN}/flare_web.js'`); + // Fix wasm-pack codegen bug: a JSDoc block contains "/* done */" + // which prematurely closes the outer /** */ comment. + jsSrc = jsSrc.replaceAll('/* done */', '/* done -/'); + + const blob = new Blob([jsSrc], { type: 'application/javascript' }); + const blobUrl = URL.createObjectURL(blob); + try { + flareLib = await import(/* webpackIgnore: true */ blobUrl); + } catch (importErr) { + URL.revokeObjectURL(blobUrl); + log(`Blob import failed: ${importErr.message}`, 'error'); + log('Trying direct CDN import...', 'info'); + flareLib = await import(`${CDN}/flare_web.js`); + } + URL.revokeObjectURL(blobUrl); + + await flareLib.default(wasmUrl); + log('Flare WASM initialized.', 'success'); } log(`Downloading GGUF model: ${config.url.split('/').pop()}`, 'info'); @@ -665,68 +652,49 @@

Comparison Charts

offset += chunk.length; } - log(`Downloaded ${(received / 1024 / 1024).toFixed(1)} MB, parsing GGUF in worker...`, 'info'); + log(`Downloaded ${(received / 1024 / 1024).toFixed(1)} MB, parsing GGUF...`, 'info'); - // Transfer the GGUF bytes to the worker — zero-copy. After this, - // `bytes.buffer` is detached on the main thread, so we drop the - // reference immediately. - const { architecture } = await flareCall('load', { bytes }, [bytes.buffer]); - flareArch = architecture; + flareEngine = flareLib.FlareEngine.load(bytes); - // Try WebGPU acceleration in the worker try { - const { gpuOk, backendInfo } = await flareCall('init_gpu'); - console.log('[Flare] backend_info:', backendInfo); - log(`Flare backend: ${JSON.stringify(backendInfo)}`, gpuOk ? 'success' : 'info'); - await flareCall('enable_prefill_profiling'); + const gpuOk = await flareEngine.init_gpu(); + const info = JSON.parse(flareEngine.backend_info()); + console.log('[Flare] backend_info:', info); + log(`Flare backend: ${JSON.stringify(info)}`, gpuOk ? 'success' : 'info'); + // Enable profiling; runFlareInference reads it after the first prefill + flareEngine.enable_prefill_profiling(); } catch (e) { console.warn('[Flare] GPU init failed:', e); log('Flare GPU init failed — using CPU SIMD', 'info'); } const loadTime = performance.now() - t0; - log(`Flare model loaded in ${(loadTime / 1000).toFixed(1)}s (arch: ${architecture})`, 'success'); + log(`Flare model loaded in ${(loadTime / 1000).toFixed(1)}s (arch: ${flareEngine.architecture})`, 'success'); return loadTime; } async function runFlareInference(prompt, opts) { const t0 = performance.now(); - const promptTokens = await flareCall('encode_text', { text: prompt }); - if (!promptTokens || promptTokens.length === 0) { + const promptIds = flareEngine.encode_text(prompt); + if (!promptIds || promptIds.length === 0) { throw new Error('Flare tokenizer failed to encode prompt'); } - await flareCall('reset'); - - let firstTokenTime = 0; - let tokenCount = 0; - let output = ''; - - await flareCall( - 'stream', - { - promptTokens, - maxTokens: opts.maxTokens, - temperature: opts.temperature || 0.001, - topP: 1.0, - topK: 40, - repeatPenalty: 1.0, - minP: 0.0, - }, - [], - (chunk) => { - if (tokenCount === 0) firstTokenTime = performance.now() - t0; - output += chunk.tokenText; - tokenCount += 1; - }, + flareEngine.reset(); + flareEngine.begin_stream_with_params( + promptIds, + opts.maxTokens, + opts.temperature || 0.001, + 1.0, 40, 1.0, 0.0, ); - // Read per-phase profile from the completed prefill once per page load. - if (!window.__flareProfileLogged) { + const firstTokenTime = performance.now() - t0; + + // First-run-only profile snapshot + if (typeof flareEngine.prefill_profile_json === 'function' && !window.__flareProfileLogged) { try { - const profileStr = await flareCall('prefill_profile_json'); - const profile = JSON.parse(profileStr); + const profile = JSON.parse(flareEngine.prefill_profile_json()); if (profile && profile.seq_len > 0) { console.log('[Flare] prefill profile:', profile); log(`Flare prefill profile: ${JSON.stringify(profile)}`, 'info'); @@ -737,6 +705,16 @@

Comparison Charts

} } + let tokenCount = 0; + let output = ''; + + while (!flareEngine.stream_done) { + const id = flareEngine.next_token(); + if (id === undefined) break; + tokenCount++; + output += flareEngine.decode_token_chunk(id); + } + const totalTime = performance.now() - t0; const decodeTime = totalTime - firstTokenTime; @@ -750,12 +728,7 @@

Comparison Charts

} function disposeFlare() { - if (flareWorker) { - flareWorker.terminate(); - flareWorker = null; - } - flareRpcPending.clear(); - flareArch = null; + flareEngine = null; } // ── Benchmark runner ──