diff --git a/examples/benchmark/flare-worker.js b/examples/benchmark/flare-worker.js
deleted file mode 100644
index c9dadad..0000000
--- a/examples/benchmark/flare-worker.js
+++ /dev/null
@@ -1,145 +0,0 @@
-// Dedicated Web Worker that owns the Flare FlareEngine.
-//
-// The main thread talks to this worker via structured `{id, type, args}`
-// messages and receives back either `{id, result}`, `{id, error}`, or
-// (for streaming calls) multiple `{id, stream}` messages followed by a
-// final `{id, result}`.
-//
-// Loading Flare on the main thread blocks the UI for the full duration
-// of the 138 MB GGUF parse + GPU buffer upload. Moving it here keeps
-// the tab responsive — matches how MLC and Transformers.js already work.
-
-let flareLib = null;
-let engine = null;
-
-function reply(id, result) {
- self.postMessage({ id, result });
-}
-
-function replyError(id, err) {
- self.postMessage({
- id,
- error: { message: (err && err.message) || String(err), stack: err && err.stack },
- });
-}
-
-function replyStream(id, tokenText, tokenId) {
- self.postMessage({ id, stream: { tokenText, tokenId } });
-}
-
-self.addEventListener('message', async (e) => {
- const { id, type, args } = e.data || {};
- try {
- await dispatch(id, type, args || {});
- } catch (err) {
- replyError(id, err);
- }
-});
-
-async function dispatch(id, type, args) {
- switch (type) {
- case 'init': {
- // Mirrors the main-thread blob-URL trick so the CDN-hosted ES module
- // can be imported without CORS problems and so import.meta.url resolves
- // to the correct wasm file.
- const { jsUrl, wasmUrl, patchImportMeta } = args;
- const resp = await fetch(jsUrl, { cache: 'no-cache' });
- if (!resp.ok) throw new Error(`HTTP ${resp.status} fetching ${jsUrl}`);
- let src = await resp.text();
- if (patchImportMeta) {
- src = src.replaceAll('import.meta.url', JSON.stringify(jsUrl));
- // Workaround for wasm-pack codegen: a JSDoc block contains "/* done */"
- // which prematurely closes the outer /** */ comment.
- src = src.replaceAll('/* done */', '/* done -/');
- }
- const blob = new Blob([src], { type: 'application/javascript' });
- const blobUrl = URL.createObjectURL(blob);
- try {
- flareLib = await import(blobUrl);
- } finally {
- URL.revokeObjectURL(blobUrl);
- }
- await flareLib.default(wasmUrl);
- return reply(id, { ok: true });
- }
-
- case 'load': {
- if (!flareLib) throw new Error('init must be called before load');
- // `args.bytes` is transferred from the main thread so this is a move,
- // not a copy.
- engine = flareLib.FlareEngine.load(args.bytes);
- return reply(id, { architecture: engine.architecture });
- }
-
- case 'init_gpu': {
- if (!engine) throw new Error('load must be called before init_gpu');
- const gpuOk = await engine.init_gpu();
- return reply(id, {
- gpuOk,
- backendInfo: JSON.parse(engine.backend_info()),
- });
- }
-
- case 'enable_prefill_profiling':
- engine.enable_prefill_profiling();
- return reply(id, null);
-
- case 'disable_prefill_profiling':
- engine.disable_prefill_profiling();
- return reply(id, null);
-
- case 'prefill_profile_json':
- return reply(id, engine.prefill_profile_json());
-
- case 'apply_chat_template':
- return reply(id, engine.apply_chat_template(args.userMsg || '', args.systemMsg || ''));
-
- case 'encode_text':
- return reply(id, Array.from(engine.encode_text(args.text)));
-
- case 'reset':
- engine.reset();
- return reply(id, null);
-
- case 'stream': {
- // One-shot streaming: the worker drains every token in a tight loop,
- // posting each decoded chunk immediately. Since the worker has nothing
- // else to do between tokens, the event loop stays unblocked.
- const {
- promptTokens,
- maxTokens,
- temperature,
- topP,
- topK,
- repeatPenalty,
- minP,
- } = args;
- engine.reset();
- engine.begin_stream_with_params(
- new Uint32Array(promptTokens),
- maxTokens,
- temperature,
- topP,
- topK,
- repeatPenalty,
- minP,
- );
- let count = 0;
- while (!engine.stream_done) {
- const tokId = engine.next_token();
- if (tokId === undefined || tokId === null) break;
- const text = engine.decode_token_chunk(tokId);
- replyStream(id, text, tokId);
- count += 1;
- }
- return reply(id, { done: true, completionTokens: count });
- }
-
- case 'dispose':
- engine = null;
- return reply(id, null);
-
- default:
- throw new Error(`unknown message type: ${type}`);
- }
-}
diff --git a/examples/benchmark/index.html b/examples/benchmark/index.html
index d1cce20..e191c85 100644
--- a/examples/benchmark/index.html
+++ b/examples/benchmark/index.html
@@ -578,64 +578,51 @@
Comparison Charts
transformersPipeline = null;
}
- // ── Flare Engine (via @sauravpanda/flare WASM in a dedicated Worker) ──
+ // ── Flare Engine (via @sauravpanda/flare WASM) ──
//
- // Flare's FlareEngine.load() is a synchronous WASM call that blocks the
- // main thread for the full 138MB GGUF parse + GPU buffer upload. We
- // run it in a dedicated Web Worker to keep the UI responsive — matches
- // MLC and Transformers.js which do the same internally.
- let flareWorker = null;
- let flareArch = null;
- let flareRpcSeq = 0;
- const flareRpcPending = new Map();
-
- function ensureFlareWorker() {
- if (flareWorker) return flareWorker;
- flareWorker = new Worker(new URL('./flare-worker.js', import.meta.url), {
- type: 'module',
- });
- flareWorker.addEventListener('message', (e) => {
- const { id, result, error, stream } = e.data;
- const slot = flareRpcPending.get(id);
- if (!slot) return;
- if (error) {
- slot.reject(new Error(error.message || 'flare worker error'));
- flareRpcPending.delete(id);
- return;
- }
- if (stream) {
- try { slot.onStream && slot.onStream(stream); } catch (err) { console.error(err); }
- return;
- }
- slot.resolve(result);
- flareRpcPending.delete(id);
- });
- flareWorker.addEventListener('error', (e) => {
- console.error('[flare-worker] error:', e);
- log(`Flare worker error: ${e.message || e}`, 'error');
- });
- return flareWorker;
- }
-
- function flareCall(type, args = {}, transfer = [], onStream = null) {
- const id = ++flareRpcSeq;
- return new Promise((resolve, reject) => {
- flareRpcPending.set(id, { resolve, reject, onStream });
- flareWorker.postMessage({ id, type, args }, transfer);
- });
- }
+ // Flare runs on the main thread. Moving it to a Web Worker is
+ // architecturally desirable (FlareEngine.load is a synchronous WASM
+ // call that briefly freezes the UI while parsing 138 MB of GGUF), but
+ // wgpu's blocking readback path (map_async + channel recv) deadlocks
+ // in a dedicated worker because the WebGPU promise queue is serviced
+ // by browser-internal hooks that only fire on the main thread.
+ // Tracked upstream — until flare-web ships a worker-safe async
+ // readback path we stay on the main thread to keep WebGPU active.
+ let flareEngine = null;
+ let flareLib = null;
async function loadFlareEngine(config) {
- if (!flareWorker) {
- log('Starting Flare worker...', 'info');
- ensureFlareWorker();
+ if (!flareLib) {
+ log('Loading @sauravpanda/flare WASM from CDN...', 'info');
const CDN = 'https://cdn.jsdelivr.net/npm/@sauravpanda/flare@0.2.12/pkg';
- await flareCall('init', {
- jsUrl: `${CDN}/flare_web.js`,
- wasmUrl: `${CDN}/flare_web_bg.wasm`,
- patchImportMeta: true,
- });
- log('Flare WASM initialized in worker.', 'success');
+ const wasmUrl = `${CDN}/flare_web_bg.wasm`;
+
+ // Fetch the JS module source, patch the WASM URL, and load via blob
+ // to avoid cross-origin ES module import restrictions.
+ const jsResp = await fetch(`${CDN}/flare_web.js`, { cache: 'no-cache' });
+ if (!jsResp.ok) throw new Error(`HTTP ${jsResp.status} fetching flare_web.js`);
+ let jsSrc = await jsResp.text();
+ log(`Fetched flare_web.js (${(jsSrc.length / 1024).toFixed(0)} KB)`, 'info');
+
+ jsSrc = jsSrc.replaceAll('import.meta.url', `'${CDN}/flare_web.js'`);
+ // Fix wasm-pack codegen bug: a JSDoc block contains "/* done */"
+ // which prematurely closes the outer /** */ comment.
+ jsSrc = jsSrc.replaceAll('/* done */', '/* done -/');
+
+ const blob = new Blob([jsSrc], { type: 'application/javascript' });
+ const blobUrl = URL.createObjectURL(blob);
+ try {
+ flareLib = await import(/* webpackIgnore: true */ blobUrl);
+ } catch (importErr) {
+ URL.revokeObjectURL(blobUrl);
+ log(`Blob import failed: ${importErr.message}`, 'error');
+ log('Trying direct CDN import...', 'info');
+ flareLib = await import(`${CDN}/flare_web.js`);
+ }
+ URL.revokeObjectURL(blobUrl);
+
+ await flareLib.default(wasmUrl);
+ log('Flare WASM initialized.', 'success');
}
log(`Downloading GGUF model: ${config.url.split('/').pop()}`, 'info');
@@ -665,68 +652,49 @@ Comparison Charts
offset += chunk.length;
}
- log(`Downloaded ${(received / 1024 / 1024).toFixed(1)} MB, parsing GGUF in worker...`, 'info');
+ log(`Downloaded ${(received / 1024 / 1024).toFixed(1)} MB, parsing GGUF...`, 'info');
- // Transfer the GGUF bytes to the worker — zero-copy. After this,
- // `bytes.buffer` is detached on the main thread, so we drop the
- // reference immediately.
- const { architecture } = await flareCall('load', { bytes }, [bytes.buffer]);
- flareArch = architecture;
+ flareEngine = flareLib.FlareEngine.load(bytes);
- // Try WebGPU acceleration in the worker
try {
- const { gpuOk, backendInfo } = await flareCall('init_gpu');
- console.log('[Flare] backend_info:', backendInfo);
- log(`Flare backend: ${JSON.stringify(backendInfo)}`, gpuOk ? 'success' : 'info');
- await flareCall('enable_prefill_profiling');
+ const gpuOk = await flareEngine.init_gpu();
+ const info = JSON.parse(flareEngine.backend_info());
+ console.log('[Flare] backend_info:', info);
+ log(`Flare backend: ${JSON.stringify(info)}`, gpuOk ? 'success' : 'info');
+ // Enable profiling; runFlareInference reads it after the first prefill
+ flareEngine.enable_prefill_profiling();
} catch (e) {
console.warn('[Flare] GPU init failed:', e);
log('Flare GPU init failed — using CPU SIMD', 'info');
}
const loadTime = performance.now() - t0;
- log(`Flare model loaded in ${(loadTime / 1000).toFixed(1)}s (arch: ${architecture})`, 'success');
+ log(`Flare model loaded in ${(loadTime / 1000).toFixed(1)}s (arch: ${flareEngine.architecture})`, 'success');
return loadTime;
}
async function runFlareInference(prompt, opts) {
const t0 = performance.now();
- const promptTokens = await flareCall('encode_text', { text: prompt });
- if (!promptTokens || promptTokens.length === 0) {
+ const promptIds = flareEngine.encode_text(prompt);
+ if (!promptIds || promptIds.length === 0) {
throw new Error('Flare tokenizer failed to encode prompt');
}
- await flareCall('reset');
-
- let firstTokenTime = 0;
- let tokenCount = 0;
- let output = '';
-
- await flareCall(
- 'stream',
- {
- promptTokens,
- maxTokens: opts.maxTokens,
- temperature: opts.temperature || 0.001,
- topP: 1.0,
- topK: 40,
- repeatPenalty: 1.0,
- minP: 0.0,
- },
- [],
- (chunk) => {
- if (tokenCount === 0) firstTokenTime = performance.now() - t0;
- output += chunk.tokenText;
- tokenCount += 1;
- },
+ flareEngine.reset();
+ flareEngine.begin_stream_with_params(
+ promptIds,
+ opts.maxTokens,
+ opts.temperature || 0.001,
+ 1.0, 40, 1.0, 0.0,
);
- // Read per-phase profile from the completed prefill once per page load.
- if (!window.__flareProfileLogged) {
+ const firstTokenTime = performance.now() - t0;
+
+ // First-run-only profile snapshot
+ if (typeof flareEngine.prefill_profile_json === 'function' && !window.__flareProfileLogged) {
try {
- const profileStr = await flareCall('prefill_profile_json');
- const profile = JSON.parse(profileStr);
+ const profile = JSON.parse(flareEngine.prefill_profile_json());
if (profile && profile.seq_len > 0) {
console.log('[Flare] prefill profile:', profile);
log(`Flare prefill profile: ${JSON.stringify(profile)}`, 'info');
@@ -737,6 +705,16 @@ Comparison Charts
}
}
+ let tokenCount = 0;
+ let output = '';
+
+ while (!flareEngine.stream_done) {
+ const id = flareEngine.next_token();
+ if (id === undefined) break;
+ tokenCount++;
+ output += flareEngine.decode_token_chunk(id);
+ }
+
const totalTime = performance.now() - t0;
const decodeTime = totalTime - firstTokenTime;
@@ -750,12 +728,7 @@ Comparison Charts
}
function disposeFlare() {
- if (flareWorker) {
- flareWorker.terminate();
- flareWorker = null;
- }
- flareRpcPending.clear();
- flareArch = null;
+ flareEngine = null;
}
// ── Benchmark runner ──