Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 0 additions & 145 deletions examples/benchmark/flare-worker.js

This file was deleted.

175 changes: 74 additions & 101 deletions examples/benchmark/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -578,64 +578,51 @@ <h2>Comparison Charts</h2>
transformersPipeline = null;
}

// ── Flare Engine (via @sauravpanda/flare WASM in a dedicated Worker) ──
// ── Flare Engine (via @sauravpanda/flare WASM) ──
//
// Flare's FlareEngine.load() is a synchronous WASM call that blocks the
// main thread for the full 138MB GGUF parse + GPU buffer upload. We
// run it in a dedicated Web Worker to keep the UI responsive — matches
// MLC and Transformers.js which do the same internally.
let flareWorker = null;
let flareArch = null;
let flareRpcSeq = 0;
const flareRpcPending = new Map();

function ensureFlareWorker() {
if (flareWorker) return flareWorker;
flareWorker = new Worker(new URL('./flare-worker.js', import.meta.url), {
type: 'module',
});
flareWorker.addEventListener('message', (e) => {
const { id, result, error, stream } = e.data;
const slot = flareRpcPending.get(id);
if (!slot) return;
if (error) {
slot.reject(new Error(error.message || 'flare worker error'));
flareRpcPending.delete(id);
return;
}
if (stream) {
try { slot.onStream && slot.onStream(stream); } catch (err) { console.error(err); }
return;
}
slot.resolve(result);
flareRpcPending.delete(id);
});
flareWorker.addEventListener('error', (e) => {
console.error('[flare-worker] error:', e);
log(`Flare worker error: ${e.message || e}`, 'error');
});
return flareWorker;
}

function flareCall(type, args = {}, transfer = [], onStream = null) {
const id = ++flareRpcSeq;
return new Promise((resolve, reject) => {
flareRpcPending.set(id, { resolve, reject, onStream });
flareWorker.postMessage({ id, type, args }, transfer);
});
}
// Flare runs on the main thread. Moving it to a Web Worker is
// architecturally desirable (FlareEngine.load is a synchronous WASM
// call that briefly freezes the UI while parsing 138 MB of GGUF), but
// wgpu's blocking readback path (map_async + channel recv) deadlocks
// in a dedicated worker because the WebGPU promise queue is serviced
// by browser-internal hooks that only fire on the main thread.
// Tracked upstream — until flare-web ships a worker-safe async
// readback path we stay on the main thread to keep WebGPU active.
let flareEngine = null;
let flareLib = null;

async function loadFlareEngine(config) {
if (!flareWorker) {
log('Starting Flare worker...', 'info');
ensureFlareWorker();
if (!flareLib) {
log('Loading @sauravpanda/flare WASM from CDN...', 'info');
const CDN = 'https://cdn.jsdelivr.net/npm/@sauravpanda/flare@0.2.12/pkg';
await flareCall('init', {
jsUrl: `${CDN}/flare_web.js`,
wasmUrl: `${CDN}/flare_web_bg.wasm`,
patchImportMeta: true,
});
log('Flare WASM initialized in worker.', 'success');
const wasmUrl = `${CDN}/flare_web_bg.wasm`;

// Fetch the JS module source, patch the WASM URL, and load via blob
// to avoid cross-origin ES module import restrictions.
const jsResp = await fetch(`${CDN}/flare_web.js`, { cache: 'no-cache' });
if (!jsResp.ok) throw new Error(`HTTP ${jsResp.status} fetching flare_web.js`);
let jsSrc = await jsResp.text();
log(`Fetched flare_web.js (${(jsSrc.length / 1024).toFixed(0)} KB)`, 'info');

jsSrc = jsSrc.replaceAll('import.meta.url', `'${CDN}/flare_web.js'`);
// Fix wasm-pack codegen bug: a JSDoc block contains "/* done */"
// which prematurely closes the outer /** */ comment.
jsSrc = jsSrc.replaceAll('/* done */', '/* done -/');

const blob = new Blob([jsSrc], { type: 'application/javascript' });
const blobUrl = URL.createObjectURL(blob);
try {
flareLib = await import(/* webpackIgnore: true */ blobUrl);
} catch (importErr) {
URL.revokeObjectURL(blobUrl);
log(`Blob import failed: ${importErr.message}`, 'error');
log('Trying direct CDN import...', 'info');
flareLib = await import(`${CDN}/flare_web.js`);
}
URL.revokeObjectURL(blobUrl);

await flareLib.default(wasmUrl);
log('Flare WASM initialized.', 'success');
}

log(`Downloading GGUF model: ${config.url.split('/').pop()}`, 'info');
Expand Down Expand Up @@ -665,68 +652,49 @@ <h2>Comparison Charts</h2>
offset += chunk.length;
}

log(`Downloaded ${(received / 1024 / 1024).toFixed(1)} MB, parsing GGUF in worker...`, 'info');
log(`Downloaded ${(received / 1024 / 1024).toFixed(1)} MB, parsing GGUF...`, 'info');

// Transfer the GGUF bytes to the worker — zero-copy. After this,
// `bytes.buffer` is detached on the main thread, so we drop the
// reference immediately.
const { architecture } = await flareCall('load', { bytes }, [bytes.buffer]);
flareArch = architecture;
flareEngine = flareLib.FlareEngine.load(bytes);

// Try WebGPU acceleration in the worker
try {
const { gpuOk, backendInfo } = await flareCall('init_gpu');
console.log('[Flare] backend_info:', backendInfo);
log(`Flare backend: ${JSON.stringify(backendInfo)}`, gpuOk ? 'success' : 'info');
await flareCall('enable_prefill_profiling');
const gpuOk = await flareEngine.init_gpu();
const info = JSON.parse(flareEngine.backend_info());
console.log('[Flare] backend_info:', info);
log(`Flare backend: ${JSON.stringify(info)}`, gpuOk ? 'success' : 'info');
// Enable profiling; runFlareInference reads it after the first prefill
flareEngine.enable_prefill_profiling();
} catch (e) {
console.warn('[Flare] GPU init failed:', e);
log('Flare GPU init failed — using CPU SIMD', 'info');
}

const loadTime = performance.now() - t0;
log(`Flare model loaded in ${(loadTime / 1000).toFixed(1)}s (arch: ${architecture})`, 'success');
log(`Flare model loaded in ${(loadTime / 1000).toFixed(1)}s (arch: ${flareEngine.architecture})`, 'success');
return loadTime;
}

async function runFlareInference(prompt, opts) {
const t0 = performance.now();

const promptTokens = await flareCall('encode_text', { text: prompt });
if (!promptTokens || promptTokens.length === 0) {
const promptIds = flareEngine.encode_text(prompt);
if (!promptIds || promptIds.length === 0) {
throw new Error('Flare tokenizer failed to encode prompt');
}

await flareCall('reset');

let firstTokenTime = 0;
let tokenCount = 0;
let output = '';

await flareCall(
'stream',
{
promptTokens,
maxTokens: opts.maxTokens,
temperature: opts.temperature || 0.001,
topP: 1.0,
topK: 40,
repeatPenalty: 1.0,
minP: 0.0,
},
[],
(chunk) => {
if (tokenCount === 0) firstTokenTime = performance.now() - t0;
output += chunk.tokenText;
tokenCount += 1;
},
flareEngine.reset();
flareEngine.begin_stream_with_params(
promptIds,
opts.maxTokens,
opts.temperature || 0.001,
1.0, 40, 1.0, 0.0,
);

// Read per-phase profile from the completed prefill once per page load.
if (!window.__flareProfileLogged) {
const firstTokenTime = performance.now() - t0;

// First-run-only profile snapshot
if (typeof flareEngine.prefill_profile_json === 'function' && !window.__flareProfileLogged) {
try {
const profileStr = await flareCall('prefill_profile_json');
const profile = JSON.parse(profileStr);
const profile = JSON.parse(flareEngine.prefill_profile_json());
if (profile && profile.seq_len > 0) {
console.log('[Flare] prefill profile:', profile);
log(`Flare prefill profile: ${JSON.stringify(profile)}`, 'info');
Expand All @@ -737,6 +705,16 @@ <h2>Comparison Charts</h2>
}
}

let tokenCount = 0;
let output = '';

while (!flareEngine.stream_done) {
const id = flareEngine.next_token();
if (id === undefined) break;
tokenCount++;
output += flareEngine.decode_token_chunk(id);
}

const totalTime = performance.now() - t0;
const decodeTime = totalTime - firstTokenTime;

Expand All @@ -750,12 +728,7 @@ <h2>Comparison Charts</h2>
}

function disposeFlare() {
if (flareWorker) {
flareWorker.terminate();
flareWorker = null;
}
flareRpcPending.clear();
flareArch = null;
flareEngine = null;
}

// ── Benchmark runner ──
Expand Down
Loading