diff --git a/examples/benchmark/flare-worker.js b/examples/benchmark/flare-worker.js
deleted file mode 100644
index c9dadad..0000000
--- a/examples/benchmark/flare-worker.js
+++ /dev/null
@@ -1,145 +0,0 @@
-// Dedicated Web Worker that owns the Flare FlareEngine.
-//
-// The main thread talks to this worker via structured `{id, type, args}`
-// messages and receives back either `{id, result}`, `{id, error}`, or
-// (for streaming calls) multiple `{id, stream}` messages followed by a
-// final `{id, result}`.
-//
-// Loading Flare on the main thread blocks the UI for the full duration
-// of the 138 MB GGUF parse + GPU buffer upload.  Moving it here keeps
-// the tab responsive — matches how MLC and Transformers.js already work.
-
-let flareLib = null;
-let engine = null;
-
-function reply(id, result) {
-  self.postMessage({ id, result });
-}
-
-function replyError(id, err) {
-  self.postMessage({
-    id,
-    error: { message: (err && err.message) || String(err), stack: err && err.stack },
-  });
-}
-
-function replyStream(id, tokenText, tokenId) {
-  self.postMessage({ id, stream: { tokenText, tokenId } });
-}
-
-self.addEventListener('message', async (e) => {
-  const { id, type, args } = e.data || {};
-  try {
-    await dispatch(id, type, args || {});
-  } catch (err) {
-    replyError(id, err);
-  }
-});
-
-async function dispatch(id, type, args) {
-  switch (type) {
-    case 'init': {
-      // Mirrors the main-thread blob-URL trick so the CDN-hosted ES module
-      // can be imported without CORS problems and so import.meta.url resolves
-      // to the correct wasm file.
-      const { jsUrl, wasmUrl, patchImportMeta } = args;
-      const resp = await fetch(jsUrl, { cache: 'no-cache' });
-      if (!resp.ok) throw new Error(`HTTP ${resp.status} fetching ${jsUrl}`);
-      let src = await resp.text();
-      if (patchImportMeta) {
-        src = src.replaceAll('import.meta.url', JSON.stringify(jsUrl));
-        // Workaround for wasm-pack codegen: a JSDoc block contains "/* done */"
-        // which prematurely closes the outer /** */ comment.
-        src = src.replaceAll('/* done */', '/* done -/');
-      }
-      const blob = new Blob([src], { type: 'application/javascript' });
-      const blobUrl = URL.createObjectURL(blob);
-      try {
-        flareLib = await import(blobUrl);
-      } finally {
-        URL.revokeObjectURL(blobUrl);
-      }
-      await flareLib.default(wasmUrl);
-      return reply(id, { ok: true });
-    }
-
-    case 'load': {
-      if (!flareLib) throw new Error('init must be called before load');
-      // `args.bytes` is transferred from the main thread so this is a move,
-      // not a copy.
-      engine = flareLib.FlareEngine.load(args.bytes);
-      return reply(id, { architecture: engine.architecture });
-    }
-
-    case 'init_gpu': {
-      if (!engine) throw new Error('load must be called before init_gpu');
-      const gpuOk = await engine.init_gpu();
-      return reply(id, {
-        gpuOk,
-        backendInfo: JSON.parse(engine.backend_info()),
-      });
-    }
-
-    case 'enable_prefill_profiling':
-      engine.enable_prefill_profiling();
-      return reply(id, null);
-
-    case 'disable_prefill_profiling':
-      engine.disable_prefill_profiling();
-      return reply(id, null);
-
-    case 'prefill_profile_json':
-      return reply(id, engine.prefill_profile_json());
-
-    case 'apply_chat_template':
-      return reply(id, engine.apply_chat_template(args.userMsg || '', args.systemMsg || ''));
-
-    case 'encode_text':
-      return reply(id, Array.from(engine.encode_text(args.text)));
-
-    case 'reset':
-      engine.reset();
-      return reply(id, null);
-
-    case 'stream': {
-      // One-shot streaming: the worker drains every token in a tight loop,
-      // posting each decoded chunk immediately.  Since the worker has nothing
-      // else to do between tokens, the event loop stays unblocked.
-      const {
-        promptTokens,
-        maxTokens,
-        temperature,
-        topP,
-        topK,
-        repeatPenalty,
-        minP,
-      } = args;
-      engine.reset();
-      engine.begin_stream_with_params(
-        new Uint32Array(promptTokens),
-        maxTokens,
-        temperature,
-        topP,
-        topK,
-        repeatPenalty,
-        minP,
-      );
-      let count = 0;
-      while (!engine.stream_done) {
-        const tokId = engine.next_token();
-        if (tokId === undefined || tokId === null) break;
-        const text = engine.decode_token_chunk(tokId);
-        replyStream(id, text, tokId);
-        count += 1;
-      }
-      return reply(id, { done: true, completionTokens: count });
-    }
-
-    case 'dispose':
-      engine = null;
-      return reply(id, null);
-
-    default:
-      throw new Error(`unknown message type: ${type}`);
-  }
-}
diff --git a/examples/benchmark/index.html b/examples/benchmark/index.html
index d1cce20..e191c85 100644
--- a/examples/benchmark/index.html
+++ b/examples/benchmark/index.html
@@ -578,64 +578,51 @@ <h2>Comparison Charts</h2>
         transformersPipeline = null;
       }
 
-      // ── Flare Engine (via @sauravpanda/flare WASM in a dedicated Worker) ──
+      // ── Flare Engine (via @sauravpanda/flare WASM) ──
       //
-      // Flare's FlareEngine.load() is a synchronous WASM call that blocks the
-      // main thread for the full 138MB GGUF parse + GPU buffer upload.  We
-      // run it in a dedicated Web Worker to keep the UI responsive — matches
-      // MLC and Transformers.js which do the same internally.
-      let flareWorker = null;
-      let flareArch = null;
-      let flareRpcSeq = 0;
-      const flareRpcPending = new Map();
-
-      function ensureFlareWorker() {
-        if (flareWorker) return flareWorker;
-        flareWorker = new Worker(new URL('./flare-worker.js', import.meta.url), {
-          type: 'module',
-        });
-        flareWorker.addEventListener('message', (e) => {
-          const { id, result, error, stream } = e.data;
-          const slot = flareRpcPending.get(id);
-          if (!slot) return;
-          if (error) {
-            slot.reject(new Error(error.message || 'flare worker error'));
-            flareRpcPending.delete(id);
-            return;
-          }
-          if (stream) {
-            try { slot.onStream && slot.onStream(stream); } catch (err) { console.error(err); }
-            return;
-          }
-          slot.resolve(result);
-          flareRpcPending.delete(id);
-        });
-        flareWorker.addEventListener('error', (e) => {
-          console.error('[flare-worker] error:', e);
-          log(`Flare worker error: ${e.message || e}`, 'error');
-        });
-        return flareWorker;
-      }
-
-      function flareCall(type, args = {}, transfer = [], onStream = null) {
-        const id = ++flareRpcSeq;
-        return new Promise((resolve, reject) => {
-          flareRpcPending.set(id, { resolve, reject, onStream });
-          flareWorker.postMessage({ id, type, args }, transfer);
-        });
-      }
+      // Flare runs on the main thread.  Moving it to a Web Worker is
+      // architecturally desirable (FlareEngine.load is a synchronous WASM
+      // call that briefly freezes the UI while parsing 138 MB of GGUF), but
+      // wgpu's blocking readback path (map_async + channel recv) deadlocks
+      // in a dedicated worker because the WebGPU promise queue is serviced
+      // by browser-internal hooks that only fire on the main thread.
+      // Tracked upstream — until flare-web ships a worker-safe async
+      // readback path we stay on the main thread to keep WebGPU active.
+      let flareEngine = null;
+      let flareLib = null;
 
       async function loadFlareEngine(config) {
-        if (!flareWorker) {
-          log('Starting Flare worker...', 'info');
-          ensureFlareWorker();
+        if (!flareLib) {
+          log('Loading @sauravpanda/flare WASM from CDN...', 'info');
           const CDN = 'https://cdn.jsdelivr.net/npm/@sauravpanda/flare@0.2.12/pkg';
-          await flareCall('init', {
-            jsUrl: `${CDN}/flare_web.js`,
-            wasmUrl: `${CDN}/flare_web_bg.wasm`,
-            patchImportMeta: true,
-          });
-          log('Flare WASM initialized in worker.', 'success');
+          const wasmUrl = `${CDN}/flare_web_bg.wasm`;
+
+          // Fetch the JS module source, patch the WASM URL, and load via blob
+          // to avoid cross-origin ES module import restrictions.
+          const jsResp = await fetch(`${CDN}/flare_web.js`, { cache: 'no-cache' });
+          if (!jsResp.ok) throw new Error(`HTTP ${jsResp.status} fetching flare_web.js`);
+          let jsSrc = await jsResp.text();
+          log(`Fetched flare_web.js (${(jsSrc.length / 1024).toFixed(0)} KB)`, 'info');
+
+          jsSrc = jsSrc.replaceAll('import.meta.url', `'${CDN}/flare_web.js'`);
+          // Fix wasm-pack codegen bug: a JSDoc block contains "/* done */"
+          // which prematurely closes the outer /** */ comment.
+          jsSrc = jsSrc.replaceAll('/* done */', '/* done -/');
+
+          const blob = new Blob([jsSrc], { type: 'application/javascript' });
+          const blobUrl = URL.createObjectURL(blob);
+          try {
+            flareLib = await import(/* webpackIgnore: true */ blobUrl);
+          } catch (importErr) {
+            URL.revokeObjectURL(blobUrl);
+            log(`Blob import failed: ${importErr.message}`, 'error');
+            log('Trying direct CDN import...', 'info');
+            flareLib = await import(`${CDN}/flare_web.js`);
+          }
+          URL.revokeObjectURL(blobUrl);
+
+          await flareLib.default(wasmUrl);
+          log('Flare WASM initialized.', 'success');
         }
 
         log(`Downloading GGUF model: ${config.url.split('/').pop()}`, 'info');
@@ -665,68 +652,49 @@ <h2>Comparison Charts</h2>
           offset += chunk.length;
         }
 
-        log(`Downloaded ${(received / 1024 / 1024).toFixed(1)} MB, parsing GGUF in worker...`, 'info');
+        log(`Downloaded ${(received / 1024 / 1024).toFixed(1)} MB, parsing GGUF...`, 'info');
 
-        // Transfer the GGUF bytes to the worker — zero-copy.  After this,
-        // `bytes.buffer` is detached on the main thread, so we drop the
-        // reference immediately.
-        const { architecture } = await flareCall('load', { bytes }, [bytes.buffer]);
-        flareArch = architecture;
+        flareEngine = flareLib.FlareEngine.load(bytes);
 
-        // Try WebGPU acceleration in the worker
         try {
-          const { gpuOk, backendInfo } = await flareCall('init_gpu');
-          console.log('[Flare] backend_info:', backendInfo);
-          log(`Flare backend: ${JSON.stringify(backendInfo)}`, gpuOk ? 'success' : 'info');
-          await flareCall('enable_prefill_profiling');
+          const gpuOk = await flareEngine.init_gpu();
+          const info = JSON.parse(flareEngine.backend_info());
+          console.log('[Flare] backend_info:', info);
+          log(`Flare backend: ${JSON.stringify(info)}`, gpuOk ? 'success' : 'info');
+          // Enable profiling; runFlareInference reads it after the first prefill
+          flareEngine.enable_prefill_profiling();
         } catch (e) {
           console.warn('[Flare] GPU init failed:', e);
           log('Flare GPU init failed — using CPU SIMD', 'info');
         }
 
         const loadTime = performance.now() - t0;
-        log(`Flare model loaded in ${(loadTime / 1000).toFixed(1)}s (arch: ${architecture})`, 'success');
+        log(`Flare model loaded in ${(loadTime / 1000).toFixed(1)}s (arch: ${flareEngine.architecture})`, 'success');
         return loadTime;
       }
 
       async function runFlareInference(prompt, opts) {
         const t0 = performance.now();
 
-        const promptTokens = await flareCall('encode_text', { text: prompt });
-        if (!promptTokens || promptTokens.length === 0) {
+        const promptIds = flareEngine.encode_text(prompt);
+        if (!promptIds || promptIds.length === 0) {
           throw new Error('Flare tokenizer failed to encode prompt');
         }
 
-        await flareCall('reset');
-
-        let firstTokenTime = 0;
-        let tokenCount = 0;
-        let output = '';
-
-        await flareCall(
-          'stream',
-          {
-            promptTokens,
-            maxTokens: opts.maxTokens,
-            temperature: opts.temperature || 0.001,
-            topP: 1.0,
-            topK: 40,
-            repeatPenalty: 1.0,
-            minP: 0.0,
-          },
-          [],
-          (chunk) => {
-            if (tokenCount === 0) firstTokenTime = performance.now() - t0;
-            output += chunk.tokenText;
-            tokenCount += 1;
-          },
+        flareEngine.reset();
+        flareEngine.begin_stream_with_params(
+          promptIds,
+          opts.maxTokens,
+          opts.temperature || 0.001,
+          1.0, 40, 1.0, 0.0,
         );
 
-        // Read per-phase profile from the completed prefill once per page load.
-        if (!window.__flareProfileLogged) {
+        const firstTokenTime = performance.now() - t0;
+
+        // First-run-only profile snapshot
+        if (typeof flareEngine.prefill_profile_json === 'function' && !window.__flareProfileLogged) {
           try {
-            const profileStr = await flareCall('prefill_profile_json');
-            const profile = JSON.parse(profileStr);
+            const profile = JSON.parse(flareEngine.prefill_profile_json());
             if (profile && profile.seq_len > 0) {
               console.log('[Flare] prefill profile:', profile);
               log(`Flare prefill profile: ${JSON.stringify(profile)}`, 'info');
@@ -737,6 +705,16 @@ <h2>Comparison Charts</h2>
           }
         }
 
+        let tokenCount = 0;
+        let output = '';
+
+        while (!flareEngine.stream_done) {
+          const id = flareEngine.next_token();
+          if (id === undefined) break;
+          tokenCount++;
+          output += flareEngine.decode_token_chunk(id);
+        }
+
         const totalTime = performance.now() - t0;
         const decodeTime = totalTime - firstTokenTime;
 
@@ -750,12 +728,7 @@ <h2>Comparison Charts</h2>
       }
 
       function disposeFlare() {
-        if (flareWorker) {
-          flareWorker.terminate();
-          flareWorker = null;
-        }
-        flareRpcPending.clear();
-        flareArch = null;
+        flareEngine = null;
       }
 
       // ── Benchmark runner ──