diff --git a/crates/aprender-serve/src/gguf/inference/forward/debug.rs b/crates/aprender-serve/src/gguf/inference/forward/debug.rs
index d99775508..37d09fadc 100644
--- a/crates/aprender-serve/src/gguf/inference/forward/debug.rs
+++ b/crates/aprender-serve/src/gguf/inference/forward/debug.rs
@@ -474,119 +474,19 @@ impl OwnedQuantizedModel {
 
         // 2. Process through transformer layers
         for (layer_idx, layer) in self.layers.iter().enumerate() {
-            // 2a+2b. Fused attention layer norm + QKV projection
-            // For RMSNorm models: fuse norm + matmul to eliminate intermediate allocation
-            // For LayerNorm models: use separate operations (has bias)
-            let mut qkv = if use_rmsnorm {
-                self.fused_rmsnorm_qkv_matmul(
-                    &hidden,
-                    &layer.attn_norm_weight,
-                    self.config.eps,
-                    &layer.qkv_weight,
-                )?
-            } else {
-                let normed = ops::layer_norm(
-                    &hidden,
-                    &layer.attn_norm_weight,
-                    layer.attn_norm_bias.as_deref(),
-                    self.config.eps,
-                );
-                self.qkv_matmul(&normed, &layer.qkv_weight)?
-            };
-
-            // PMAT-114: Trace QKV BEFORE bias (PMAT-260)
-            self.debug_trace_qkv(&qkv, layer_idx, hidden_dim);
-
-            if let Some(ref bias) = layer.qkv_bias {
-                ops::add_bias(&mut qkv, bias);
-            }
-
-            // 2c. Extract Q, K, V with GQA-aware sizes and apply RoPE
-            // Q: [hidden_dim] = [num_heads * head_dim]
-            // K: [kv_dim] = [num_kv_heads * head_dim]
-            // V: [kv_dim] = [num_kv_heads * head_dim]
-            // Optimization: apply RoPE in-place to avoid Q/K copies
-            let num_kv_heads = self.config.num_kv_heads;
-            // GH-479: Use config methods (Qwen3 head_dim != hidden/heads)
-            let head_dim = self.config.head_dim();
-            let q_dim = self.config.q_dim();
-            let kv_dim = self.config.kv_dim();
-
-            // PMAT-114: Trace QKV after bias for layer 0 (PMAT-260)
-            self.debug_trace_qkv_after_bias(&qkv, layer, layer_idx, hidden_dim);
-
-            // GH-479: Per-head QK RMSNorm (Qwen3) — after bias, before RoPE
-            if let Some(ref q_norm) = layer.attn_q_norm_weight {
-                ops::apply_per_head_rms_norm(
-                    &mut qkv[0..q_dim],
-                    q_norm,
-                    self.config.num_heads,
-                    self.config.eps,
-                );
-            }
-            if let Some(ref k_norm) = layer.attn_k_norm_weight {
-                ops::apply_per_head_rms_norm(
-                    &mut qkv[q_dim..q_dim + kv_dim],
-                    k_norm,
-                    num_kv_heads,
-                    self.config.eps,
-                );
-            }
-
-            // GH-278: Skip RoPE for models with learned position embeddings (GPT-2)
-            if self.config.constraints.uses_rope() {
-                self.apply_rope(&mut qkv[0..q_dim], position, self.config.num_heads);
-                self.apply_rope(
-                    &mut qkv[q_dim..q_dim + kv_dim],
-                    position,
-                    num_kv_heads,
-                );
-            }
-
-            // Use slices to avoid copies (only copy K for cache storage)
-            // GH-479: Use q_dim (may differ from hidden_dim for Qwen3)
-            let q = &qkv[0..q_dim];
-            let k = &qkv[q_dim..q_dim + kv_dim];
-            let v = &qkv[q_dim + kv_dim..q_dim + 2 * kv_dim];
-
-            // 2d. Get cached K/V and compute attention with GQA support
-            let k_cache = cache.get_k(layer_idx);
-            let v_cache = cache.get_v(layer_idx);
-
-            // Use pre-allocated attention output buffer (reused across layers)
-            if k_cache.is_empty() {
-                // First token - no cache yet, output is just weighted V
-                // With single query and single K/V, need to expand V for all Q heads
-                let q_per_kv = self.config.num_heads / num_kv_heads;
-                for q_head in 0..self.config.num_heads {
-                    let kv_head = q_head / q_per_kv;
-                    let v_start = kv_head * head_dim;
-                    let out_start = q_head * head_dim;
-                    attn_out_buffer[out_start..out_start + head_dim]
-                        .copy_from_slice(&v[v_start..v_start + head_dim]);
-                }
-            } else {
-                // Use cached K/V for attention with GQA
-                // Uses pre-allocated buffer to avoid 704 Vec allocations per token
-                self.attention_with_cache_gqa_into(q, k_cache, v_cache, k, v, &mut attn_out_buffer);
-
-                // CORRECTNESS-013: Debug CPU attention output (PMAT-260)
-                Self::debug_trace_attention_output(&attn_out_buffer, layer_idx, position, head_dim);
-            }
-
-            // 2e. Store K and V in cache for future tokens
-            cache.append(layer_idx, k, v);
-
-            // 2f. Attention output projection
-            let mut attn_output = self.fused_matmul(&attn_out_buffer, &layer.attn_output_weight)?;
-            if let Some(ref bias) = layer.attn_output_bias {
-                ops::add_bias(&mut attn_output, bias);
-            }
-
-            // 2g. Residual connection
-            for i in 0..hidden_dim {
-                hidden[i] += attn_output[i];
-            }
+            // 2a-2g. Attention sub-block (norm + QKV + RoPE + cache + attn + output proj + residual).
+            // Lifted into a helper (M32d Day 1 prep refactor) so the upcoming
+            // `forward_single_qwen3_moe_with_cache` can share the SAME attention
+            // implementation — only the FFN block differs between dense and MoE paths.
+            self.attention_layer_with_cache(
+                &mut hidden,
+                layer,
+                layer_idx,
+                cache,
+                position,
+                &mut attn_out_buffer,
+                use_rmsnorm,
+            )?;
 
             // 2h+2i. FFN with optional layer norm and SwiGLU/GELU activation
             let ffn_activated = self.single_cache_ffn_block(&hidden, layer_idx, use_rmsnorm)?;
@@ -612,4 +512,161 @@ impl OwnedQuantizedModel {
         // Final output: norm + LM head + debug verification + bias
         self.single_cache_final_output(&hidden, position, use_rmsnorm)
     }
+
+    /// Per-layer attention sub-block with KV cache — M32d Day 1 prep refactor.
+    ///
+    /// Lifted verbatim from `forward_single_with_cache`'s inner loop body
+    /// (steps 2a–2g: fused RMSNorm+QKV → bias → per-head QK norm → RoPE →
+    /// cache.get_k/get_v → attention → cache.append → output proj +
+    /// residual). The behavior is bit-identical to the previous inline
+    /// version; this method exists so the upcoming
+    /// `forward_single_qwen3_moe_with_cache` (M32d step 1) can call the
+    /// SAME attention implementation — only the FFN block differs between
+    /// dense and MoE forward paths.
+    ///
+    /// Mutates `hidden` (residual add of the attention output) and
+    /// `attn_out_buffer` (reused across layers). Calls `cache.append`
+    /// to store this token's K/V for future calls. Caller is responsible
+    /// for `cache.advance()` after all layers are processed.
+    ///
+    /// # Arguments
+    /// * `hidden` - Per-layer hidden state; mutated by the post-attention residual add
+    /// * `layer` - Quantized layer weights (qkv, attn_norm, attn_output, optional QK norms)
+    /// * `layer_idx` - Index used for cache get/append and debug traces
+    /// * `cache` - KV cache; read for past K/V, appended with this token's K/V
+    /// * `position` - Token position used by RoPE (0-indexed from prompt start)
+    /// * `attn_out_buffer` - Pre-allocated scratch for `[q_dim]` attention output (reused)
+    /// * `use_rmsnorm` - True for RMSNorm models (Llama/Qwen family); false for LayerNorm (GPT-2)
+    ///
+    /// # Errors
+    /// Propagates errors from `fused_rmsnorm_qkv_matmul`, `qkv_matmul`,
+    /// `attention_with_cache_gqa_into`, and `fused_matmul`.
+    fn attention_layer_with_cache(
+        &self,
+        hidden: &mut Vec<f32>,
+        layer: &OwnedQuantizedLayer,
+        layer_idx: usize,
+        cache: &mut OwnedQuantizedKVCache,
+        position: usize,
+        attn_out_buffer: &mut [f32],
+        use_rmsnorm: bool,
+    ) -> Result<()> {
+        let hidden_dim = self.config.hidden_dim;
+
+        // 2a+2b. Fused attention layer norm + QKV projection
+        // For RMSNorm models: fuse norm + matmul to eliminate intermediate allocation
+        // For LayerNorm models: use separate operations (has bias)
+        let mut qkv = if use_rmsnorm {
+            self.fused_rmsnorm_qkv_matmul(
+                hidden,
+                &layer.attn_norm_weight,
+                self.config.eps,
+                &layer.qkv_weight,
+            )?
+        } else {
+            let normed = ops::layer_norm(
+                hidden,
+                &layer.attn_norm_weight,
+                layer.attn_norm_bias.as_deref(),
+                self.config.eps,
+            );
+            self.qkv_matmul(&normed, &layer.qkv_weight)?
+        };
+
+        // PMAT-114: Trace QKV BEFORE bias (PMAT-260)
+        self.debug_trace_qkv(&qkv, layer_idx, hidden_dim);
+
+        if let Some(ref bias) = layer.qkv_bias {
+            ops::add_bias(&mut qkv, bias);
+        }
+
+        // 2c. Extract Q, K, V with GQA-aware sizes and apply RoPE
+        // Q: [hidden_dim] = [num_heads * head_dim]
+        // K: [kv_dim] = [num_kv_heads * head_dim]
+        // V: [kv_dim] = [num_kv_heads * head_dim]
+        // Optimization: apply RoPE in-place to avoid Q/K copies
+        let num_kv_heads = self.config.num_kv_heads;
+        // GH-479: Use config methods (Qwen3 head_dim != hidden/heads)
+        let head_dim = self.config.head_dim();
+        let q_dim = self.config.q_dim();
+        let kv_dim = self.config.kv_dim();
+
+        // PMAT-114: Trace QKV after bias for layer 0 (PMAT-260)
+        self.debug_trace_qkv_after_bias(&qkv, layer, layer_idx, hidden_dim);
+
+        // GH-479: Per-head QK RMSNorm (Qwen3) — after bias, before RoPE
+        if let Some(ref q_norm) = layer.attn_q_norm_weight {
+            ops::apply_per_head_rms_norm(
+                &mut qkv[0..q_dim],
+                q_norm,
+                self.config.num_heads,
+                self.config.eps,
+            );
+        }
+        if let Some(ref k_norm) = layer.attn_k_norm_weight {
+            ops::apply_per_head_rms_norm(
+                &mut qkv[q_dim..q_dim + kv_dim],
+                k_norm,
+                num_kv_heads,
+                self.config.eps,
+            );
+        }
+
+        // GH-278: Skip RoPE for models with learned position embeddings (GPT-2)
+        if self.config.constraints.uses_rope() {
+            self.apply_rope(&mut qkv[0..q_dim], position, self.config.num_heads);
+            self.apply_rope(
+                &mut qkv[q_dim..q_dim + kv_dim],
+                position,
+                num_kv_heads,
+            );
+        }
+
+        // Use slices to avoid copies (only copy K for cache storage)
+        // GH-479: Use q_dim (may differ from hidden_dim for Qwen3)
+        let q = &qkv[0..q_dim];
+        let k = &qkv[q_dim..q_dim + kv_dim];
+        let v = &qkv[q_dim + kv_dim..q_dim + 2 * kv_dim];
+
+        // 2d. Get cached K/V and compute attention with GQA support
+        let k_cache = cache.get_k(layer_idx);
+        let v_cache = cache.get_v(layer_idx);
+
+        // Use pre-allocated attention output buffer (reused across layers)
+        if k_cache.is_empty() {
+            // First token - no cache yet, output is just weighted V
+            // With single query and single K/V, need to expand V for all Q heads
+            let q_per_kv = self.config.num_heads / num_kv_heads;
+            for q_head in 0..self.config.num_heads {
+                let kv_head = q_head / q_per_kv;
+                let v_start = kv_head * head_dim;
+                let out_start = q_head * head_dim;
+                attn_out_buffer[out_start..out_start + head_dim]
+                    .copy_from_slice(&v[v_start..v_start + head_dim]);
+            }
+        } else {
+            // Use cached K/V for attention with GQA
+            // Uses pre-allocated buffer to avoid 704 Vec allocations per token
+            self.attention_with_cache_gqa_into(q, k_cache, v_cache, k, v, attn_out_buffer);
+
+            // CORRECTNESS-013: Debug CPU attention output (PMAT-260)
+            Self::debug_trace_attention_output(attn_out_buffer, layer_idx, position, head_dim);
+        }
+
+        // 2e. Store K and V in cache for future tokens
+        cache.append(layer_idx, k, v);
+
+        // 2f. Attention output projection
+        let mut attn_output = self.fused_matmul(attn_out_buffer, &layer.attn_output_weight)?;
+        if let Some(ref bias) = layer.attn_output_bias {
+            ops::add_bias(&mut attn_output, bias);
+        }
+
+        // 2g. Residual connection
+        for i in 0..hidden_dim {
+            hidden[i] += attn_output[i];
+        }
+
+        Ok(())
+    }
 }
diff --git a/crates/aprender-serve/src/gguf/inference/forward/forward_qwen3_moe.rs b/crates/aprender-serve/src/gguf/inference/forward/forward_qwen3_moe.rs
index 62bc0f33e..0f170176f 100644
--- a/crates/aprender-serve/src/gguf/inference/forward/forward_qwen3_moe.rs
+++ b/crates/aprender-serve/src/gguf/inference/forward/forward_qwen3_moe.rs
@@ -221,42 +221,28 @@ impl OwnedQuantizedModel {
                 hidden[i] += attn_output[i];
             }
 
-            // 2f. Pre-FFN norm
-            let ffn_input = if let Some(ref ffn_norm) = layer.ffn_norm_weight {
-                if use_rmsnorm {
-                    ops::rms_norm(&hidden, ffn_norm, self.config.eps)
-                } else {
-                    ops::layer_norm(
-                        &hidden,
-                        ffn_norm,
-                        layer.ffn_norm_bias.as_deref(),
-                        self.config.eps,
-                    )
-                }
-            } else {
-                hidden.clone()
-            };
-
-            // 2g. **MoE FFN** — the only piece that differs from the dense forward.
-            // Dispatch per-position through the M32c.2.2.2.0 single-layer kernel.
-            let mut ffn_output = vec![0.0f32; seq_len * hidden_dim];
+            // 2f+2g. Per-position MoE FFN block (norm + dispatch + residual).
+            // Lifted into `moe_ffn_layer` helper (M32d Day 1 prep refactor)
+            // so the upcoming `forward_single_qwen3_moe_with_cache` (M32d
+            // step 3) can call the SAME per-token MoE FFN implementation.
+            //
+            // Math is byte-identical to the previous batch-norm + per-position
+            // dispatch + outer residual version because rms_norm/layer_norm
+            // are per-vector ops (no cross-token interaction).
+            let mut tok_hidden = vec![0.0f32; hidden_dim];
             for s in 0..seq_len {
-                let pos_in = &ffn_input[s * hidden_dim..(s + 1) * hidden_dim];
-                let pos_out = moe_ffn_forward_layer(
-                    pos_in,
+                tok_hidden.copy_from_slice(&hidden[s * hidden_dim..(s + 1) * hidden_dim]);
+                self.moe_ffn_layer(
+                    &mut tok_hidden,
+                    layer,
                     &moe_layers[layer_idx],
                     num_experts,
                     num_experts_per_tok,
                     intermediate,
-                    hidden_dim,
                     data,
+                    use_rmsnorm,
                 )?;
-                ffn_output[s * hidden_dim..(s + 1) * hidden_dim].copy_from_slice(&pos_out);
-            }
-
-            // Residual
-            for i in 0..hidden.len() {
-                hidden[i] += ffn_output[i];
+                hidden[s * hidden_dim..(s + 1) * hidden_dim].copy_from_slice(&tok_hidden);
             }
         }
 
@@ -282,4 +268,86 @@ impl OwnedQuantizedModel {
         }
         Ok(logits)
     }
+
+    /// Per-token MoE FFN block — M32d Day 1 prep refactor.
+    ///
+    /// Mirrors the dense path's per-layer FFN block but routes the
+    /// computation through `moe_ffn_forward_layer` (per-expert dispatch)
+    /// instead of the dense gate × up × SwiGLU × down sequence.
+    ///
+    /// The body is lifted verbatim from `forward_qwen3_moe`'s per-position
+    /// FFN block (formerly at the bottom of the per-layer loop). The
+    /// outer `forward_qwen3_moe` calls this once per token in its
+    /// `for s in 0..seq_len` loop. The upcoming
+    /// `forward_single_qwen3_moe_with_cache` (M32d step 3) calls this
+    /// exactly once per layer per generated token.
+    ///
+    /// Mutates `hidden_token` via the post-MoE residual add.
+    ///
+    /// # Arguments
+    /// * `hidden_token` - Single-token hidden state, `[hidden_dim]`; mutated
+    /// * `layer` - Dense layer struct (read for `ffn_norm_weight` + bias)
+    /// * `moe_layer` - Per-layer MoE expert tensor descriptors
+    /// * `num_experts`, `num_experts_per_tok`, `moe_intermediate` - MoE config
+    /// * `data` - Mmapped GGUF byte slice (borrowed by `moe_ffn_forward_layer`
+    ///   for in-place fused dequant+matvec on each selected expert)
+    /// * `use_rmsnorm` - True for RMSNorm models; false for LayerNorm
+    ///
+    /// # Errors
+    /// Propagates errors from `moe_ffn_forward_layer` (mismatched dims,
+    /// out-of-range expert, etc.).
+    #[allow(clippy::too_many_arguments)]
+    pub(crate) fn moe_ffn_layer(
+        &self,
+        hidden_token: &mut [f32],
+        layer: &crate::gguf::OwnedQuantizedLayer,
+        moe_layer: &Qwen3MoeQuantizedLayer,
+        num_experts: usize,
+        num_experts_per_tok: usize,
+        moe_intermediate: usize,
+        data: &[u8],
+        use_rmsnorm: bool,
+    ) -> Result<()> {
+        let hidden_dim = self.config.hidden_dim;
+        debug_assert_eq!(
+            hidden_token.len(),
+            hidden_dim,
+            "moe_ffn_layer expects per-token hidden of length hidden_dim"
+        );
+
+        // 2f. Pre-FFN norm — per-vector, math is identical whether applied
+        // batched or per-token (rms_norm/layer_norm only mix elements
+        // within a single hidden_dim vector, never across tokens).
+        let ffn_input: Vec<f32> = if let Some(ref ffn_norm) = layer.ffn_norm_weight {
+            if use_rmsnorm {
+                ops::rms_norm(hidden_token, ffn_norm, self.config.eps)
+            } else {
+                ops::layer_norm(
+                    hidden_token,
+                    ffn_norm,
+                    layer.ffn_norm_bias.as_deref(),
+                    self.config.eps,
+                )
+            }
+        } else {
+            hidden_token.to_vec()
+        };
+
+        // 2g. MoE FFN — router + top-K experts + weighted sum.
+        let pos_out = moe_ffn_forward_layer(
+            &ffn_input,
+            moe_layer,
+            num_experts,
+            num_experts_per_tok,
+            moe_intermediate,
+            hidden_dim,
+            data,
+        )?;
+
+        // Residual add into the caller's hidden_token slice.
+        for i in 0..hidden_dim {
+            hidden_token[i] += pos_out[i];
+        }
+        Ok(())
+    }
 }