diff --git a/crates/aprender-serve/src/gguf/inference/forward/debug.rs b/crates/aprender-serve/src/gguf/inference/forward/debug.rs index d99775508..37d09fadc 100644 --- a/crates/aprender-serve/src/gguf/inference/forward/debug.rs +++ b/crates/aprender-serve/src/gguf/inference/forward/debug.rs @@ -474,119 +474,19 @@ impl OwnedQuantizedModel { // 2. Process through transformer layers for (layer_idx, layer) in self.layers.iter().enumerate() { - // 2a+2b. Fused attention layer norm + QKV projection - // For RMSNorm models: fuse norm + matmul to eliminate intermediate allocation - // For LayerNorm models: use separate operations (has bias) - let mut qkv = if use_rmsnorm { - self.fused_rmsnorm_qkv_matmul( - &hidden, - &layer.attn_norm_weight, - self.config.eps, - &layer.qkv_weight, - )? - } else { - let normed = ops::layer_norm( - &hidden, - &layer.attn_norm_weight, - layer.attn_norm_bias.as_deref(), - self.config.eps, - ); - self.qkv_matmul(&normed, &layer.qkv_weight)? - }; - - // PMAT-114: Trace QKV BEFORE bias (PMAT-260) - self.debug_trace_qkv(&qkv, layer_idx, hidden_dim); - - if let Some(ref bias) = layer.qkv_bias { - ops::add_bias(&mut qkv, bias); - } - - // 2c. Extract Q, K, V with GQA-aware sizes and apply RoPE - // Q: [hidden_dim] = [num_heads * head_dim] - // K: [kv_dim] = [num_kv_heads * head_dim] - // V: [kv_dim] = [num_kv_heads * head_dim] - // Optimization: apply RoPE in-place to avoid Q/K copies - let num_kv_heads = self.config.num_kv_heads; - // GH-479: Use config methods (Qwen3 head_dim != hidden/heads) - let head_dim = self.config.head_dim(); - let q_dim = self.config.q_dim(); - let kv_dim = self.config.kv_dim(); - - // PMAT-114: Trace QKV after bias for layer 0 (PMAT-260) - self.debug_trace_qkv_after_bias(&qkv, layer, layer_idx, hidden_dim); - - // GH-479: Per-head QK RMSNorm (Qwen3) — after bias, before RoPE - if let Some(ref q_norm) = layer.attn_q_norm_weight { - ops::apply_per_head_rms_norm( - &mut qkv[0..q_dim], - q_norm, - self.config.num_heads, - self.config.eps, - ); - } - if let Some(ref k_norm) = layer.attn_k_norm_weight { - ops::apply_per_head_rms_norm( - &mut qkv[q_dim..q_dim + kv_dim], - k_norm, - num_kv_heads, - self.config.eps, - ); - } - - // GH-278: Skip RoPE for models with learned position embeddings (GPT-2) - if self.config.constraints.uses_rope() { - self.apply_rope(&mut qkv[0..q_dim], position, self.config.num_heads); - self.apply_rope( - &mut qkv[q_dim..q_dim + kv_dim], - position, - num_kv_heads, - ); - } - - // Use slices to avoid copies (only copy K for cache storage) - // GH-479: Use q_dim (may differ from hidden_dim for Qwen3) - let q = &qkv[0..q_dim]; - let k = &qkv[q_dim..q_dim + kv_dim]; - let v = &qkv[q_dim + kv_dim..q_dim + 2 * kv_dim]; - - // 2d. Get cached K/V and compute attention with GQA support - let k_cache = cache.get_k(layer_idx); - let v_cache = cache.get_v(layer_idx); - - // Use pre-allocated attention output buffer (reused across layers) - if k_cache.is_empty() { - // First token - no cache yet, output is just weighted V - // With single query and single K/V, need to expand V for all Q heads - let q_per_kv = self.config.num_heads / num_kv_heads; - for q_head in 0..self.config.num_heads { - let kv_head = q_head / q_per_kv; - let v_start = kv_head * head_dim; - let out_start = q_head * head_dim; - attn_out_buffer[out_start..out_start + head_dim] - .copy_from_slice(&v[v_start..v_start + head_dim]); - } - } else { - // Use cached K/V for attention with GQA - // Uses pre-allocated buffer to avoid 704 Vec allocations per token - self.attention_with_cache_gqa_into(q, k_cache, v_cache, k, v, &mut attn_out_buffer); - - // CORRECTNESS-013: Debug CPU attention output (PMAT-260) - Self::debug_trace_attention_output(&attn_out_buffer, layer_idx, position, head_dim); - } - - // 2e. Store K and V in cache for future tokens - cache.append(layer_idx, k, v); - - // 2f. Attention output projection - let mut attn_output = self.fused_matmul(&attn_out_buffer, &layer.attn_output_weight)?; - if let Some(ref bias) = layer.attn_output_bias { - ops::add_bias(&mut attn_output, bias); - } - - // 2g. Residual connection - for i in 0..hidden_dim { - hidden[i] += attn_output[i]; - } + // 2a-2g. Attention sub-block (norm + QKV + RoPE + cache + attn + output proj + residual). + // Lifted into a helper (M32d Day 1 prep refactor) so the upcoming + // `forward_single_qwen3_moe_with_cache` can share the SAME attention + // implementation — only the FFN block differs between dense and MoE paths. + self.attention_layer_with_cache( + &mut hidden, + layer, + layer_idx, + cache, + position, + &mut attn_out_buffer, + use_rmsnorm, + )?; // 2h+2i. FFN with optional layer norm and SwiGLU/GELU activation let ffn_activated = self.single_cache_ffn_block(&hidden, layer_idx, use_rmsnorm)?; @@ -612,4 +512,161 @@ impl OwnedQuantizedModel { // Final output: norm + LM head + debug verification + bias self.single_cache_final_output(&hidden, position, use_rmsnorm) } + + /// Per-layer attention sub-block with KV cache — M32d Day 1 prep refactor. + /// + /// Lifted verbatim from `forward_single_with_cache`'s inner loop body + /// (steps 2a–2g: fused RMSNorm+QKV → bias → per-head QK norm → RoPE → + /// cache.get_k/get_v → attention → cache.append → output proj + + /// residual). The behavior is bit-identical to the previous inline + /// version; this method exists so the upcoming + /// `forward_single_qwen3_moe_with_cache` (M32d step 1) can call the + /// SAME attention implementation — only the FFN block differs between + /// dense and MoE forward paths. + /// + /// Mutates `hidden` (residual add of the attention output) and + /// `attn_out_buffer` (reused across layers). Calls `cache.append` + /// to store this token's K/V for future calls. Caller is responsible + /// for `cache.advance()` after all layers are processed. + /// + /// # Arguments + /// * `hidden` - Per-layer hidden state; mutated by the post-attention residual add + /// * `layer` - Quantized layer weights (qkv, attn_norm, attn_output, optional QK norms) + /// * `layer_idx` - Index used for cache get/append and debug traces + /// * `cache` - KV cache; read for past K/V, appended with this token's K/V + /// * `position` - Token position used by RoPE (0-indexed from prompt start) + /// * `attn_out_buffer` - Pre-allocated scratch for `[q_dim]` attention output (reused) + /// * `use_rmsnorm` - True for RMSNorm models (Llama/Qwen family); false for LayerNorm (GPT-2) + /// + /// # Errors + /// Propagates errors from `fused_rmsnorm_qkv_matmul`, `qkv_matmul`, + /// `attention_with_cache_gqa_into`, and `fused_matmul`. + fn attention_layer_with_cache( + &self, + hidden: &mut Vec, + layer: &OwnedQuantizedLayer, + layer_idx: usize, + cache: &mut OwnedQuantizedKVCache, + position: usize, + attn_out_buffer: &mut [f32], + use_rmsnorm: bool, + ) -> Result<()> { + let hidden_dim = self.config.hidden_dim; + + // 2a+2b. Fused attention layer norm + QKV projection + // For RMSNorm models: fuse norm + matmul to eliminate intermediate allocation + // For LayerNorm models: use separate operations (has bias) + let mut qkv = if use_rmsnorm { + self.fused_rmsnorm_qkv_matmul( + hidden, + &layer.attn_norm_weight, + self.config.eps, + &layer.qkv_weight, + )? + } else { + let normed = ops::layer_norm( + hidden, + &layer.attn_norm_weight, + layer.attn_norm_bias.as_deref(), + self.config.eps, + ); + self.qkv_matmul(&normed, &layer.qkv_weight)? + }; + + // PMAT-114: Trace QKV BEFORE bias (PMAT-260) + self.debug_trace_qkv(&qkv, layer_idx, hidden_dim); + + if let Some(ref bias) = layer.qkv_bias { + ops::add_bias(&mut qkv, bias); + } + + // 2c. Extract Q, K, V with GQA-aware sizes and apply RoPE + // Q: [hidden_dim] = [num_heads * head_dim] + // K: [kv_dim] = [num_kv_heads * head_dim] + // V: [kv_dim] = [num_kv_heads * head_dim] + // Optimization: apply RoPE in-place to avoid Q/K copies + let num_kv_heads = self.config.num_kv_heads; + // GH-479: Use config methods (Qwen3 head_dim != hidden/heads) + let head_dim = self.config.head_dim(); + let q_dim = self.config.q_dim(); + let kv_dim = self.config.kv_dim(); + + // PMAT-114: Trace QKV after bias for layer 0 (PMAT-260) + self.debug_trace_qkv_after_bias(&qkv, layer, layer_idx, hidden_dim); + + // GH-479: Per-head QK RMSNorm (Qwen3) — after bias, before RoPE + if let Some(ref q_norm) = layer.attn_q_norm_weight { + ops::apply_per_head_rms_norm( + &mut qkv[0..q_dim], + q_norm, + self.config.num_heads, + self.config.eps, + ); + } + if let Some(ref k_norm) = layer.attn_k_norm_weight { + ops::apply_per_head_rms_norm( + &mut qkv[q_dim..q_dim + kv_dim], + k_norm, + num_kv_heads, + self.config.eps, + ); + } + + // GH-278: Skip RoPE for models with learned position embeddings (GPT-2) + if self.config.constraints.uses_rope() { + self.apply_rope(&mut qkv[0..q_dim], position, self.config.num_heads); + self.apply_rope( + &mut qkv[q_dim..q_dim + kv_dim], + position, + num_kv_heads, + ); + } + + // Use slices to avoid copies (only copy K for cache storage) + // GH-479: Use q_dim (may differ from hidden_dim for Qwen3) + let q = &qkv[0..q_dim]; + let k = &qkv[q_dim..q_dim + kv_dim]; + let v = &qkv[q_dim + kv_dim..q_dim + 2 * kv_dim]; + + // 2d. Get cached K/V and compute attention with GQA support + let k_cache = cache.get_k(layer_idx); + let v_cache = cache.get_v(layer_idx); + + // Use pre-allocated attention output buffer (reused across layers) + if k_cache.is_empty() { + // First token - no cache yet, output is just weighted V + // With single query and single K/V, need to expand V for all Q heads + let q_per_kv = self.config.num_heads / num_kv_heads; + for q_head in 0..self.config.num_heads { + let kv_head = q_head / q_per_kv; + let v_start = kv_head * head_dim; + let out_start = q_head * head_dim; + attn_out_buffer[out_start..out_start + head_dim] + .copy_from_slice(&v[v_start..v_start + head_dim]); + } + } else { + // Use cached K/V for attention with GQA + // Uses pre-allocated buffer to avoid 704 Vec allocations per token + self.attention_with_cache_gqa_into(q, k_cache, v_cache, k, v, attn_out_buffer); + + // CORRECTNESS-013: Debug CPU attention output (PMAT-260) + Self::debug_trace_attention_output(attn_out_buffer, layer_idx, position, head_dim); + } + + // 2e. Store K and V in cache for future tokens + cache.append(layer_idx, k, v); + + // 2f. Attention output projection + let mut attn_output = self.fused_matmul(attn_out_buffer, &layer.attn_output_weight)?; + if let Some(ref bias) = layer.attn_output_bias { + ops::add_bias(&mut attn_output, bias); + } + + // 2g. Residual connection + for i in 0..hidden_dim { + hidden[i] += attn_output[i]; + } + + Ok(()) + } } diff --git a/crates/aprender-serve/src/gguf/inference/forward/forward_qwen3_moe.rs b/crates/aprender-serve/src/gguf/inference/forward/forward_qwen3_moe.rs index 62bc0f33e..0f170176f 100644 --- a/crates/aprender-serve/src/gguf/inference/forward/forward_qwen3_moe.rs +++ b/crates/aprender-serve/src/gguf/inference/forward/forward_qwen3_moe.rs @@ -221,42 +221,28 @@ impl OwnedQuantizedModel { hidden[i] += attn_output[i]; } - // 2f. Pre-FFN norm - let ffn_input = if let Some(ref ffn_norm) = layer.ffn_norm_weight { - if use_rmsnorm { - ops::rms_norm(&hidden, ffn_norm, self.config.eps) - } else { - ops::layer_norm( - &hidden, - ffn_norm, - layer.ffn_norm_bias.as_deref(), - self.config.eps, - ) - } - } else { - hidden.clone() - }; - - // 2g. **MoE FFN** — the only piece that differs from the dense forward. - // Dispatch per-position through the M32c.2.2.2.0 single-layer kernel. - let mut ffn_output = vec![0.0f32; seq_len * hidden_dim]; + // 2f+2g. Per-position MoE FFN block (norm + dispatch + residual). + // Lifted into `moe_ffn_layer` helper (M32d Day 1 prep refactor) + // so the upcoming `forward_single_qwen3_moe_with_cache` (M32d + // step 3) can call the SAME per-token MoE FFN implementation. + // + // Math is byte-identical to the previous batch-norm + per-position + // dispatch + outer residual version because rms_norm/layer_norm + // are per-vector ops (no cross-token interaction). + let mut tok_hidden = vec![0.0f32; hidden_dim]; for s in 0..seq_len { - let pos_in = &ffn_input[s * hidden_dim..(s + 1) * hidden_dim]; - let pos_out = moe_ffn_forward_layer( - pos_in, + tok_hidden.copy_from_slice(&hidden[s * hidden_dim..(s + 1) * hidden_dim]); + self.moe_ffn_layer( + &mut tok_hidden, + layer, &moe_layers[layer_idx], num_experts, num_experts_per_tok, intermediate, - hidden_dim, data, + use_rmsnorm, )?; - ffn_output[s * hidden_dim..(s + 1) * hidden_dim].copy_from_slice(&pos_out); - } - - // Residual - for i in 0..hidden.len() { - hidden[i] += ffn_output[i]; + hidden[s * hidden_dim..(s + 1) * hidden_dim].copy_from_slice(&tok_hidden); } } @@ -282,4 +268,86 @@ impl OwnedQuantizedModel { } Ok(logits) } + + /// Per-token MoE FFN block — M32d Day 1 prep refactor. + /// + /// Mirrors the dense path's per-layer FFN block but routes the + /// computation through `moe_ffn_forward_layer` (per-expert dispatch) + /// instead of the dense gate × up × SwiGLU × down sequence. + /// + /// The body is lifted verbatim from `forward_qwen3_moe`'s per-position + /// FFN block (formerly at the bottom of the per-layer loop). The + /// outer `forward_qwen3_moe` calls this once per token in its + /// `for s in 0..seq_len` loop. The upcoming + /// `forward_single_qwen3_moe_with_cache` (M32d step 3) calls this + /// exactly once per layer per generated token. + /// + /// Mutates `hidden_token` via the post-MoE residual add. + /// + /// # Arguments + /// * `hidden_token` - Single-token hidden state, `[hidden_dim]`; mutated + /// * `layer` - Dense layer struct (read for `ffn_norm_weight` + bias) + /// * `moe_layer` - Per-layer MoE expert tensor descriptors + /// * `num_experts`, `num_experts_per_tok`, `moe_intermediate` - MoE config + /// * `data` - Mmapped GGUF byte slice (borrowed by `moe_ffn_forward_layer` + /// for in-place fused dequant+matvec on each selected expert) + /// * `use_rmsnorm` - True for RMSNorm models; false for LayerNorm + /// + /// # Errors + /// Propagates errors from `moe_ffn_forward_layer` (mismatched dims, + /// out-of-range expert, etc.). + #[allow(clippy::too_many_arguments)] + pub(crate) fn moe_ffn_layer( + &self, + hidden_token: &mut [f32], + layer: &crate::gguf::OwnedQuantizedLayer, + moe_layer: &Qwen3MoeQuantizedLayer, + num_experts: usize, + num_experts_per_tok: usize, + moe_intermediate: usize, + data: &[u8], + use_rmsnorm: bool, + ) -> Result<()> { + let hidden_dim = self.config.hidden_dim; + debug_assert_eq!( + hidden_token.len(), + hidden_dim, + "moe_ffn_layer expects per-token hidden of length hidden_dim" + ); + + // 2f. Pre-FFN norm — per-vector, math is identical whether applied + // batched or per-token (rms_norm/layer_norm only mix elements + // within a single hidden_dim vector, never across tokens). + let ffn_input: Vec = if let Some(ref ffn_norm) = layer.ffn_norm_weight { + if use_rmsnorm { + ops::rms_norm(hidden_token, ffn_norm, self.config.eps) + } else { + ops::layer_norm( + hidden_token, + ffn_norm, + layer.ffn_norm_bias.as_deref(), + self.config.eps, + ) + } + } else { + hidden_token.to_vec() + }; + + // 2g. MoE FFN — router + top-K experts + weighted sum. + let pos_out = moe_ffn_forward_layer( + &ffn_input, + moe_layer, + num_experts, + num_experts_per_tok, + moe_intermediate, + hidden_dim, + data, + )?; + + // Residual add into the caller's hidden_token slice. + for i in 0..hidden_dim { + hidden_token[i] += pos_out[i]; + } + Ok(()) + } }