From 61e916c2bdb88498637a4f0d91b4eee63d071973 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Tue, 12 May 2026 16:08:03 +0800 Subject: [PATCH 1/8] MTP: clean-up --- common/arg.cpp | 27 ++- common/download.cpp | 55 ++++-- common/download.h | 7 +- common/speculative.cpp | 2 +- convert_hf_to_gguf.py | 78 +++++++- include/llama.h | 11 ++ src/llama-arch.cpp | 2 - src/llama-arch.h | 2 - src/llama-context.cpp | 16 +- src/llama-context.h | 2 + src/llama-graph.h | 1 + src/llama-memory.h | 3 + src/llama-model.cpp | 39 ++-- src/models/models.h | 30 +--- src/models/qwen35-mtp.cpp | 207 --------------------- src/models/qwen35.cpp | 254 +++++++++++++++++++++----- src/models/qwen35moe-mtp.cpp | 252 -------------------------- src/models/qwen35moe.cpp | 306 +++++++++++++++++++++++++++----- tests/test-llama-archs.cpp | 6 +- tools/server/server-context.cpp | 41 ++--- 20 files changed, 704 insertions(+), 637 deletions(-) delete mode 100644 src/models/qwen35-mtp.cpp delete mode 100644 src/models/qwen35moe-mtp.cpp diff --git a/common/arg.cpp b/common/arg.cpp index 9fefe411ee2..e3334ab4ab6 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -335,11 +335,15 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa struct handle_model_result { bool found_mmproj = false; common_params_model mmproj; + + bool found_mtp = false; + common_params_model mtp; }; static handle_model_result common_params_handle_model(struct common_params_model & model, const std::string & bearer_token, - bool offline) { + bool offline, + bool search_mtp = false) { handle_model_result result; if (!model.docker_repo.empty()) { @@ -354,7 +358,7 @@ static handle_model_result common_params_handle_model(struct common_params_model common_download_opts opts; opts.bearer_token = bearer_token; opts.offline = offline; - auto download_result = common_download_model(model, opts, true); + auto download_result = common_download_model(model, opts, true, search_mtp); if (download_result.model_path.empty()) { LOG_ERR("error: failed to download model from Hugging Face\n"); @@ -368,6 +372,11 @@ static handle_model_result common_params_handle_model(struct common_params_model result.found_mmproj = true; result.mmproj.path = download_result.mmproj_path; } + + if (!download_result.mtp_path.empty()) { + result.found_mtp = true; + result.mtp.path = download_result.mtp_path; + } } else if (!model.url.empty()) { if (model.path.empty()) { auto f = string_split(model.url, '#').front(); @@ -588,7 +597,11 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context // handle model and download if (!skip_model_download) { - auto res = common_params_handle_model(params.model, params.hf_token, params.offline); + const bool spec_type_mtp = std::find(params.speculative.types.begin(), + params.speculative.types.end(), + COMMON_SPECULATIVE_TYPE_MTP) != params.speculative.types.end(); + + auto res = common_params_handle_model(params.model, params.hf_token, params.offline, spec_type_mtp); if (params.no_mmproj) { params.mmproj = {}; } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) { @@ -602,6 +615,14 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context break; } } + // when --spec-type mtp is set and no draft model was provided explicitly, + // fall back to the MTP head discovered alongside the -hf model + if (spec_type_mtp && res.found_mtp && + params.speculative.draft.mparams.path.empty() && + params.speculative.draft.mparams.hf_repo.empty() && + params.speculative.draft.mparams.url.empty()) { + params.speculative.draft.mparams.path = res.mtp.path; + } common_params_handle_model(params.speculative.draft.mparams, params.hf_token, params.offline); common_params_handle_model(params.vocoder.model, params.hf_token, params.offline); } diff --git a/common/download.cpp b/common/download.cpp index d6d47b2d2fc..71d8dfda00d 100644 --- a/common/download.cpp +++ b/common/download.cpp @@ -566,8 +566,11 @@ static hf_cache::hf_files get_split_files(const hf_cache::hf_files & files, return result; } -static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files, - const std::string & model) { +// pick the best sibling GGUF whose filename contains `keyword` (e.g. "mmproj" / "MTP"), +// preferring deeper shared directory prefix with the model, then closest quantization +static hf_cache::hf_file find_best_sibling(const hf_cache::hf_files & files, + const std::string & model, + const std::string & keyword) { hf_cache::hf_file best; size_t best_depth = 0; int best_diff = 0; @@ -579,20 +582,20 @@ static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files, for (const auto & f : files) { if (!string_ends_with(f.path, ".gguf") || - f.path.find("mmproj") == std::string::npos) { + f.path.find(keyword) == std::string::npos) { continue; } - auto mmproj_parts = string_split(f.path, '/'); - auto mmproj_dir = mmproj_parts.end() - 1; + auto sib_parts = string_split(f.path, '/'); + auto sib_dir = sib_parts.end() - 1; auto [_, dir] = std::mismatch(model_parts.begin(), model_dir, - mmproj_parts.begin(), mmproj_dir); - if (dir != mmproj_dir) { + sib_parts.begin(), sib_dir); + if (dir != sib_dir) { continue; } - size_t depth = dir - mmproj_parts.begin(); + size_t depth = dir - sib_parts.begin(); auto bits = extract_quant_bits(f.path); auto diff = std::abs(bits - model_bits); @@ -606,6 +609,16 @@ static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files, return best; } +static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files, + const std::string & model) { + return find_best_sibling(files, model, "mmproj"); +} + +static hf_cache::hf_file find_best_mtp(const hf_cache::hf_files & files, + const std::string & model) { + return find_best_sibling(files, model, "MTP"); +} + static bool gguf_filename_is_model(const std::string & filepath) { if (!string_ends_with(filepath, ".gguf")) { return false; @@ -617,7 +630,8 @@ static bool gguf_filename_is_model(const std::string & filepath) { } return filename.find("mmproj") == std::string::npos && - filename.find("imatrix") == std::string::npos; + filename.find("imatrix") == std::string::npos && + filename.find("MTP") == std::string::npos; } static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files, @@ -673,11 +687,13 @@ struct hf_plan { hf_cache::hf_file primary; hf_cache::hf_files model_files; hf_cache::hf_file mmproj; + hf_cache::hf_file mtp; }; static hf_plan get_hf_plan(const common_params_model & model, const common_download_opts & opts, - bool download_mmproj) { + bool download_mmproj, + bool download_mtp) { hf_plan plan; hf_cache::hf_files all; @@ -723,6 +739,10 @@ static hf_plan get_hf_plan(const common_params_model & model, plan.mmproj = find_best_mmproj(all, primary.path); } + if (download_mtp) { + plan.mtp = find_best_mtp(all, primary.path); + } + return plan; } @@ -756,7 +776,8 @@ static std::vector get_url_tasks(const common_params_model & mode common_download_model_result common_download_model(const common_params_model & model, const common_download_opts & opts, - bool download_mmproj) { + bool download_mmproj, + bool download_mtp) { common_download_model_result result; std::vector tasks; hf_plan hf; @@ -764,13 +785,16 @@ common_download_model_result common_download_model(const common_params_model & bool is_hf = !model.hf_repo.empty(); if (is_hf) { - hf = get_hf_plan(model, opts, download_mmproj); + hf = get_hf_plan(model, opts, download_mmproj, download_mtp); for (const auto & f : hf.model_files) { tasks.push_back({f.url, f.local_path}); } if (!hf.mmproj.path.empty()) { tasks.push_back({hf.mmproj.url, hf.mmproj.local_path}); } + if (!hf.mtp.path.empty()) { + tasks.push_back({hf.mtp.url, hf.mtp.local_path}); + } } else if (!model.url.empty()) { tasks = get_url_tasks(model); } else { @@ -807,6 +831,10 @@ common_download_model_result common_download_model(const common_params_model & if (!hf.mmproj.path.empty()) { result.mmproj_path = hf_cache::finalize_file(hf.mmproj); } + + if (!hf.mtp.path.empty()) { + result.mtp_path = hf_cache::finalize_file(hf.mtp); + } } else { result.model_path = model.path; } @@ -946,7 +974,8 @@ std::vector common_list_cached_models() { for (const auto & f : files) { auto split = get_gguf_split_info(f.path); if (split.index != 1 || split.tag.empty() || - split.prefix.find("mmproj") != std::string::npos) { + split.prefix.find("mmproj") != std::string::npos || + split.prefix.find("MTP") != std::string::npos) { continue; } if (seen.insert(f.repo_id + ":" + split.tag).second) { diff --git a/common/download.h b/common/download.h index edc3e9f1a71..4a169ef7796 100644 --- a/common/download.h +++ b/common/download.h @@ -59,6 +59,7 @@ struct common_download_opts { struct common_download_model_result { std::string model_path; std::string mmproj_path; + std::string mtp_path; }; // Download model from HuggingFace repo or URL @@ -83,12 +84,14 @@ struct common_download_model_result { // when opts.offline=true, no network requests are made // when download_mmproj=true, searches for mmproj in same directory as model or any parent directory // then with the closest quantization bits +// when download_mtp=true, applies the same sibling search for an MTP-head GGUF // -// returns result with model_path and mmproj_path (empty on failure) +// returns result with model_path, mmproj_path and mtp_path (empty when not found / on failure) common_download_model_result common_download_model( const common_params_model & model, const common_download_opts & opts = {}, - bool download_mmproj = false + bool download_mmproj = false, + bool download_mtp = false ); // returns list of cached models diff --git a/common/speculative.cpp b/common/speculative.cpp index 01b826db344..5135c48821d 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -1198,7 +1198,7 @@ common_speculative * common_speculative_init(common_params_speculative & params, LOG_WRN("%s: draft model is not specified - cannot use 'draft' type\n", __func__); has_draft = false; } - } else if (has_draft_model) { + } else if (has_draft_model && !has_mtp && !has_draft_eagle3) { LOG_WRN("%s: draft model is specified but 'draft' speculative type is not explicitly enabled - enabling it\n", __func__); has_draft = true; } diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index f0f9d2545b5..2e71b07466a 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5559,17 +5559,59 @@ class _Qwen35MtpMixin: gguf_writer: gguf.GGUFWriter block_count: int tensor_map: gguf.TensorNameMap + fname_out: Path + ftype: Any + metadata: Any + + # When true, `--mtp` was passed: filter out trunk weights so the resulting + # GGUF carries only the MTP head and the shared embeddings/output tensors. + mtp_only: bool = False + + # When true, `--no-mtp` was passed: drop `mtp.*` tensors and report block_count + # as the trunk-only layer count, producing a GGUF with no MTP head. + no_mtp: bool = False def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("mtp_num_hidden_layers", 0) + self.block_count = self.hparams["num_hidden_layers"] + if not self.no_mtp: + self.block_count += self.hparams.get("mtp_num_hidden_layers", 0) self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + @classmethod + def filter_tensors(cls, item): + name, _ = item + if name.startswith("mtp."): + # Qwen3Next drops `mtp.*` tensors; Qwen3.5/3.6 use them by default. `--no-mtp` opts out. + if cls.no_mtp: + return None + return item + return super().filter_tensors(item) # ty: ignore[unresolved-attribute] + def set_gguf_parameters(self): super().set_gguf_parameters() # ty: ignore[unresolved-attribute] + if self.no_mtp: + return if (n := self.hparams.get("mtp_num_hidden_layers", 0)) > 0: self.gguf_writer.add_nextn_predict_layers(n) + def prepare_metadata(self, vocab_only: bool): + super().prepare_metadata(vocab_only=vocab_only) # ty: ignore[unresolved-attribute] + + if not self.mtp_only: + return + + output_type: str = self.ftype.name.partition("_")[2] + + if self.fname_out.is_dir(): + fname_default: str = gguf.naming_convention( + self.metadata.name, self.metadata.basename, self.metadata.finetune, + self.metadata.version, size_label=None, output_type=output_type, model_type=None) + self.fname_out = self.fname_out / f"{Path(fname_default).stem}-MTP.gguf" + else: + stem = self.fname_out.stem + self.fname_out = self.fname_out.parent / f"{stem}-MTP{self.fname_out.suffix}" + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # Multimodal Qwen3.5/3.6 wrap the text model under `model.language_model.*`. if name.startswith("model.language_model."): @@ -5577,6 +5619,17 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter elif name.startswith("language_model."): name = name[len("language_model."):] + if self.mtp_only: + # In --mtp mode keep only the MTP block plus the shared embedding/output tensors + # that the standalone MTP graph references at inference time. + keep = ( + name.startswith("mtp.") or + name in ("model.embed_tokens.weight", "model.norm.weight", "lm_head.weight") or + name in ("embed_tokens.weight", "norm.weight") + ) + if not keep: + return + # Remap MTP block tensors to llama.cpp's layer-indexed nextn naming. # HF: mtp.layers.0.* (transformer block at MTP slot 0) # mtp.fc / mtp.pre_fc_norm_embedding / mtp.pre_fc_norm_hidden / mtp.norm @@ -14034,6 +14087,14 @@ def parse_args() -> argparse.Namespace: "--mmproj", action="store_true", help="(Experimental) Export multimodal projector (mmproj) for vision models. This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name.", ) + parser.add_argument( + "--mtp", action="store_true", + help="(Experimental) Export only the multi-token prediction (MTP) head as a separate GGUF, suitable for use as a speculative draft. Output file name will get a '-MTP' suffix.", + ) + parser.add_argument( + "--no-mtp", action="store_true", + help="(Experimental) Exclude the multi-token prediction (MTP) head from the converted GGUF. Pair with --mtp on a second run to publish trunk and MTP as two files. Note: the split form duplicates embeddings, so the bundled default is more space-efficient overall.", + ) parser.add_argument( "--mistral-format", action="store_true", help="Whether the model is stored following the Mistral format.", @@ -14193,6 +14254,18 @@ def main() -> None: else: model_class = MistralModel + if args.mtp and args.no_mtp: + logger.error("--mtp and --no-mtp are mutually exclusive") + sys.exit(1) + + if (args.mtp or args.no_mtp) and not issubclass(model_class, _Qwen35MtpMixin): + logger.error("--mtp / --no-mtp are only supported for Qwen3.5/3.6 text variants today") + sys.exit(1) + + # set on the class so __init__ sees the correct mode when computing block_count + if args.no_mtp: + model_class.no_mtp = True + model_instance = model_class(dir_model, output_type, fname_out, is_big_endian=args.bigendian, use_temp_file=args.use_temp_file, eager=args.no_lazy, @@ -14205,6 +14278,9 @@ def main() -> None: fuse_gate_up_exps=args.fuse_gate_up_exps ) + if args.mtp: + model_instance.mtp_only = True + if args.vocab_only: logger.info("Exporting model vocab...") model_instance.write_vocab() diff --git a/include/llama.h b/include/llama.h index 1b896944735..bb34b47102a 100644 --- a/include/llama.h +++ b/include/llama.h @@ -198,6 +198,13 @@ extern "C" { LLAMA_SPLIT_MODE_TENSOR = 3, }; + enum llama_graph_type { + LLAMA_GRAPH_TYPE_DEFAULT = 0, + LLAMA_GRAPH_TYPE_ENCODER = 1, + LLAMA_GRAPH_TYPE_DECODER = 2, + LLAMA_GRAPH_TYPE_DECODER_MTP = 3, + }; + // TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979) typedef struct llama_token_data { llama_token id; // token id @@ -383,6 +390,8 @@ extern "C" { // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init) struct llama_sampler_seq_config * samplers; size_t n_samplers; + + enum llama_graph_type graph_type; }; struct llama_model_tensor_override { @@ -557,6 +566,8 @@ extern "C" { LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model); LLAMA_API int32_t llama_model_n_swa (const struct llama_model * model); + LLAMA_API bool llama_model_has_mtp (const struct llama_model * model); + // Get the model's RoPE frequency scaling factor LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model); diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 794666d09a4..ab4334da79b 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -41,8 +41,6 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_QWEN3VLMOE, "qwen3vlmoe" }, { LLM_ARCH_QWEN35, "qwen35" }, { LLM_ARCH_QWEN35MOE, "qwen35moe" }, - { LLM_ARCH_QWEN35_MTP, "qwen35_mtp" }, - { LLM_ARCH_QWEN35MOE_MTP, "qwen35moe_mtp" }, { LLM_ARCH_PHI2, "phi2" }, { LLM_ARCH_PHI3, "phi3" }, { LLM_ARCH_PHIMOE, "phimoe" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 71c2ca6e6b3..e37d548c98e 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -45,8 +45,6 @@ enum llm_arch { LLM_ARCH_QWEN3VLMOE, LLM_ARCH_QWEN35, LLM_ARCH_QWEN35MOE, - LLM_ARCH_QWEN35_MTP, - LLM_ARCH_QWEN35MOE_MTP, LLM_ARCH_PHI2, LLM_ARCH_PHI3, LLM_ARCH_PHIMOE, diff --git a/src/llama-context.cpp b/src/llama-context.cpp index aea8a0a4e81..47ada0b75a2 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -66,6 +66,14 @@ llama_context::llama_context( cparams.cb_eval = params.cb_eval; cparams.cb_eval_user_data = params.cb_eval_user_data; + switch (params.graph_type) { + case LLAMA_GRAPH_TYPE_DEFAULT: gtype = LLM_GRAPH_TYPE_DEFAULT; break; + case LLAMA_GRAPH_TYPE_ENCODER: gtype = LLM_GRAPH_TYPE_ENCODER; break; + case LLAMA_GRAPH_TYPE_DECODER: gtype = LLM_GRAPH_TYPE_DECODER; break; + case LLAMA_GRAPH_TYPE_DECODER_MTP: gtype = LLM_GRAPH_TYPE_DECODER_MTP; break; + default: throw std::runtime_error("invalid graph_type"); + } + // Initialize backend samplers here so they are part of the sampling graph // before the reserve passes run later in this function. This avoids a later // re-reserve when graph nodes change. @@ -279,6 +287,7 @@ llama_context::llama_context( /*.type_k =*/ params.type_k, /*.type_v =*/ params.type_v, /*.swa_full =*/ params.swa_full, + /*.gtype =*/ gtype, }; memory.reset(model.create_memory(params_mem, cparams)); @@ -1738,7 +1747,7 @@ int llama_context::decode(const llama_batch & batch_inp) { } ggml_status status; - const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status); + const auto * res = process_ubatch(ubatch, gtype, mctx.get(), status); if (!res) { // the last ubatch failed or was aborted -> remove all positions of that ubatch from the memory module @@ -2198,7 +2207,7 @@ ggml_cgraph * llama_context::graph_reserve( auto * res = gf_res_reserve.get(); - const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT); + const auto gparams = graph_params(res, ubatch, mctx, gtype); res->reset(); @@ -3177,7 +3186,7 @@ void llama_context::opt_epoch_iter( auto * res = gf_res_prev.get(); - const auto gparams = graph_params(res, ubatch, mctx.get(), LLM_GRAPH_TYPE_DEFAULT); + const auto gparams = graph_params(res, ubatch, mctx.get(), gtype); res->reset(); @@ -3306,6 +3315,7 @@ llama_context_params llama_context_default_params() { /*.kv_unified =*/ false, /*.sampler =*/ nullptr, /*.n_sampler =*/ 0, + /*.graph_type =*/ LLAMA_GRAPH_TYPE_DEFAULT, }; return result; diff --git a/src/llama-context.h b/src/llama-context.h index e16ac4c618b..5d9efdf242a 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -268,6 +268,8 @@ struct llama_context { llama_cparams cparams; + llm_graph_type gtype = LLM_GRAPH_TYPE_DECODER; + llama_adapter_cvec_ptr cvec; llama_adapter_loras_ptr loras; diff --git a/src/llama-graph.h b/src/llama-graph.h index d3cd69a674c..9e55d0a675e 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -32,6 +32,7 @@ enum llm_graph_type { LLM_GRAPH_TYPE_DEFAULT, LLM_GRAPH_TYPE_ENCODER, LLM_GRAPH_TYPE_DECODER, + LLM_GRAPH_TYPE_DECODER_MTP, }; enum llm_ffn_op_type { diff --git a/src/llama-memory.h b/src/llama-memory.h index 4a157b91fdb..2875d614315 100644 --- a/src/llama-memory.h +++ b/src/llama-memory.h @@ -1,6 +1,7 @@ #pragma once #include "llama.h" +#include "llama-graph.h" #include #include @@ -20,6 +21,8 @@ struct llama_memory_params { // use full-size SWA cache bool swa_full; + + llm_graph_type gtype; }; enum llama_memory_status { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 6abfbfb3e3b..e4b891ea96e 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -276,10 +276,6 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params return new llama_model_qwen35(params); case LLM_ARCH_QWEN35MOE: return new llama_model_qwen35moe(params); - case LLM_ARCH_QWEN35_MTP: - return new llama_model_qwen35_mtp(params); - case LLM_ARCH_QWEN35MOE_MTP: - return new llama_model_qwen35moe_mtp(params); case LLM_ARCH_MISTRAL3: return new llama_model_mistral3(params); case LLM_ARCH_MIMO2: @@ -1409,8 +1405,7 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { } } - const bool partial_load = (arch == LLM_ARCH_QWEN35_MTP || arch == LLM_ARCH_QWEN35MOE_MTP); - ml.done_getting_tensors(partial_load); + ml.done_getting_tensors(); // populate tensors_by_name for (auto & [_, ctx_ptr] : ml.ctx_map) { @@ -1948,6 +1943,12 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, // checks default: { + // The MTP head is dense-attention only on hybrid Qwen3.5/3.6, so use a plain + // attention KV cache for the MTP context instead of the hybrid wrapper. + const bool mtp_on_hybrid_qwen35 = + params.gtype == LLM_GRAPH_TYPE_DECODER_MTP && + (arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE); + if (llm_arch_is_recurrent(arch)) { res = new llama_memory_recurrent( *this, @@ -1957,7 +1958,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, std::max((uint32_t) 1, cparams.n_seq_max), cparams.n_seq_max, nullptr); - } else if (llm_arch_is_hybrid(arch)) { + } else if (llm_arch_is_hybrid(arch) && !mtp_on_hybrid_qwen35) { // The main difference between hybrid architectures is the // layer filters, so pick the right one here llama_memory_hybrid::layer_filter_cb filter_attn = nullptr; @@ -1972,6 +1973,14 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, filter_recr = [&](int32_t il) { return hparams.is_recurrent(il) && hparams.n_ff(il) == 0; }; + } else if (arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE) { + const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers; + filter_attn = [&, n_main](int32_t il) { + return (uint32_t)il < n_main && !hparams.is_recurrent(il); + }; + filter_recr = [&, n_main](int32_t il) { + return (uint32_t)il < n_main && hparams.is_recurrent(il); + }; } if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { @@ -2014,6 +2023,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, } } else { llama_memory_i::layer_reuse_cb reuse = nullptr; + llama_kv_cache::layer_filter_cb filter = nullptr; if (arch == LLM_ARCH_GEMMA3N || arch == LLM_ARCH_GEMMA4) { reuse = [&](int32_t il) { @@ -2025,6 +2035,11 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, }; } + if (mtp_on_hybrid_qwen35) { + const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers; + filter = [n_main](int32_t il) { return (uint32_t)il >= n_main; }; + } + if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { GGML_ASSERT(hparams.is_swa_any()); @@ -2040,7 +2055,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, cparams.n_seq_max, cparams.n_ubatch, 1, - nullptr, + filter, reuse); } else { GGML_ASSERT(!hparams.is_swa_any()); @@ -2057,7 +2072,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, 1, hparams.n_swa, hparams.swa_type, - nullptr, + filter, nullptr); } } @@ -2161,6 +2176,10 @@ int32_t llama_model_n_swa(const llama_model * model) { return model->hparams.n_swa; } +bool llama_model_has_mtp(const llama_model * model) { + return model->hparams.nextn_predict_layers > 0; +} + uint32_t llama_model_n_cls_out(const struct llama_model * model) { return model->hparams.n_cls_out; } @@ -2328,8 +2347,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_QWEN3VLMOE: case LLM_ARCH_QWEN35: case LLM_ARCH_QWEN35MOE: - case LLM_ARCH_QWEN35_MTP: - case LLM_ARCH_QWEN35MOE_MTP: return LLAMA_ROPE_TYPE_IMROPE; case LLM_ARCH_GLM4: diff --git a/src/models/models.h b/src/models/models.h index 1f04d313d13..fe95b9b89ad 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -1739,6 +1739,10 @@ struct llama_model_qwen35 : public llama_model_base { const llama_model & model; }; + struct graph_mtp : public llm_graph_context { + graph_mtp(const llama_model & model, const llm_graph_params & params); + }; + std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; @@ -1781,30 +1785,8 @@ struct llama_model_qwen35moe : public llama_model_base { const llama_model & model; }; - std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; -}; - - -struct llama_model_qwen35_mtp : public llama_model_base { - llama_model_qwen35_mtp(const struct llama_model_params & params) : llama_model_base(params) {} - void load_arch_hparams(llama_model_loader & ml) override; - void load_arch_tensors(llama_model_loader & ml) override; - - struct graph : public llm_graph_context { - graph(const llama_model & model, const llm_graph_params & params); - }; - - std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; -}; - - -struct llama_model_qwen35moe_mtp : public llama_model_base { - llama_model_qwen35moe_mtp(const struct llama_model_params & params) : llama_model_base(params) {} - void load_arch_hparams(llama_model_loader & ml) override; - void load_arch_tensors(llama_model_loader & ml) override; - - struct graph : public llm_graph_context { - graph(const llama_model & model, const llm_graph_params & params); + struct graph_mtp : public llm_graph_context { + graph_mtp(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; diff --git a/src/models/qwen35-mtp.cpp b/src/models/qwen35-mtp.cpp deleted file mode 100644 index 83039e98db5..00000000000 --- a/src/models/qwen35-mtp.cpp +++ /dev/null @@ -1,207 +0,0 @@ -#include "models.h" - -void llama_model_qwen35_mtp::load_arch_hparams(llama_model_loader & ml) { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true); - - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers > 0 && "QWEN35_MTP requires nextn_predict_layers > 0"); - GGML_ASSERT(hparams.nextn_predict_layers <= hparams.n_layer); - - // only the MTP layers get a KV cache, trunk layers are skipped. - hparams.kv_only_nextn = true; - hparams.n_layer_kv_from_start = -1; - for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.recurrent_layer_arr[i] = false; - } - - type = LLM_TYPE_UNKNOWN; -} - -void llama_model_qwen35_mtp::load_arch_tensors(llama_model_loader &) { - LLAMA_LOAD_LOCALS; - - tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); - output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, TENSOR_NOT_REQUIRED); - output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); - if (output == nullptr) { - output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED); - } - - const uint32_t n_main = n_layer - hparams.nextn_predict_layers; - for (int i = 0; i < n_layer; ++i) { - if (static_cast(i) < n_main) { - continue; // trunk layer — owned by the sibling QWEN35 model - } - - auto & layer = layers[i]; - - // MTP block looks like a full-attention Qwen3.5 decoder block. - layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0); - layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0); - - create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0); - layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0); - layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0); - layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0); - - layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); - - // NextN-specific tensors that define the MTP block. - layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, 0); - layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, 0); - layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, 0); - layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); - layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); - layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED); - } -} - -std::unique_ptr llama_model_qwen35_mtp::build_arch_graph(const llm_graph_params & params) const { - return std::make_unique(*this, params); -} - -// LLM_ARCH_QWEN35_MTP draft head for Qwen3.5/3.6 dense series -llama_model_qwen35_mtp::graph::graph(const llama_model & model, const llm_graph_params & params) - : llm_graph_context(params) { - GGML_ASSERT(hparams.nextn_predict_layers > 0 && "QWEN35_MTP requires nextn_predict_layers > 0"); - GGML_ASSERT(hparams.nextn_predict_layers == 1 && "QWEN35_MTP currently only supports a single MTP block"); - - const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); - - // The MTP block lives at the source file's original layer index. - const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers; - const auto & layer = model.layers[il]; - - GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj"); - GGML_ASSERT(layer.nextn.enorm && "MTP block missing nextn.enorm"); - GGML_ASSERT(layer.nextn.hnorm && "MTP block missing nextn.hnorm"); - - int sections[4]; - std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); - - auto inp = std::make_unique(hparams.n_embd); - - inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_set_input(inp->tokens); - - inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens); - ggml_set_input(inp->embd); - ggml_set_name(inp->embd, "mtp_h_input"); - - ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd; - - ggml_tensor * h_input = inp->embd; - ggml_tensor * tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens); - cb(tok_embd, "mtp_tok_embd", il); - - res->add_input(std::move(inp)); - - ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv(); - - ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il); - cb(h_norm, "mtp_hnorm", il); - - ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, LLM_NORM_RMS, il); - cb(e_norm, "mtp_enorm", il); - - ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0); - cb(concat, "mtp_concat", il); - - ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat); - cb(cur, "mtp_eh_proj", il); - - ggml_tensor * inpSA = cur; - - cur = build_norm(cur, layer.attn_norm, nullptr, LLM_NORM_RMS, il); - cb(cur, "mtp_attn_norm", il); - - ggml_tensor * Qcur_full = build_lora_mm(layer.wq, cur, layer.wq_s); - cb(Qcur_full, "mtp_Qcur_full", il); - - ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full, - n_embd_head, n_head, n_tokens, - ggml_element_size(Qcur_full) * n_embd_head * 2, - ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, - 0); - Qcur = build_norm(Qcur, layer.attn_q_norm, nullptr, LLM_NORM_RMS, il); - cb(Qcur, "mtp_Qcur_normed", il); - - ggml_tensor * gate = ggml_view_3d(ctx0, Qcur_full, - n_embd_head, n_head, n_tokens, - ggml_element_size(Qcur_full) * n_embd_head * 2, - ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, - ggml_element_size(Qcur_full) * n_embd_head); - gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens); - cb(gate, "mtp_gate", il); - - ggml_tensor * Kcur = build_lora_mm(layer.wk, cur, layer.wk_s); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Kcur = build_norm(Kcur, layer.attn_k_norm, nullptr, LLM_NORM_RMS, il); - cb(Kcur, "mtp_Kcur_normed", il); - - ggml_tensor * Vcur = build_lora_mm(layer.wv, cur, layer.wv_s); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - cb(Vcur, "mtp_Vcur", il); - - Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr, - n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow); - Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr, - n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow); - - const float kq_scale = hparams.f_attention_scale == 0.0f - ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale; - - cur = build_attn(inp_attn, - nullptr, nullptr, nullptr, - Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); - cb(cur, "mtp_attn_pregate", il); - - cur = ggml_mul(ctx0, cur, ggml_sigmoid(ctx0, gate)); - cur = build_lora_mm(layer.wo, cur, layer.wo_s); - cb(cur, "mtp_attn_out", il); - - cur = ggml_add(ctx0, cur, inpSA); - cb(cur, "mtp_attn_residual", il); - - ggml_tensor * ffn_residual = cur; - cur = build_norm(cur, layer.attn_post_norm, nullptr, LLM_NORM_RMS, il); - cb(cur, "mtp_attn_post_norm", il); - - cur = build_ffn(cur, - layer.ffn_up, nullptr, layer.ffn_up_s, - layer.ffn_gate, nullptr, layer.ffn_gate_s, - layer.ffn_down, nullptr, layer.ffn_down_s, - nullptr, - LLM_FFN_SILU, LLM_FFN_PAR, il); - cb(cur, "mtp_ffn_out", il); - - cur = ggml_add(ctx0, cur, ffn_residual); - cb(cur, "mtp_post_ffn", il); - - // Pre-norm hidden state: used by the AR draft loop to seed the next MTP step. - // (In the trunk graph this is `t_h_pre_norm`; the MTP head reuses the same slot.) - cb(cur, "h_pre_norm", -1); - res->t_h_pre_norm = cur; - - ggml_tensor * head_norm_w = layer.nextn.shared_head_norm - ? layer.nextn.shared_head_norm - : model.output_norm; - GGML_ASSERT(head_norm_w && "QWEN35_MTP: missing both nextn.shared_head_norm and output_norm"); - cur = build_norm(cur, head_norm_w, nullptr, LLM_NORM_RMS, -1); - cb(cur, "mtp_shared_head_norm", -1); - - ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output; - GGML_ASSERT(head_w && "QWEN35_MTP: missing LM head (nextn.shared_head_head or model.output)"); - cur = build_lora_mm(head_w, cur); - cb(cur, "result_output", -1); - - res->t_logits = cur; - ggml_build_forward_expand(gf, cur); -} diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index 79fdd8f679b..ca4297e94f3 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -15,7 +15,6 @@ void llama_model_qwen35::load_arch_hparams(llama_model_loader & ml) { // NextN/MTP (Qwen3.5/3.6): extra decoder block appended beyond the main stack ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); - hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; // Mark recurrent layers (linear attention layers). MTP layers are dense // attention-only and must be flagged non-recurrent. @@ -36,9 +35,14 @@ void llama_model_qwen35::load_arch_hparams(llama_model_loader & ml) { } } -void llama_model_qwen35::load_arch_tensors(llama_model_loader &) { +void llama_model_qwen35::load_arch_tensors(llama_model_loader & ml) { LLAMA_LOAD_LOCALS; + const uint32_t n_main = n_layer - hparams.nextn_predict_layers; + const bool mtp_only = (hparams.nextn_predict_layers > 0) && + (ml.get_weight("blk.0.attn_norm.weight") == nullptr); + const int trunk_flags = mtp_only ? TENSOR_NOT_REQUIRED : 0; + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); // output @@ -50,60 +54,85 @@ void llama_model_qwen35::load_arch_tensors(llama_model_loader &) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED); } - // Calculate dimensions from hyperparameters - const int64_t head_k_dim = hparams.ssm_d_state; - const int64_t head_v_dim = hparams.ssm_d_state; - const int64_t n_k_heads = hparams.ssm_n_group; - const int64_t n_v_heads = hparams.ssm_dt_rank; - const int64_t key_dim = head_k_dim * n_k_heads; - const int64_t value_dim = head_v_dim * n_v_heads; - const int64_t conv_dim = key_dim * 2 + value_dim; + auto load_block_trunk = [&](int il, int flags) { + auto & layer = layers[il]; - for (int i = 0; i < n_layer; ++i) { - auto & layer = layers[i]; + // Calculate dimensions from hyperparameters + const int64_t head_k_dim = hparams.ssm_d_state; + const int64_t head_v_dim = hparams.ssm_d_state; + const int64_t n_k_heads = hparams.ssm_n_group; + const int64_t n_v_heads = hparams.ssm_dt_rank; + const int64_t key_dim = head_k_dim * n_k_heads; + const int64_t value_dim = head_v_dim * n_v_heads; + const int64_t conv_dim = key_dim * 2 + value_dim; - layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0); - layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0); + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", il), { n_embd }, flags); + layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", il), { n_embd }, flags); - if (!hparams.is_recurrent(i)) { + if (!hparams.is_recurrent(il)) { // Attention layers - create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0); - layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0); + create_tensor_qkv(layer, il, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, flags); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", il), { n_embd_head_k * n_head, n_embd }, flags); // Q/K normalization for attention layers - layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0); - layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0); + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", il), { n_embd_head_k }, flags); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", il), { n_embd_head_k }, flags); } else { // Linear attention (gated delta net) specific tensors // Create tensors with calculated dimensions - layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED); - layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED); - layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0); - layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0); - layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0); - layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0); - layer.ssm_alpha = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0); - layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0); - layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0); + layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", il), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED); + layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", il), { n_embd, value_dim }, TENSOR_NOT_REQUIRED); + layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", il), { hparams.ssm_d_conv, conv_dim }, flags); + layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", il), { hparams.ssm_dt_rank }, flags); + layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, il), { hparams.ssm_dt_rank }, flags); + layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", il), { n_embd, n_v_heads }, flags); + layer.ssm_alpha = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "weight", il), { n_embd, n_v_heads }, flags); + layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", il), { head_v_dim }, flags); + layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", il), { value_dim, n_embd }, flags); } - layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); - - // NextN/MTP tensors (preserved but unused) - only bound on MTP layers - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { - layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, TENSOR_NOT_REQUIRED); - layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED); - layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED); - layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); - layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); - layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED); - } + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", il), {n_embd, n_ff}, flags); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", il), { n_ff, n_embd}, flags); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", il), {n_embd, n_ff}, flags); + }; + + auto load_block_mtp = [&](int il) { + auto & layer = layers[il]; + + // MTP block looks like a full-attention Qwen3.5 decoder block. + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", il), { n_embd }, 0); + layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", il), { n_embd }, 0); + + create_tensor_qkv(layer, il, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", il), { n_embd_head_k * n_head, n_embd }, 0); + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", il), { n_embd_head_k }, 0); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", il), { n_embd_head_k }, 0); + + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", il), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", il), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", il), {n_embd, n_ff}, 0); + + // NextN-specific tensors that define the MTP block. + layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", il), { 2 * n_embd, n_embd }, 0); + layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", il), { n_embd }, 0); + layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", il), { n_embd }, 0); + layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", il), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); + layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", il), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); + layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", il), { n_embd }, TENSOR_NOT_REQUIRED); + }; + + for (int i = 0; i < (int) n_main; ++i) { + load_block_trunk(i, trunk_flags); + } + for (int i = (int) n_main; i < n_layer; ++i) { + load_block_mtp(i); } } std::unique_ptr llama_model_qwen35::build_arch_graph(const llm_graph_params & params) const { + if (params.gtype == LLM_GRAPH_TYPE_DECODER_MTP) { + return std::make_unique(*this, params); + } return std::make_unique(*this, params); } @@ -493,3 +522,146 @@ ggml_tensor * llama_model_qwen35::graph::build_layer_ffn(ggml_tensor * cur, cons return cur; } + +// LLM_GRAPH_TYPE_DECODER_MTP draft head for Qwen3.5/3.6 dense series +llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params) + : llm_graph_context(params) { + GGML_ASSERT(hparams.nextn_predict_layers > 0 && "QWEN35 MTP requires nextn_predict_layers > 0"); + GGML_ASSERT(hparams.nextn_predict_layers == 1 && "QWEN35 MTP currently only supports a single MTP block"); + + const int64_t n_embd_head = hparams.n_embd_head_v(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + + // The MTP block lives at the source file's original layer index. + const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers; + const auto & layer = model.layers[il]; + + GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj"); + GGML_ASSERT(layer.nextn.enorm && "MTP block missing nextn.enorm"); + GGML_ASSERT(layer.nextn.hnorm && "MTP block missing nextn.hnorm"); + + int sections[4]; + std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); + + auto inp = std::make_unique(hparams.n_embd); + + inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + ggml_set_input(inp->tokens); + + inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens); + ggml_set_input(inp->embd); + ggml_set_name(inp->embd, "mtp_h_input"); + + ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd; + + ggml_tensor * h_input = inp->embd; + ggml_tensor * tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens); + cb(tok_embd, "mtp_tok_embd", il); + + res->add_input(std::move(inp)); + + ggml_tensor * inp_pos = build_inp_pos(); + auto * inp_attn = build_attn_inp_kv(); + + ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il); + cb(h_norm, "mtp_hnorm", il); + + ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, LLM_NORM_RMS, il); + cb(e_norm, "mtp_enorm", il); + + ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0); + cb(concat, "mtp_concat", il); + + ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat); + cb(cur, "mtp_eh_proj", il); + + ggml_tensor * inpSA = cur; + + cur = build_norm(cur, layer.attn_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "mtp_attn_norm", il); + + ggml_tensor * Qcur_full = build_lora_mm(layer.wq, cur, layer.wq_s); + cb(Qcur_full, "mtp_Qcur_full", il); + + ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full, + n_embd_head, n_head, n_tokens, + ggml_element_size(Qcur_full) * n_embd_head * 2, + ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, + 0); + Qcur = build_norm(Qcur, layer.attn_q_norm, nullptr, LLM_NORM_RMS, il); + cb(Qcur, "mtp_Qcur_normed", il); + + ggml_tensor * gate = ggml_view_3d(ctx0, Qcur_full, + n_embd_head, n_head, n_tokens, + ggml_element_size(Qcur_full) * n_embd_head * 2, + ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, + ggml_element_size(Qcur_full) * n_embd_head); + gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens); + cb(gate, "mtp_gate", il); + + ggml_tensor * Kcur = build_lora_mm(layer.wk, cur, layer.wk_s); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Kcur = build_norm(Kcur, layer.attn_k_norm, nullptr, LLM_NORM_RMS, il); + cb(Kcur, "mtp_Kcur_normed", il); + + ggml_tensor * Vcur = build_lora_mm(layer.wv, cur, layer.wv_s); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + cb(Vcur, "mtp_Vcur", il); + + Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr, + n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr, + n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + const float kq_scale = hparams.f_attention_scale == 0.0f + ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale; + + cur = build_attn(inp_attn, + nullptr, nullptr, nullptr, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); + cb(cur, "mtp_attn_pregate", il); + + cur = ggml_mul(ctx0, cur, ggml_sigmoid(ctx0, gate)); + cur = build_lora_mm(layer.wo, cur, layer.wo_s); + cb(cur, "mtp_attn_out", il); + + cur = ggml_add(ctx0, cur, inpSA); + cb(cur, "mtp_attn_residual", il); + + ggml_tensor * ffn_residual = cur; + cur = build_norm(cur, layer.attn_post_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "mtp_attn_post_norm", il); + + cur = build_ffn(cur, + layer.ffn_up, nullptr, layer.ffn_up_s, + layer.ffn_gate, nullptr, layer.ffn_gate_s, + layer.ffn_down, nullptr, layer.ffn_down_s, + nullptr, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "mtp_ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_residual); + cb(cur, "mtp_post_ffn", il); + + // Pre-norm hidden state: used by the AR draft loop to seed the next MTP step. + // (In the trunk graph this is `t_h_pre_norm`; the MTP head reuses the same slot.) + cb(cur, "h_pre_norm", -1); + res->t_h_pre_norm = cur; + + ggml_tensor * head_norm_w = layer.nextn.shared_head_norm + ? layer.nextn.shared_head_norm + : model.output_norm; + GGML_ASSERT(head_norm_w && "QWEN35 MTP: missing both nextn.shared_head_norm and output_norm"); + cur = build_norm(cur, head_norm_w, nullptr, LLM_NORM_RMS, -1); + cb(cur, "mtp_shared_head_norm", -1); + + ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output; + GGML_ASSERT(head_w && "QWEN35 MTP: missing LM head (nextn.shared_head_head or model.output)"); + cur = build_lora_mm(head_w, cur); + cb(cur, "result_output", -1); + + res->t_logits = cur; + ggml_build_forward_expand(gf, cur); +} diff --git a/src/models/qwen35moe-mtp.cpp b/src/models/qwen35moe-mtp.cpp deleted file mode 100644 index 9f662213bee..00000000000 --- a/src/models/qwen35moe-mtp.cpp +++ /dev/null @@ -1,252 +0,0 @@ -#include "models.h" - -void llama_model_qwen35moe_mtp::load_arch_hparams(llama_model_loader & ml) { - ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); - ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false); - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true); - - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers > 0 && "QWEN35MOE_MTP requires nextn_predict_layers > 0"); - GGML_ASSERT(hparams.nextn_predict_layers <= hparams.n_layer); - GGML_ASSERT(hparams.n_expert > 0 && "QWEN35MOE_MTP requires n_expert > 0"); - - // only the MTP layers get a KV cache, trunk layers are skipped. - hparams.kv_only_nextn = true; - hparams.n_layer_kv_from_start = -1; - for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.recurrent_layer_arr[i] = false; - } - - type = LLM_TYPE_UNKNOWN; -} - -void llama_model_qwen35moe_mtp::load_arch_tensors(llama_model_loader &) { - LLAMA_LOAD_LOCALS; - - tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); - output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, TENSOR_NOT_REQUIRED); - output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); - if (output == nullptr) { - output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED); - } - - const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used; - const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff; - - const uint32_t n_main = n_layer - hparams.nextn_predict_layers; - for (int i = 0; i < n_layer; ++i) { - if (static_cast(i) < n_main) { - continue; // trunk layer — owned by the sibling QWEN35MOE model - } - - auto & layer = layers[i]; - - // MTP block looks like a full-attention Qwen3.5 decoder block with MoE FFN. - layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0); - layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0); - - create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0); - layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0); - layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0); - layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0); - - // Routed experts - layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0); - layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0); - create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0); - - // Shared experts - layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0); - layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0); - layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0); - layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, 0); - - // NextN-specific tensors that define the MTP block. - layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, 0); - layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, 0); - layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, 0); - layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); - layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); - layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED); - } -} - -std::unique_ptr llama_model_qwen35moe_mtp::build_arch_graph(const llm_graph_params & params) const { - return std::make_unique(*this, params); -} - -// LLM_ARCH_QWEN35MOE_MTP draft head for Qwen3.5/3.6 MoE -llama_model_qwen35moe_mtp::graph::graph(const llama_model & model, const llm_graph_params & params) - : llm_graph_context(params) { - GGML_ASSERT(hparams.nextn_predict_layers > 0 && "QWEN35MOE_MTP requires nextn_predict_layers > 0"); - GGML_ASSERT(hparams.nextn_predict_layers == 1 && "QWEN35MOE_MTP currently only supports a single MTP block"); - - const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); - - const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers; - const auto & layer = model.layers[il]; - - GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj"); - GGML_ASSERT(layer.nextn.enorm && "MTP block missing nextn.enorm"); - GGML_ASSERT(layer.nextn.hnorm && "MTP block missing nextn.hnorm"); - GGML_ASSERT(layer.ffn_gate_inp && "MTP block missing ffn_gate_inp"); - - int sections[4]; - std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); - - auto inp = std::make_unique(hparams.n_embd); - - inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_set_input(inp->tokens); - - inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens); - ggml_set_input(inp->embd); - ggml_set_name(inp->embd, "mtp_h_input"); - - ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd; - - ggml_tensor * h_input = inp->embd; - ggml_tensor * tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens); - cb(tok_embd, "mtp_tok_embd", il); - - res->add_input(std::move(inp)); - - ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv(); - - ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il); - cb(h_norm, "mtp_hnorm", il); - - ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, LLM_NORM_RMS, il); - cb(e_norm, "mtp_enorm", il); - - ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0); - cb(concat, "mtp_concat", il); - - ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat); - cb(cur, "mtp_eh_proj", il); - - ggml_tensor * inpSA = cur; - - cur = build_norm(cur, layer.attn_norm, nullptr, LLM_NORM_RMS, il); - cb(cur, "mtp_attn_norm", il); - - ggml_tensor * Qcur_full = build_lora_mm(layer.wq, cur, layer.wq_s); - cb(Qcur_full, "mtp_Qcur_full", il); - - ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full, - n_embd_head, n_head, n_tokens, - ggml_element_size(Qcur_full) * n_embd_head * 2, - ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, - 0); - Qcur = build_norm(Qcur, layer.attn_q_norm, nullptr, LLM_NORM_RMS, il); - cb(Qcur, "mtp_Qcur_normed", il); - - ggml_tensor * gate = ggml_view_3d(ctx0, Qcur_full, - n_embd_head, n_head, n_tokens, - ggml_element_size(Qcur_full) * n_embd_head * 2, - ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, - ggml_element_size(Qcur_full) * n_embd_head); - gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens); - cb(gate, "mtp_gate", il); - - ggml_tensor * Kcur = build_lora_mm(layer.wk, cur, layer.wk_s); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Kcur = build_norm(Kcur, layer.attn_k_norm, nullptr, LLM_NORM_RMS, il); - cb(Kcur, "mtp_Kcur_normed", il); - - ggml_tensor * Vcur = build_lora_mm(layer.wv, cur, layer.wv_s); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - cb(Vcur, "mtp_Vcur", il); - - Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr, - n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow); - Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr, - n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow); - - const float kq_scale = hparams.f_attention_scale == 0.0f - ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale; - - cur = build_attn(inp_attn, - nullptr, nullptr, nullptr, - Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); - cb(cur, "mtp_attn_pregate", il); - - cur = ggml_mul(ctx0, cur, ggml_sigmoid(ctx0, gate)); - cur = build_lora_mm(layer.wo, cur, layer.wo_s); - cb(cur, "mtp_attn_out", il); - - cur = ggml_add(ctx0, cur, inpSA); - cb(cur, "mtp_attn_residual", il); - - ggml_tensor * ffn_residual = cur; - cur = build_norm(cur, layer.attn_post_norm, nullptr, LLM_NORM_RMS, il); - cb(cur, "mtp_attn_post_norm", il); - - // MoE FFN — routed experts plus gated shared expert (mirrors qwen35moe). - ggml_tensor * moe_out = - build_moe_ffn(cur, - layer.ffn_gate_inp, - layer.ffn_up_exps, - layer.ffn_gate_exps, - layer.ffn_down_exps, - nullptr, - n_expert, n_expert_used, - LLM_FFN_SILU, true, - hparams.expert_weights_scale, - LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il, - nullptr, layer.ffn_gate_up_exps, - layer.ffn_up_exps_s, - layer.ffn_gate_exps_s, - layer.ffn_down_exps_s); - cb(moe_out, "mtp_ffn_moe_out", il); - - if (layer.ffn_up_shexp != nullptr) { - ggml_tensor * ffn_shexp = - build_ffn(cur, - layer.ffn_up_shexp, nullptr, layer.ffn_up_shexp_s, - layer.ffn_gate_shexp, nullptr, layer.ffn_gate_shexp_s, - layer.ffn_down_shexp, nullptr, layer.ffn_down_shexp_s, - nullptr, - LLM_FFN_SILU, LLM_FFN_PAR, il); - cb(ffn_shexp, "mtp_ffn_shexp", il); - - ggml_tensor * shared_gate = build_lora_mm(layer.ffn_gate_inp_shexp, cur); - shared_gate = ggml_sigmoid(ctx0, shared_gate); - cb(shared_gate, "mtp_shared_expert_gate_sigmoid", il); - - ffn_shexp = ggml_mul(ctx0, ffn_shexp, shared_gate); - cb(ffn_shexp, "mtp_ffn_shexp_gated", il); - - cur = ggml_add(ctx0, moe_out, ffn_shexp); - } else { - cur = moe_out; - } - cb(cur, "mtp_ffn_out", il); - - cur = ggml_add(ctx0, cur, ffn_residual); - cb(cur, "mtp_post_ffn", il); - - // Pre-norm hidden state: used by the AR draft loop to seed the next MTP step. - cb(cur, "h_pre_norm", -1); - res->t_h_pre_norm = cur; - - ggml_tensor * head_norm_w = layer.nextn.shared_head_norm - ? layer.nextn.shared_head_norm - : model.output_norm; - GGML_ASSERT(head_norm_w && "QWEN35MOE_MTP: missing both nextn.shared_head_norm and output_norm"); - cur = build_norm(cur, head_norm_w, nullptr, LLM_NORM_RMS, -1); - cb(cur, "mtp_shared_head_norm", -1); - - ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output; - GGML_ASSERT(head_w && "QWEN35MOE_MTP: missing LM head (nextn.shared_head_head or model.output)"); - cur = build_lora_mm(head_w, cur); - cb(cur, "result_output", -1); - - res->t_logits = cur; - ggml_build_forward_expand(gf, cur); -} diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index 5912aa38153..a4c7cb6ad14 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -18,7 +18,6 @@ void llama_model_qwen35moe::load_arch_hparams(llama_model_loader & ml) { // NextN/MTP (Qwen3.5/3.6): extra decoder block appended beyond the main stack ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); - hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; // Mark recurrent layers (linear attention layers). MTP layers are dense // attention-only and must be flagged non-recurrent. @@ -39,9 +38,14 @@ void llama_model_qwen35moe::load_arch_hparams(llama_model_loader & ml) { } } -void llama_model_qwen35moe::load_arch_tensors(llama_model_loader &) { +void llama_model_qwen35moe::load_arch_tensors(llama_model_loader & ml) { LLAMA_LOAD_LOCALS; + const uint32_t n_main = n_layer - hparams.nextn_predict_layers; + const bool mtp_only = (hparams.nextn_predict_layers > 0) && + (ml.get_weight("blk.0.attn_norm.weight") == nullptr); + const int trunk_flags = mtp_only ? TENSOR_NOT_REQUIRED : 0; + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); // output @@ -53,70 +57,105 @@ void llama_model_qwen35moe::load_arch_tensors(llama_model_loader &) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED); } - const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used; + auto load_block_trunk = [&](int il, int flags) { + auto & layer = layers[il]; - // Calculate dimensions from hyperparameters - const int64_t head_k_dim = hparams.ssm_d_state; - const int64_t head_v_dim = hparams.ssm_d_state; - const int64_t n_k_heads = hparams.ssm_n_group; - const int64_t n_v_heads = hparams.ssm_dt_rank; - const int64_t key_dim = head_k_dim * n_k_heads; - const int64_t value_dim = head_v_dim * n_v_heads; - const int64_t conv_dim = key_dim * 2 + value_dim; + const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used; + const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff; - for (int i = 0; i < n_layer; ++i) { - auto & layer = layers[i]; + // Calculate dimensions from hyperparameters + const int64_t head_k_dim = hparams.ssm_d_state; + const int64_t head_v_dim = hparams.ssm_d_state; + const int64_t n_k_heads = hparams.ssm_n_group; + const int64_t n_v_heads = hparams.ssm_dt_rank; + const int64_t key_dim = head_k_dim * n_k_heads; + const int64_t value_dim = head_v_dim * n_v_heads; + const int64_t conv_dim = key_dim * 2 + value_dim; - layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0); - layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0); + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", il), { n_embd }, flags); + layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", il), { n_embd }, flags); - if (!hparams.is_recurrent(i)) { + if (!hparams.is_recurrent(il)) { // Attention layers - create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0); - layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0); + create_tensor_qkv(layer, il, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, flags); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", il), { n_embd_head_k * n_head, n_embd }, flags); // Q/K normalization for attention layers - layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0); - layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0); + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", il), { n_embd_head_k }, flags); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", il), { n_embd_head_k }, flags); } else { // Linear attention (gated delta net) specific tensors // Create tensors with calculated dimensions - layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED); - layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED); - layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0); - layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0); - layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0); - layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0); - layer.ssm_alpha = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0); - layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0); - layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0); + layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", il), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED); + layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", il), { n_embd, value_dim }, TENSOR_NOT_REQUIRED); + layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", il), { hparams.ssm_d_conv, conv_dim }, flags); + layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", il), { hparams.ssm_dt_rank }, flags); + layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, il), { hparams.ssm_dt_rank }, flags); + layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", il), { n_embd, n_v_heads }, flags); + layer.ssm_alpha = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "weight", il), { n_embd, n_v_heads }, flags); + layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", il), { head_v_dim }, flags); + layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", il), { value_dim, n_embd }, flags); } - layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0); - layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0); - create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0); + // Routed experts + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", il), { n_embd, n_expert }, flags); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", il), { n_ff_exp, n_embd, n_expert }, flags); + create_tensor_gate_up_exps(layer, il, n_embd, n_ff_exp, n_expert, flags); // Shared experts + layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", il), { n_embd }, flags); + layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", il), { n_embd, n_ff_shexp }, flags); + layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", il), { n_embd, n_ff_shexp }, flags); + layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", il), { n_ff_shexp, n_embd }, flags); + }; + + auto load_block_mtp = [&](int il) { + auto & layer = layers[il]; + + const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used; const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff; - layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0); - layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0); - layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0); - layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, 0); - - // NextN/MTP tensors (preserved but unused) - only bound on MTP layers - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { - layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, TENSOR_NOT_REQUIRED); - layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED); - layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED); - layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); - layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); - layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED); - } + // MTP block looks like a full-attention Qwen3.5 decoder block with MoE FFN. + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", il), { n_embd }, 0); + layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", il), { n_embd }, 0); + + create_tensor_qkv(layer, il, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", il), { n_embd_head_k * n_head, n_embd }, 0); + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", il), { n_embd_head_k }, 0); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", il), { n_embd_head_k }, 0); + + // Routed experts + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", il), { n_embd, n_expert }, 0); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", il), { n_ff_exp, n_embd, n_expert }, 0); + create_tensor_gate_up_exps(layer, il, n_embd, n_ff_exp, n_expert, 0); + + // Shared experts + layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", il), { n_embd }, 0); + layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", il), { n_embd, n_ff_shexp }, 0); + layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", il), { n_embd, n_ff_shexp }, 0); + layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", il), { n_ff_shexp, n_embd }, 0); + + // NextN-specific tensors that define the MTP block. + layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", il), { 2 * n_embd, n_embd }, 0); + layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", il), { n_embd }, 0); + layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", il), { n_embd }, 0); + layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", il), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); + layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", il), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); + layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", il), { n_embd }, TENSOR_NOT_REQUIRED); + }; + + for (int i = 0; i < (int) n_main; ++i) { + load_block_trunk(i, trunk_flags); + } + for (int i = (int) n_main; i < n_layer; ++i) { + load_block_mtp(i); } } std::unique_ptr llama_model_qwen35moe::build_arch_graph(const llm_graph_params & params) const { + if (params.gtype == LLM_GRAPH_TYPE_DECODER_MTP) { + return std::make_unique(*this, params); + } return std::make_unique(*this, params); } @@ -547,3 +586,178 @@ ggml_tensor * llama_model_qwen35moe::graph::build_layer_ffn(ggml_tensor * cur, c return cur; } + +// LLM_GRAPH_TYPE_DECODER_MTP draft head for Qwen3.5/3.6 MoE +llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params) + : llm_graph_context(params) { + GGML_ASSERT(hparams.nextn_predict_layers > 0 && "QWEN35MOE MTP requires nextn_predict_layers > 0"); + GGML_ASSERT(hparams.nextn_predict_layers == 1 && "QWEN35MOE MTP currently only supports a single MTP block"); + + const int64_t n_embd_head = hparams.n_embd_head_v(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + + const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers; + const auto & layer = model.layers[il]; + + GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj"); + GGML_ASSERT(layer.nextn.enorm && "MTP block missing nextn.enorm"); + GGML_ASSERT(layer.nextn.hnorm && "MTP block missing nextn.hnorm"); + GGML_ASSERT(layer.ffn_gate_inp && "MTP block missing ffn_gate_inp"); + + int sections[4]; + std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); + + auto inp = std::make_unique(hparams.n_embd); + + inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + ggml_set_input(inp->tokens); + + inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens); + ggml_set_input(inp->embd); + ggml_set_name(inp->embd, "mtp_h_input"); + + ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd; + + ggml_tensor * h_input = inp->embd; + ggml_tensor * tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens); + cb(tok_embd, "mtp_tok_embd", il); + + res->add_input(std::move(inp)); + + ggml_tensor * inp_pos = build_inp_pos(); + auto * inp_attn = build_attn_inp_kv(); + + ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il); + cb(h_norm, "mtp_hnorm", il); + + ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, LLM_NORM_RMS, il); + cb(e_norm, "mtp_enorm", il); + + ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0); + cb(concat, "mtp_concat", il); + + ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat); + cb(cur, "mtp_eh_proj", il); + + ggml_tensor * inpSA = cur; + + cur = build_norm(cur, layer.attn_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "mtp_attn_norm", il); + + ggml_tensor * Qcur_full = build_lora_mm(layer.wq, cur, layer.wq_s); + cb(Qcur_full, "mtp_Qcur_full", il); + + ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full, + n_embd_head, n_head, n_tokens, + ggml_element_size(Qcur_full) * n_embd_head * 2, + ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, + 0); + Qcur = build_norm(Qcur, layer.attn_q_norm, nullptr, LLM_NORM_RMS, il); + cb(Qcur, "mtp_Qcur_normed", il); + + ggml_tensor * gate = ggml_view_3d(ctx0, Qcur_full, + n_embd_head, n_head, n_tokens, + ggml_element_size(Qcur_full) * n_embd_head * 2, + ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, + ggml_element_size(Qcur_full) * n_embd_head); + gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens); + cb(gate, "mtp_gate", il); + + ggml_tensor * Kcur = build_lora_mm(layer.wk, cur, layer.wk_s); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Kcur = build_norm(Kcur, layer.attn_k_norm, nullptr, LLM_NORM_RMS, il); + cb(Kcur, "mtp_Kcur_normed", il); + + ggml_tensor * Vcur = build_lora_mm(layer.wv, cur, layer.wv_s); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + cb(Vcur, "mtp_Vcur", il); + + Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr, + n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr, + n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + const float kq_scale = hparams.f_attention_scale == 0.0f + ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale; + + cur = build_attn(inp_attn, + nullptr, nullptr, nullptr, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); + cb(cur, "mtp_attn_pregate", il); + + cur = ggml_mul(ctx0, cur, ggml_sigmoid(ctx0, gate)); + cur = build_lora_mm(layer.wo, cur, layer.wo_s); + cb(cur, "mtp_attn_out", il); + + cur = ggml_add(ctx0, cur, inpSA); + cb(cur, "mtp_attn_residual", il); + + ggml_tensor * ffn_residual = cur; + cur = build_norm(cur, layer.attn_post_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "mtp_attn_post_norm", il); + + // MoE FFN — routed experts plus gated shared expert (mirrors qwen35moe). + ggml_tensor * moe_out = + build_moe_ffn(cur, + layer.ffn_gate_inp, + layer.ffn_up_exps, + layer.ffn_gate_exps, + layer.ffn_down_exps, + nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, true, + hparams.expert_weights_scale, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il, + nullptr, layer.ffn_gate_up_exps, + layer.ffn_up_exps_s, + layer.ffn_gate_exps_s, + layer.ffn_down_exps_s); + cb(moe_out, "mtp_ffn_moe_out", il); + + if (layer.ffn_up_shexp != nullptr) { + ggml_tensor * ffn_shexp = + build_ffn(cur, + layer.ffn_up_shexp, nullptr, layer.ffn_up_shexp_s, + layer.ffn_gate_shexp, nullptr, layer.ffn_gate_shexp_s, + layer.ffn_down_shexp, nullptr, layer.ffn_down_shexp_s, + nullptr, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(ffn_shexp, "mtp_ffn_shexp", il); + + ggml_tensor * shared_gate = build_lora_mm(layer.ffn_gate_inp_shexp, cur); + shared_gate = ggml_sigmoid(ctx0, shared_gate); + cb(shared_gate, "mtp_shared_expert_gate_sigmoid", il); + + ffn_shexp = ggml_mul(ctx0, ffn_shexp, shared_gate); + cb(ffn_shexp, "mtp_ffn_shexp_gated", il); + + cur = ggml_add(ctx0, moe_out, ffn_shexp); + } else { + cur = moe_out; + } + cb(cur, "mtp_ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_residual); + cb(cur, "mtp_post_ffn", il); + + // Pre-norm hidden state: used by the AR draft loop to seed the next MTP step. + cb(cur, "h_pre_norm", -1); + res->t_h_pre_norm = cur; + + ggml_tensor * head_norm_w = layer.nextn.shared_head_norm + ? layer.nextn.shared_head_norm + : model.output_norm; + GGML_ASSERT(head_norm_w && "QWEN35MOE MTP: missing both nextn.shared_head_norm and output_norm"); + cur = build_norm(cur, head_norm_w, nullptr, LLM_NORM_RMS, -1); + cb(cur, "mtp_shared_head_norm", -1); + + ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output; + GGML_ASSERT(head_w && "QWEN35MOE MTP: missing LM head (nextn.shared_head_head or model.output)"); + cur = build_lora_mm(head_w, cur); + cb(cur, "result_output", -1); + + res->t_logits = cur; + ggml_build_forward_expand(gf, cur); +} diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp index fd0d3696d77..03d7c19c78b 100644 --- a/tests/test-llama-archs.cpp +++ b/tests/test-llama-archs.cpp @@ -406,11 +406,7 @@ static bool arch_supported(const llm_arch arch) { if (arch == LLM_ARCH_DEEPSEEK2OCR) { return false; } - if (arch == LLM_ARCH_QWEN35_MTP || arch == LLM_ARCH_QWEN35MOE_MTP) { - return false; // MTP-only arch; requires a sibling trunk model and cannot run standalone. - } - - // FIXME some models are segfaulting with WebGPU: +// FIXME some models are segfaulting with WebGPU: #ifdef GGML_USE_WEBGPU if (arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE || arch == LLM_ARCH_KIMI_LINEAR) { return false; diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 9430c31b4c5..4e37c2ea850 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -756,6 +756,14 @@ struct server_context_impl { } auto cparams = common_context_params_to_llama(params_dft); + + const bool spec_mtp = std::find(params_base.speculative.types.begin(), + params_base.speculative.types.end(), + COMMON_SPECULATIVE_TYPE_MTP) != params_base.speculative.types.end(); + if (spec_mtp) { + cparams.graph_type = LLAMA_GRAPH_TYPE_DECODER_MTP; + } + ctx_dft.reset(llama_init_from_model(model_dft.get(), cparams)); ctx_dft_seq_rm_type = common_context_can_seq_rm(ctx_dft.get()); @@ -764,36 +772,21 @@ struct server_context_impl { params_base.speculative.draft.ctx_dft = ctx_dft.get(); } else if (std::find(params_base.speculative.types.begin(), params_base.speculative.types.end(), COMMON_SPECULATIVE_TYPE_MTP) != params_base.speculative.types.end()) { - // MTP head lives in the *target* GGUF — load it as a sibling model - // with override_arch and feed it through the existing ctx_dft slot. - char trunk_arch[64] = {0}; - llama_model_meta_val_str(model_tgt, "general.architecture", trunk_arch, sizeof(trunk_arch)); - - const char * mtp_arch = nullptr; - if (std::string(trunk_arch) == "qwen35") { - mtp_arch = "qwen35_mtp"; - } else if (std::string(trunk_arch) == "qwen35moe") { - mtp_arch = "qwen35moe_mtp"; - } else { - SRV_ERR("MTP not supported for trunk architecture '%s'\n", trunk_arch); + // MTP head lives in the *target* model — share the same llama_model and + // spin up a second context that builds the MTP graph instead of the trunk graph. + if (!llama_model_has_mtp(model_tgt)) { + SRV_ERR("MTP requested but the target model '%s' has no MTP tensors\n", + params_base.model.path.c_str()); return false; } - SRV_INF("loading MTP head from '%s' (override_arch=%s)\n", - params_base.model.path.c_str(), mtp_arch); - - auto mparams_mtp = common_model_params_to_llama(params_base); - mparams_mtp.override_arch = mtp_arch; - - model_dft.reset(llama_model_load_from_file(params_base.model.path.c_str(), mparams_mtp)); - if (model_dft == nullptr) { - SRV_ERR("failed to load MTP head from '%s'\n", params_base.model.path.c_str()); - return false; - } + SRV_INF("creating MTP draft context against the target model '%s'\n", + params_base.model.path.c_str()); auto cparams_mtp = common_context_params_to_llama(params_base); + cparams_mtp.graph_type = LLAMA_GRAPH_TYPE_DECODER_MTP; - ctx_dft.reset(llama_init_from_model(model_dft.get(), cparams_mtp)); + ctx_dft.reset(llama_init_from_model(model_tgt, cparams_mtp)); if (ctx_dft == nullptr) { SRV_ERR("%s", "failed to create MTP context\n"); return false; From 5b9283992c629e4216c498a2fc54a92549c40d31 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Tue, 12 May 2026 21:40:06 +0800 Subject: [PATCH 2/8] review: use llama_context_type instead of llama_graph_type --- include/llama.h | 11 ++++------- src/llama-context.cpp | 28 ++++++++++++++++------------ src/llama-context.h | 2 -- src/llama-cparams.h | 1 + src/llama-memory.h | 2 +- src/llama-model.cpp | 2 +- tools/server/server-context.cpp | 4 ++-- 7 files changed, 25 insertions(+), 25 deletions(-) diff --git a/include/llama.h b/include/llama.h index bb34b47102a..19326b72ac4 100644 --- a/include/llama.h +++ b/include/llama.h @@ -198,11 +198,9 @@ extern "C" { LLAMA_SPLIT_MODE_TENSOR = 3, }; - enum llama_graph_type { - LLAMA_GRAPH_TYPE_DEFAULT = 0, - LLAMA_GRAPH_TYPE_ENCODER = 1, - LLAMA_GRAPH_TYPE_DECODER = 2, - LLAMA_GRAPH_TYPE_DECODER_MTP = 3, + enum llama_context_type { + LLAMA_CONTEXT_TYPE_DEFAULT = 0, + LLAMA_CONTEXT_TYPE_MTP = 1, }; // TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979) @@ -346,6 +344,7 @@ extern "C" { int32_t n_threads; // number of threads to use for generation int32_t n_threads_batch; // number of threads to use for batch processing + enum llama_context_type ctx_type; // set the context type (e.g. MTP) enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id enum llama_attention_type attention_type; // attention type to use for embeddings @@ -390,8 +389,6 @@ extern "C" { // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init) struct llama_sampler_seq_config * samplers; size_t n_samplers; - - enum llama_graph_type graph_type; }; struct llama_model_tensor_override { diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 47ada0b75a2..26697276ec2 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2,6 +2,7 @@ #include "ggml.h" #include "llama-arch.h" +#include "llama-graph.h" #include "llama-impl.h" #include "llama-batch.h" #include "llama-io.h" @@ -21,6 +22,14 @@ // llama_context // +static llm_graph_type ctx_type_to_graph_type(llama_context_type ctx_type) { + switch (ctx_type) { + case LLAMA_CONTEXT_TYPE_DEFAULT: return LLM_GRAPH_TYPE_DEFAULT; + case LLAMA_CONTEXT_TYPE_MTP : return LLM_GRAPH_TYPE_DECODER_MTP; + } + throw std::runtime_error("Unsupported ctx type"); +} + llama_context::llama_context( const llama_model & model, llama_context_params params) : @@ -66,13 +75,7 @@ llama_context::llama_context( cparams.cb_eval = params.cb_eval; cparams.cb_eval_user_data = params.cb_eval_user_data; - switch (params.graph_type) { - case LLAMA_GRAPH_TYPE_DEFAULT: gtype = LLM_GRAPH_TYPE_DEFAULT; break; - case LLAMA_GRAPH_TYPE_ENCODER: gtype = LLM_GRAPH_TYPE_ENCODER; break; - case LLAMA_GRAPH_TYPE_DECODER: gtype = LLM_GRAPH_TYPE_DECODER; break; - case LLAMA_GRAPH_TYPE_DECODER_MTP: gtype = LLM_GRAPH_TYPE_DECODER_MTP; break; - default: throw std::runtime_error("invalid graph_type"); - } + cparams.ctx_type = params.ctx_type; // Initialize backend samplers here so they are part of the sampling graph // before the reserve passes run later in this function. This avoids a later @@ -287,7 +290,7 @@ llama_context::llama_context( /*.type_k =*/ params.type_k, /*.type_v =*/ params.type_v, /*.swa_full =*/ params.swa_full, - /*.gtype =*/ gtype, + /*.ctx_type=*/ cparams.ctx_type, }; memory.reset(model.create_memory(params_mem, cparams)); @@ -1747,7 +1750,8 @@ int llama_context::decode(const llama_batch & batch_inp) { } ggml_status status; - const auto * res = process_ubatch(ubatch, gtype, mctx.get(), status); + + const auto * res = process_ubatch(ubatch, ctx_type_to_graph_type(cparams.ctx_type), mctx.get(), status); if (!res) { // the last ubatch failed or was aborted -> remove all positions of that ubatch from the memory module @@ -2207,7 +2211,7 @@ ggml_cgraph * llama_context::graph_reserve( auto * res = gf_res_reserve.get(); - const auto gparams = graph_params(res, ubatch, mctx, gtype); + const auto gparams = graph_params(res, ubatch, mctx, ctx_type_to_graph_type(cparams.ctx_type)); res->reset(); @@ -3186,7 +3190,7 @@ void llama_context::opt_epoch_iter( auto * res = gf_res_prev.get(); - const auto gparams = graph_params(res, ubatch, mctx.get(), gtype); + const auto gparams = graph_params(res, ubatch, mctx.get(), ctx_type_to_graph_type(cparams.ctx_type)); res->reset(); @@ -3289,6 +3293,7 @@ llama_context_params llama_context_default_params() { /*.n_seq_max =*/ 1, /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS, + /*.ctx_type =*/ LLAMA_CONTEXT_TYPE_DEFAULT, /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, /*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED, /*.attention_type =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED, @@ -3315,7 +3320,6 @@ llama_context_params llama_context_default_params() { /*.kv_unified =*/ false, /*.sampler =*/ nullptr, /*.n_sampler =*/ 0, - /*.graph_type =*/ LLAMA_GRAPH_TYPE_DEFAULT, }; return result; diff --git a/src/llama-context.h b/src/llama-context.h index 5d9efdf242a..e16ac4c618b 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -268,8 +268,6 @@ struct llama_context { llama_cparams cparams; - llm_graph_type gtype = LLM_GRAPH_TYPE_DECODER; - llama_adapter_cvec_ptr cvec; llama_adapter_loras_ptr loras; diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 1e4e9e29ed8..9a4c2274407 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -42,6 +42,7 @@ struct llama_cparams { bool pipeline_parallel; enum llama_pooling_type pooling_type; + enum llama_context_type ctx_type; ggml_backend_sched_eval_callback cb_eval; void * cb_eval_user_data; diff --git a/src/llama-memory.h b/src/llama-memory.h index 2875d614315..4ad1612e45b 100644 --- a/src/llama-memory.h +++ b/src/llama-memory.h @@ -22,7 +22,7 @@ struct llama_memory_params { // use full-size SWA cache bool swa_full; - llm_graph_type gtype; + llama_context_type ctx_type; }; enum llama_memory_status { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index e4b891ea96e..bc239cc4c44 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1946,7 +1946,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, // The MTP head is dense-attention only on hybrid Qwen3.5/3.6, so use a plain // attention KV cache for the MTP context instead of the hybrid wrapper. const bool mtp_on_hybrid_qwen35 = - params.gtype == LLM_GRAPH_TYPE_DECODER_MTP && + params.ctx_type == LLAMA_CONTEXT_TYPE_MTP && (arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE); if (llm_arch_is_recurrent(arch)) { diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 4e37c2ea850..91954531cda 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -761,7 +761,7 @@ struct server_context_impl { params_base.speculative.types.end(), COMMON_SPECULATIVE_TYPE_MTP) != params_base.speculative.types.end(); if (spec_mtp) { - cparams.graph_type = LLAMA_GRAPH_TYPE_DECODER_MTP; + cparams.ctx_type = LLAMA_CONTEXT_TYPE_MTP; } ctx_dft.reset(llama_init_from_model(model_dft.get(), cparams)); @@ -784,7 +784,7 @@ struct server_context_impl { params_base.model.path.c_str()); auto cparams_mtp = common_context_params_to_llama(params_base); - cparams_mtp.graph_type = LLAMA_GRAPH_TYPE_DECODER_MTP; + cparams_mtp.ctx_type = LLAMA_CONTEXT_TYPE_MTP; ctx_dft.reset(llama_init_from_model(model_tgt, cparams_mtp)); if (ctx_dft == nullptr) { From 46c080153d87cf17123d215319fefd4459fb16c2 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Tue, 12 May 2026 22:02:43 +0800 Subject: [PATCH 3/8] review: remove llama_model_has_mtp --- include/llama.h | 2 -- src/llama-context.cpp | 6 ++++++ src/llama-model.cpp | 3 --- tools/server/server-context.cpp | 8 -------- 4 files changed, 6 insertions(+), 13 deletions(-) diff --git a/include/llama.h b/include/llama.h index 19326b72ac4..2dcec3bb7a7 100644 --- a/include/llama.h +++ b/include/llama.h @@ -563,8 +563,6 @@ extern "C" { LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model); LLAMA_API int32_t llama_model_n_swa (const struct llama_model * model); - LLAMA_API bool llama_model_has_mtp (const struct llama_model * model); - // Get the model's RoPE frequency scaling factor LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model); diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 26697276ec2..43e694494c8 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -3397,6 +3397,12 @@ llama_context * llama_init_from_model( model->hparams.pooling_type, params.pooling_type); } + if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP && model->hparams.nextn_predict_layers == 0) { + LLAMA_LOG_WARN("%s: context type MTP request but model doesn't contain MTP layers\n", __func__); + return nullptr; + } + + try { auto * ctx = new llama_context(*model, params); return ctx; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index bc239cc4c44..2a157eb1299 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2176,9 +2176,6 @@ int32_t llama_model_n_swa(const llama_model * model) { return model->hparams.n_swa; } -bool llama_model_has_mtp(const llama_model * model) { - return model->hparams.nextn_predict_layers > 0; -} uint32_t llama_model_n_cls_out(const struct llama_model * model) { return model->hparams.n_cls_out; diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 91954531cda..76b4294f897 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -772,14 +772,6 @@ struct server_context_impl { params_base.speculative.draft.ctx_dft = ctx_dft.get(); } else if (std::find(params_base.speculative.types.begin(), params_base.speculative.types.end(), COMMON_SPECULATIVE_TYPE_MTP) != params_base.speculative.types.end()) { - // MTP head lives in the *target* model — share the same llama_model and - // spin up a second context that builds the MTP graph instead of the trunk graph. - if (!llama_model_has_mtp(model_tgt)) { - SRV_ERR("MTP requested but the target model '%s' has no MTP tensors\n", - params_base.model.path.c_str()); - return false; - } - SRV_INF("creating MTP draft context against the target model '%s'\n", params_base.model.path.c_str()); From f87f0f440b81d53bfab1ca35f1f0acc7b685d055 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Tue, 12 May 2026 22:43:30 +0800 Subject: [PATCH 4/8] review: fix convert issues --- convert_hf_to_gguf.py | 52 ++++++++++++++----------------------------- 1 file changed, 17 insertions(+), 35 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 2e71b07466a..f218607438a 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -95,6 +95,7 @@ class ModelBase: gguf_writer: gguf.GGUFWriter model_name: str | None metadata_override: Path | None + metadata: gguf.Metadata dir_model_card: Path remote_hf_model_id: str | None @@ -5559,16 +5560,8 @@ class _Qwen35MtpMixin: gguf_writer: gguf.GGUFWriter block_count: int tensor_map: gguf.TensorNameMap - fname_out: Path - ftype: Any - metadata: Any - # When true, `--mtp` was passed: filter out trunk weights so the resulting - # GGUF carries only the MTP head and the shared embeddings/output tensors. mtp_only: bool = False - - # When true, `--no-mtp` was passed: drop `mtp.*` tensors and report block_count - # as the trunk-only layer count, producing a GGUF with no MTP head. no_mtp: bool = False def __init__(self, *args, **kwargs): @@ -5582,10 +5575,19 @@ def __init__(self, *args, **kwargs): def filter_tensors(cls, item): name, _ = item if name.startswith("mtp."): - # Qwen3Next drops `mtp.*` tensors; Qwen3.5/3.6 use them by default. `--no-mtp` opts out. if cls.no_mtp: return None return item + if cls.mtp_only: + # In --mtp mode, drop trunk weights and keep only the shared embeddings/output + # tensors that the standalone MTP graph references at inference time. + canonical = name.replace("language_model.", "") + keep = canonical in ( + "model.embed_tokens.weight", "model.norm.weight", "lm_head.weight", + "embed_tokens.weight", "norm.weight", + ) + if not keep: + return None return super().filter_tensors(item) # ty: ignore[unresolved-attribute] def set_gguf_parameters(self): @@ -5601,38 +5603,19 @@ def prepare_metadata(self, vocab_only: bool): if not self.mtp_only: return - output_type: str = self.ftype.name.partition("_")[2] + output_type: str = self.ftype.name.partition("_")[2] # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] if self.fname_out.is_dir(): fname_default: str = gguf.naming_convention( - self.metadata.name, self.metadata.basename, self.metadata.finetune, - self.metadata.version, size_label=None, output_type=output_type, model_type=None) + self.metadata.name, self.metadata.basename, self.metadata.finetune, # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] + self.metadata.version, size_label=None, output_type=output_type, model_type=None) # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] self.fname_out = self.fname_out / f"{Path(fname_default).stem}-MTP.gguf" else: stem = self.fname_out.stem self.fname_out = self.fname_out.parent / f"{stem}-MTP{self.fname_out.suffix}" def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # Multimodal Qwen3.5/3.6 wrap the text model under `model.language_model.*`. - if name.startswith("model.language_model."): - name = "model." + name[len("model.language_model."):] - elif name.startswith("language_model."): - name = name[len("language_model."):] - - if self.mtp_only: - # In --mtp mode keep only the MTP block plus the shared embedding/output tensors - # that the standalone MTP graph references at inference time. - keep = ( - name.startswith("mtp.") or - name in ("model.embed_tokens.weight", "model.norm.weight", "lm_head.weight") or - name in ("embed_tokens.weight", "norm.weight") - ) - if not keep: - return - # Remap MTP block tensors to llama.cpp's layer-indexed nextn naming. - # HF: mtp.layers.0.* (transformer block at MTP slot 0) - # mtp.fc / mtp.pre_fc_norm_embedding / mtp.pre_fc_norm_hidden / mtp.norm if name.startswith("mtp."): n_layer = self.hparams["num_hidden_layers"] if name.find("layers.") != -1: @@ -14262,9 +14245,11 @@ def main() -> None: logger.error("--mtp / --no-mtp are only supported for Qwen3.5/3.6 text variants today") sys.exit(1) - # set on the class so __init__ sees the correct mode when computing block_count + # set on the class so __init__ / filter_tensors see the correct mode if args.no_mtp: model_class.no_mtp = True + if args.mtp: + model_class.mtp_only = True model_instance = model_class(dir_model, output_type, fname_out, is_big_endian=args.bigendian, use_temp_file=args.use_temp_file, @@ -14278,9 +14263,6 @@ def main() -> None: fuse_gate_up_exps=args.fuse_gate_up_exps ) - if args.mtp: - model_instance.mtp_only = True - if args.vocab_only: logger.info("Exporting model vocab...") model_instance.write_vocab() From d769c578ebc38ca877a4b028619164fac887507c Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Tue, 12 May 2026 22:54:39 +0800 Subject: [PATCH 5/8] convert: fix pycheck --- convert_hf_to_gguf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index f218607438a..73f7aeb6a36 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -14247,9 +14247,9 @@ def main() -> None: # set on the class so __init__ / filter_tensors see the correct mode if args.no_mtp: - model_class.no_mtp = True + model_class.no_mtp = True # ty: ignore[unresolved-attribute] if args.mtp: - model_class.mtp_only = True + model_class.mtp_only = True # ty: ignore[unresolved-attribute] model_instance = model_class(dir_model, output_type, fname_out, is_big_endian=args.bigendian, use_temp_file=args.use_temp_file, From 84f00ce76028349d603f45b936d6bb11cfd48331 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Tue, 12 May 2026 22:59:31 +0800 Subject: [PATCH 6/8] review: formatting --- include/llama.h | 4 ++-- src/llama-context.cpp | 7 ++++--- src/llama-cparams.h | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/include/llama.h b/include/llama.h index 2dcec3bb7a7..b814e2c58de 100644 --- a/include/llama.h +++ b/include/llama.h @@ -199,8 +199,8 @@ extern "C" { }; enum llama_context_type { - LLAMA_CONTEXT_TYPE_DEFAULT = 0, - LLAMA_CONTEXT_TYPE_MTP = 1, + LLAMA_CONTEXT_TYPE_DEFAULT = 0, + LLAMA_CONTEXT_TYPE_MTP = 1, }; // TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 43e694494c8..6ecbe1b6083 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -290,7 +290,7 @@ llama_context::llama_context( /*.type_k =*/ params.type_k, /*.type_v =*/ params.type_v, /*.swa_full =*/ params.swa_full, - /*.ctx_type=*/ cparams.ctx_type, + /*.ctx_type= */ cparams.ctx_type, }; memory.reset(model.create_memory(params_mem, cparams)); @@ -3397,8 +3397,9 @@ llama_context * llama_init_from_model( model->hparams.pooling_type, params.pooling_type); } - if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP && model->hparams.nextn_predict_layers == 0) { - LLAMA_LOG_WARN("%s: context type MTP request but model doesn't contain MTP layers\n", __func__); + if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP && + model->hparams.nextn_predict_layers == 0) { + LLAMA_LOG_WARN("%s: context type MTP requested but model doesn't contain MTP layers\n", __func__); return nullptr; } diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 9a4c2274407..cbf74eba63e 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -41,8 +41,8 @@ struct llama_cparams { bool kv_unified; bool pipeline_parallel; - enum llama_pooling_type pooling_type; enum llama_context_type ctx_type; + enum llama_pooling_type pooling_type; ggml_backend_sched_eval_callback cb_eval; void * cb_eval_user_data; From f6f29e6fb89ff8abdd9f92f84679d27dd03a1c10 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Wed, 13 May 2026 00:53:20 +0800 Subject: [PATCH 7/8] use `mtp-` for identifying mtp models --- common/download.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/common/download.cpp b/common/download.cpp index 71d8dfda00d..c1cbe2033aa 100644 --- a/common/download.cpp +++ b/common/download.cpp @@ -566,7 +566,7 @@ static hf_cache::hf_files get_split_files(const hf_cache::hf_files & files, return result; } -// pick the best sibling GGUF whose filename contains `keyword` (e.g. "mmproj" / "MTP"), +// pick the best sibling GGUF whose filename contains `keyword` (e.g. "mmproj" / "mtp"), // preferring deeper shared directory prefix with the model, then closest quantization static hf_cache::hf_file find_best_sibling(const hf_cache::hf_files & files, const std::string & model, @@ -616,7 +616,7 @@ static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files, static hf_cache::hf_file find_best_mtp(const hf_cache::hf_files & files, const std::string & model) { - return find_best_sibling(files, model, "MTP"); + return find_best_sibling(files, model, "mtp-"); } static bool gguf_filename_is_model(const std::string & filepath) { @@ -631,7 +631,7 @@ static bool gguf_filename_is_model(const std::string & filepath) { return filename.find("mmproj") == std::string::npos && filename.find("imatrix") == std::string::npos && - filename.find("MTP") == std::string::npos; + filename.find("mtp-") == std::string::npos; } static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files, From 0712378b45828768f0f5830abae2f217e6aea962 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Wed, 13 May 2026 11:11:48 +0800 Subject: [PATCH 8/8] convert: fix mtp conversion --- convert_hf_to_gguf.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 73f7aeb6a36..0857d9e6803 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5598,21 +5598,19 @@ def set_gguf_parameters(self): self.gguf_writer.add_nextn_predict_layers(n) def prepare_metadata(self, vocab_only: bool): + # TextModel.prepare_metadata resolves a directory fname_out into a concrete + # file path, so snapshot is_dir() first to decide whether to apply the mtp- prefix. + from_dir = self.fname_out.is_dir() super().prepare_metadata(vocab_only=vocab_only) # ty: ignore[unresolved-attribute] - if not self.mtp_only: + if not self.mtp_only or not from_dir: return output_type: str = self.ftype.name.partition("_")[2] # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] - - if self.fname_out.is_dir(): - fname_default: str = gguf.naming_convention( - self.metadata.name, self.metadata.basename, self.metadata.finetune, # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] - self.metadata.version, size_label=None, output_type=output_type, model_type=None) # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] - self.fname_out = self.fname_out / f"{Path(fname_default).stem}-MTP.gguf" - else: - stem = self.fname_out.stem - self.fname_out = self.fname_out.parent / f"{stem}-MTP{self.fname_out.suffix}" + fname_default: str = gguf.naming_convention( + self.metadata.name, self.metadata.basename, self.metadata.finetune, # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] + self.metadata.version, size_label=None, output_type=output_type, model_type=None) # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] + self.fname_out = self.fname_out.parent / f"mtp-{fname_default}.gguf" def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # Remap MTP block tensors to llama.cpp's layer-indexed nextn naming.