From 61e916c2bdb88498637a4f0d91b4eee63d071973 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Tue, 12 May 2026 16:08:03 +0800
Subject: [PATCH 1/8] MTP: clean-up

---
 common/arg.cpp                  |  27 ++-
 common/download.cpp             |  55 ++++--
 common/download.h               |   7 +-
 common/speculative.cpp          |   2 +-
 convert_hf_to_gguf.py           |  78 +++++++-
 include/llama.h                 |  11 ++
 src/llama-arch.cpp              |   2 -
 src/llama-arch.h                |   2 -
 src/llama-context.cpp           |  16 +-
 src/llama-context.h             |   2 +
 src/llama-graph.h               |   1 +
 src/llama-memory.h              |   3 +
 src/llama-model.cpp             |  39 ++--
 src/models/models.h             |  30 +---
 src/models/qwen35-mtp.cpp       | 207 ---------------------
 src/models/qwen35.cpp           | 254 +++++++++++++++++++++-----
 src/models/qwen35moe-mtp.cpp    | 252 --------------------------
 src/models/qwen35moe.cpp        | 306 +++++++++++++++++++++++++++-----
 tests/test-llama-archs.cpp      |   6 +-
 tools/server/server-context.cpp |  41 ++---
 20 files changed, 704 insertions(+), 637 deletions(-)
 delete mode 100644 src/models/qwen35-mtp.cpp
 delete mode 100644 src/models/qwen35moe-mtp.cpp

diff --git a/common/arg.cpp b/common/arg.cpp
index 9fefe411ee2..e3334ab4ab6 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -335,11 +335,15 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
 struct handle_model_result {
     bool found_mmproj = false;
     common_params_model mmproj;
+
+    bool found_mtp = false;
+    common_params_model mtp;
 };
 
 static handle_model_result common_params_handle_model(struct common_params_model & model,
                                                       const std::string          & bearer_token,
-                                                      bool                         offline) {
+                                                      bool                         offline,
+                                                      bool                         search_mtp = false) {
     handle_model_result result;
 
     if (!model.docker_repo.empty()) {
@@ -354,7 +358,7 @@ static handle_model_result common_params_handle_model(struct common_params_model
         common_download_opts opts;
         opts.bearer_token = bearer_token;
         opts.offline = offline;
-        auto download_result = common_download_model(model, opts, true);
+        auto download_result = common_download_model(model, opts, true, search_mtp);
 
         if (download_result.model_path.empty()) {
             LOG_ERR("error: failed to download model from Hugging Face\n");
@@ -368,6 +372,11 @@ static handle_model_result common_params_handle_model(struct common_params_model
             result.found_mmproj = true;
             result.mmproj.path  = download_result.mmproj_path;
         }
+
+        if (!download_result.mtp_path.empty()) {
+            result.found_mtp = true;
+            result.mtp.path  = download_result.mtp_path;
+        }
     } else if (!model.url.empty()) {
         if (model.path.empty()) {
             auto f = string_split<std::string>(model.url, '#').front();
@@ -588,7 +597,11 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
 
     // handle model and download
     if (!skip_model_download) {
-        auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
+        const bool spec_type_mtp = std::find(params.speculative.types.begin(),
+                                             params.speculative.types.end(),
+                                             COMMON_SPECULATIVE_TYPE_MTP) != params.speculative.types.end();
+
+        auto res = common_params_handle_model(params.model, params.hf_token, params.offline, spec_type_mtp);
         if (params.no_mmproj) {
             params.mmproj = {};
         } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
@@ -602,6 +615,14 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
                 break;
             }
         }
+        // when --spec-type mtp is set and no draft model was provided explicitly,
+        // fall back to the MTP head discovered alongside the -hf model
+        if (spec_type_mtp && res.found_mtp &&
+            params.speculative.draft.mparams.path.empty() &&
+            params.speculative.draft.mparams.hf_repo.empty() &&
+            params.speculative.draft.mparams.url.empty()) {
+            params.speculative.draft.mparams.path = res.mtp.path;
+        }
         common_params_handle_model(params.speculative.draft.mparams, params.hf_token, params.offline);
         common_params_handle_model(params.vocoder.model,             params.hf_token, params.offline);
     }
diff --git a/common/download.cpp b/common/download.cpp
index d6d47b2d2fc..71d8dfda00d 100644
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -566,8 +566,11 @@ static hf_cache::hf_files get_split_files(const hf_cache::hf_files & files,
     return result;
 }
 
-static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files,
-                                          const std::string        & model) {
+// pick the best sibling GGUF whose filename contains `keyword` (e.g. "mmproj" / "MTP"),
+// preferring deeper shared directory prefix with the model, then closest quantization
+static hf_cache::hf_file find_best_sibling(const hf_cache::hf_files & files,
+                                           const std::string        & model,
+                                           const std::string        & keyword) {
     hf_cache::hf_file best;
     size_t best_depth = 0;
     int best_diff = 0;
@@ -579,20 +582,20 @@ static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files,
 
     for (const auto & f : files) {
         if (!string_ends_with(f.path, ".gguf") ||
-            f.path.find("mmproj") == std::string::npos) {
+            f.path.find(keyword) == std::string::npos) {
             continue;
         }
 
-        auto mmproj_parts = string_split<std::string>(f.path, '/');
-        auto mmproj_dir = mmproj_parts.end() - 1;
+        auto sib_parts = string_split<std::string>(f.path, '/');
+        auto sib_dir = sib_parts.end() - 1;
 
         auto [_, dir] = std::mismatch(model_parts.begin(), model_dir,
-                                      mmproj_parts.begin(), mmproj_dir);
-        if (dir != mmproj_dir) {
+                                      sib_parts.begin(), sib_dir);
+        if (dir != sib_dir) {
             continue;
         }
 
-        size_t depth = dir - mmproj_parts.begin();
+        size_t depth = dir - sib_parts.begin();
         auto bits = extract_quant_bits(f.path);
         auto diff = std::abs(bits - model_bits);
 
@@ -606,6 +609,16 @@ static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files,
     return best;
 }
 
+static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files,
+                                          const std::string        & model) {
+    return find_best_sibling(files, model, "mmproj");
+}
+
+static hf_cache::hf_file find_best_mtp(const hf_cache::hf_files & files,
+                                       const std::string        & model) {
+    return find_best_sibling(files, model, "MTP");
+}
+
 static bool gguf_filename_is_model(const std::string & filepath) {
     if (!string_ends_with(filepath, ".gguf")) {
         return false;
@@ -617,7 +630,8 @@ static bool gguf_filename_is_model(const std::string & filepath) {
     }
 
     return filename.find("mmproj")  == std::string::npos &&
-           filename.find("imatrix") == std::string::npos;
+           filename.find("imatrix") == std::string::npos &&
+           filename.find("MTP")     == std::string::npos;
 }
 
 static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,
@@ -673,11 +687,13 @@ struct hf_plan {
     hf_cache::hf_file primary;
     hf_cache::hf_files model_files;
     hf_cache::hf_file mmproj;
+    hf_cache::hf_file mtp;
 };
 
 static hf_plan get_hf_plan(const common_params_model  & model,
                            const common_download_opts & opts,
-                           bool download_mmproj) {
+                           bool download_mmproj,
+                           bool download_mtp) {
     hf_plan plan;
     hf_cache::hf_files all;
 
@@ -723,6 +739,10 @@ static hf_plan get_hf_plan(const common_params_model  & model,
         plan.mmproj = find_best_mmproj(all, primary.path);
     }
 
+    if (download_mtp) {
+        plan.mtp = find_best_mtp(all, primary.path);
+    }
+
     return plan;
 }
 
@@ -756,7 +776,8 @@ static std::vector<download_task> get_url_tasks(const common_params_model & mode
 
 common_download_model_result common_download_model(const common_params_model  & model,
                                                    const common_download_opts & opts,
-                                                   bool download_mmproj) {
+                                                   bool download_mmproj,
+                                                   bool download_mtp) {
     common_download_model_result result;
     std::vector<download_task> tasks;
     hf_plan hf;
@@ -764,13 +785,16 @@ common_download_model_result common_download_model(const common_params_model  &
     bool is_hf = !model.hf_repo.empty();
 
     if (is_hf) {
-        hf = get_hf_plan(model, opts, download_mmproj);
+        hf = get_hf_plan(model, opts, download_mmproj, download_mtp);
         for (const auto & f : hf.model_files) {
             tasks.push_back({f.url, f.local_path});
         }
         if (!hf.mmproj.path.empty()) {
             tasks.push_back({hf.mmproj.url, hf.mmproj.local_path});
         }
+        if (!hf.mtp.path.empty()) {
+            tasks.push_back({hf.mtp.url, hf.mtp.local_path});
+        }
     } else if (!model.url.empty()) {
         tasks = get_url_tasks(model);
     } else {
@@ -807,6 +831,10 @@ common_download_model_result common_download_model(const common_params_model  &
         if (!hf.mmproj.path.empty()) {
             result.mmproj_path = hf_cache::finalize_file(hf.mmproj);
         }
+
+        if (!hf.mtp.path.empty()) {
+            result.mtp_path = hf_cache::finalize_file(hf.mtp);
+        }
     } else {
         result.model_path = model.path;
     }
@@ -946,7 +974,8 @@ std::vector<common_cached_model_info> common_list_cached_models() {
     for (const auto & f : files) {
         auto split = get_gguf_split_info(f.path);
         if (split.index != 1 || split.tag.empty() ||
-            split.prefix.find("mmproj") != std::string::npos) {
+            split.prefix.find("mmproj") != std::string::npos ||
+            split.prefix.find("MTP")    != std::string::npos) {
             continue;
         }
         if (seen.insert(f.repo_id + ":" + split.tag).second) {
diff --git a/common/download.h b/common/download.h
index edc3e9f1a71..4a169ef7796 100644
--- a/common/download.h
+++ b/common/download.h
@@ -59,6 +59,7 @@ struct common_download_opts {
 struct common_download_model_result {
     std::string model_path;
     std::string mmproj_path;
+    std::string mtp_path;
 };
 
 // Download model from HuggingFace repo or URL
@@ -83,12 +84,14 @@ struct common_download_model_result {
 // when opts.offline=true, no network requests are made
 // when download_mmproj=true, searches for mmproj in same directory as model or any parent directory
 // then with the closest quantization bits
+// when download_mtp=true, applies the same sibling search for an MTP-head GGUF
 //
-// returns result with model_path and mmproj_path (empty on failure)
+// returns result with model_path, mmproj_path and mtp_path (empty when not found / on failure)
 common_download_model_result common_download_model(
     const common_params_model & model,
     const common_download_opts & opts = {},
-    bool download_mmproj = false
+    bool download_mmproj = false,
+    bool download_mtp    = false
 );
 
 // returns list of cached models
diff --git a/common/speculative.cpp b/common/speculative.cpp
index 01b826db344..5135c48821d 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -1198,7 +1198,7 @@ common_speculative * common_speculative_init(common_params_speculative & params,
                 LOG_WRN("%s: draft model is not specified - cannot use 'draft' type\n", __func__);
                 has_draft = false;
             }
-        } else if (has_draft_model) {
+        } else if (has_draft_model && !has_mtp && !has_draft_eagle3) {
             LOG_WRN("%s: draft model is specified but 'draft' speculative type is not explicitly enabled - enabling it\n", __func__);
             has_draft = true;
         }
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index f0f9d2545b5..2e71b07466a 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5559,17 +5559,59 @@ class _Qwen35MtpMixin:
     gguf_writer: gguf.GGUFWriter
     block_count: int
     tensor_map: gguf.TensorNameMap
+    fname_out: Path
+    ftype: Any
+    metadata: Any
+
+    # When true, `--mtp` was passed: filter out trunk weights so the resulting
+    # GGUF carries only the MTP head and the shared embeddings/output tensors.
+    mtp_only: bool = False
+
+    # When true, `--no-mtp` was passed: drop `mtp.*` tensors and report block_count
+    # as the trunk-only layer count, producing a GGUF with no MTP head.
+    no_mtp: bool = False
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("mtp_num_hidden_layers", 0)
+        self.block_count = self.hparams["num_hidden_layers"]
+        if not self.no_mtp:
+            self.block_count += self.hparams.get("mtp_num_hidden_layers", 0)
         self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
 
+    @classmethod
+    def filter_tensors(cls, item):
+        name, _ = item
+        if name.startswith("mtp."):
+            # Qwen3Next drops `mtp.*` tensors; Qwen3.5/3.6 use them by default. `--no-mtp` opts out.
+            if cls.no_mtp:
+                return None
+            return item
+        return super().filter_tensors(item)  # ty: ignore[unresolved-attribute]
+
     def set_gguf_parameters(self):
         super().set_gguf_parameters()  # ty: ignore[unresolved-attribute]
+        if self.no_mtp:
+            return
         if (n := self.hparams.get("mtp_num_hidden_layers", 0)) > 0:
             self.gguf_writer.add_nextn_predict_layers(n)
 
+    def prepare_metadata(self, vocab_only: bool):
+        super().prepare_metadata(vocab_only=vocab_only)  # ty: ignore[unresolved-attribute]
+
+        if not self.mtp_only:
+            return
+
+        output_type: str = self.ftype.name.partition("_")[2]
+
+        if self.fname_out.is_dir():
+            fname_default: str = gguf.naming_convention(
+                self.metadata.name, self.metadata.basename, self.metadata.finetune,
+                self.metadata.version, size_label=None, output_type=output_type, model_type=None)
+            self.fname_out = self.fname_out / f"{Path(fname_default).stem}-MTP.gguf"
+        else:
+            stem = self.fname_out.stem
+            self.fname_out = self.fname_out.parent / f"{stem}-MTP{self.fname_out.suffix}"
+
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         # Multimodal Qwen3.5/3.6 wrap the text model under `model.language_model.*`.
         if name.startswith("model.language_model."):
@@ -5577,6 +5619,17 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         elif name.startswith("language_model."):
             name = name[len("language_model."):]
 
+        if self.mtp_only:
+            # In --mtp mode keep only the MTP block plus the shared embedding/output tensors
+            # that the standalone MTP graph references at inference time.
+            keep = (
+                name.startswith("mtp.") or
+                name in ("model.embed_tokens.weight", "model.norm.weight", "lm_head.weight") or
+                name in ("embed_tokens.weight", "norm.weight")
+            )
+            if not keep:
+                return
+
         # Remap MTP block tensors to llama.cpp's layer-indexed nextn naming.
         # HF: mtp.layers.0.*  (transformer block at MTP slot 0)
         #     mtp.fc / mtp.pre_fc_norm_embedding / mtp.pre_fc_norm_hidden / mtp.norm
@@ -14034,6 +14087,14 @@ def parse_args() -> argparse.Namespace:
         "--mmproj", action="store_true",
         help="(Experimental) Export multimodal projector (mmproj) for vision models. This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name.",
     )
+    parser.add_argument(
+        "--mtp", action="store_true",
+        help="(Experimental) Export only the multi-token prediction (MTP) head as a separate GGUF, suitable for use as a speculative draft. Output file name will get a '-MTP' suffix.",
+    )
+    parser.add_argument(
+        "--no-mtp", action="store_true",
+        help="(Experimental) Exclude the multi-token prediction (MTP) head from the converted GGUF. Pair with --mtp on a second run to publish trunk and MTP as two files. Note: the split form duplicates embeddings, so the bundled default is more space-efficient overall.",
+    )
     parser.add_argument(
         "--mistral-format", action="store_true",
         help="Whether the model is stored following the Mistral format.",
@@ -14193,6 +14254,18 @@ def main() -> None:
         else:
             model_class = MistralModel
 
+        if args.mtp and args.no_mtp:
+            logger.error("--mtp and --no-mtp are mutually exclusive")
+            sys.exit(1)
+
+        if (args.mtp or args.no_mtp) and not issubclass(model_class, _Qwen35MtpMixin):
+            logger.error("--mtp / --no-mtp are only supported for Qwen3.5/3.6 text variants today")
+            sys.exit(1)
+
+        # set on the class so __init__ sees the correct mode when computing block_count
+        if args.no_mtp:
+            model_class.no_mtp = True
+
         model_instance = model_class(dir_model, output_type, fname_out,
                                      is_big_endian=args.bigendian, use_temp_file=args.use_temp_file,
                                      eager=args.no_lazy,
@@ -14205,6 +14278,9 @@ def main() -> None:
                                      fuse_gate_up_exps=args.fuse_gate_up_exps
                                      )
 
+        if args.mtp:
+            model_instance.mtp_only = True
+
         if args.vocab_only:
             logger.info("Exporting model vocab...")
             model_instance.write_vocab()
diff --git a/include/llama.h b/include/llama.h
index 1b896944735..bb34b47102a 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -198,6 +198,13 @@ extern "C" {
         LLAMA_SPLIT_MODE_TENSOR = 3,
     };
 
+    enum llama_graph_type {
+        LLAMA_GRAPH_TYPE_DEFAULT     = 0,
+        LLAMA_GRAPH_TYPE_ENCODER     = 1,
+        LLAMA_GRAPH_TYPE_DECODER     = 2,
+        LLAMA_GRAPH_TYPE_DECODER_MTP = 3,
+    };
+
     // TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)
     typedef struct llama_token_data {
         llama_token id; // token id
@@ -383,6 +390,8 @@ extern "C" {
         // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
         struct llama_sampler_seq_config * samplers;
         size_t                            n_samplers;
+
+        enum llama_graph_type graph_type;
     };
 
     struct llama_model_tensor_override {
@@ -557,6 +566,8 @@ extern "C" {
     LLAMA_API int32_t llama_model_n_head_kv  (const struct llama_model * model);
     LLAMA_API int32_t llama_model_n_swa      (const struct llama_model * model);
 
+    LLAMA_API bool    llama_model_has_mtp    (const struct llama_model * model);
+
     // Get the model's RoPE frequency scaling factor
     LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
 
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 794666d09a4..ab4334da79b 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -41,8 +41,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_QWEN3VLMOE,       "qwen3vlmoe"       },
     { LLM_ARCH_QWEN35,           "qwen35"           },
     { LLM_ARCH_QWEN35MOE,        "qwen35moe"        },
-    { LLM_ARCH_QWEN35_MTP,       "qwen35_mtp"       },
-    { LLM_ARCH_QWEN35MOE_MTP,    "qwen35moe_mtp"    },
     { LLM_ARCH_PHI2,             "phi2"             },
     { LLM_ARCH_PHI3,             "phi3"             },
     { LLM_ARCH_PHIMOE,           "phimoe"           },
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 71c2ca6e6b3..e37d548c98e 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -45,8 +45,6 @@ enum llm_arch {
     LLM_ARCH_QWEN3VLMOE,
     LLM_ARCH_QWEN35,
     LLM_ARCH_QWEN35MOE,
-    LLM_ARCH_QWEN35_MTP,
-    LLM_ARCH_QWEN35MOE_MTP,
     LLM_ARCH_PHI2,
     LLM_ARCH_PHI3,
     LLM_ARCH_PHIMOE,
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index aea8a0a4e81..47ada0b75a2 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -66,6 +66,14 @@ llama_context::llama_context(
     cparams.cb_eval           = params.cb_eval;
     cparams.cb_eval_user_data = params.cb_eval_user_data;
 
+    switch (params.graph_type) {
+        case LLAMA_GRAPH_TYPE_DEFAULT:     gtype = LLM_GRAPH_TYPE_DEFAULT;     break;
+        case LLAMA_GRAPH_TYPE_ENCODER:     gtype = LLM_GRAPH_TYPE_ENCODER;     break;
+        case LLAMA_GRAPH_TYPE_DECODER:     gtype = LLM_GRAPH_TYPE_DECODER;     break;
+        case LLAMA_GRAPH_TYPE_DECODER_MTP: gtype = LLM_GRAPH_TYPE_DECODER_MTP; break;
+        default: throw std::runtime_error("invalid graph_type");
+    }
+
     // Initialize backend samplers here so they are part of the sampling graph
     // before the reserve passes run later in this function. This avoids a later
     // re-reserve when graph nodes change.
@@ -279,6 +287,7 @@ llama_context::llama_context(
             /*.type_k   =*/ params.type_k,
             /*.type_v   =*/ params.type_v,
             /*.swa_full =*/ params.swa_full,
+            /*.gtype    =*/ gtype,
         };
 
         memory.reset(model.create_memory(params_mem, cparams));
@@ -1738,7 +1747,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
         }
 
         ggml_status status;
-        const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
+        const auto * res = process_ubatch(ubatch, gtype, mctx.get(), status);
 
         if (!res) {
             // the last ubatch failed or was aborted -> remove all positions of that ubatch from the memory module
@@ -2198,7 +2207,7 @@ ggml_cgraph * llama_context::graph_reserve(
 
     auto * res = gf_res_reserve.get();
 
-    const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT);
+    const auto gparams = graph_params(res, ubatch, mctx, gtype);
 
     res->reset();
 
@@ -3177,7 +3186,7 @@ void llama_context::opt_epoch_iter(
 
             auto * res = gf_res_prev.get();
 
-            const auto gparams = graph_params(res, ubatch, mctx.get(), LLM_GRAPH_TYPE_DEFAULT);
+            const auto gparams = graph_params(res, ubatch, mctx.get(), gtype);
 
             res->reset();
 
@@ -3306,6 +3315,7 @@ llama_context_params llama_context_default_params() {
         /*.kv_unified                  =*/ false,
         /*.sampler                     =*/ nullptr,
         /*.n_sampler                   =*/ 0,
+        /*.graph_type                  =*/ LLAMA_GRAPH_TYPE_DEFAULT,
     };
 
     return result;
diff --git a/src/llama-context.h b/src/llama-context.h
index e16ac4c618b..5d9efdf242a 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -268,6 +268,8 @@ struct llama_context {
 
     llama_cparams cparams;
 
+    llm_graph_type gtype = LLM_GRAPH_TYPE_DECODER;
+
     llama_adapter_cvec_ptr  cvec;
     llama_adapter_loras_ptr loras;
 
diff --git a/src/llama-graph.h b/src/llama-graph.h
index d3cd69a674c..9e55d0a675e 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -32,6 +32,7 @@ enum llm_graph_type {
     LLM_GRAPH_TYPE_DEFAULT,
     LLM_GRAPH_TYPE_ENCODER,
     LLM_GRAPH_TYPE_DECODER,
+    LLM_GRAPH_TYPE_DECODER_MTP,
 };
 
 enum llm_ffn_op_type {
diff --git a/src/llama-memory.h b/src/llama-memory.h
index 4a157b91fdb..2875d614315 100644
--- a/src/llama-memory.h
+++ b/src/llama-memory.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "llama.h"
+#include "llama-graph.h"
 
 #include <map>
 #include <memory>
@@ -20,6 +21,8 @@ struct llama_memory_params {
 
     // use full-size SWA cache
     bool swa_full;
+
+    llm_graph_type gtype;
 };
 
 enum llama_memory_status {
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 6abfbfb3e3b..e4b891ea96e 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -276,10 +276,6 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params
             return new llama_model_qwen35(params);
         case LLM_ARCH_QWEN35MOE:
             return new llama_model_qwen35moe(params);
-        case LLM_ARCH_QWEN35_MTP:
-            return new llama_model_qwen35_mtp(params);
-        case LLM_ARCH_QWEN35MOE_MTP:
-            return new llama_model_qwen35moe_mtp(params);
         case LLM_ARCH_MISTRAL3:
             return new llama_model_mistral3(params);
         case LLM_ARCH_MIMO2:
@@ -1409,8 +1405,7 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
         }
     }
 
-    const bool partial_load = (arch == LLM_ARCH_QWEN35_MTP || arch == LLM_ARCH_QWEN35MOE_MTP);
-    ml.done_getting_tensors(partial_load);
+    ml.done_getting_tensors();
 
     // populate tensors_by_name
     for (auto & [_, ctx_ptr] : ml.ctx_map) {
@@ -1948,6 +1943,12 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
         // checks
         default:
             {
+                // The MTP head is dense-attention only on hybrid Qwen3.5/3.6, so use a plain
+                // attention KV cache for the MTP context instead of the hybrid wrapper.
+                const bool mtp_on_hybrid_qwen35 =
+                    params.gtype == LLM_GRAPH_TYPE_DECODER_MTP &&
+                    (arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE);
+
                 if (llm_arch_is_recurrent(arch)) {
                     res = new llama_memory_recurrent(
                             *this,
@@ -1957,7 +1958,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                             std::max((uint32_t) 1, cparams.n_seq_max),
                             cparams.n_seq_max,
                             nullptr);
-                } else if (llm_arch_is_hybrid(arch)) {
+                } else if (llm_arch_is_hybrid(arch) && !mtp_on_hybrid_qwen35) {
                     // The main difference between hybrid architectures is the
                     // layer filters, so pick the right one here
                     llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
@@ -1972,6 +1973,14 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                         filter_recr = [&](int32_t il) {
                             return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
                         };
+                    } else if (arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE) {
+                        const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers;
+                        filter_attn = [&, n_main](int32_t il) {
+                            return (uint32_t)il < n_main && !hparams.is_recurrent(il);
+                        };
+                        filter_recr = [&, n_main](int32_t il) {
+                            return (uint32_t)il < n_main && hparams.is_recurrent(il);
+                        };
                     }
 
                     if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
@@ -2014,6 +2023,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                     }
                 } else {
                     llama_memory_i::layer_reuse_cb reuse = nullptr;
+                    llama_kv_cache::layer_filter_cb filter = nullptr;
 
                     if (arch == LLM_ARCH_GEMMA3N || arch == LLM_ARCH_GEMMA4) {
                         reuse = [&](int32_t il) {
@@ -2025,6 +2035,11 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                         };
                     }
 
+                    if (mtp_on_hybrid_qwen35) {
+                        const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers;
+                        filter = [n_main](int32_t il) { return (uint32_t)il >= n_main; };
+                    }
+
                     if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
                         GGML_ASSERT(hparams.is_swa_any());
 
@@ -2040,7 +2055,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                                 cparams.n_seq_max,
                                 cparams.n_ubatch,
                                 1,
-                                nullptr,
+                                filter,
                                 reuse);
                     } else {
                         GGML_ASSERT(!hparams.is_swa_any());
@@ -2057,7 +2072,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                                 1,
                                 hparams.n_swa,
                                 hparams.swa_type,
-                                nullptr,
+                                filter,
                                 nullptr);
                     }
                 }
@@ -2161,6 +2176,10 @@ int32_t llama_model_n_swa(const llama_model * model) {
     return model->hparams.n_swa;
 }
 
+bool llama_model_has_mtp(const llama_model * model) {
+    return model->hparams.nextn_predict_layers > 0;
+}
+
 uint32_t llama_model_n_cls_out(const struct llama_model * model) {
     return model->hparams.n_cls_out;
 }
@@ -2328,8 +2347,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_QWEN3VLMOE:
         case LLM_ARCH_QWEN35:
         case LLM_ARCH_QWEN35MOE:
-        case LLM_ARCH_QWEN35_MTP:
-        case LLM_ARCH_QWEN35MOE_MTP:
             return LLAMA_ROPE_TYPE_IMROPE;
 
         case LLM_ARCH_GLM4:
diff --git a/src/models/models.h b/src/models/models.h
index 1f04d313d13..fe95b9b89ad 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -1739,6 +1739,10 @@ struct llama_model_qwen35 : public llama_model_base {
         const llama_model & model;
     };
 
+    struct graph_mtp : public llm_graph_context {
+        graph_mtp(const llama_model & model, const llm_graph_params & params);
+    };
+
     std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
@@ -1781,30 +1785,8 @@ struct llama_model_qwen35moe : public llama_model_base {
         const llama_model & model;
     };
 
-    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
-};
-
-
-struct llama_model_qwen35_mtp : public llama_model_base {
-    llama_model_qwen35_mtp(const struct llama_model_params & params) : llama_model_base(params) {}
-    void load_arch_hparams(llama_model_loader & ml) override;
-    void load_arch_tensors(llama_model_loader & ml) override;
-
-    struct graph : public llm_graph_context {
-        graph(const llama_model & model, const llm_graph_params & params);
-    };
-
-    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
-};
-
-
-struct llama_model_qwen35moe_mtp : public llama_model_base {
-    llama_model_qwen35moe_mtp(const struct llama_model_params & params) : llama_model_base(params) {}
-    void load_arch_hparams(llama_model_loader & ml) override;
-    void load_arch_tensors(llama_model_loader & ml) override;
-
-    struct graph : public llm_graph_context {
-        graph(const llama_model & model, const llm_graph_params & params);
+    struct graph_mtp : public llm_graph_context {
+        graph_mtp(const llama_model & model, const llm_graph_params & params);
     };
 
     std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
diff --git a/src/models/qwen35-mtp.cpp b/src/models/qwen35-mtp.cpp
deleted file mode 100644
index 83039e98db5..00000000000
--- a/src/models/qwen35-mtp.cpp
+++ /dev/null
@@ -1,207 +0,0 @@
-#include "models.h"
-
-void llama_model_qwen35_mtp::load_arch_hparams(llama_model_loader & ml) {
-    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
-    ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS,    hparams.rope_sections, 4, true);
-
-    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
-    GGML_ASSERT(hparams.nextn_predict_layers > 0   && "QWEN35_MTP requires nextn_predict_layers > 0");
-    GGML_ASSERT(hparams.nextn_predict_layers <= hparams.n_layer);
-
-    // only the MTP layers get a KV cache, trunk layers are skipped.
-    hparams.kv_only_nextn         = true;
-    hparams.n_layer_kv_from_start = -1;
-    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
-        hparams.recurrent_layer_arr[i] = false;
-    }
-
-    type = LLM_TYPE_UNKNOWN;
-}
-
-void llama_model_qwen35_mtp::load_arch_tensors(llama_model_loader &) {
-    LLAMA_LOAD_LOCALS;
-
-    tok_embd    = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), { n_embd, n_vocab }, 0);
-    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd },          TENSOR_NOT_REQUIRED);
-    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
-    if (output == nullptr) {
-        output  = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
-    }
-
-    const uint32_t n_main = n_layer - hparams.nextn_predict_layers;
-    for (int i = 0; i < n_layer; ++i) {
-        if (static_cast<uint32_t>(i) < n_main) {
-            continue;  // trunk layer — owned by the sibling QWEN35 model
-        }
-
-        auto & layer = layers[i];
-
-        // MTP block looks like a full-attention Qwen3.5 decoder block.
-        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), { n_embd }, 0);
-        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
-
-        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0);
-        layer.wo          = create_tensor(tn(LLM_TENSOR_ATTN_OUT,    "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
-        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
-        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
-
-        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-
-        // NextN-specific tensors that define the MTP block.
-        layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ,          "weight", i), { 2 * n_embd, n_embd }, 0);
-        layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM,            "weight", i), { n_embd },              0);
-        layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM,            "weight", i), { n_embd },              0);
-        layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS,     "weight", i), { n_embd, n_vocab },     TENSOR_NOT_REQUIRED);
-        layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab },     TENSOR_NOT_REQUIRED);
-        layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd },              TENSOR_NOT_REQUIRED);
-    }
-}
-
-std::unique_ptr<llm_graph_context> llama_model_qwen35_mtp::build_arch_graph(const llm_graph_params & params) const {
-    return std::make_unique<graph>(*this, params);
-}
-
-// LLM_ARCH_QWEN35_MTP draft head for Qwen3.5/3.6 dense series
-llama_model_qwen35_mtp::graph::graph(const llama_model & model, const llm_graph_params & params)
-    : llm_graph_context(params) {
-    GGML_ASSERT(hparams.nextn_predict_layers > 0 && "QWEN35_MTP requires nextn_predict_layers > 0");
-    GGML_ASSERT(hparams.nextn_predict_layers == 1 && "QWEN35_MTP currently only supports a single MTP block");
-
-    const int64_t n_embd_head = hparams.n_embd_head_v();
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
-
-    // The MTP block lives at the source file's original layer index.
-    const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers;
-    const auto & layer = model.layers[il];
-
-    GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj");
-    GGML_ASSERT(layer.nextn.enorm   && "MTP block missing nextn.enorm");
-    GGML_ASSERT(layer.nextn.hnorm   && "MTP block missing nextn.hnorm");
-
-    int sections[4];
-    std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
-
-    auto inp = std::make_unique<llm_graph_input_embd>(hparams.n_embd);
-
-    inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-    ggml_set_input(inp->tokens);
-
-    inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens);
-    ggml_set_input(inp->embd);
-    ggml_set_name(inp->embd, "mtp_h_input");
-
-    ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd;
-
-    ggml_tensor * h_input  = inp->embd;
-    ggml_tensor * tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens);
-    cb(tok_embd, "mtp_tok_embd", il);
-
-    res->add_input(std::move(inp));
-
-    ggml_tensor * inp_pos = build_inp_pos();
-    auto * inp_attn       = build_attn_inp_kv();
-
-    ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
-    cb(h_norm, "mtp_hnorm", il);
-
-    ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, LLM_NORM_RMS, il);
-    cb(e_norm, "mtp_enorm", il);
-
-    ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0);
-    cb(concat, "mtp_concat", il);
-
-    ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat);
-    cb(cur, "mtp_eh_proj", il);
-
-    ggml_tensor * inpSA = cur;
-
-    cur = build_norm(cur, layer.attn_norm, nullptr, LLM_NORM_RMS, il);
-    cb(cur, "mtp_attn_norm", il);
-
-    ggml_tensor * Qcur_full = build_lora_mm(layer.wq, cur, layer.wq_s);
-    cb(Qcur_full, "mtp_Qcur_full", il);
-
-    ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full,
-            n_embd_head, n_head, n_tokens,
-            ggml_element_size(Qcur_full) * n_embd_head * 2,
-            ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head,
-            0);
-    Qcur = build_norm(Qcur, layer.attn_q_norm, nullptr, LLM_NORM_RMS, il);
-    cb(Qcur, "mtp_Qcur_normed", il);
-
-    ggml_tensor * gate = ggml_view_3d(ctx0, Qcur_full,
-            n_embd_head, n_head, n_tokens,
-            ggml_element_size(Qcur_full) * n_embd_head * 2,
-            ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head,
-            ggml_element_size(Qcur_full) * n_embd_head);
-    gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens);
-    cb(gate, "mtp_gate", il);
-
-    ggml_tensor * Kcur = build_lora_mm(layer.wk, cur, layer.wk_s);
-    Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-    Kcur = build_norm(Kcur, layer.attn_k_norm, nullptr, LLM_NORM_RMS, il);
-    cb(Kcur, "mtp_Kcur_normed", il);
-
-    ggml_tensor * Vcur = build_lora_mm(layer.wv, cur, layer.wv_s);
-    Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-    cb(Vcur, "mtp_Vcur", il);
-
-    Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
-            n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
-            ext_factor, attn_factor, beta_fast, beta_slow);
-    Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
-            n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
-            ext_factor, attn_factor, beta_fast, beta_slow);
-
-    const float kq_scale = hparams.f_attention_scale == 0.0f
-            ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
-
-    cur = build_attn(inp_attn,
-            nullptr, nullptr, nullptr,
-            Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
-    cb(cur, "mtp_attn_pregate", il);
-
-    cur = ggml_mul(ctx0, cur, ggml_sigmoid(ctx0, gate));
-    cur = build_lora_mm(layer.wo, cur, layer.wo_s);
-    cb(cur, "mtp_attn_out", il);
-
-    cur = ggml_add(ctx0, cur, inpSA);
-    cb(cur, "mtp_attn_residual", il);
-
-    ggml_tensor * ffn_residual = cur;
-    cur = build_norm(cur, layer.attn_post_norm, nullptr, LLM_NORM_RMS, il);
-    cb(cur, "mtp_attn_post_norm", il);
-
-    cur = build_ffn(cur,
-            layer.ffn_up,   nullptr, layer.ffn_up_s,
-            layer.ffn_gate, nullptr, layer.ffn_gate_s,
-            layer.ffn_down, nullptr, layer.ffn_down_s,
-            nullptr,
-            LLM_FFN_SILU, LLM_FFN_PAR, il);
-    cb(cur, "mtp_ffn_out", il);
-
-    cur = ggml_add(ctx0, cur, ffn_residual);
-    cb(cur, "mtp_post_ffn", il);
-
-    // Pre-norm hidden state: used by the AR draft loop to seed the next MTP step.
-    // (In the trunk graph this is `t_h_pre_norm`; the MTP head reuses the same slot.)
-    cb(cur, "h_pre_norm", -1);
-    res->t_h_pre_norm = cur;
-
-    ggml_tensor * head_norm_w = layer.nextn.shared_head_norm
-            ? layer.nextn.shared_head_norm
-            : model.output_norm;
-    GGML_ASSERT(head_norm_w && "QWEN35_MTP: missing both nextn.shared_head_norm and output_norm");
-    cur = build_norm(cur, head_norm_w, nullptr, LLM_NORM_RMS, -1);
-    cb(cur, "mtp_shared_head_norm", -1);
-
-    ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output;
-    GGML_ASSERT(head_w && "QWEN35_MTP: missing LM head (nextn.shared_head_head or model.output)");
-    cur = build_lora_mm(head_w, cur);
-    cb(cur, "result_output", -1);
-
-    res->t_logits = cur;
-    ggml_build_forward_expand(gf, cur);
-}
diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp
index 79fdd8f679b..ca4297e94f3 100644
--- a/src/models/qwen35.cpp
+++ b/src/models/qwen35.cpp
@@ -15,7 +15,6 @@ void llama_model_qwen35::load_arch_hparams(llama_model_loader & ml) {
     // NextN/MTP (Qwen3.5/3.6): extra decoder block appended beyond the main stack
     ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
     GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
-    hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
 
     // Mark recurrent layers (linear attention layers). MTP layers are dense
     // attention-only and must be flagged non-recurrent.
@@ -36,9 +35,14 @@ void llama_model_qwen35::load_arch_hparams(llama_model_loader & ml) {
     }
 }
 
-void llama_model_qwen35::load_arch_tensors(llama_model_loader &) {
+void llama_model_qwen35::load_arch_tensors(llama_model_loader & ml) {
     LLAMA_LOAD_LOCALS;
 
+    const uint32_t n_main = n_layer - hparams.nextn_predict_layers;
+    const bool mtp_only   = (hparams.nextn_predict_layers > 0) &&
+                            (ml.get_weight("blk.0.attn_norm.weight") == nullptr);
+    const int trunk_flags = mtp_only ? TENSOR_NOT_REQUIRED : 0;
+
     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
 
     // output
@@ -50,60 +54,85 @@ void llama_model_qwen35::load_arch_tensors(llama_model_loader &) {
         output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
     }
 
-    // Calculate dimensions from hyperparameters
-    const int64_t head_k_dim = hparams.ssm_d_state;
-    const int64_t head_v_dim = hparams.ssm_d_state;
-    const int64_t n_k_heads  = hparams.ssm_n_group;
-    const int64_t n_v_heads  = hparams.ssm_dt_rank;
-    const int64_t key_dim    = head_k_dim * n_k_heads;
-    const int64_t value_dim  = head_v_dim * n_v_heads;
-    const int64_t conv_dim   = key_dim * 2 + value_dim;
+    auto load_block_trunk = [&](int il, int flags) {
+        auto & layer = layers[il];
 
-    for (int i = 0; i < n_layer; ++i) {
-        auto & layer = layers[i];
+        // Calculate dimensions from hyperparameters
+        const int64_t head_k_dim = hparams.ssm_d_state;
+        const int64_t head_v_dim = hparams.ssm_d_state;
+        const int64_t n_k_heads  = hparams.ssm_n_group;
+        const int64_t n_v_heads  = hparams.ssm_dt_rank;
+        const int64_t key_dim    = head_k_dim * n_k_heads;
+        const int64_t value_dim  = head_v_dim * n_v_heads;
+        const int64_t conv_dim   = key_dim * 2 + value_dim;
 
-        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), { n_embd }, 0);
-        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
+        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", il), { n_embd }, flags);
+        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", il), { n_embd }, flags);
 
-        if (!hparams.is_recurrent(i)) {
+        if (!hparams.is_recurrent(il)) {
             // Attention layers
-            create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0);
-            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
+            create_tensor_qkv(layer, il, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, flags);
+            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", il), { n_embd_head_k * n_head, n_embd }, flags);
 
             // Q/K normalization for attention layers
-            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
-            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
+            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", il), { n_embd_head_k }, flags);
+            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", il), { n_embd_head_k }, flags);
         } else {
             // Linear attention (gated delta net) specific tensors
             // Create tensors with calculated dimensions
-            layer.wqkv           = create_tensor(tn(LLM_TENSOR_ATTN_QKV,       "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
-            layer.wqkv_gate      = create_tensor(tn(LLM_TENSOR_ATTN_GATE,      "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
-            layer.ssm_conv1d     = create_tensor(tn(LLM_TENSOR_SSM_CONV1D,     "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
-            layer.ssm_dt         = create_tensor(tn(LLM_TENSOR_SSM_DT,         "bias",   i), { hparams.ssm_dt_rank }, 0);
-            layer.ssm_a          = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN,             i), { hparams.ssm_dt_rank }, 0);
-            layer.ssm_beta       = create_tensor(tn(LLM_TENSOR_SSM_BETA,       "weight", i), { n_embd, n_v_heads }, 0);
-            layer.ssm_alpha      = create_tensor(tn(LLM_TENSOR_SSM_ALPHA,      "weight", i), { n_embd, n_v_heads }, 0);
-            layer.ssm_norm       = create_tensor(tn(LLM_TENSOR_SSM_NORM,       "weight", i), { head_v_dim }, 0);
-            layer.ssm_out        = create_tensor(tn(LLM_TENSOR_SSM_OUT,        "weight", i), { value_dim, n_embd }, 0);
+            layer.wqkv           = create_tensor(tn(LLM_TENSOR_ATTN_QKV,       "weight", il), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
+            layer.wqkv_gate      = create_tensor(tn(LLM_TENSOR_ATTN_GATE,      "weight", il), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
+            layer.ssm_conv1d     = create_tensor(tn(LLM_TENSOR_SSM_CONV1D,     "weight", il), { hparams.ssm_d_conv, conv_dim }, flags);
+            layer.ssm_dt         = create_tensor(tn(LLM_TENSOR_SSM_DT,         "bias",   il), { hparams.ssm_dt_rank }, flags);
+            layer.ssm_a          = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN,             il), { hparams.ssm_dt_rank }, flags);
+            layer.ssm_beta       = create_tensor(tn(LLM_TENSOR_SSM_BETA,       "weight", il), { n_embd, n_v_heads }, flags);
+            layer.ssm_alpha      = create_tensor(tn(LLM_TENSOR_SSM_ALPHA,      "weight", il), { n_embd, n_v_heads }, flags);
+            layer.ssm_norm       = create_tensor(tn(LLM_TENSOR_SSM_NORM,       "weight", il), { head_v_dim }, flags);
+            layer.ssm_out        = create_tensor(tn(LLM_TENSOR_SSM_OUT,        "weight", il), { value_dim, n_embd }, flags);
         }
 
-        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-
-        // NextN/MTP tensors (preserved but unused) - only bound on MTP layers
-        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
-            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ,          "weight", i), { 2 * n_embd, n_embd }, TENSOR_NOT_REQUIRED);
-            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM,            "weight", i), { n_embd },              TENSOR_NOT_REQUIRED);
-            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM,            "weight", i), { n_embd },              TENSOR_NOT_REQUIRED);
-            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS,     "weight", i), { n_embd, n_vocab },     TENSOR_NOT_REQUIRED);
-            layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab },     TENSOR_NOT_REQUIRED);
-            layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd },              TENSOR_NOT_REQUIRED);
-        }
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", il), {n_embd,   n_ff}, flags);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", il), {  n_ff, n_embd}, flags);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", il), {n_embd,   n_ff}, flags);
+    };
+
+    auto load_block_mtp = [&](int il) {
+        auto & layer = layers[il];
+
+        // MTP block looks like a full-attention Qwen3.5 decoder block.
+        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", il), { n_embd }, 0);
+        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", il), { n_embd }, 0);
+
+        create_tensor_qkv(layer, il, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0);
+        layer.wo          = create_tensor(tn(LLM_TENSOR_ATTN_OUT,    "weight", il), { n_embd_head_k * n_head, n_embd }, 0);
+        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", il), { n_embd_head_k }, 0);
+        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", il), { n_embd_head_k }, 0);
+
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", il), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", il), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", il), {n_embd,   n_ff}, 0);
+
+        // NextN-specific tensors that define the MTP block.
+        layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ,          "weight", il), { 2 * n_embd, n_embd }, 0);
+        layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM,            "weight", il), { n_embd },              0);
+        layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM,            "weight", il), { n_embd },              0);
+        layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS,     "weight", il), { n_embd, n_vocab },     TENSOR_NOT_REQUIRED);
+        layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", il), { n_embd, n_vocab },     TENSOR_NOT_REQUIRED);
+        layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", il), { n_embd },              TENSOR_NOT_REQUIRED);
+    };
+
+    for (int i = 0; i < (int) n_main; ++i) {
+        load_block_trunk(i, trunk_flags);
+    }
+    for (int i = (int) n_main; i < n_layer; ++i) {
+        load_block_mtp(i);
     }
 }
 
 std::unique_ptr<llm_graph_context> llama_model_qwen35::build_arch_graph(const llm_graph_params & params) const {
+    if (params.gtype == LLM_GRAPH_TYPE_DECODER_MTP) {
+        return std::make_unique<graph_mtp>(*this, params);
+    }
     return std::make_unique<graph>(*this, params);
 }
 
@@ -493,3 +522,146 @@ ggml_tensor * llama_model_qwen35::graph::build_layer_ffn(ggml_tensor * cur, cons
 
     return cur;
 }
+
+// LLM_GRAPH_TYPE_DECODER_MTP draft head for Qwen3.5/3.6 dense series
+llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params)
+    : llm_graph_context(params) {
+    GGML_ASSERT(hparams.nextn_predict_layers > 0 && "QWEN35 MTP requires nextn_predict_layers > 0");
+    GGML_ASSERT(hparams.nextn_predict_layers == 1 && "QWEN35 MTP currently only supports a single MTP block");
+
+    const int64_t n_embd_head = hparams.n_embd_head_v();
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+
+    // The MTP block lives at the source file's original layer index.
+    const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers;
+    const auto & layer = model.layers[il];
+
+    GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj");
+    GGML_ASSERT(layer.nextn.enorm   && "MTP block missing nextn.enorm");
+    GGML_ASSERT(layer.nextn.hnorm   && "MTP block missing nextn.hnorm");
+
+    int sections[4];
+    std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
+
+    auto inp = std::make_unique<llm_graph_input_embd>(hparams.n_embd);
+
+    inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+    ggml_set_input(inp->tokens);
+
+    inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens);
+    ggml_set_input(inp->embd);
+    ggml_set_name(inp->embd, "mtp_h_input");
+
+    ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd;
+
+    ggml_tensor * h_input  = inp->embd;
+    ggml_tensor * tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens);
+    cb(tok_embd, "mtp_tok_embd", il);
+
+    res->add_input(std::move(inp));
+
+    ggml_tensor * inp_pos = build_inp_pos();
+    auto * inp_attn       = build_attn_inp_kv();
+
+    ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
+    cb(h_norm, "mtp_hnorm", il);
+
+    ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, LLM_NORM_RMS, il);
+    cb(e_norm, "mtp_enorm", il);
+
+    ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0);
+    cb(concat, "mtp_concat", il);
+
+    ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat);
+    cb(cur, "mtp_eh_proj", il);
+
+    ggml_tensor * inpSA = cur;
+
+    cur = build_norm(cur, layer.attn_norm, nullptr, LLM_NORM_RMS, il);
+    cb(cur, "mtp_attn_norm", il);
+
+    ggml_tensor * Qcur_full = build_lora_mm(layer.wq, cur, layer.wq_s);
+    cb(Qcur_full, "mtp_Qcur_full", il);
+
+    ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full,
+            n_embd_head, n_head, n_tokens,
+            ggml_element_size(Qcur_full) * n_embd_head * 2,
+            ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head,
+            0);
+    Qcur = build_norm(Qcur, layer.attn_q_norm, nullptr, LLM_NORM_RMS, il);
+    cb(Qcur, "mtp_Qcur_normed", il);
+
+    ggml_tensor * gate = ggml_view_3d(ctx0, Qcur_full,
+            n_embd_head, n_head, n_tokens,
+            ggml_element_size(Qcur_full) * n_embd_head * 2,
+            ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head,
+            ggml_element_size(Qcur_full) * n_embd_head);
+    gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens);
+    cb(gate, "mtp_gate", il);
+
+    ggml_tensor * Kcur = build_lora_mm(layer.wk, cur, layer.wk_s);
+    Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+    Kcur = build_norm(Kcur, layer.attn_k_norm, nullptr, LLM_NORM_RMS, il);
+    cb(Kcur, "mtp_Kcur_normed", il);
+
+    ggml_tensor * Vcur = build_lora_mm(layer.wv, cur, layer.wv_s);
+    Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+    cb(Vcur, "mtp_Vcur", il);
+
+    Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
+            n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+            ext_factor, attn_factor, beta_fast, beta_slow);
+    Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
+            n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+            ext_factor, attn_factor, beta_fast, beta_slow);
+
+    const float kq_scale = hparams.f_attention_scale == 0.0f
+            ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+    cur = build_attn(inp_attn,
+            nullptr, nullptr, nullptr,
+            Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+    cb(cur, "mtp_attn_pregate", il);
+
+    cur = ggml_mul(ctx0, cur, ggml_sigmoid(ctx0, gate));
+    cur = build_lora_mm(layer.wo, cur, layer.wo_s);
+    cb(cur, "mtp_attn_out", il);
+
+    cur = ggml_add(ctx0, cur, inpSA);
+    cb(cur, "mtp_attn_residual", il);
+
+    ggml_tensor * ffn_residual = cur;
+    cur = build_norm(cur, layer.attn_post_norm, nullptr, LLM_NORM_RMS, il);
+    cb(cur, "mtp_attn_post_norm", il);
+
+    cur = build_ffn(cur,
+            layer.ffn_up,   nullptr, layer.ffn_up_s,
+            layer.ffn_gate, nullptr, layer.ffn_gate_s,
+            layer.ffn_down, nullptr, layer.ffn_down_s,
+            nullptr,
+            LLM_FFN_SILU, LLM_FFN_PAR, il);
+    cb(cur, "mtp_ffn_out", il);
+
+    cur = ggml_add(ctx0, cur, ffn_residual);
+    cb(cur, "mtp_post_ffn", il);
+
+    // Pre-norm hidden state: used by the AR draft loop to seed the next MTP step.
+    // (In the trunk graph this is `t_h_pre_norm`; the MTP head reuses the same slot.)
+    cb(cur, "h_pre_norm", -1);
+    res->t_h_pre_norm = cur;
+
+    ggml_tensor * head_norm_w = layer.nextn.shared_head_norm
+            ? layer.nextn.shared_head_norm
+            : model.output_norm;
+    GGML_ASSERT(head_norm_w && "QWEN35 MTP: missing both nextn.shared_head_norm and output_norm");
+    cur = build_norm(cur, head_norm_w, nullptr, LLM_NORM_RMS, -1);
+    cb(cur, "mtp_shared_head_norm", -1);
+
+    ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output;
+    GGML_ASSERT(head_w && "QWEN35 MTP: missing LM head (nextn.shared_head_head or model.output)");
+    cur = build_lora_mm(head_w, cur);
+    cb(cur, "result_output", -1);
+
+    res->t_logits = cur;
+    ggml_build_forward_expand(gf, cur);
+}
diff --git a/src/models/qwen35moe-mtp.cpp b/src/models/qwen35moe-mtp.cpp
deleted file mode 100644
index 9f662213bee..00000000000
--- a/src/models/qwen35moe-mtp.cpp
+++ /dev/null
@@ -1,252 +0,0 @@
-#include "models.h"
-
-void llama_model_qwen35moe_mtp::load_arch_hparams(llama_model_loader & ml) {
-    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp,      false);
-    ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp,    false);
-    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
-    ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS,    hparams.rope_sections, 4, true);
-
-    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
-    GGML_ASSERT(hparams.nextn_predict_layers > 0   && "QWEN35MOE_MTP requires nextn_predict_layers > 0");
-    GGML_ASSERT(hparams.nextn_predict_layers <= hparams.n_layer);
-    GGML_ASSERT(hparams.n_expert > 0 && "QWEN35MOE_MTP requires n_expert > 0");
-
-    // only the MTP layers get a KV cache, trunk layers are skipped.
-    hparams.kv_only_nextn         = true;
-    hparams.n_layer_kv_from_start = -1;
-    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
-        hparams.recurrent_layer_arr[i] = false;
-    }
-
-    type = LLM_TYPE_UNKNOWN;
-}
-
-void llama_model_qwen35moe_mtp::load_arch_tensors(llama_model_loader &) {
-    LLAMA_LOAD_LOCALS;
-
-    tok_embd    = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), { n_embd, n_vocab }, 0);
-    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd },          TENSOR_NOT_REQUIRED);
-    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
-    if (output == nullptr) {
-        output  = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
-    }
-
-    const int64_t n_ff_exp   = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
-    const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
-
-    const uint32_t n_main = n_layer - hparams.nextn_predict_layers;
-    for (int i = 0; i < n_layer; ++i) {
-        if (static_cast<uint32_t>(i) < n_main) {
-            continue;  // trunk layer — owned by the sibling QWEN35MOE model
-        }
-
-        auto & layer = layers[i];
-
-        // MTP block looks like a full-attention Qwen3.5 decoder block with MoE FFN.
-        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), { n_embd }, 0);
-        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
-
-        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0);
-        layer.wo          = create_tensor(tn(LLM_TENSOR_ATTN_OUT,    "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
-        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
-        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
-
-        // Routed experts
-        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), { n_embd, n_expert }, 0);
-        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
-        create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
-
-        // Shared experts
-        layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
-        layer.ffn_gate_shexp     = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP,     "weight", i), { n_embd, n_ff_shexp }, 0);
-        layer.ffn_up_shexp       = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,       "weight", i), { n_embd, n_ff_shexp }, 0);
-        layer.ffn_down_shexp     = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP,     "weight", i), { n_ff_shexp, n_embd }, 0);
-
-        // NextN-specific tensors that define the MTP block.
-        layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ,          "weight", i), { 2 * n_embd, n_embd }, 0);
-        layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM,            "weight", i), { n_embd },              0);
-        layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM,            "weight", i), { n_embd },              0);
-        layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS,     "weight", i), { n_embd, n_vocab },     TENSOR_NOT_REQUIRED);
-        layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab },     TENSOR_NOT_REQUIRED);
-        layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd },              TENSOR_NOT_REQUIRED);
-    }
-}
-
-std::unique_ptr<llm_graph_context> llama_model_qwen35moe_mtp::build_arch_graph(const llm_graph_params & params) const {
-    return std::make_unique<graph>(*this, params);
-}
-
-// LLM_ARCH_QWEN35MOE_MTP draft head for Qwen3.5/3.6 MoE
-llama_model_qwen35moe_mtp::graph::graph(const llama_model & model, const llm_graph_params & params)
-    : llm_graph_context(params) {
-    GGML_ASSERT(hparams.nextn_predict_layers > 0 && "QWEN35MOE_MTP requires nextn_predict_layers > 0");
-    GGML_ASSERT(hparams.nextn_predict_layers == 1 && "QWEN35MOE_MTP currently only supports a single MTP block");
-
-    const int64_t n_embd_head = hparams.n_embd_head_v();
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
-
-    const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers;
-    const auto & layer = model.layers[il];
-
-    GGML_ASSERT(layer.nextn.eh_proj    && "MTP block missing nextn.eh_proj");
-    GGML_ASSERT(layer.nextn.enorm      && "MTP block missing nextn.enorm");
-    GGML_ASSERT(layer.nextn.hnorm      && "MTP block missing nextn.hnorm");
-    GGML_ASSERT(layer.ffn_gate_inp     && "MTP block missing ffn_gate_inp");
-
-    int sections[4];
-    std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
-
-    auto inp = std::make_unique<llm_graph_input_embd>(hparams.n_embd);
-
-    inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-    ggml_set_input(inp->tokens);
-
-    inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens);
-    ggml_set_input(inp->embd);
-    ggml_set_name(inp->embd, "mtp_h_input");
-
-    ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd;
-
-    ggml_tensor * h_input  = inp->embd;
-    ggml_tensor * tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens);
-    cb(tok_embd, "mtp_tok_embd", il);
-
-    res->add_input(std::move(inp));
-
-    ggml_tensor * inp_pos = build_inp_pos();
-    auto * inp_attn       = build_attn_inp_kv();
-
-    ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
-    cb(h_norm, "mtp_hnorm", il);
-
-    ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, LLM_NORM_RMS, il);
-    cb(e_norm, "mtp_enorm", il);
-
-    ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0);
-    cb(concat, "mtp_concat", il);
-
-    ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat);
-    cb(cur, "mtp_eh_proj", il);
-
-    ggml_tensor * inpSA = cur;
-
-    cur = build_norm(cur, layer.attn_norm, nullptr, LLM_NORM_RMS, il);
-    cb(cur, "mtp_attn_norm", il);
-
-    ggml_tensor * Qcur_full = build_lora_mm(layer.wq, cur, layer.wq_s);
-    cb(Qcur_full, "mtp_Qcur_full", il);
-
-    ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full,
-            n_embd_head, n_head, n_tokens,
-            ggml_element_size(Qcur_full) * n_embd_head * 2,
-            ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head,
-            0);
-    Qcur = build_norm(Qcur, layer.attn_q_norm, nullptr, LLM_NORM_RMS, il);
-    cb(Qcur, "mtp_Qcur_normed", il);
-
-    ggml_tensor * gate = ggml_view_3d(ctx0, Qcur_full,
-            n_embd_head, n_head, n_tokens,
-            ggml_element_size(Qcur_full) * n_embd_head * 2,
-            ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head,
-            ggml_element_size(Qcur_full) * n_embd_head);
-    gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens);
-    cb(gate, "mtp_gate", il);
-
-    ggml_tensor * Kcur = build_lora_mm(layer.wk, cur, layer.wk_s);
-    Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-    Kcur = build_norm(Kcur, layer.attn_k_norm, nullptr, LLM_NORM_RMS, il);
-    cb(Kcur, "mtp_Kcur_normed", il);
-
-    ggml_tensor * Vcur = build_lora_mm(layer.wv, cur, layer.wv_s);
-    Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-    cb(Vcur, "mtp_Vcur", il);
-
-    Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
-            n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
-            ext_factor, attn_factor, beta_fast, beta_slow);
-    Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
-            n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
-            ext_factor, attn_factor, beta_fast, beta_slow);
-
-    const float kq_scale = hparams.f_attention_scale == 0.0f
-            ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
-
-    cur = build_attn(inp_attn,
-            nullptr, nullptr, nullptr,
-            Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
-    cb(cur, "mtp_attn_pregate", il);
-
-    cur = ggml_mul(ctx0, cur, ggml_sigmoid(ctx0, gate));
-    cur = build_lora_mm(layer.wo, cur, layer.wo_s);
-    cb(cur, "mtp_attn_out", il);
-
-    cur = ggml_add(ctx0, cur, inpSA);
-    cb(cur, "mtp_attn_residual", il);
-
-    ggml_tensor * ffn_residual = cur;
-    cur = build_norm(cur, layer.attn_post_norm, nullptr, LLM_NORM_RMS, il);
-    cb(cur, "mtp_attn_post_norm", il);
-
-    // MoE FFN — routed experts plus gated shared expert (mirrors qwen35moe).
-    ggml_tensor * moe_out =
-        build_moe_ffn(cur,
-            layer.ffn_gate_inp,
-            layer.ffn_up_exps,
-            layer.ffn_gate_exps,
-            layer.ffn_down_exps,
-            nullptr,
-            n_expert, n_expert_used,
-            LLM_FFN_SILU, true,
-            hparams.expert_weights_scale,
-            LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il,
-            nullptr, layer.ffn_gate_up_exps,
-            layer.ffn_up_exps_s,
-            layer.ffn_gate_exps_s,
-            layer.ffn_down_exps_s);
-    cb(moe_out, "mtp_ffn_moe_out", il);
-
-    if (layer.ffn_up_shexp != nullptr) {
-        ggml_tensor * ffn_shexp =
-            build_ffn(cur,
-                layer.ffn_up_shexp,   nullptr, layer.ffn_up_shexp_s,
-                layer.ffn_gate_shexp, nullptr, layer.ffn_gate_shexp_s,
-                layer.ffn_down_shexp, nullptr, layer.ffn_down_shexp_s,
-                nullptr,
-                LLM_FFN_SILU, LLM_FFN_PAR, il);
-        cb(ffn_shexp, "mtp_ffn_shexp", il);
-
-        ggml_tensor * shared_gate = build_lora_mm(layer.ffn_gate_inp_shexp, cur);
-        shared_gate = ggml_sigmoid(ctx0, shared_gate);
-        cb(shared_gate, "mtp_shared_expert_gate_sigmoid", il);
-
-        ffn_shexp = ggml_mul(ctx0, ffn_shexp, shared_gate);
-        cb(ffn_shexp, "mtp_ffn_shexp_gated", il);
-
-        cur = ggml_add(ctx0, moe_out, ffn_shexp);
-    } else {
-        cur = moe_out;
-    }
-    cb(cur, "mtp_ffn_out", il);
-
-    cur = ggml_add(ctx0, cur, ffn_residual);
-    cb(cur, "mtp_post_ffn", il);
-
-    // Pre-norm hidden state: used by the AR draft loop to seed the next MTP step.
-    cb(cur, "h_pre_norm", -1);
-    res->t_h_pre_norm = cur;
-
-    ggml_tensor * head_norm_w = layer.nextn.shared_head_norm
-            ? layer.nextn.shared_head_norm
-            : model.output_norm;
-    GGML_ASSERT(head_norm_w && "QWEN35MOE_MTP: missing both nextn.shared_head_norm and output_norm");
-    cur = build_norm(cur, head_norm_w, nullptr, LLM_NORM_RMS, -1);
-    cb(cur, "mtp_shared_head_norm", -1);
-
-    ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output;
-    GGML_ASSERT(head_w && "QWEN35MOE_MTP: missing LM head (nextn.shared_head_head or model.output)");
-    cur = build_lora_mm(head_w, cur);
-    cb(cur, "result_output", -1);
-
-    res->t_logits = cur;
-    ggml_build_forward_expand(gf, cur);
-}
diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp
index 5912aa38153..a4c7cb6ad14 100644
--- a/src/models/qwen35moe.cpp
+++ b/src/models/qwen35moe.cpp
@@ -18,7 +18,6 @@ void llama_model_qwen35moe::load_arch_hparams(llama_model_loader & ml) {
     // NextN/MTP (Qwen3.5/3.6): extra decoder block appended beyond the main stack
     ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
     GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
-    hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
 
     // Mark recurrent layers (linear attention layers). MTP layers are dense
     // attention-only and must be flagged non-recurrent.
@@ -39,9 +38,14 @@ void llama_model_qwen35moe::load_arch_hparams(llama_model_loader & ml) {
     }
 }
 
-void llama_model_qwen35moe::load_arch_tensors(llama_model_loader &) {
+void llama_model_qwen35moe::load_arch_tensors(llama_model_loader & ml) {
     LLAMA_LOAD_LOCALS;
 
+    const uint32_t n_main = n_layer - hparams.nextn_predict_layers;
+    const bool mtp_only   = (hparams.nextn_predict_layers > 0) &&
+                            (ml.get_weight("blk.0.attn_norm.weight") == nullptr);
+    const int trunk_flags = mtp_only ? TENSOR_NOT_REQUIRED : 0;
+
     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
 
     // output
@@ -53,70 +57,105 @@ void llama_model_qwen35moe::load_arch_tensors(llama_model_loader &) {
         output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
     }
 
-    const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+    auto load_block_trunk = [&](int il, int flags) {
+        auto & layer = layers[il];
 
-    // Calculate dimensions from hyperparameters
-    const int64_t head_k_dim = hparams.ssm_d_state;
-    const int64_t head_v_dim = hparams.ssm_d_state;
-    const int64_t n_k_heads  = hparams.ssm_n_group;
-    const int64_t n_v_heads  = hparams.ssm_dt_rank;
-    const int64_t key_dim    = head_k_dim * n_k_heads;
-    const int64_t value_dim  = head_v_dim * n_v_heads;
-    const int64_t conv_dim   = key_dim * 2 + value_dim;
+        const int64_t n_ff_exp   = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+        const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
 
-    for (int i = 0; i < n_layer; ++i) {
-        auto & layer = layers[i];
+        // Calculate dimensions from hyperparameters
+        const int64_t head_k_dim = hparams.ssm_d_state;
+        const int64_t head_v_dim = hparams.ssm_d_state;
+        const int64_t n_k_heads  = hparams.ssm_n_group;
+        const int64_t n_v_heads  = hparams.ssm_dt_rank;
+        const int64_t key_dim    = head_k_dim * n_k_heads;
+        const int64_t value_dim  = head_v_dim * n_v_heads;
+        const int64_t conv_dim   = key_dim * 2 + value_dim;
 
-        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), { n_embd }, 0);
-        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
+        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", il), { n_embd }, flags);
+        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", il), { n_embd }, flags);
 
-        if (!hparams.is_recurrent(i)) {
+        if (!hparams.is_recurrent(il)) {
             // Attention layers
-            create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0);
-            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
+            create_tensor_qkv(layer, il, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, flags);
+            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", il), { n_embd_head_k * n_head, n_embd }, flags);
 
             // Q/K normalization for attention layers
-            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
-            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
+            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", il), { n_embd_head_k }, flags);
+            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", il), { n_embd_head_k }, flags);
         } else {
             // Linear attention (gated delta net) specific tensors
             // Create tensors with calculated dimensions
-            layer.wqkv           = create_tensor(tn(LLM_TENSOR_ATTN_QKV,       "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
-            layer.wqkv_gate      = create_tensor(tn(LLM_TENSOR_ATTN_GATE,      "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
-            layer.ssm_conv1d     = create_tensor(tn(LLM_TENSOR_SSM_CONV1D,     "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
-            layer.ssm_dt         = create_tensor(tn(LLM_TENSOR_SSM_DT,         "bias",   i), { hparams.ssm_dt_rank }, 0);
-            layer.ssm_a          = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN,             i), { hparams.ssm_dt_rank }, 0);
-            layer.ssm_beta       = create_tensor(tn(LLM_TENSOR_SSM_BETA,       "weight", i), { n_embd, n_v_heads }, 0);
-            layer.ssm_alpha      = create_tensor(tn(LLM_TENSOR_SSM_ALPHA,      "weight", i), { n_embd, n_v_heads }, 0);
-            layer.ssm_norm       = create_tensor(tn(LLM_TENSOR_SSM_NORM,       "weight", i), { head_v_dim }, 0);
-            layer.ssm_out        = create_tensor(tn(LLM_TENSOR_SSM_OUT,        "weight", i), { value_dim, n_embd }, 0);
+            layer.wqkv           = create_tensor(tn(LLM_TENSOR_ATTN_QKV,       "weight", il), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
+            layer.wqkv_gate      = create_tensor(tn(LLM_TENSOR_ATTN_GATE,      "weight", il), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
+            layer.ssm_conv1d     = create_tensor(tn(LLM_TENSOR_SSM_CONV1D,     "weight", il), { hparams.ssm_d_conv, conv_dim }, flags);
+            layer.ssm_dt         = create_tensor(tn(LLM_TENSOR_SSM_DT,         "bias",   il), { hparams.ssm_dt_rank }, flags);
+            layer.ssm_a          = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN,             il), { hparams.ssm_dt_rank }, flags);
+            layer.ssm_beta       = create_tensor(tn(LLM_TENSOR_SSM_BETA,       "weight", il), { n_embd, n_v_heads }, flags);
+            layer.ssm_alpha      = create_tensor(tn(LLM_TENSOR_SSM_ALPHA,      "weight", il), { n_embd, n_v_heads }, flags);
+            layer.ssm_norm       = create_tensor(tn(LLM_TENSOR_SSM_NORM,       "weight", il), { head_v_dim }, flags);
+            layer.ssm_out        = create_tensor(tn(LLM_TENSOR_SSM_OUT,        "weight", il), { value_dim, n_embd }, flags);
         }
 
-        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), { n_embd, n_expert }, 0);
-        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
-        create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
+        // Routed experts
+        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", il), { n_embd, n_expert }, flags);
+        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", il), { n_ff_exp, n_embd, n_expert }, flags);
+        create_tensor_gate_up_exps(layer, il, n_embd, n_ff_exp, n_expert, flags);
 
         // Shared experts
+        layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", il), { n_embd }, flags);
+        layer.ffn_gate_shexp     = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP,     "weight", il), { n_embd, n_ff_shexp }, flags);
+        layer.ffn_up_shexp       = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,       "weight", il), { n_embd, n_ff_shexp }, flags);
+        layer.ffn_down_shexp     = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP,     "weight", il), { n_ff_shexp, n_embd }, flags);
+    };
+
+    auto load_block_mtp = [&](int il) {
+        auto & layer = layers[il];
+
+        const int64_t n_ff_exp   = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
         const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
 
-        layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
-        layer.ffn_gate_shexp     = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP,     "weight", i), { n_embd, n_ff_shexp }, 0);
-        layer.ffn_up_shexp       = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,       "weight", i), { n_embd, n_ff_shexp }, 0);
-        layer.ffn_down_shexp     = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP,     "weight", i), { n_ff_shexp, n_embd }, 0);
-
-        // NextN/MTP tensors (preserved but unused) - only bound on MTP layers
-        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
-            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ,          "weight", i), { 2 * n_embd, n_embd }, TENSOR_NOT_REQUIRED);
-            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM,            "weight", i), { n_embd },              TENSOR_NOT_REQUIRED);
-            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM,            "weight", i), { n_embd },              TENSOR_NOT_REQUIRED);
-            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS,     "weight", i), { n_embd, n_vocab },     TENSOR_NOT_REQUIRED);
-            layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab },     TENSOR_NOT_REQUIRED);
-            layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd },              TENSOR_NOT_REQUIRED);
-        }
+        // MTP block looks like a full-attention Qwen3.5 decoder block with MoE FFN.
+        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", il), { n_embd }, 0);
+        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", il), { n_embd }, 0);
+
+        create_tensor_qkv(layer, il, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0);
+        layer.wo          = create_tensor(tn(LLM_TENSOR_ATTN_OUT,    "weight", il), { n_embd_head_k * n_head, n_embd }, 0);
+        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", il), { n_embd_head_k }, 0);
+        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", il), { n_embd_head_k }, 0);
+
+        // Routed experts
+        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", il), { n_embd, n_expert }, 0);
+        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", il), { n_ff_exp, n_embd, n_expert }, 0);
+        create_tensor_gate_up_exps(layer, il, n_embd, n_ff_exp, n_expert, 0);
+
+        // Shared experts
+        layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", il), { n_embd }, 0);
+        layer.ffn_gate_shexp     = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP,     "weight", il), { n_embd, n_ff_shexp }, 0);
+        layer.ffn_up_shexp       = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,       "weight", il), { n_embd, n_ff_shexp }, 0);
+        layer.ffn_down_shexp     = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP,     "weight", il), { n_ff_shexp, n_embd }, 0);
+
+        // NextN-specific tensors that define the MTP block.
+        layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ,          "weight", il), { 2 * n_embd, n_embd }, 0);
+        layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM,            "weight", il), { n_embd },              0);
+        layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM,            "weight", il), { n_embd },              0);
+        layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS,     "weight", il), { n_embd, n_vocab },     TENSOR_NOT_REQUIRED);
+        layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", il), { n_embd, n_vocab },     TENSOR_NOT_REQUIRED);
+        layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", il), { n_embd },              TENSOR_NOT_REQUIRED);
+    };
+
+    for (int i = 0; i < (int) n_main; ++i) {
+        load_block_trunk(i, trunk_flags);
+    }
+    for (int i = (int) n_main; i < n_layer; ++i) {
+        load_block_mtp(i);
     }
 }
 
 std::unique_ptr<llm_graph_context> llama_model_qwen35moe::build_arch_graph(const llm_graph_params & params) const {
+    if (params.gtype == LLM_GRAPH_TYPE_DECODER_MTP) {
+        return std::make_unique<graph_mtp>(*this, params);
+    }
     return std::make_unique<graph>(*this, params);
 }
 
@@ -547,3 +586,178 @@ ggml_tensor * llama_model_qwen35moe::graph::build_layer_ffn(ggml_tensor * cur, c
 
     return cur;
 }
+
+// LLM_GRAPH_TYPE_DECODER_MTP draft head for Qwen3.5/3.6 MoE
+llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params)
+    : llm_graph_context(params) {
+    GGML_ASSERT(hparams.nextn_predict_layers > 0 && "QWEN35MOE MTP requires nextn_predict_layers > 0");
+    GGML_ASSERT(hparams.nextn_predict_layers == 1 && "QWEN35MOE MTP currently only supports a single MTP block");
+
+    const int64_t n_embd_head = hparams.n_embd_head_v();
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+
+    const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers;
+    const auto & layer = model.layers[il];
+
+    GGML_ASSERT(layer.nextn.eh_proj    && "MTP block missing nextn.eh_proj");
+    GGML_ASSERT(layer.nextn.enorm      && "MTP block missing nextn.enorm");
+    GGML_ASSERT(layer.nextn.hnorm      && "MTP block missing nextn.hnorm");
+    GGML_ASSERT(layer.ffn_gate_inp     && "MTP block missing ffn_gate_inp");
+
+    int sections[4];
+    std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
+
+    auto inp = std::make_unique<llm_graph_input_embd>(hparams.n_embd);
+
+    inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+    ggml_set_input(inp->tokens);
+
+    inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens);
+    ggml_set_input(inp->embd);
+    ggml_set_name(inp->embd, "mtp_h_input");
+
+    ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd;
+
+    ggml_tensor * h_input  = inp->embd;
+    ggml_tensor * tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens);
+    cb(tok_embd, "mtp_tok_embd", il);
+
+    res->add_input(std::move(inp));
+
+    ggml_tensor * inp_pos = build_inp_pos();
+    auto * inp_attn       = build_attn_inp_kv();
+
+    ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
+    cb(h_norm, "mtp_hnorm", il);
+
+    ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, LLM_NORM_RMS, il);
+    cb(e_norm, "mtp_enorm", il);
+
+    ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0);
+    cb(concat, "mtp_concat", il);
+
+    ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat);
+    cb(cur, "mtp_eh_proj", il);
+
+    ggml_tensor * inpSA = cur;
+
+    cur = build_norm(cur, layer.attn_norm, nullptr, LLM_NORM_RMS, il);
+    cb(cur, "mtp_attn_norm", il);
+
+    ggml_tensor * Qcur_full = build_lora_mm(layer.wq, cur, layer.wq_s);
+    cb(Qcur_full, "mtp_Qcur_full", il);
+
+    ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full,
+            n_embd_head, n_head, n_tokens,
+            ggml_element_size(Qcur_full) * n_embd_head * 2,
+            ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head,
+            0);
+    Qcur = build_norm(Qcur, layer.attn_q_norm, nullptr, LLM_NORM_RMS, il);
+    cb(Qcur, "mtp_Qcur_normed", il);
+
+    ggml_tensor * gate = ggml_view_3d(ctx0, Qcur_full,
+            n_embd_head, n_head, n_tokens,
+            ggml_element_size(Qcur_full) * n_embd_head * 2,
+            ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head,
+            ggml_element_size(Qcur_full) * n_embd_head);
+    gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens);
+    cb(gate, "mtp_gate", il);
+
+    ggml_tensor * Kcur = build_lora_mm(layer.wk, cur, layer.wk_s);
+    Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+    Kcur = build_norm(Kcur, layer.attn_k_norm, nullptr, LLM_NORM_RMS, il);
+    cb(Kcur, "mtp_Kcur_normed", il);
+
+    ggml_tensor * Vcur = build_lora_mm(layer.wv, cur, layer.wv_s);
+    Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+    cb(Vcur, "mtp_Vcur", il);
+
+    Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
+            n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+            ext_factor, attn_factor, beta_fast, beta_slow);
+    Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
+            n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+            ext_factor, attn_factor, beta_fast, beta_slow);
+
+    const float kq_scale = hparams.f_attention_scale == 0.0f
+            ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+    cur = build_attn(inp_attn,
+            nullptr, nullptr, nullptr,
+            Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+    cb(cur, "mtp_attn_pregate", il);
+
+    cur = ggml_mul(ctx0, cur, ggml_sigmoid(ctx0, gate));
+    cur = build_lora_mm(layer.wo, cur, layer.wo_s);
+    cb(cur, "mtp_attn_out", il);
+
+    cur = ggml_add(ctx0, cur, inpSA);
+    cb(cur, "mtp_attn_residual", il);
+
+    ggml_tensor * ffn_residual = cur;
+    cur = build_norm(cur, layer.attn_post_norm, nullptr, LLM_NORM_RMS, il);
+    cb(cur, "mtp_attn_post_norm", il);
+
+    // MoE FFN — routed experts plus gated shared expert (mirrors qwen35moe).
+    ggml_tensor * moe_out =
+        build_moe_ffn(cur,
+            layer.ffn_gate_inp,
+            layer.ffn_up_exps,
+            layer.ffn_gate_exps,
+            layer.ffn_down_exps,
+            nullptr,
+            n_expert, n_expert_used,
+            LLM_FFN_SILU, true,
+            hparams.expert_weights_scale,
+            LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il,
+            nullptr, layer.ffn_gate_up_exps,
+            layer.ffn_up_exps_s,
+            layer.ffn_gate_exps_s,
+            layer.ffn_down_exps_s);
+    cb(moe_out, "mtp_ffn_moe_out", il);
+
+    if (layer.ffn_up_shexp != nullptr) {
+        ggml_tensor * ffn_shexp =
+            build_ffn(cur,
+                layer.ffn_up_shexp,   nullptr, layer.ffn_up_shexp_s,
+                layer.ffn_gate_shexp, nullptr, layer.ffn_gate_shexp_s,
+                layer.ffn_down_shexp, nullptr, layer.ffn_down_shexp_s,
+                nullptr,
+                LLM_FFN_SILU, LLM_FFN_PAR, il);
+        cb(ffn_shexp, "mtp_ffn_shexp", il);
+
+        ggml_tensor * shared_gate = build_lora_mm(layer.ffn_gate_inp_shexp, cur);
+        shared_gate = ggml_sigmoid(ctx0, shared_gate);
+        cb(shared_gate, "mtp_shared_expert_gate_sigmoid", il);
+
+        ffn_shexp = ggml_mul(ctx0, ffn_shexp, shared_gate);
+        cb(ffn_shexp, "mtp_ffn_shexp_gated", il);
+
+        cur = ggml_add(ctx0, moe_out, ffn_shexp);
+    } else {
+        cur = moe_out;
+    }
+    cb(cur, "mtp_ffn_out", il);
+
+    cur = ggml_add(ctx0, cur, ffn_residual);
+    cb(cur, "mtp_post_ffn", il);
+
+    // Pre-norm hidden state: used by the AR draft loop to seed the next MTP step.
+    cb(cur, "h_pre_norm", -1);
+    res->t_h_pre_norm = cur;
+
+    ggml_tensor * head_norm_w = layer.nextn.shared_head_norm
+            ? layer.nextn.shared_head_norm
+            : model.output_norm;
+    GGML_ASSERT(head_norm_w && "QWEN35MOE MTP: missing both nextn.shared_head_norm and output_norm");
+    cur = build_norm(cur, head_norm_w, nullptr, LLM_NORM_RMS, -1);
+    cb(cur, "mtp_shared_head_norm", -1);
+
+    ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output;
+    GGML_ASSERT(head_w && "QWEN35MOE MTP: missing LM head (nextn.shared_head_head or model.output)");
+    cur = build_lora_mm(head_w, cur);
+    cb(cur, "result_output", -1);
+
+    res->t_logits = cur;
+    ggml_build_forward_expand(gf, cur);
+}
diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp
index fd0d3696d77..03d7c19c78b 100644
--- a/tests/test-llama-archs.cpp
+++ b/tests/test-llama-archs.cpp
@@ -406,11 +406,7 @@ static bool arch_supported(const llm_arch arch) {
     if (arch == LLM_ARCH_DEEPSEEK2OCR) {
         return false;
     }
-    if (arch == LLM_ARCH_QWEN35_MTP || arch == LLM_ARCH_QWEN35MOE_MTP) {
-        return false; // MTP-only arch; requires a sibling trunk model and cannot run standalone.
-    }
-
-    // FIXME some models are segfaulting with WebGPU:
+// FIXME some models are segfaulting with WebGPU:
 #ifdef GGML_USE_WEBGPU
     if (arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE || arch == LLM_ARCH_KIMI_LINEAR) {
         return false;
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 9430c31b4c5..4e37c2ea850 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -756,6 +756,14 @@ struct server_context_impl {
             }
 
             auto cparams = common_context_params_to_llama(params_dft);
+
+            const bool spec_mtp = std::find(params_base.speculative.types.begin(),
+                                            params_base.speculative.types.end(),
+                                            COMMON_SPECULATIVE_TYPE_MTP) != params_base.speculative.types.end();
+            if (spec_mtp) {
+                cparams.graph_type = LLAMA_GRAPH_TYPE_DECODER_MTP;
+            }
+
             ctx_dft.reset(llama_init_from_model(model_dft.get(), cparams));
 
             ctx_dft_seq_rm_type = common_context_can_seq_rm(ctx_dft.get());
@@ -764,36 +772,21 @@ struct server_context_impl {
             params_base.speculative.draft.ctx_dft = ctx_dft.get();
         } else if (std::find(params_base.speculative.types.begin(), params_base.speculative.types.end(),
                              COMMON_SPECULATIVE_TYPE_MTP) != params_base.speculative.types.end()) {
-            // MTP head lives in the *target* GGUF — load it as a sibling model
-            // with override_arch and feed it through the existing ctx_dft slot.
-            char trunk_arch[64] = {0};
-            llama_model_meta_val_str(model_tgt, "general.architecture", trunk_arch, sizeof(trunk_arch));
-
-            const char * mtp_arch = nullptr;
-            if (std::string(trunk_arch) == "qwen35") {
-                mtp_arch = "qwen35_mtp";
-            } else if (std::string(trunk_arch) == "qwen35moe") {
-                mtp_arch = "qwen35moe_mtp";
-            } else {
-                SRV_ERR("MTP not supported for trunk architecture '%s'\n", trunk_arch);
+            // MTP head lives in the *target* model — share the same llama_model and
+            // spin up a second context that builds the MTP graph instead of the trunk graph.
+            if (!llama_model_has_mtp(model_tgt)) {
+                SRV_ERR("MTP requested but the target model '%s' has no MTP tensors\n",
+                        params_base.model.path.c_str());
                 return false;
             }
 
-            SRV_INF("loading MTP head from '%s' (override_arch=%s)\n",
-                    params_base.model.path.c_str(), mtp_arch);
-
-            auto mparams_mtp = common_model_params_to_llama(params_base);
-            mparams_mtp.override_arch = mtp_arch;
-
-            model_dft.reset(llama_model_load_from_file(params_base.model.path.c_str(), mparams_mtp));
-            if (model_dft == nullptr) {
-                SRV_ERR("failed to load MTP head from '%s'\n", params_base.model.path.c_str());
-                return false;
-            }
+            SRV_INF("creating MTP draft context against the target model '%s'\n",
+                    params_base.model.path.c_str());
 
             auto cparams_mtp = common_context_params_to_llama(params_base);
+            cparams_mtp.graph_type = LLAMA_GRAPH_TYPE_DECODER_MTP;
 
-            ctx_dft.reset(llama_init_from_model(model_dft.get(), cparams_mtp));
+            ctx_dft.reset(llama_init_from_model(model_tgt, cparams_mtp));
             if (ctx_dft == nullptr) {
                 SRV_ERR("%s", "failed to create MTP context\n");
                 return false;

From 5b9283992c629e4216c498a2fc54a92549c40d31 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Tue, 12 May 2026 21:40:06 +0800
Subject: [PATCH 2/8] review: use llama_context_type instead of
 llama_graph_type

---
 include/llama.h                 | 11 ++++-------
 src/llama-context.cpp           | 28 ++++++++++++++++------------
 src/llama-context.h             |  2 --
 src/llama-cparams.h             |  1 +
 src/llama-memory.h              |  2 +-
 src/llama-model.cpp             |  2 +-
 tools/server/server-context.cpp |  4 ++--
 7 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index bb34b47102a..19326b72ac4 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -198,11 +198,9 @@ extern "C" {
         LLAMA_SPLIT_MODE_TENSOR = 3,
     };
 
-    enum llama_graph_type {
-        LLAMA_GRAPH_TYPE_DEFAULT     = 0,
-        LLAMA_GRAPH_TYPE_ENCODER     = 1,
-        LLAMA_GRAPH_TYPE_DECODER     = 2,
-        LLAMA_GRAPH_TYPE_DECODER_MTP = 3,
+    enum llama_context_type {
+        LLAMA_CONTEXT_TYPE_DEFAULT     = 0,
+        LLAMA_CONTEXT_TYPE_MTP         = 1,
     };
 
     // TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)
@@ -346,6 +344,7 @@ extern "C" {
         int32_t  n_threads;         // number of threads to use for generation
         int32_t  n_threads_batch;   // number of threads to use for batch processing
 
+        enum llama_context_type      ctx_type;          // set the context type (e.g. MTP)
         enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
         enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
         enum llama_attention_type    attention_type;    // attention type to use for embeddings
@@ -390,8 +389,6 @@ extern "C" {
         // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
         struct llama_sampler_seq_config * samplers;
         size_t                            n_samplers;
-
-        enum llama_graph_type graph_type;
     };
 
     struct llama_model_tensor_override {
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 47ada0b75a2..26697276ec2 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -2,6 +2,7 @@
 
 #include "ggml.h"
 #include "llama-arch.h"
+#include "llama-graph.h"
 #include "llama-impl.h"
 #include "llama-batch.h"
 #include "llama-io.h"
@@ -21,6 +22,14 @@
 // llama_context
 //
 
+static llm_graph_type ctx_type_to_graph_type(llama_context_type ctx_type) {
+    switch (ctx_type) {
+        case LLAMA_CONTEXT_TYPE_DEFAULT: return LLM_GRAPH_TYPE_DEFAULT;
+        case LLAMA_CONTEXT_TYPE_MTP    : return LLM_GRAPH_TYPE_DECODER_MTP;
+    }
+    throw std::runtime_error("Unsupported ctx type");
+}
+
 llama_context::llama_context(
         const llama_model & model,
               llama_context_params params) :
@@ -66,13 +75,7 @@ llama_context::llama_context(
     cparams.cb_eval           = params.cb_eval;
     cparams.cb_eval_user_data = params.cb_eval_user_data;
 
-    switch (params.graph_type) {
-        case LLAMA_GRAPH_TYPE_DEFAULT:     gtype = LLM_GRAPH_TYPE_DEFAULT;     break;
-        case LLAMA_GRAPH_TYPE_ENCODER:     gtype = LLM_GRAPH_TYPE_ENCODER;     break;
-        case LLAMA_GRAPH_TYPE_DECODER:     gtype = LLM_GRAPH_TYPE_DECODER;     break;
-        case LLAMA_GRAPH_TYPE_DECODER_MTP: gtype = LLM_GRAPH_TYPE_DECODER_MTP; break;
-        default: throw std::runtime_error("invalid graph_type");
-    }
+    cparams.ctx_type          = params.ctx_type;
 
     // Initialize backend samplers here so they are part of the sampling graph
     // before the reserve passes run later in this function. This avoids a later
@@ -287,7 +290,7 @@ llama_context::llama_context(
             /*.type_k   =*/ params.type_k,
             /*.type_v   =*/ params.type_v,
             /*.swa_full =*/ params.swa_full,
-            /*.gtype    =*/ gtype,
+            /*.ctx_type=*/  cparams.ctx_type,
         };
 
         memory.reset(model.create_memory(params_mem, cparams));
@@ -1747,7 +1750,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
         }
 
         ggml_status status;
-        const auto * res = process_ubatch(ubatch, gtype, mctx.get(), status);
+
+        const auto * res = process_ubatch(ubatch, ctx_type_to_graph_type(cparams.ctx_type), mctx.get(), status);
 
         if (!res) {
             // the last ubatch failed or was aborted -> remove all positions of that ubatch from the memory module
@@ -2207,7 +2211,7 @@ ggml_cgraph * llama_context::graph_reserve(
 
     auto * res = gf_res_reserve.get();
 
-    const auto gparams = graph_params(res, ubatch, mctx, gtype);
+    const auto gparams = graph_params(res, ubatch, mctx, ctx_type_to_graph_type(cparams.ctx_type));
 
     res->reset();
 
@@ -3186,7 +3190,7 @@ void llama_context::opt_epoch_iter(
 
             auto * res = gf_res_prev.get();
 
-            const auto gparams = graph_params(res, ubatch, mctx.get(), gtype);
+            const auto gparams = graph_params(res, ubatch, mctx.get(), ctx_type_to_graph_type(cparams.ctx_type));
 
             res->reset();
 
@@ -3289,6 +3293,7 @@ llama_context_params llama_context_default_params() {
         /*.n_seq_max                   =*/ 1,
         /*.n_threads                   =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
         /*.n_threads_batch             =*/ GGML_DEFAULT_N_THREADS,
+        /*.ctx_type                    =*/ LLAMA_CONTEXT_TYPE_DEFAULT,
         /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
         /*.pooling_type                =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
         /*.attention_type              =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED,
@@ -3315,7 +3320,6 @@ llama_context_params llama_context_default_params() {
         /*.kv_unified                  =*/ false,
         /*.sampler                     =*/ nullptr,
         /*.n_sampler                   =*/ 0,
-        /*.graph_type                  =*/ LLAMA_GRAPH_TYPE_DEFAULT,
     };
 
     return result;
diff --git a/src/llama-context.h b/src/llama-context.h
index 5d9efdf242a..e16ac4c618b 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -268,8 +268,6 @@ struct llama_context {
 
     llama_cparams cparams;
 
-    llm_graph_type gtype = LLM_GRAPH_TYPE_DECODER;
-
     llama_adapter_cvec_ptr  cvec;
     llama_adapter_loras_ptr loras;
 
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
index 1e4e9e29ed8..9a4c2274407 100644
--- a/src/llama-cparams.h
+++ b/src/llama-cparams.h
@@ -42,6 +42,7 @@ struct llama_cparams {
     bool pipeline_parallel;
 
     enum llama_pooling_type pooling_type;
+    enum llama_context_type ctx_type;
 
     ggml_backend_sched_eval_callback cb_eval;
     void * cb_eval_user_data;
diff --git a/src/llama-memory.h b/src/llama-memory.h
index 2875d614315..4ad1612e45b 100644
--- a/src/llama-memory.h
+++ b/src/llama-memory.h
@@ -22,7 +22,7 @@ struct llama_memory_params {
     // use full-size SWA cache
     bool swa_full;
 
-    llm_graph_type gtype;
+    llama_context_type ctx_type;
 };
 
 enum llama_memory_status {
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index e4b891ea96e..bc239cc4c44 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1946,7 +1946,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                 // The MTP head is dense-attention only on hybrid Qwen3.5/3.6, so use a plain
                 // attention KV cache for the MTP context instead of the hybrid wrapper.
                 const bool mtp_on_hybrid_qwen35 =
-                    params.gtype == LLM_GRAPH_TYPE_DECODER_MTP &&
+                    params.ctx_type == LLAMA_CONTEXT_TYPE_MTP &&
                     (arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE);
 
                 if (llm_arch_is_recurrent(arch)) {
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 4e37c2ea850..91954531cda 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -761,7 +761,7 @@ struct server_context_impl {
                                             params_base.speculative.types.end(),
                                             COMMON_SPECULATIVE_TYPE_MTP) != params_base.speculative.types.end();
             if (spec_mtp) {
-                cparams.graph_type = LLAMA_GRAPH_TYPE_DECODER_MTP;
+                cparams.ctx_type = LLAMA_CONTEXT_TYPE_MTP;
             }
 
             ctx_dft.reset(llama_init_from_model(model_dft.get(), cparams));
@@ -784,7 +784,7 @@ struct server_context_impl {
                     params_base.model.path.c_str());
 
             auto cparams_mtp = common_context_params_to_llama(params_base);
-            cparams_mtp.graph_type = LLAMA_GRAPH_TYPE_DECODER_MTP;
+            cparams_mtp.ctx_type = LLAMA_CONTEXT_TYPE_MTP;
 
             ctx_dft.reset(llama_init_from_model(model_tgt, cparams_mtp));
             if (ctx_dft == nullptr) {

From 46c080153d87cf17123d215319fefd4459fb16c2 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Tue, 12 May 2026 22:02:43 +0800
Subject: [PATCH 3/8] review: remove llama_model_has_mtp

---
 include/llama.h                 | 2 --
 src/llama-context.cpp           | 6 ++++++
 src/llama-model.cpp             | 3 ---
 tools/server/server-context.cpp | 8 --------
 4 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index 19326b72ac4..2dcec3bb7a7 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -563,8 +563,6 @@ extern "C" {
     LLAMA_API int32_t llama_model_n_head_kv  (const struct llama_model * model);
     LLAMA_API int32_t llama_model_n_swa      (const struct llama_model * model);
 
-    LLAMA_API bool    llama_model_has_mtp    (const struct llama_model * model);
-
     // Get the model's RoPE frequency scaling factor
     LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
 
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 26697276ec2..43e694494c8 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -3397,6 +3397,12 @@ llama_context * llama_init_from_model(
                        model->hparams.pooling_type, params.pooling_type);
     }
 
+    if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP && model->hparams.nextn_predict_layers == 0) {
+        LLAMA_LOG_WARN("%s: context type MTP request but model doesn't contain MTP layers\n", __func__);
+        return nullptr;
+    }
+
+
     try {
         auto * ctx = new llama_context(*model, params);
         return ctx;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index bc239cc4c44..2a157eb1299 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2176,9 +2176,6 @@ int32_t llama_model_n_swa(const llama_model * model) {
     return model->hparams.n_swa;
 }
 
-bool llama_model_has_mtp(const llama_model * model) {
-    return model->hparams.nextn_predict_layers > 0;
-}
 
 uint32_t llama_model_n_cls_out(const struct llama_model * model) {
     return model->hparams.n_cls_out;
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 91954531cda..76b4294f897 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -772,14 +772,6 @@ struct server_context_impl {
             params_base.speculative.draft.ctx_dft = ctx_dft.get();
         } else if (std::find(params_base.speculative.types.begin(), params_base.speculative.types.end(),
                              COMMON_SPECULATIVE_TYPE_MTP) != params_base.speculative.types.end()) {
-            // MTP head lives in the *target* model — share the same llama_model and
-            // spin up a second context that builds the MTP graph instead of the trunk graph.
-            if (!llama_model_has_mtp(model_tgt)) {
-                SRV_ERR("MTP requested but the target model '%s' has no MTP tensors\n",
-                        params_base.model.path.c_str());
-                return false;
-            }
-
             SRV_INF("creating MTP draft context against the target model '%s'\n",
                     params_base.model.path.c_str());
 

From f87f0f440b81d53bfab1ca35f1f0acc7b685d055 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Tue, 12 May 2026 22:43:30 +0800
Subject: [PATCH 4/8] review: fix convert issues

---
 convert_hf_to_gguf.py | 52 ++++++++++++++-----------------------------
 1 file changed, 17 insertions(+), 35 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 2e71b07466a..f218607438a 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -95,6 +95,7 @@ class ModelBase:
     gguf_writer: gguf.GGUFWriter
     model_name: str | None
     metadata_override: Path | None
+    metadata: gguf.Metadata
     dir_model_card: Path
     remote_hf_model_id: str | None
 
@@ -5559,16 +5560,8 @@ class _Qwen35MtpMixin:
     gguf_writer: gguf.GGUFWriter
     block_count: int
     tensor_map: gguf.TensorNameMap
-    fname_out: Path
-    ftype: Any
-    metadata: Any
 
-    # When true, `--mtp` was passed: filter out trunk weights so the resulting
-    # GGUF carries only the MTP head and the shared embeddings/output tensors.
     mtp_only: bool = False
-
-    # When true, `--no-mtp` was passed: drop `mtp.*` tensors and report block_count
-    # as the trunk-only layer count, producing a GGUF with no MTP head.
     no_mtp: bool = False
 
     def __init__(self, *args, **kwargs):
@@ -5582,10 +5575,19 @@ def __init__(self, *args, **kwargs):
     def filter_tensors(cls, item):
         name, _ = item
         if name.startswith("mtp."):
-            # Qwen3Next drops `mtp.*` tensors; Qwen3.5/3.6 use them by default. `--no-mtp` opts out.
             if cls.no_mtp:
                 return None
             return item
+        if cls.mtp_only:
+            # In --mtp mode, drop trunk weights and keep only the shared embeddings/output
+            # tensors that the standalone MTP graph references at inference time.
+            canonical = name.replace("language_model.", "")
+            keep = canonical in (
+                "model.embed_tokens.weight", "model.norm.weight", "lm_head.weight",
+                "embed_tokens.weight", "norm.weight",
+            )
+            if not keep:
+                return None
         return super().filter_tensors(item)  # ty: ignore[unresolved-attribute]
 
     def set_gguf_parameters(self):
@@ -5601,38 +5603,19 @@ def prepare_metadata(self, vocab_only: bool):
         if not self.mtp_only:
             return
 
-        output_type: str = self.ftype.name.partition("_")[2]
+        output_type: str = self.ftype.name.partition("_")[2]  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
 
         if self.fname_out.is_dir():
             fname_default: str = gguf.naming_convention(
-                self.metadata.name, self.metadata.basename, self.metadata.finetune,
-                self.metadata.version, size_label=None, output_type=output_type, model_type=None)
+                self.metadata.name, self.metadata.basename, self.metadata.finetune,                  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
+                self.metadata.version, size_label=None, output_type=output_type, model_type=None)    # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
             self.fname_out = self.fname_out / f"{Path(fname_default).stem}-MTP.gguf"
         else:
             stem = self.fname_out.stem
             self.fname_out = self.fname_out.parent / f"{stem}-MTP{self.fname_out.suffix}"
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # Multimodal Qwen3.5/3.6 wrap the text model under `model.language_model.*`.
-        if name.startswith("model.language_model."):
-            name = "model." + name[len("model.language_model."):]
-        elif name.startswith("language_model."):
-            name = name[len("language_model."):]
-
-        if self.mtp_only:
-            # In --mtp mode keep only the MTP block plus the shared embedding/output tensors
-            # that the standalone MTP graph references at inference time.
-            keep = (
-                name.startswith("mtp.") or
-                name in ("model.embed_tokens.weight", "model.norm.weight", "lm_head.weight") or
-                name in ("embed_tokens.weight", "norm.weight")
-            )
-            if not keep:
-                return
-
         # Remap MTP block tensors to llama.cpp's layer-indexed nextn naming.
-        # HF: mtp.layers.0.*  (transformer block at MTP slot 0)
-        #     mtp.fc / mtp.pre_fc_norm_embedding / mtp.pre_fc_norm_hidden / mtp.norm
         if name.startswith("mtp."):
             n_layer = self.hparams["num_hidden_layers"]
             if name.find("layers.") != -1:
@@ -14262,9 +14245,11 @@ def main() -> None:
             logger.error("--mtp / --no-mtp are only supported for Qwen3.5/3.6 text variants today")
             sys.exit(1)
 
-        # set on the class so __init__ sees the correct mode when computing block_count
+        # set on the class so __init__ / filter_tensors see the correct mode
         if args.no_mtp:
             model_class.no_mtp = True
+        if args.mtp:
+            model_class.mtp_only = True
 
         model_instance = model_class(dir_model, output_type, fname_out,
                                      is_big_endian=args.bigendian, use_temp_file=args.use_temp_file,
@@ -14278,9 +14263,6 @@ def main() -> None:
                                      fuse_gate_up_exps=args.fuse_gate_up_exps
                                      )
 
-        if args.mtp:
-            model_instance.mtp_only = True
-
         if args.vocab_only:
             logger.info("Exporting model vocab...")
             model_instance.write_vocab()

From d769c578ebc38ca877a4b028619164fac887507c Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Tue, 12 May 2026 22:54:39 +0800
Subject: [PATCH 5/8] convert: fix pycheck

---
 convert_hf_to_gguf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index f218607438a..73f7aeb6a36 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -14247,9 +14247,9 @@ def main() -> None:
 
         # set on the class so __init__ / filter_tensors see the correct mode
         if args.no_mtp:
-            model_class.no_mtp = True
+            model_class.no_mtp = True  # ty: ignore[unresolved-attribute]
         if args.mtp:
-            model_class.mtp_only = True
+            model_class.mtp_only = True  # ty: ignore[unresolved-attribute]
 
         model_instance = model_class(dir_model, output_type, fname_out,
                                      is_big_endian=args.bigendian, use_temp_file=args.use_temp_file,

From 84f00ce76028349d603f45b936d6bb11cfd48331 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Tue, 12 May 2026 22:59:31 +0800
Subject: [PATCH 6/8] review: formatting

---
 include/llama.h       | 4 ++--
 src/llama-context.cpp | 7 ++++---
 src/llama-cparams.h   | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index 2dcec3bb7a7..b814e2c58de 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -199,8 +199,8 @@ extern "C" {
     };
 
     enum llama_context_type {
-        LLAMA_CONTEXT_TYPE_DEFAULT     = 0,
-        LLAMA_CONTEXT_TYPE_MTP         = 1,
+        LLAMA_CONTEXT_TYPE_DEFAULT = 0,
+        LLAMA_CONTEXT_TYPE_MTP     = 1,
     };
 
     // TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 43e694494c8..6ecbe1b6083 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -290,7 +290,7 @@ llama_context::llama_context(
             /*.type_k   =*/ params.type_k,
             /*.type_v   =*/ params.type_v,
             /*.swa_full =*/ params.swa_full,
-            /*.ctx_type=*/  cparams.ctx_type,
+            /*.ctx_type= */ cparams.ctx_type,
         };
 
         memory.reset(model.create_memory(params_mem, cparams));
@@ -3397,8 +3397,9 @@ llama_context * llama_init_from_model(
                        model->hparams.pooling_type, params.pooling_type);
     }
 
-    if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP && model->hparams.nextn_predict_layers == 0) {
-        LLAMA_LOG_WARN("%s: context type MTP request but model doesn't contain MTP layers\n", __func__);
+    if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP &&
+        model->hparams.nextn_predict_layers == 0) {
+        LLAMA_LOG_WARN("%s: context type MTP requested but model doesn't contain MTP layers\n", __func__);
         return nullptr;
     }
 
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
index 9a4c2274407..cbf74eba63e 100644
--- a/src/llama-cparams.h
+++ b/src/llama-cparams.h
@@ -41,8 +41,8 @@ struct llama_cparams {
     bool kv_unified;
     bool pipeline_parallel;
 
-    enum llama_pooling_type pooling_type;
     enum llama_context_type ctx_type;
+    enum llama_pooling_type pooling_type;
 
     ggml_backend_sched_eval_callback cb_eval;
     void * cb_eval_user_data;

From f6f29e6fb89ff8abdd9f92f84679d27dd03a1c10 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Wed, 13 May 2026 00:53:20 +0800
Subject: [PATCH 7/8] use `mtp-` for identifying mtp models

---
 common/download.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/common/download.cpp b/common/download.cpp
index 71d8dfda00d..c1cbe2033aa 100644
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -566,7 +566,7 @@ static hf_cache::hf_files get_split_files(const hf_cache::hf_files & files,
     return result;
 }
 
-// pick the best sibling GGUF whose filename contains `keyword` (e.g. "mmproj" / "MTP"),
+// pick the best sibling GGUF whose filename contains `keyword` (e.g. "mmproj" / "mtp"),
 // preferring deeper shared directory prefix with the model, then closest quantization
 static hf_cache::hf_file find_best_sibling(const hf_cache::hf_files & files,
                                            const std::string        & model,
@@ -616,7 +616,7 @@ static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files,
 
 static hf_cache::hf_file find_best_mtp(const hf_cache::hf_files & files,
                                        const std::string        & model) {
-    return find_best_sibling(files, model, "MTP");
+    return find_best_sibling(files, model, "mtp-");
 }
 
 static bool gguf_filename_is_model(const std::string & filepath) {
@@ -631,7 +631,7 @@ static bool gguf_filename_is_model(const std::string & filepath) {
 
     return filename.find("mmproj")  == std::string::npos &&
            filename.find("imatrix") == std::string::npos &&
-           filename.find("MTP")     == std::string::npos;
+           filename.find("mtp-")    == std::string::npos;
 }
 
 static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,

From 0712378b45828768f0f5830abae2f217e6aea962 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Wed, 13 May 2026 11:11:48 +0800
Subject: [PATCH 8/8] convert: fix mtp conversion

---
 convert_hf_to_gguf.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 73f7aeb6a36..0857d9e6803 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5598,21 +5598,19 @@ def set_gguf_parameters(self):
             self.gguf_writer.add_nextn_predict_layers(n)
 
     def prepare_metadata(self, vocab_only: bool):
+        # TextModel.prepare_metadata resolves a directory fname_out into a concrete
+        # file path, so snapshot is_dir() first to decide whether to apply the mtp- prefix.
+        from_dir = self.fname_out.is_dir()
         super().prepare_metadata(vocab_only=vocab_only)  # ty: ignore[unresolved-attribute]
 
-        if not self.mtp_only:
+        if not self.mtp_only or not from_dir:
             return
 
         output_type: str = self.ftype.name.partition("_")[2]  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
-
-        if self.fname_out.is_dir():
-            fname_default: str = gguf.naming_convention(
-                self.metadata.name, self.metadata.basename, self.metadata.finetune,                  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
-                self.metadata.version, size_label=None, output_type=output_type, model_type=None)    # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
-            self.fname_out = self.fname_out / f"{Path(fname_default).stem}-MTP.gguf"
-        else:
-            stem = self.fname_out.stem
-            self.fname_out = self.fname_out.parent / f"{stem}-MTP{self.fname_out.suffix}"
+        fname_default: str = gguf.naming_convention(
+            self.metadata.name, self.metadata.basename, self.metadata.finetune,                  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
+            self.metadata.version, size_label=None, output_type=output_type, model_type=None)    # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
+        self.fname_out = self.fname_out.parent / f"mtp-{fname_default}.gguf"
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         # Remap MTP block tensors to llama.cpp's layer-indexed nextn naming.