hparams : refactor hparams.n_layer (#24060)

* hparams : refactor hparams.n_layer * cont : remove `n_layer_kv()`, use n_layer_all instead * cont : type consistency * pi : update SYSTEM.md * models : fix Step3.5 MTP * cont : remove duplicate switch cases * cont : explicitly set `false` to extra layers for `is_swa` and `is_recr` * cont : fix nextn layer count handling Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2026-06-09 07:16:44 +02:00 · 2026-06-05 11:09:36 +03:00
parent 3ecfb150a4
commit 7acb4e8cd2
129 changed files with 412 additions and 431 deletions
@@ -16,12 +16,12 @@ Pull requests (PRs):
 - New branch names are prefixed with "gg/"
 - Before opening a pull request, ask the user to confirm the description
 - When creating a pull request, look for the repository's PR template and follow it
- For the AI usage disclosure section, write "YES. llama.cpp + pi + [MODEL]"
+- For the AI usage disclosure section, write "YES. pi:llama.cpp/[MODEL]"
 - Ask the user to tell you what model was used and write it in place of [MODEL]
 - Always create the pull requests in draft mode

 Commits:
- On every commit that you make, include a "Assisted-by: llama.cpp:local pi" tag
+- On every commit that you make, include a "Assisted-by: pi:llama.cpp/[MODEL]" tag
 - Do not explicitly set the git author in commits - rely on the default git config
 - Always use `--no-gpg-sign` when committing
 - Never `git push` without explicit confirmation from the user
@@ -41,7 +41,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
        auto it = ctx_map.find(buft);
        if (it == ctx_map.end()) {
            ggml_init_params params = {
-                /*.mem_size   =*/ hparams.n_layer*ggml_tensor_overhead(),
+                /*.mem_size   =*/ hparams.n_layer()*ggml_tensor_overhead(),
                /*.mem_buffer =*/ NULL,
                /*.no_alloc   =*/ true,
            };
@@ -61,9 +61,9 @@ bool llama_adapter_cvec::init(const llama_model & model) {
    };

    // make tensors
-    tensors.reserve(hparams.n_layer);
+    tensors.reserve(hparams.n_layer());
    tensors.push_back(nullptr); // there's never a tensor for layer 0
-    for (size_t il = 1; il < hparams.n_layer; il++) {
+    for (size_t il = 1; il < hparams.n_layer(); il++) {
        ggml_backend_buffer_type_t buft = model.select_buft(il);
        ggml_context * ctx = ctx_for_buft(buft);
        if (!ctx) {
@@ -121,7 +121,7 @@ bool llama_adapter_cvec::apply(
    layer_start = il_start;
    layer_end   = il_end;

-    for (size_t il = 1; il < hparams.n_layer; il++) {
+    for (size_t il = 1; il < hparams.n_layer(); il++) {
        assert(tensors[il] != nullptr);

        const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
@@ -341,7 +341,7 @@ llama_context::llama_context(
        // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
        bool pipeline_parallel =
            model.n_devices() > 1 &&
-            model.n_gpu_layers() > model.hparams.n_layer &&
+            model.n_gpu_layers() > model.hparams.n_layer() &&
            model.split_mode() == LLAMA_SPLIT_MODE_LAYER &&
            cparams.offload_kqv &&
            !model.has_tensor_overrides();
@@ -2351,7 +2351,7 @@ llm_graph_cb llama_context::graph_get_cb() const {

        // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
        // FIXME: fix in ggml_backend_sched
-        const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer;
+        const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer();
        if (ubatch.n_tokens < 32 || full_offload) {
            if (il != -1 && strcmp(name, "norm") == 0) {
                const auto & dev_layer = model.dev_layer(il);
@@ -3416,7 +3416,7 @@ llama_context * llama_init_from_model(

    if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && ggml_is_quantized(params.type_k)) {
        const uint32_t blck_size = ggml_blck_size(params.type_k);
-        for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
+        for (uint32_t il = 0; il < model->hparams.n_layer(); ++il) {
            if (model->hparams.n_embd_head_k(il) % blck_size != 0) {
                LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
                    __func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k(il));
@@ -3427,7 +3427,7 @@ llama_context * llama_init_from_model(

    if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && ggml_is_quantized(params.type_v)) {
        const uint32_t blck_size = ggml_blck_size(params.type_v);
-        for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
+        for (uint32_t il = 0; il < model->hparams.n_layer(); ++il) {
            if (model->hparams.n_embd_head_v(il) % blck_size != 0) {
                LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_v=%u\n",
                    __func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v(il));
@@ -3449,7 +3449,7 @@ llama_context * llama_init_from_model(
    }

    if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP &&
-        model->hparams.nextn_predict_layers == 0) {
+        model->hparams.n_layer_nextn == 0) {
        LLAMA_LOG_WARN("%s: context type MTP requested but model doesn't contain MTP layers\n", __func__);
        return nullptr;
    }
@@ -1005,7 +1005,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
    cparams          (params.cparams),
    ubatch           (params.ubatch),
    n_embd           (hparams.n_embd),
-    n_layer          (hparams.n_layer),
+    n_layer          (hparams.n_layer()),
    n_rot            (hparams.n_rot()),
    n_ctx            (cparams.n_ctx),
    n_head           (hparams.n_head()),
@@ -7,31 +7,38 @@

 void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
    if (dense_first) {
-        for (uint32_t il = 0; il < n_layer; ++il) {
+        for (uint32_t il = 0; il < n_layer(); ++il) {
            is_swa_impl[il] = n_pattern == 0 || (il % n_pattern != 0);
        }
    } else {
-        for (uint32_t il = 0; il < n_layer; ++il) {
+        for (uint32_t il = 0; il < n_layer(); ++il) {
            is_swa_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
        }
    }
+
+    for (uint32_t il = n_layer(); il < n_layer_all; ++il) {
+        is_swa_impl[il] = false;
+    }
 }

-// TODO: implement
-//void llama_hparams::set_recr_pattern(uint32_t n_pattern, bool dense_first) {
-//    if (dense_first) {
-//        for (uint32_t il = 0; il < n_layer; ++il) {
-//            is_recr_impl[il] = n_pattern == 0 || (il % n_pattern != 0);
-//        }
-//    } else {
-//        for (uint32_t il = 0; il < n_layer; ++il) {
-//            is_recr_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
-//        }
-//    }
-//}
+void llama_hparams::set_recr_pattern(uint32_t n_pattern, bool dense_first) {
+    if (dense_first) {
+        for (uint32_t il = 0; il < n_layer(); ++il) {
+            is_recr_impl[il] = n_pattern == 0 || (il % n_pattern != 0);
+        }
+    } else {
+        for (uint32_t il = 0; il < n_layer(); ++il) {
+            is_recr_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
+        }
+    }
+
+    for (uint32_t il = n_layer(); il < n_layer_all; ++il) {
+        is_recr_impl[il] = false;
+    }
+}

 bool llama_hparams::is_swa_any() const {
-    for (uint32_t il = 0; il < n_layer; ++il) {
+    for (uint32_t il = 0; il < n_layer_all; ++il) {
        if (is_swa_impl[il]) {
            return true;
        }
@@ -41,7 +48,7 @@ bool llama_hparams::is_swa_any() const {
 }

 uint32_t llama_hparams::n_head(uint32_t il) const {
-    if (il < n_layer) {
+    if (il < n_layer_all) {
        return n_head_arr[il];
    }

@@ -49,7 +56,7 @@ uint32_t llama_hparams::n_head(uint32_t il) const {
 }

 uint32_t llama_hparams::n_head_kv(uint32_t il) const {
-    if (il < n_layer) {
+    if (il < n_layer_all) {
        return n_head_kv_arr[il];
    }

@@ -57,7 +64,7 @@ uint32_t llama_hparams::n_head_kv(uint32_t il) const {
 }

 uint32_t llama_hparams::n_ff(uint32_t il) const {
-    if (il < n_layer) {
+    if (il < n_layer_all) {
        return n_ff_arr[il];
    }

@@ -76,7 +83,7 @@ uint32_t llama_hparams::n_gqa(uint32_t il) const {
 }

 uint32_t llama_hparams::n_rot(uint32_t il) const {
-    if (il < n_layer) {
+    if (il < n_layer_all) {
        return is_swa(il) ? n_rot_swa : n_rot_full;
    }

@@ -98,7 +105,7 @@ uint32_t llama_hparams::n_embd_out() const {
 }

 uint32_t llama_hparams::n_embd_head_k(uint32_t il) const {
-    if (il < n_layer) {
+    if (il < n_layer_all) {
        return is_swa(il) ? n_embd_head_k_swa : n_embd_head_k_full;
    }

@@ -106,7 +113,7 @@ uint32_t llama_hparams::n_embd_head_k(uint32_t il) const {
 }

 uint32_t llama_hparams::n_embd_head_v(uint32_t il) const {
-    if (il < n_layer) {
+    if (il < n_layer_all) {
        return is_swa(il) ? n_embd_head_v_swa : n_embd_head_v_full;
    }

@@ -127,7 +134,7 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {

 bool llama_hparams::is_n_embd_k_gqa_variable() const {
    const uint32_t val = n_embd_k_gqa();
-    for (uint32_t il = 0; il < n_layer; ++il) {
+    for (uint32_t il = 0; il < n_layer_all; ++il) {
        if (val != n_embd_k_gqa(il)) {
            return true;
        }
@@ -138,7 +145,7 @@ bool llama_hparams::is_n_embd_k_gqa_variable() const {

 bool llama_hparams::is_n_embd_v_gqa_variable() const {
    const uint32_t val = n_embd_v_gqa();
-    for (uint32_t il = 0; il < n_layer; ++il) {
+    for (uint32_t il = 0; il < n_layer_all; ++il) {
        if (val != n_embd_v_gqa(il)) {
            return true;
        }
@@ -149,7 +156,7 @@ bool llama_hparams::is_n_embd_v_gqa_variable() const {

 uint32_t llama_hparams::n_embd_k_gqa_max() const {
    uint32_t val = n_embd_k_gqa();
-    for (uint32_t il = 0; il < n_layer; ++il) {
+    for (uint32_t il = 0; il < n_layer_all; ++il) {
        val = std::max(val, n_embd_k_gqa(il));
    }

@@ -158,7 +165,7 @@ uint32_t llama_hparams::n_embd_k_gqa_max() const {

 uint32_t llama_hparams::n_embd_v_gqa_max() const {
    uint32_t val = n_embd_v_gqa();
-    for (uint32_t il = 0; il < n_layer; ++il) {
+    for (uint32_t il = 0; il < n_layer_all; ++il) {
        val = std::max(val, n_embd_v_gqa(il));
    }

@@ -207,11 +214,11 @@ uint32_t llama_hparams::n_embd_s() const {
 }

 bool llama_hparams::is_recr(uint32_t il) const {
-    if (il < n_layer) {
+    if (il < n_layer_all) {
        return is_recr_impl[il];
    }

-    GGML_ABORT("%s: il (%u) out of bounds (n_layer: %u)\n", __func__, il, n_layer);
+    GGML_ABORT("%s: il (%u) out of bounds (n_layer_all: %u)\n", __func__, il, n_layer_all);
 }

 uint32_t llama_hparams::n_pos_per_embd() const {
@@ -219,11 +226,11 @@ uint32_t llama_hparams::n_pos_per_embd() const {
 }

 bool llama_hparams::is_swa(uint32_t il) const {
-    if (il < n_layer) {
+    if (il < n_layer_all) {
        return is_swa_impl[il];
    }

-    GGML_ABORT("fatal error");
+    GGML_ABORT("%s: il (%u) out of bounds (n_layer_all: %u)\n", __func__, il, n_layer_all);
 }

 bool llama_hparams::is_mla() const {
@@ -242,12 +249,6 @@ uint32_t llama_hparams::n_embd_head_v_mla() const {
 }

 bool llama_hparams::has_kv(uint32_t il) const {
-    if (kv_only_nextn) {
-        // MTP head: only the trailing nextn_predict_layers blocks own a KV cache;
-        // the leading trunk blocks are not executed in this graph.
-        return nextn_predict_layers > 0 && il >= (n_layer - nextn_predict_layers);
-    }
-
    if (n_layer_kv_from_start >= 0) {
        if (il < (uint32_t) n_layer_kv_from_start) {
            return true;
@@ -260,16 +261,8 @@ bool llama_hparams::has_kv(uint32_t il) const {
    return true;
 }

-uint32_t llama_hparams::n_layer_kv() const {
-    uint32_t res = 0;
-
-    for (uint32_t il = 0; il < n_layer; ++il) {
-        if (has_kv(il)) {
-            res++;
-        }
-    }
-
-    return res;
+uint32_t llama_hparams::n_layer() const {
+    return n_layer_all - n_layer_nextn;
 }

 bool llama_hparams::use_mrope() const {
@@ -48,12 +48,15 @@ struct llama_hparams {

    uint32_t n_ctx_train; // context size the model was trained on
    uint32_t n_embd;
-    uint32_t n_layer;
-    int32_t  n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
+    uint32_t n_layer_all;
+    uint32_t n_layer_nextn = 0;
    uint32_t n_expert = 0;
    uint32_t n_expert_used = 0;
    uint32_t n_rel_attn_bkts = 0;

+    // TODO: this needs to be reworked
+    int32_t  n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
+
    // different head size for full_attention and SWA layers
    uint32_t n_embd_head_k_full; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
    uint32_t n_embd_head_v_full; // dimension of values (d_v) aka n_embd_head
@@ -96,9 +99,6 @@ struct llama_hparams {
    uint32_t expert_gating_func   = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
    uint32_t moe_every_n_layers   = 0;
    uint32_t moe_latent_size      = 0;
-    uint32_t nextn_predict_layers = 0;
-
-    bool kv_only_nextn = false; // if true, only the last nextn_predict_layers blocks have a KV cache (MTP head arches)

    float f_norm_eps;
    float f_norm_rms_eps;
@@ -272,8 +272,7 @@ struct llama_hparams {

    bool is_swa(uint32_t il) const;

-    // TODO: implement
-    //void set_recr_pattern(uint32_t n_pattern, bool dense_first = false);
+    void set_recr_pattern(uint32_t n_pattern, bool dense_first = false);

    // whether or not the given layer is recurrent (for hybrid models)
    bool is_recr(uint32_t il) const;
@@ -329,8 +328,8 @@ struct llama_hparams {

    bool has_kv(uint32_t il) const;

-    // number of layers for which has_kv() returns true
-    uint32_t n_layer_kv() const;
+    // number of effective layers (excludes nextn layers)
+    uint32_t n_layer() const;

    // note that this function uses different SWA parameters from those in the hparams
    // note: inlined on purpose for performance reasons
@@ -97,7 +97,7 @@ llama_kv_cache::llama_kv_cache(

    GGML_ASSERT(kv_size % n_pad == 0);

-    const uint32_t n_layer_kv = hparams.n_layer_kv();
+    const uint32_t n_layer = hparams.n_layer_all;

    // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
    struct ggml_backend_buft_comparator {
@@ -112,7 +112,7 @@ llama_kv_cache::llama_kv_cache(
        auto it = ctx_map.find(buft);
        if (it == ctx_map.end()) {
            ggml_init_params params = {
-                /*.mem_size   =*/ size_t(2u*(1 + n_stream)*n_layer_kv*ggml_tensor_overhead()),
+                /*.mem_size   =*/ size_t(2u*(1 + n_stream)*n_layer*ggml_tensor_overhead()),
                /*.mem_buffer =*/ NULL,
                /*.no_alloc   =*/ true,
            };
@@ -160,7 +160,7 @@ llama_kv_cache::llama_kv_cache(

    const bool is_mla = hparams.is_mla();

-    for (uint32_t il = 0; il < hparams.n_layer; il++) {
+    for (uint32_t il = 0; il < n_layer; il++) {
        if (!hparams.has_kv(il)) {
            LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il);
            continue;
@@ -230,7 +230,7 @@ llama_kv_cache::llama_kv_cache(
    if (reuse) {
        LLAMA_LOG_DEBUG("%s: reusing layers:\n", __func__);

-        for (uint32_t il = 0; il < hparams.n_layer; il++) {
+        for (uint32_t il = 0; il < n_layer; il++) {
            const int32_t il_reuse = reuse(il);

            if (il_reuse < 0) {
@@ -26,7 +26,7 @@ llama_memory_recurrent::llama_memory_recurrent(
                 uint32_t   n_seq_max,
                 uint32_t   n_rs_seq,
    const layer_filter_cb & filter) : hparams(model.hparams), n_seq_max(n_seq_max) {
-    const int32_t n_layer = hparams.n_layer;
+    const int32_t n_layer = hparams.n_layer();

    head = 0;
    size = mem_size;
@@ -863,7 +863,7 @@ void llama_memory_recurrent::state_write_meta(llama_io_write_i & io, const std::

 void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
    const uint32_t s_trans = 0;
-    const uint32_t n_layer = hparams.n_layer;
+    const uint32_t n_layer = hparams.n_layer();

    io.write(&s_trans, sizeof(s_trans));
    io.write(&n_layer, sizeof(n_layer));
@@ -1047,8 +1047,8 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell
    io.read(&s_trans, sizeof(s_trans));
    io.read(&n_layer, sizeof(n_layer));

-    if (n_layer != hparams.n_layer) {
-        LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
+    if (n_layer != hparams.n_layer()) {
+        LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer());
        return false;
    }
    if (cell_count > size) {
@@ -1050,10 +1050,10 @@ struct ggml_tensor * llama_model_loader::create_tensor(
        if (it == ctx_map.end()) {
            // one ggml context per buffer type
            int max_n_tensors = n_tensors;
-            max_n_tensors += 1;                 // duplicated output tensor
-            max_n_tensors += hparams.n_layer*2; // duplicated rope freq tensors
+            max_n_tensors += 1;                   // duplicated output tensor
+            max_n_tensors += hparams.n_layer()*2; // duplicated rope freq tensors
            if (files.empty()) {
-                max_n_tensors += hparams.n_layer*256; // this should be well above what any model actually uses
+                max_n_tensors += hparams.n_layer()*256; // this should be well above what any model actually uses
            }
            const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;

@@ -77,7 +77,7 @@ void llama_model_saver::add_kv(const enum llm_kv key, const char value) {
 template <typename Container>
 void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, const bool per_layer) {
    GGML_ASSERT(model != nullptr || !per_layer);
-    const size_t n_values = per_layer ? size_t(model->hparams.n_layer) : value.size();
+    const size_t n_values = per_layer ? size_t(model->hparams.n_layer()) : value.size();
    GGML_ASSERT(n_values <= value.size());

    if (n_values == 0) {
@@ -206,7 +206,7 @@ void llama_model_saver::add_kv_from_model() {
    if (hparams.n_embd_out_impl > 0) {
        add_kv(LLM_KV_EMBEDDING_LENGTH_OUT,          hparams.n_embd_out_impl);
    }
-    add_kv(LLM_KV_BLOCK_COUNT,                       hparams.n_layer);
+    add_kv(LLM_KV_BLOCK_COUNT,                       hparams.n_layer_all);
    add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead);
    add_kv(LLM_KV_FEED_FORWARD_LENGTH,               hparams.n_ff_arr, true);
    add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
@@ -227,7 +227,7 @@ void llama_model_saver::add_kv_from_model() {
    add_kv(LLM_KV_EXPERT_GROUP_SCALE,                hparams.expert_group_scale);
    add_kv(LLM_KV_EXPERTS_PER_GROUP,                 hparams.n_group_experts);
    add_kv(LLM_KV_MOE_EVERY_N_LAYERS,                hparams.moe_every_n_layers);
-    add_kv(LLM_KV_NEXTN_PREDICT_LAYERS,              hparams.nextn_predict_layers);
+    add_kv(LLM_KV_NEXTN_PREDICT_LAYERS,              hparams.n_layer_nextn);
    add_kv(LLM_KV_NUM_DEEPSTACK_LAYERS,              hparams.n_deepstack_layers);
    add_kv(LLM_KV_POOLING_TYPE,                      uint32_t(hparams.pooling_type));
    add_kv(LLM_KV_LOGIT_SCALE,                       hparams.f_logit_scale);
@@ -398,7 +398,7 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
            rotation = get_il_eff(il) % ud->n_devices;
        } else {
            il = 0;
-            rotation = hparams.n_layer % ud->n_devices;
+            rotation = hparams.n_layer() % ud->n_devices;
        }
        const ggml_tensor * tensor_axis_0 = suffix.empty() ? tensor : ud->model->get_tensor((prefix + suffix).c_str());
        if (tensor_axis_0 == nullptr) {
@@ -1034,7 +1034,7 @@ void llama_model_base::load_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT,    hparams.n_embd_out_impl, false);
    ml.get_key(LLM_KV_ATTENTION_CAUSAL,        hparams.causal_attn,     false);
    ml.get_key(LLM_KV_POOLING_TYPE,            hparams.pooling_type,    false);
-    ml.get_key(LLM_KV_BLOCK_COUNT,             hparams.n_layer);
+    ml.get_key(LLM_KV_BLOCK_COUNT,             hparams.n_layer_all);
    ml.get_key(LLM_KV_EXPERT_COUNT,            hparams.n_expert,        false);
    ml.get_key(LLM_KV_EXPERT_USED_COUNT,       hparams.n_expert_used,   false);
    ml.get_key(LLM_KV_EXPERT_GROUP_COUNT,      hparams.n_expert_groups, false);
@@ -1089,13 +1089,13 @@ void llama_model_base::load_hparams(llama_model_loader & ml) {
    std::fill(hparams.swiglu_clamp_exp.begin(),   hparams.swiglu_clamp_exp.end(),   0.0f);
    std::fill(hparams.swiglu_clamp_shexp.begin(), hparams.swiglu_clamp_shexp.end(), 0.0f);

-    ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff_arr,   hparams.n_layer, false);
-    ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
+    ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff_arr,   hparams.n_layer(), false);
+    ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer(), false);

    // n_head_kv is optional, default to n_head
    hparams.n_head_kv_arr = hparams.n_head_arr;

-    ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
+    ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer(), false);

    bool rope_finetuned = false;
    ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
@@ -1194,7 +1194,7 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
    const auto & use_mlock    = params.use_mlock;
    const auto & tensor_split = params.tensor_split;

-    const int n_layer      = hparams.n_layer;
+    const int n_layer = hparams.n_layer_all;
    const int n_gpu_layers = this->n_gpu_layers();

    const bool use_mmap_buffer = true;
@@ -1251,10 +1251,10 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
        splits[i] /= split_sum;
    }

-    const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
-    const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
+    const int i_gpu_start = std::max(n_layer + 1 - n_gpu_layers, 0);
+    const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, n_layer + 1);
    auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
-        const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il);
+        const bool is_swa = il < n_layer && hparams.is_swa(il);
        if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
            LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
            return {cpu_dev, &pimpl->cpu_buft_list};
@@ -1557,7 +1557,7 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
    }

    if (llama_supports_gpu_offload()) {
-        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
+        const int n_gpu = std::min(n_gpu_layers, n_layer);

        int n_repeating = n_gpu;
        if (n_repeating > 0) {
@@ -1566,8 +1566,8 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
        }
        LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating);

-        const int max_backend_supported_layers = hparams.n_layer + 1;
-        const int max_offloadable_layers       = hparams.n_layer + 1;
+        const int max_backend_supported_layers = n_layer + 1;
+        const int max_offloadable_layers       = n_layer + 1;

        LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
    }
@@ -1636,7 +1636,7 @@ const float * llama_model::tensor_split() const {
 }

 uint32_t llama_model::n_gpu_layers() const {
-    return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer + 1;
+    return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer() + 1;
 }

 llama_split_mode llama_model::split_mode() const {
@@ -1707,17 +1707,17 @@ void llama_model::print_info() const {
        LLAMA_LOG_INFO("%s: n_ctx_train           = %u\n",     __func__, hparams.n_ctx_train);
        LLAMA_LOG_INFO("%s: n_embd                = %u\n",     __func__, hparams.n_embd);
        LLAMA_LOG_INFO("%s: n_embd_inp            = %u\n",     __func__, hparams.n_embd_inp());
-        LLAMA_LOG_INFO("%s: n_layer               = %u\n",     __func__, hparams.n_layer);
-        LLAMA_LOG_INFO("%s: n_head                = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head(il);    }, hparams.n_layer).c_str());
-        LLAMA_LOG_INFO("%s: n_head_kv             = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
+        LLAMA_LOG_INFO("%s: n_layer               = %u\n",     __func__, hparams.n_layer());
+        LLAMA_LOG_INFO("%s: n_head                = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head(il);    }, hparams.n_layer()).c_str());
+        LLAMA_LOG_INFO("%s: n_head_kv             = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer()).c_str());
        LLAMA_LOG_INFO("%s: n_rot                 = %u\n",     __func__, hparams.n_rot_full);
        LLAMA_LOG_INFO("%s: n_swa                 = %u\n",     __func__, hparams.n_swa);
        LLAMA_LOG_INFO("%s: is_swa_any            = %u\n",     __func__, hparams.is_swa_any());
        LLAMA_LOG_INFO("%s: n_embd_head_k         = %u\n",     __func__, hparams.n_embd_head_k_full);
        LLAMA_LOG_INFO("%s: n_embd_head_v         = %u\n",     __func__, hparams.n_embd_head_v_full);
-        LLAMA_LOG_INFO("%s: n_gqa                 = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il);        }, hparams.n_layer).c_str());
-        LLAMA_LOG_INFO("%s: n_embd_k_gqa          = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
-        LLAMA_LOG_INFO("%s: n_embd_v_gqa          = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
+        LLAMA_LOG_INFO("%s: n_gqa                 = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il);        }, hparams.n_layer()).c_str());
+        LLAMA_LOG_INFO("%s: n_embd_k_gqa          = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer()).c_str());
+        LLAMA_LOG_INFO("%s: n_embd_v_gqa          = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer()).c_str());
        LLAMA_LOG_INFO("%s: f_norm_eps            = %.1e\n",   __func__, hparams.f_norm_eps);
        LLAMA_LOG_INFO("%s: f_norm_rms_eps        = %.1e\n",   __func__, hparams.f_norm_rms_eps);
        LLAMA_LOG_INFO("%s: f_clamp_kqv           = %.1e\n",   __func__, hparams.f_clamp_kqv);
@@ -1725,7 +1725,7 @@ void llama_model::print_info() const {
        LLAMA_LOG_INFO("%s: f_logit_scale         = %.1e\n",   __func__, hparams.f_logit_scale);
        LLAMA_LOG_INFO("%s: f_attn_scale          = %.1e\n",   __func__, hparams.f_attention_scale);
        LLAMA_LOG_INFO("%s: f_attn_value_scale    = %.4f\n",   __func__, hparams.f_attn_value_scale);
-        LLAMA_LOG_INFO("%s: n_ff                  = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
+        LLAMA_LOG_INFO("%s: n_ff                  = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer()).c_str());
        LLAMA_LOG_INFO("%s: n_expert              = %u\n",     __func__, hparams.n_expert);
        LLAMA_LOG_INFO("%s: n_expert_used         = %u\n",     __func__, hparams.n_expert_used);
        LLAMA_LOG_INFO("%s: n_expert_groups       = %d\n",     __func__, hparams.n_expert_groups);
@@ -1852,7 +1852,7 @@ void llama_model::print_info() const {
            LLAMA_LOG_INFO("%s: expert_weights_scale  = %.1f\n",   __func__, hparams.expert_weights_scale);
            LLAMA_LOG_INFO("%s: expert_weights_norm   = %d\n",     __func__, hparams.expert_weights_norm);
            LLAMA_LOG_INFO("%s: expert_gating_func    = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
-            LLAMA_LOG_INFO("%s: nextn_predict_layers  = %d\n",     __func__, hparams.nextn_predict_layers);
+            LLAMA_LOG_INFO("%s: n_layer_nextn         = %d\n",     __func__, hparams.n_layer_nextn);
        }

        if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
@@ -2034,22 +2034,21 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                    llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
                    llama_memory_hybrid::layer_filter_cb filter_recr = nullptr;
                    if (arch == LLM_ARCH_FALCON_H1) {
-                        filter_attn = [&](int32_t) { return true; };
-                        filter_recr = [&](int32_t) { return true; };
+                        filter_attn = [&](uint32_t) { return true; };
+                        filter_recr = [&](uint32_t) { return true; };
                    } else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
-                        filter_attn = [&](int32_t il) {
+                        filter_attn = [&](uint32_t il) {
                            return !hparams.is_recr(il) && hparams.n_ff(il) == 0;
                        };
-                        filter_recr = [&](int32_t il) {
+                        filter_recr = [&](uint32_t il) {
                            return hparams.is_recr(il) && hparams.n_ff(il) == 0;
                        };
                    } else if (arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE) {
-                        const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers;
-                        filter_attn = [&, n_main](int32_t il) {
-                            return (uint32_t)il < n_main && !hparams.is_recr(il);
+                        filter_attn = [&](uint32_t il) {
+                            return il < hparams.n_layer() && !hparams.is_recr(il);
                        };
-                        filter_recr = [&, n_main](int32_t il) {
-                            return (uint32_t)il < n_main && hparams.is_recr(il);
+                        filter_recr = [&](uint32_t il) {
+                            return il < hparams.n_layer() && hparams.is_recr(il);
                        };
                    }

@@ -2098,9 +2097,11 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                    llama_kv_cache::layer_filter_cb filter = nullptr;

                    if (arch == LLM_ARCH_GEMMA3N || arch == LLM_ARCH_GEMMA4) {
-                        reuse = [&](int32_t il) {
-                            if (il >= (int32_t) hparams.n_layer_kv_from_start) {
-                                return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
+                        reuse = [&](uint32_t il) {
+                            GGML_ASSERT(hparams.n_layer_kv_from_start >= 2);
+
+                            if (il >= (uint32_t)hparams.n_layer_kv_from_start) {
+                                return hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
                            }

                            return -1;
@@ -2108,16 +2109,14 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                    }

                    if (mtp_on_hybrid_qwen35) {
-                        const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers;
-                        filter = [n_main](int32_t il) { return (uint32_t)il >= n_main; };
+                        filter = [&](uint32_t il) { return il >= hparams.n_layer(); };
                    }

-                    if (arch == LLM_ARCH_STEP35 && hparams.nextn_predict_layers > 0) {
-                        const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers;
+                    if (arch == LLM_ARCH_STEP35 && hparams.n_layer_nextn > 0) {
                        if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP) {
-                            filter = [n_main](int32_t il) { return (uint32_t)il >= n_main; };
+                            filter = [&](uint32_t il) { return il >= hparams.n_layer(); };
                        } else {
-                            filter = [n_main](int32_t il) { return (uint32_t)il <  n_main; };
+                            filter = [&](uint32_t il) { return il <  hparams.n_layer(); };
                        }
                    }

@@ -2242,7 +2241,7 @@ int32_t llama_model_n_embd_out(const llama_model * model) {
 }

 int32_t llama_model_n_layer(const llama_model * model) {
-    return model->hparams.n_layer;
+    return model->hparams.n_layer();
 }

 int32_t llama_model_n_head(const llama_model * model) {
@@ -700,7 +700,8 @@ const char * llm_type_name(llm_type type);
 // convenience macro for loading local variables for load_tensors() in llama_model_base
 // note: cast to int64_t since we will use these for the tensor dimensions
 #define LLAMA_LOAD_LOCALS \
-    const int     n_layer        = hparams.n_layer;          GGML_UNUSED(n_layer); \
+    const int     n_layer        = hparams.n_layer();        GGML_UNUSED(n_layer); \
+    const int     n_layer_all    = hparams.n_layer_all;      GGML_UNUSED(n_layer_all); \
    const int64_t n_head         = hparams.n_head();         GGML_UNUSED(n_head); \
    const int64_t n_head_kv      = hparams.n_head_kv();      GGML_UNUSED(n_head_kv); \
    const int64_t n_embd         = hparams.n_embd;           GGML_UNUSED(n_embd); \
@@ -847,7 +847,7 @@ static void init_quantize_state_counters(quantize_state_impl & qs, std::vector<t
            qs.has_tied_embeddings = false;
        }
    }
-    qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer;
+    qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer();
 }

 //
@@ -1348,7 +1348,7 @@ llama_model * llama_quant_model_from_metadata(const llama_quant_model_desc * des
    model->hparams.n_embd             = desc->n_embd;
    model->hparams.n_embd_head_k_full = desc->n_embd_head_k;
    model->hparams.n_embd_head_v_full = desc->n_embd_head_v;
-    model->hparams.n_layer            = desc->n_layer;
+    model->hparams.n_layer_all        = desc->n_layer;
    model->hparams.n_expert           = desc->n_expert;

    for (uint32_t i = 0; i < desc->n_layer; i++) {
@@ -30,7 +30,7 @@ void llama_model_afmoe::load_arch_hparams(llama_model_loader & ml) {
        hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
    }

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 56: type = LLM_TYPE_6B; break;
        case 32: type = LLM_TYPE_26B; break;
        default: type = LLM_TYPE_UNKNOWN;
@@ -2,12 +2,13 @@

 void llama_model_apertus::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N,        hparams.xielu_alpha_n, hparams.n_layer);
-    ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P,        hparams.xielu_alpha_p, hparams.n_layer);
-    ml.get_key_or_arr(LLM_KV_XIELU_BETA,           hparams.xielu_beta,    hparams.n_layer);
-    ml.get_key_or_arr(LLM_KV_XIELU_EPS,            hparams.xielu_eps,     hparams.n_layer);

-    switch (hparams.n_layer) {
+    ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n, hparams.n_layer());
+    ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p, hparams.n_layer());
+    ml.get_key_or_arr(LLM_KV_XIELU_BETA,    hparams.xielu_beta,    hparams.n_layer());
+    ml.get_key_or_arr(LLM_KV_XIELU_EPS,     hparams.xielu_eps,     hparams.n_layer());
+
+    switch (hparams.n_layer()) {
        case 32: type = LLM_TYPE_8B; break;
        default: type = LLM_TYPE_UNKNOWN;
    }
@@ -4,7 +4,7 @@ void llama_model_arcee::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

    // Arcee uses the same structure as Llama
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 36: type = LLM_TYPE_4B; break;
        default: type = LLM_TYPE_UNKNOWN;
    }
@@ -4,7 +4,7 @@ void llama_model_arctic::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

    if (hparams.n_expert == 128) {
-        switch (hparams.n_layer) {
+        switch (hparams.n_layer()) {
            case 35: type = LLM_TYPE_10B_128x3_66B; break;
            default: type = LLM_TYPE_UNKNOWN;
        }
@@ -10,7 +10,7 @@ void llama_model_arwkv7::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK,               hparams.n_lora_gate, false);
    ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT,                      hparams.token_shift_count, false);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 12:
            switch (hparams.n_embd) {
                case 768: type = LLM_TYPE_190M; break;
@@ -2,7 +2,7 @@

 void llama_model_baichuan::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 32: type = LLM_TYPE_7B; break;
        case 40: type = LLM_TYPE_13B; break;
        default: type = LLM_TYPE_UNKNOWN;
@@ -8,7 +8,7 @@ void llama_model_bailingmoe::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
    ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 28: type = LLM_TYPE_16B; break;
        case 88: type = LLM_TYPE_290B; break;
        default: type = LLM_TYPE_UNKNOWN;
@@ -9,17 +9,13 @@ void llama_model_bailingmoe2::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
    ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
    ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func);
-    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,              hparams.nextn_predict_layers, false);
-    GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,              hparams.n_layer_nextn, false);

-    // TODO: when MTP is implemented, this should probably be updated if needed
-    hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
+    GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl");

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 20: type = LLM_TYPE_16B_A1B; break;
-        case 21: type = LLM_TYPE_16B_A1B; break;
        case 32: type = LLM_TYPE_100B_A6B; break;
-        case 33: type = LLM_TYPE_100B_A6B; break;
        default: type = LLM_TYPE_UNKNOWN;
    }
 }
@@ -39,9 +35,9 @@ void llama_model_bailingmoe2::load_arch_tensors(llama_model_loader &) {
    GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for bailingmoe2");
    GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for bailingmoe2");

-    for (int i = 0; i < n_layer; ++i) {
+    for (int i = 0; i < n_layer_all; ++i) {
        int flags = 0;
-        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+        if (i >= n_layer) {
            // skip all tensors in the NextN layers
            flags |= TENSOR_SKIP;
        }
@@ -78,7 +74,7 @@ void llama_model_bailingmoe2::load_arch_tensors(llama_model_loader &) {
        }

        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
-        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+        if (i >= n_layer) {
            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
@@ -112,8 +108,7 @@ llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph

    ggml_tensor * inp_out_ids = build_inp_out_ids();

-    const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
-    for (int il = 0; il < n_transformer_layers; ++il) {
+    for (int il = 0; il < n_layer; ++il) {
        ggml_tensor * inpSA = inpL;

        // norm
@@ -146,7 +141,7 @@ llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
        }

-        if (il == n_transformer_layers - 1 && inp_out_ids) {
+        if (il == n_layer - 1 && inp_out_ids) {
            cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
        }
@@ -1,9 +1,9 @@
 #include "models.h"

 void llama_model_bert::load_arch_hparams(llama_model_loader & ml) {
-    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 3:
            type = LLM_TYPE_17M; break; // bge-micro
        case 6:
@@ -3,7 +3,7 @@
 void llama_model_bitnet::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 26: type = LLM_TYPE_3B; break;
        default: type = LLM_TYPE_UNKNOWN;
    }
@@ -3,7 +3,7 @@
 void llama_model_bloom::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 24: type = LLM_TYPE_1B; break;
        case 30:
            switch (hparams.n_embd) {
@@ -6,7 +6,7 @@ void llama_model_chameleon::load_arch_hparams(llama_model_loader & ml) {
    hparams.f_norm_eps = 1e-5;  // eps for qk-norm, torch default
    ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm, false);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 32: type = LLM_TYPE_7B; break;
        case 48: type = LLM_TYPE_34B; break;
        default: type = LLM_TYPE_UNKNOWN;
@@ -2,7 +2,8 @@

 void llama_model_chatglm::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
        case 28: {
            if (hparams.n_head(0) == 16) {
                type = LLM_TYPE_1_5B;
@@ -2,7 +2,8 @@

 void llama_model_codeshell::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
        case 42: type = LLM_TYPE_7B; break;
        default: type = LLM_TYPE_UNKNOWN;
    }
@@ -2,7 +2,8 @@

 void llama_model_cogvlm::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
        case 32: type = LLM_TYPE_13B; break;
        default: type = LLM_TYPE_UNKNOWN;
    }
@@ -5,6 +5,7 @@ void llama_model_cohere2::load_arch_hparams(llama_model_loader & ml) {
    uint32_t swa_period = 4;
    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
    hparams.set_swa_pattern(swa_period);
+
    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;

@@ -12,7 +13,8 @@ void llama_model_cohere2::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
    ml.get_key(LLM_KV_LOGIT_SCALE,              hparams.f_logit_scale);
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,  hparams.f_norm_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
        case 32: type = LLM_TYPE_8B; break;
        default: type = LLM_TYPE_UNKNOWN;
    }
@@ -3,7 +3,8 @@
 void llama_model_command_r::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_LOGIT_SCALE,             hparams.f_logit_scale, false);
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
        case 40: type = LLM_TYPE_35B; break;
        default: type = LLM_TYPE_UNKNOWN;
    }
@@ -1,14 +1,14 @@
 #include "models.h"

 void llama_model_dbrx::load_arch_hparams(llama_model_loader & ml) {
-ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,     hparams.f_clamp_kqv);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+    ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,     hparams.f_clamp_kqv);

-switch (hparams.n_layer) {
-    case 40: type = LLM_TYPE_16x12B; break;
-    default: type = LLM_TYPE_UNKNOWN;
+    switch (hparams.n_layer()) {
+        case 40: type = LLM_TYPE_16x12B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
 }
-        }

 void llama_model_dbrx::load_arch_tensors(llama_model_loader &) {
    LLAMA_LOAD_LOCALS;
@@ -2,7 +2,8 @@

 void llama_model_deci::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
        case 32: type = LLM_TYPE_7B; break;
        case 80: type = LLM_TYPE_70B; break;
        case 162: type = LLM_TYPE_405B; break;
@@ -5,7 +5,7 @@ void llama_model_deepseek2::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);

    // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B, Kanana-2-30B-A3B
-    const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26 || (hparams.n_layer == 48 && n_vocab == 128256));
+    const bool is_lite = (hparams.n_layer() == 27 || hparams.n_layer() == 26 || (hparams.n_layer() == 48 && n_vocab == 128256));

    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
    ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
@@ -23,7 +23,7 @@ void llama_model_deepseek2::load_arch_hparams(llama_model_loader & ml) {
    if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
        // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
        // that have no expert_gating_func model parameter set
-        if ((hparams.n_layer == 47 || hparams.n_layer == 48) && n_vocab == 154880) {
+        if ((hparams.n_layer() == 47 || hparams.n_layer() == 48) && n_vocab == 154880) {
            // GLM 4.7 Lite
            hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
        } else {
@@ -43,7 +43,7 @@ void llama_model_deepseek2::load_arch_hparams(llama_model_loader & ml) {

    hparams.f_attn_temp_offset = 0.0f;

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 27: type = LLM_TYPE_16B; break;
        case 47: type = LLM_TYPE_30B_A3B; break;
        case 60: type = LLM_TYPE_236B; break;
@@ -191,8 +191,7 @@ llama_model_deepseek2::graph::graph(const llama_model & model, const llm_graph_p

    ggml_tensor * inp_out_ids = build_inp_out_ids();

-    int effective_n_layers = hparams.n_layer - hparams.nextn_predict_layers;
-    for (int il = 0; il < effective_n_layers; ++il) {
+    for (int il = 0; il < n_layer; ++il) {
        ggml_tensor * inpSA = inpL;

        // norm
@@ -366,7 +365,7 @@ llama_model_deepseek2::graph::graph(const llama_model & model, const llm_graph_p
                            Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
            }
        }
-        if (il == effective_n_layers - 1 && inp_out_ids) {
+        if (il == n_layer - 1 && inp_out_ids) {
            cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
        }
@@ -14,7 +14,7 @@ void llama_model_deepseek2ocr::load_arch_hparams(llama_model_loader & ml) {
        hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
    }

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 12: type = LLM_TYPE_3B; break;
        default: type = LLM_TYPE_UNKNOWN;
    }
@@ -31,7 +31,7 @@ void llama_model_deepseek32::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_INDEXER_TOP_K,      hparams.indexer_top_k);

    // Expert gating function
-    ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func);
+    ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);

    if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
        // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
@@ -40,13 +40,10 @@ void llama_model_deepseek32::load_arch_hparams(llama_model_loader & ml) {
    }

    // NextN/MTP parameters
-    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,        hparams.nextn_predict_layers, false);
-    GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false);
+    GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer");

-    // TODO: when MTP is implemented, this should probably be updated if needed
-    hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
-
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 62: type = LLM_TYPE_685B_A37B; break;
        default: type = LLM_TYPE_UNKNOWN;
    }
@@ -82,9 +79,9 @@ void llama_model_deepseek32::load_arch_tensors(llama_model_loader &) {
        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
    }

-    for (int i = 0; i < n_layer; ++i) {
+    for (int i = 0; i < n_layer_all; ++i) {
        int flags = 0;
-        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+        if (i >= n_layer) {
            // skip all tensors in the NextN layers
            // TODO @ngxson : TENSOR_NOT_REQUIRED was a hack, need to remove it later
            flags |= TENSOR_SKIP | TENSOR_NOT_REQUIRED;
@@ -142,7 +139,7 @@ void llama_model_deepseek32::load_arch_tensors(llama_model_loader &) {
        }

        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
-        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+        if (i >= n_layer) {
            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
@@ -205,8 +202,7 @@ llama_model_deepseek32::graph::graph(const llama_model & model, const llm_graph_

    ggml_tensor * inp_out_ids = build_inp_out_ids();

-    int effective_n_layers = hparams.n_layer - hparams.nextn_predict_layers;
-    for (int il = 0; il < effective_n_layers; ++il) {
+    for (int il = 0; il < n_layer; ++il) {
        ggml_tensor * inpSA = inpL;

        // norm
@@ -427,7 +423,7 @@ llama_model_deepseek32::graph::graph(const llama_model & model, const llm_graph_
                        Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, top_k, kq_scale, il);
            }
        }
-        if (il == effective_n_layers - 1 && inp_out_ids) {
+        if (il == n_layer - 1 && inp_out_ids) {
            cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
        }
@@ -8,7 +8,8 @@ void llama_model_dots1::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
    ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
    ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
        case 62: type = LLM_TYPE_142B; break;
        default: type = LLM_TYPE_UNKNOWN;
    }
@@ -2,8 +2,9 @@

 void llama_model_dream::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
    // Dream models are primarily 7B with 28 layers
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 28:
            type = LLM_TYPE_7B;
            break;
@@ -12,7 +12,7 @@ void llama_model_ernie4_5::load_arch_hparams(llama_model_loader & ml) {
        ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead, false);
    }

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 18: type = LLM_TYPE_0_3B; break;
        case 28: type = LLM_TYPE_21B_A3B; break;
        case 54: type = LLM_TYPE_300B_A47B; break;
@@ -3,7 +3,7 @@
 void llama_model_eurobert::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

-    if (hparams.n_layer == 12) {
+    if (hparams.n_layer() == 12) {
        type = LLM_TYPE_SMALL;  // 0.2B
    }
 }
@@ -20,13 +20,12 @@ void llama_model_exaone_moe::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
    ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead, false);

-    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,              hparams.nextn_predict_layers, false);
-    GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false);
+    GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl");

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 32: type = LLM_TYPE_30B_A3B; break;
-        case 48:
-        case 49: type = LLM_TYPE_235B_A22B; break;
+        case 48: type = LLM_TYPE_235B_A22B; break;
        default: type = LLM_TYPE_UNKNOWN;
    }
 }
@@ -50,9 +49,9 @@ void llama_model_exaone_moe::load_arch_tensors(llama_model_loader &) {
        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
    }

-    for (int i = 0; i < n_layer; ++i) {
+    for (int i = 0; i < n_layer_all; ++i) {
        int flags = 0;
-        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+        if (i >= n_layer) {
            // skip all tensors in the NextN layers
            flags |= TENSOR_SKIP;
        }
@@ -70,7 +69,7 @@ void llama_model_exaone_moe::load_arch_tensors(llama_model_loader &) {
        layer.ffn_norm     = create_tensor(tn(LLM_TENSOR_FFN_NORM,    "weight", i), {n_embd}, flags);

        // dense layers for first n_layer_dense_lead layers or nextn_predict_layers layers at the end
-        if (i < (int) hparams.n_layer_dense_lead || (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers)) {
+        if (i < (int) hparams.n_layer_dense_lead || (i >= n_layer)) {
            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, flags);
            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, flags);
@@ -95,7 +94,7 @@ void llama_model_exaone_moe::load_arch_tensors(llama_model_loader &) {
        }

        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
-        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+        if (i >= n_layer) {
            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), {2 * n_embd, n_embd}, flags);
            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM,   "weight", i), {n_embd}, flags);
            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM,   "weight", i), {n_embd}, flags);
@@ -130,8 +129,7 @@ llama_model_exaone_moe::graph::graph(const llama_model & model, const llm_graph_

    ggml_tensor * inp_out_ids = build_inp_out_ids();

-    const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
-    for (int il = 0; il < n_transformer_layers; ++il) {
+    for (int il = 0; il < n_layer; ++il) {
        ggml_tensor * inpSA = inpL;

        // use RoPE for SWA layers
@@ -170,7 +168,7 @@ llama_model_exaone_moe::graph::graph(const llama_model & model, const llm_graph_
                Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
            cb(cur, "attn_out", il);
        }
-        if (il == n_transformer_layers - 1 && inp_out_ids) {
+        if (il == n_layer - 1 && inp_out_ids) {
            cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
        }
@@ -3,7 +3,7 @@
 void llama_model_exaone::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 32: type = LLM_TYPE_8B; break;
        default: type = LLM_TYPE_UNKNOWN;
    }
@@ -1,7 +1,7 @@
 #include "models.h"

 void llama_model_exaone4::load_arch_hparams(llama_model_loader & ml) {
-    if (hparams.n_layer == 64) {    // 32B
+    if (hparams.n_layer() == 64) {    // 32B
        hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
        hparams.n_swa = 4096;
        uint32_t swa_period = 4;
@@ -15,11 +15,11 @@ void llama_model_exaone4::load_arch_hparams(llama_model_loader & ml) {

    ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,        hparams.nextn_predict_layers, false);
-    GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
-    hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,        hparams.n_layer_nextn, false);

-    switch (hparams.n_layer) {
+    GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer");
+
+    switch (hparams.n_layer()) {
        case 30: type = LLM_TYPE_1_2B; break;
        case 64: type = LLM_TYPE_32B; break;
        default: type = LLM_TYPE_UNKNOWN;
@@ -40,8 +40,8 @@ void llama_model_exaone4::load_arch_tensors(llama_model_loader &) {
        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
    }

-    for (int i = 0; i < n_layer; ++i) {
-        const bool is_nextn = hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers;
+    for (int i = 0; i < n_layer_all; ++i) {
+        const bool is_nextn = i >= n_layer;
        int flags = 0;
        if (is_nextn) {
            // NextN/MTP layers are preserved in GGUF but are not executed yet.
@@ -109,11 +109,7 @@ llama_model_exaone4::graph<iswa>::graph(const llama_model & model, const llm_gra
    }
    ggml_tensor * inp_out_ids = build_inp_out_ids();

-    // MTP / NextN tail blocks are loaded for compatibility but not executed (same as exaone-moe).
-    const int n_layer_main = int(n_layer) - int(hparams.nextn_predict_layers);
-    GGML_ASSERT(n_layer_main > 0);
-
-    for (int il = 0; il < n_layer_main; ++il) {
+    for (int il = 0; il < n_layer; ++il) {
        ggml_tensor * inpSA = inpL;

        // use RoPE for SWA layers or non-SWA models
@@ -149,7 +145,7 @@ llama_model_exaone4::graph<iswa>::graph(const llama_model & model, const llm_gra
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
            cb(cur, "attn_out", il);
        }
-        if (il == n_layer_main - 1 && inp_out_ids) {
+        if (il == n_layer - 1 && inp_out_ids) {
            cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
        }
@@ -13,7 +13,7 @@ void llama_model_falcon_h1::load_arch_hparams(llama_model_loader & ml) {

    std::fill(hparams.is_recr_impl.begin(), hparams.is_recr_impl.end(), true);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 36:
            type = LLM_TYPE_0_5B; break;
        case 24:
@@ -3,7 +3,7 @@
 void llama_model_falcon::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 32: type = LLM_TYPE_7B; break;
        case 60: type = LLM_TYPE_40B; break;
        default: type = LLM_TYPE_UNKNOWN;
@@ -21,7 +21,7 @@ void llama_model_gemma_embedding::load_arch_hparams(llama_model_loader & ml) {
    GGML_ASSERT((hparams.dense_2_feat_in == 0 || hparams.dense_2_feat_in == hparams.n_embd) && "dense_2_feat_in must be equal to n_embd");
    GGML_ASSERT((hparams.dense_3_feat_out == 0 || hparams.dense_3_feat_out == hparams.n_embd) && "dense_3_feat_out must be equal to n_embd");

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 24: type = LLM_TYPE_0_3B; break;
        default: type = LLM_TYPE_UNKNOWN;
    }
@@ -3,7 +3,7 @@
 void llama_model_gemma::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 18: type = LLM_TYPE_2B; break;
        case 28: type = LLM_TYPE_7B; break;
        default: type = LLM_TYPE_UNKNOWN;
@@ -16,7 +16,7 @@ void llama_model_gemma2::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING,      hparams.f_attn_logit_softcapping, false);
    ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING,     hparams.f_final_logit_softcapping, false);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 26: type = LLM_TYPE_2B; break;
        case 42: type = LLM_TYPE_9B; break;
        case 46: type = LLM_TYPE_27B; break;
@@ -17,7 +17,7 @@ void llama_model_gemma3::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 18: type = LLM_TYPE_270M; break;
        case 26: type = LLM_TYPE_1B; break;
        case 32: type = LLM_TYPE_8B; break; // Rnj-1
@@ -6,14 +6,14 @@ void llama_model_gemma3n::load_arch_hparams(llama_model_loader & ml) {
    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
    hparams.set_swa_pattern(swa_period);

-    hparams.n_layer_kv_from_start     = 20;
-    hparams.f_attention_scale         = 1.0f;
+    hparams.n_layer_kv_from_start = 20;
+    hparams.f_attention_scale     = 1.0f;

    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,          hparams.rope_freq_base_train_swa, false);
    ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 30: type = LLM_TYPE_E2B; break;
        case 35: type = LLM_TYPE_E4B; break;
        default: type = LLM_TYPE_UNKNOWN;
@@ -2,12 +2,12 @@

 void llama_model_gemma4::load_arch_hparams(llama_model_loader & ml) {
    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer);
+    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer());

    uint32_t n_kv_shared_layers = 0;
    ml.get_key(LLM_KV_ATTENTION_SHARED_KV_LAYERS, n_kv_shared_layers, false);

-    hparams.n_layer_kv_from_start = hparams.n_layer - (int32_t)n_kv_shared_layers;
+    hparams.n_layer_kv_from_start = hparams.n_layer_all - (int32_t)n_kv_shared_layers;
    hparams.f_attention_scale     = 1.0f; // Gemma4 uses self.scaling = 1.0 (no pre-attn scaling)

    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,          hparams.rope_freq_base_train_swa, false);
@@ -19,7 +19,7 @@ void llama_model_gemma4::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA,  hparams.n_embd_head_v_swa);
    ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING,     hparams.f_final_logit_softcapping, false);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 30: type = LLM_TYPE_26B_A4B; break;
        case 35: type = LLM_TYPE_E2B; break;
        case 42: type = LLM_TYPE_E4B; break;
@@ -33,13 +33,10 @@ void llama_model_glm_dsa::load_arch_hparams(llama_model_loader & ml) {
    }

    // NextN/MTP parameters
-    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,        hparams.nextn_predict_layers, false);
-    GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false);
+    GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl");

-    // TODO: when MTP is implemented, this should probably be updated if needed
-    hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
-
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 79: type = LLM_TYPE_744B_A40B; break;
        default: type = LLM_TYPE_UNKNOWN;
    }
@@ -76,9 +73,9 @@ void llama_model_glm_dsa::load_arch_tensors(llama_model_loader &) {
        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
    }

-    for (int i = 0; i < n_layer; ++i) {
+    for (int i = 0; i < n_layer_all; ++i) {
        int flags = 0;
-        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+        if (i >= n_layer) {
            // skip all tensors in the NextN layers
            // TODO @ngxson : TENSOR_NOT_REQUIRED was a hack, need to remove it later
            flags |= TENSOR_SKIP | TENSOR_NOT_REQUIRED;
@@ -135,8 +132,8 @@ void llama_model_glm_dsa::load_arch_tensors(llama_model_loader &) {
            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags);
        }

-        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
-        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+        // NextN/MTP tensors (preserved but unused) - conditionally load for last n_layer_nextn
+        if (i >= n_layer) {
            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
@@ -20,16 +20,13 @@ void llama_model_glm4_moe::load_arch_hparams(llama_model_loader & ml) {
    }

    // NextN/MTP parameters
-    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,        hparams.nextn_predict_layers, false);
-    GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false);
+    GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl");

-    // TODO: when MTP is implemented, this should probably be updated if needed
-    hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
-
-    switch (hparams.n_layer) {
-        case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
-        case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open
-        case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
+    switch (hparams.n_layer()) {
+        case 46: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air
+        case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open
+        case 92: type = LLM_TYPE_355B_A32B; break; // GLM-4.5
        default: type = LLM_TYPE_UNKNOWN;
    }
 }
@@ -54,9 +51,9 @@ void llama_model_glm4_moe::load_arch_tensors(llama_model_loader &) {

    // Load ALL tensors including NextN layer to satisfy total tensor count
    // but only PROCESS up to last layer (skipping final NextN layer) in forward pass
-    for (int i = 0; i < n_layer; ++i) {
+    for (int i = 0; i < n_layer_all; ++i) {
        int flags = 0;
-        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+        if (i >= n_layer) {
            // skip all tensors in the NextN layers
            flags |= TENSOR_SKIP;
        }
@@ -116,7 +113,7 @@ void llama_model_glm4_moe::load_arch_tensors(llama_model_loader &) {
        }

        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
-        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+        if (i >= n_layer) {
            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
@@ -161,8 +158,7 @@ llama_model_glm4_moe::graph::graph(const llama_model & model, const llm_graph_pa

    // Only process up to last layer (skip final NextN layer)
    // Final layer tensors are loaded but not processed in forward pass
-    const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
-    for (int il = 0; il < n_transformer_layers; ++il) {
+    for (int il = 0; il < n_layer; ++il) {
        ggml_tensor * inpSA = inpL;

        // Pre-attention norm
@@ -211,7 +207,7 @@ llama_model_glm4_moe::graph::graph(const llama_model & model, const llm_graph_pa
                    model.layers[il].wo, NULL, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
        }
-        if (il == n_transformer_layers - 1 && inp_out_ids) {
+        if (il == n_layer - 1 && inp_out_ids) {
            cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
        }
@@ -5,13 +5,10 @@ void llama_model_glm4::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);

    // NextN/MTP parameters (GLM-OCR)
-    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
-    GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false);
+    GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl");

-    // TODO: when MTP is implemented, this should probably be updated if needed
-    hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
-
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 17: type = LLM_TYPE_1B; break; // GLM-OCR
        case 40: type = LLM_TYPE_9B; break;
        case 61: type = LLM_TYPE_32B; break;
@@ -32,9 +29,9 @@ void llama_model_glm4::load_arch_tensors(llama_model_loader &) {
        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
    }

-    for (int i = 0; i < n_layer; ++i) {
+    for (int i = 0; i < n_layer_all; ++i) {
        int flags = 0;
-        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+        if (i >= n_layer) {
            // skip all tensors in the NextN layers
            flags |= TENSOR_SKIP;
        }
@@ -55,7 +52,7 @@ void llama_model_glm4::load_arch_tensors(llama_model_loader &) {
        layer.ffn_post_norm  = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, flags);

        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
-        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+        if (i >= n_layer) {
            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
@@ -100,8 +97,7 @@ llama_model_glm4::graph::graph(const llama_model & model, const llm_graph_params

    // Only process up to last layer (skip final NextN layer)
    // Final layer tensors are loaded but not processed in forward pass
-    const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
-    for (int il = 0; il < n_transformer_layers; ++il) {
+    for (int il = 0; il < n_layer; ++il) {
        ggml_tensor * inpSA = inpL;

        // Pre-attention norm
@@ -140,7 +136,7 @@ llama_model_glm4::graph::graph(const llama_model & model, const llm_graph_params
                    model.layers[il].wo, NULL, model.layers[il].wo_s,
                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
        }
-        if (il == n_transformer_layers - 1 && inp_out_ids) {
+        if (il == n_layer - 1 && inp_out_ids) {
            cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
        }
@@ -2,7 +2,8 @@

 void llama_model_gpt2::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
        case 12: type = LLM_TYPE_SMALL; break;
        case 24: type = LLM_TYPE_MEDIUM; break;
        case 36: type = LLM_TYPE_LARGE; break;
@@ -3,7 +3,8 @@
 void llama_model_gptneox::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
    ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL,   hparams.use_par_res);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
        case 6:
            switch (hparams.n_ff()) {
                case 512:  type = LLM_TYPE_14M; break;
@@ -19,7 +19,7 @@ void llama_model_granite_hybrid::load_arch_hparams(llama_model_loader & ml) {
    hparams.rope_finetuned = rope_finetuned;

    // A layer is recurrent IFF the n_head_kv value is set to 0
-    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+    for (uint32_t i = 0; i < hparams.n_layer(); ++i) {
        hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0;
    }

@@ -12,7 +12,7 @@ void llama_model_granite_moe::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
    hparams.rope_finetuned = rope_finetuned;

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 32: type = LLM_TYPE_3B; break;
        case 40: type = LLM_TYPE_3B; break;
        // Add additional layer/vocab/etc checks here for other model sizes
@@ -12,7 +12,7 @@ void llama_model_granite::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
    hparams.rope_finetuned = rope_finetuned;

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 32: type = LLM_TYPE_3B; break;
        case 40: type = LLM_TYPE_3B; break;
        // Add additional layer/vocab/etc checks here for other model sizes
@@ -26,7 +26,7 @@ void llama_model_grok::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST,   hparams.yarn_beta_fast, false);
    ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,   hparams.yarn_beta_slow, false);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 64: type = LLM_TYPE_314B; break;
        default: type = LLM_TYPE_UNKNOWN;
    }
@@ -7,7 +7,7 @@ void llama_model_grovemoe::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_EXPERTS_PER_GROUP,                 hparams.n_group_experts);
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 48: type = LLM_TYPE_30B_A3B; break;
        default: type = LLM_TYPE_UNKNOWN;
    }
@@ -5,7 +5,7 @@ void llama_model_hunyuan_moe::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
    ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 32: type = LLM_TYPE_A13B; break;
        default: type = LLM_TYPE_UNKNOWN;
    }
@@ -2,7 +2,8 @@

 void llama_model_internlm2::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
        case 32: type = LLM_TYPE_7B; break;
        case 48: type = LLM_TYPE_20B; break;
        default: type = LLM_TYPE_UNKNOWN;
@@ -4,7 +4,7 @@ void llama_model_jais::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
    ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 24: type = LLM_TYPE_1_3B; break;
        case 40: type = LLM_TYPE_13B; break;
        /* TODO: add variants */
@@ -3,7 +3,7 @@
 void llama_model_jais2::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 32: type = LLM_TYPE_8B; break;
        case 68: type = LLM_TYPE_70B; break;
        default: type = LLM_TYPE_UNKNOWN;
@@ -8,11 +8,11 @@ void llama_model_jamba::load_arch_hparams(llama_model_loader & ml) {

    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

-    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+    for (uint32_t i = 0; i < hparams.n_layer(); ++i) {
        hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0;
    }

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        // TODO: Jamba layers are a bit heterogeneous, so naming this is hard.
        case 12: // 900M  8x???M
        case 32: // 51B  16x?B
@@ -4,7 +4,7 @@ void llama_model_jina_bert_v2::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
    hparams.f_max_alibi_bias = 8.0f;

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 4:  type = LLM_TYPE_33M;  break; // jina-embeddings-small
        case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base
        default: type = LLM_TYPE_UNKNOWN;
@@ -3,7 +3,7 @@
 void llama_model_jina_bert_v3::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 24:
            type = LLM_TYPE_558M; break;
        default: type = LLM_TYPE_UNKNOWN;
@@ -14,7 +14,7 @@ void llama_model_kimi_linear::load_arch_hparams(llama_model_loader & ml) {

    // Mark KDA layers as recurrent using n_head_kv pattern (like Jamba)
    // Set n_head_kv = 0 for KDA layers (recurrent), n_head_kv = n_head for MLA layers (attention)
-    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+    for (uint32_t i = 0; i < hparams.n_layer(); ++i) {
        hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0;  // KDA layers are recurrent
    }

@@ -25,7 +25,7 @@ void llama_model_kimi_linear::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
    ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 27: type = LLM_TYPE_48B_A3B; break; // Kimi-Linear-48B-A3B
        default: type = LLM_TYPE_UNKNOWN;
    }
@@ -5,10 +5,13 @@
 void llama_model_lfm2::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_SHORTCONV_L_CACHE,           hparams.n_shortconv_l_cache);
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    for (uint32_t il = 0; il < hparams.n_layer; ++il) {
+
+    for (uint32_t il = 0; il < hparams.n_layer(); ++il) {
        hparams.is_recr_impl[il] = hparams.n_head_kv(il) == 0;
    }
-    hparams.n_layer_dense_lead = hparams.n_layer;
+
+    hparams.n_layer_dense_lead = hparams.n_layer();
+
    switch (hparams.n_ff()) {
        case  4608: type = LLM_TYPE_350M; break;
        case  6912: type = LLM_TYPE_700M; break;
@@ -16,9 +19,10 @@ void llama_model_lfm2::load_arch_hparams(llama_model_loader & ml) {
        case 10752: type = LLM_TYPE_2_6B; break;
        default:    type = LLM_TYPE_UNKNOWN;
    }
+
    if (const auto is_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); is_swa && hparams.n_swa > 0) {
        hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-        for (uint32_t il = 0; il < hparams.n_layer; ++il) {
+        for (uint32_t il = 0; il < hparams.n_layer(); ++il) {
            hparams.is_swa_impl[il] = !hparams.is_recr_impl[il];
        }
    }
@@ -9,11 +9,11 @@ void llama_model_lfm2moe::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
    ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func);

-    for (uint32_t il = 0; il < hparams.n_layer; ++il) {
+    for (uint32_t il = 0; il < hparams.n_layer(); ++il) {
        hparams.is_recr_impl[il] = hparams.n_head_kv(il) == 0;
    }

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 24: type = LLM_TYPE_8B_A1B;  break;
        case 40: type = LLM_TYPE_24B_A2B; break;
        default: type = LLM_TYPE_UNKNOWN;
@@ -2,11 +2,12 @@

 void llama_model_llada_moe::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
-
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
    // diffusion language model uses non-causal attention
    hparams.causal_attn = false;
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
        case 16: type = LLM_TYPE_A1_7B; break;
        default: type = LLM_TYPE_UNKNOWN;
    }
@@ -2,14 +2,16 @@

 void llama_model_llada::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
    // LLaDA-8B has 32 layers, similar to LLaMA but for diffusion
-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 32:
            type = LLM_TYPE_8B;
            break;
        default:
            type = LLM_TYPE_UNKNOWN;
    }
+
    // Set non-causal attention for diffusion models
    hparams.causal_attn = false;
 }
@@ -7,13 +7,13 @@ void llama_model_llama::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

    if (hparams.n_expert == 8) {
-        switch (hparams.n_layer) {
+        switch (hparams.n_layer()) {
            case 32: type = LLM_TYPE_8x7B; break;
            case 56: type = LLM_TYPE_8x22B; break;
            default: type = LLM_TYPE_UNKNOWN;
        }
    } else {
-        switch (hparams.n_layer) {
+        switch (hparams.n_layer()) {
            case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B
            case 22: type = LLM_TYPE_1B; break;
            case 26: type = LLM_TYPE_3B; break;
@@ -8,7 +8,7 @@ void llama_model_llama4::load_arch_hparams(llama_model_loader & ml) {
    const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
    if (found_swa && hparams.n_swa == 0) {
        hparams.swa_type             = LLAMA_SWA_TYPE_NONE;
-        hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
+        hparams.n_no_rope_layer_step = hparams.n_layer(); // always use rope
    } else {
        hparams.swa_type                = LLAMA_SWA_TYPE_CHUNKED;
        hparams.n_swa                   = 8192;
@@ -2,7 +2,8 @@

 void llama_model_maincoder::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
        case 32: type = LLM_TYPE_1B; break;
        default: type = LLM_TYPE_UNKNOWN;
    }
@@ -9,7 +9,7 @@ void llama_model_mamba::load_arch_hparams(llama_model_loader & ml) {

    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 24:
            switch (hparams.n_embd) {
                case 768: type = LLM_TYPE_SMALL; break;
@@ -9,7 +9,7 @@ void llama_model_mamba2::load_arch_hparams(llama_model_loader & ml) {

    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 24:
            switch (hparams.n_embd) {
                case 768: type = LLM_TYPE_SMALL; break;
@@ -13,7 +13,7 @@ void llama_model_mellum::load_arch_hparams(llama_model_loader & ml) {
        if (res) {
            hparams.set_swa_pattern(swa_period);
        } else {
-            ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer);
+            ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer());
        }

        hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
@@ -24,7 +24,7 @@ void llama_model_mellum::load_arch_hparams(llama_model_loader & ml) {
        hparams.swa_type = LLAMA_SWA_TYPE_NONE;
    }

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 28: type = LLM_TYPE_12B_A2_5B; break;
        default: type = LLM_TYPE_UNKNOWN;
    }
@@ -9,18 +9,17 @@ void llama_model_mimo2::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,   hparams.n_swa);
    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,         hparams.rope_freq_base_train_swa, false);

-    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer);
+    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer());

    float value_scale = 0.0f;
    if (ml.get_key(LLM_KV_ATTENTION_VALUE_SCALE, value_scale, false) && value_scale != 1.0f) {
        hparams.f_attn_value_scale = value_scale;
    }

-    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
-    GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
-    hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false);
+    GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl");

-    switch (hparams.n_layer - hparams.nextn_predict_layers) {
+    switch (hparams.n_layer()) {
        case 48: type = LLM_TYPE_310B_A15B; break;
        default: type = LLM_TYPE_UNKNOWN;
    }
@@ -35,16 +34,14 @@ void llama_model_mimo2::load_arch_tensors(llama_model_loader &) {
    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);

-    const uint32_t n_nextn = hparams.nextn_predict_layers;
-
-    for (int i = 0; i < n_layer; ++i) {
+    for (int i = 0; i < n_layer_all; ++i) {
        auto & layer = layers[i];
        uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
        uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
        uint32_t n_head = hparams.n_head(i);

        // NextN/MTP layers (the last n_nextn blocks) are preserved but disabled pending support
-        const bool is_nextn = (n_nextn > 0) && (static_cast<uint32_t>(i) >= n_layer - n_nextn);
+        const bool is_nextn = i >= n_layer;
        const int  skip     = is_nextn ? TENSOR_SKIP : 0;

        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, skip);
@@ -93,10 +90,7 @@ llama_model_mimo2::graph::graph(const llama_model & model, const llm_graph_param

    const float v_scale = hparams.f_attn_value_scale;

-    // The last hparams.nextn_predict_layers blocks are MTP heads, currently inactive
-    const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
-
-    for (int il = 0; il < n_transformer_layers; ++il) {
+    for (int il = 0; il < n_layer; ++il) {
        ggml_tensor * inpSA = inpL;

        uint32_t n_head_l    = hparams.n_head(il);
@@ -174,7 +168,7 @@ llama_model_mimo2::graph::graph(const llama_model & model, const llm_graph_param
            }
        }

-        if (il == n_transformer_layers - 1 && inp_out_ids) {
+        if (il == n_layer - 1 && inp_out_ids) {
            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
        }
@@ -3,7 +3,7 @@
 void llama_model_minicpm::load_arch_hparams(llama_model_loader & ml) {
    // Backward-compatible defaults for older MiniCPM GGUFs
    hparams.f_embedding_scale = 12.0f;
-    hparams.f_residual_scale  = 1.4f / sqrtf(float(hparams.n_layer));
+    hparams.f_residual_scale  = 1.4f / sqrtf(float(hparams.n_layer()));
    hparams.f_logit_scale     = hparams.n_embd ? (256.0f / float(hparams.n_embd)) : 1.0f;

    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -16,7 +16,7 @@ void llama_model_minicpm::load_arch_hparams(llama_model_loader & ml) {
    // MiniCPM uses rope by default, unlike Granite which uses it as a switch
    hparams.rope_finetuned = true;

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 52: type = LLM_TYPE_1B; break;
        case 40: type = LLM_TYPE_2B; break;
        default: type = LLM_TYPE_UNKNOWN;
@@ -5,7 +5,7 @@ void llama_model_minicpm3::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK,       hparams.n_lora_q);
    ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,      hparams.n_lora_kv);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 62: type = LLM_TYPE_4B; break;
        default: type = LLM_TYPE_UNKNOWN;
    }
@@ -5,7 +5,7 @@ void llama_model_minimax_m2::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,   hparams.n_ff_exp);
    ml.get_key(LLM_KV_EXPERT_GATING_FUNC,           hparams.expert_gating_func, false);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 62: type = LLM_TYPE_230B_A10B; break;
        default: type = LLM_TYPE_UNKNOWN;
    }
@@ -18,7 +18,7 @@ void llama_model_mistral3::load_arch_hparams(llama_model_loader & ml) {
        }
    }

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 26: type = LLM_TYPE_3B; break;
        case 34: type = LLM_TYPE_8B; break;
        case 40: type = LLM_TYPE_14B; break;
@@ -22,7 +22,7 @@ void llama_model_modern_bert::load_arch_hparams(llama_model_loader & ml) {
        hparams.llm_ffn_op = llm_ffn_op_type_from_string(hidden_act, LLM_FFN_GEGLU);
    }

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 12:
            type = LLM_TYPE_47M; break; // granite-embedding-small
        case 22:
@@ -5,7 +5,7 @@ void llama_model_mpt::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,      hparams.f_clamp_kqv, false);
    ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 32: type = LLM_TYPE_7B; break;
        case 48: type = LLM_TYPE_30B; break;
        default: type = LLM_TYPE_UNKNOWN;
@@ -9,7 +9,7 @@ void llama_model_nemotron_h::load_arch_hparams(llama_model_loader & ml) {

    // A layer is recurrent IFF the n_head_kv value is set to 0 and
    // the n_ff value is set to 0
-    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+    for (uint32_t i = 0; i < hparams.n_layer(); ++i) {
        hparams.is_recr_impl[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0);
    }

@@ -22,7 +22,7 @@ void llama_model_nemotron_h::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
    ml.get_key(LLM_KV_MOE_LATENT_SIZE,                   hparams.moe_latent_size, false);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B
        case 56: type = LLM_TYPE_9B; break;
        case 88: type = LLM_TYPE_120B_A12B; break;
@@ -2,7 +2,8 @@

 void llama_model_nemotron::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
        case 32: type = LLM_TYPE_4B; break;
        default: type = LLM_TYPE_UNKNOWN;
    }
@@ -3,7 +3,7 @@
 void llama_model_neo_bert::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

-    if (hparams.n_layer == 28) {
+    if (hparams.n_layer() == 28) {
        type = LLM_TYPE_250M;
    }
 }
@@ -4,7 +4,7 @@ void llama_model_nomic_bert_moe::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
    ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS,         hparams.moe_every_n_layers, 0);

-    if (hparams.n_layer == 12 && hparams.n_embd == 768) {
+    if (hparams.n_layer() == 12 && hparams.n_embd == 768) {
        if (arch == LLM_ARCH_NOMIC_BERT) {
            type = LLM_TYPE_137M;
        } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
@@ -4,7 +4,7 @@ void llama_model_nomic_bert::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
    ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS,         hparams.moe_every_n_layers, 0);

-    if (hparams.n_layer == 12 && hparams.n_embd == 768) {
+    if (hparams.n_layer() == 12 && hparams.n_embd == 768) {
        if (arch == LLM_ARCH_NOMIC_BERT) {
            type = LLM_TYPE_137M;
        } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
@@ -4,7 +4,7 @@ void llama_model_olmo::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
    ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,     hparams.f_clamp_kqv, false);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 22: type = LLM_TYPE_1B; break;
        case 32: type = LLM_TYPE_7B; break;
        case 80: type = LLM_TYPE_70B; break;
@@ -17,7 +17,7 @@ void llama_model_olmo2::load_arch_hparams(llama_model_loader & ml) {
        hparams.swa_type = LLAMA_SWA_TYPE_NONE;
    }

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 16: type = LLM_TYPE_1B; break;
        case 32: type = LLM_TYPE_7B; break;
        case 40: type = LLM_TYPE_13B; break;
@@ -2,7 +2,8 @@

 void llama_model_olmoe::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
        case 16: type = LLM_TYPE_A1_7B; break;
        default: type = LLM_TYPE_UNKNOWN;
    }
@@ -14,7 +14,7 @@ void llama_model_openai_moe::load_arch_hparams(llama_model_loader & ml) {
    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 24: type = LLM_TYPE_20B; break;
        case 36: type = LLM_TYPE_120B; break;
        default: type = LLM_TYPE_UNKNOWN;
@@ -3,12 +3,12 @@
 void llama_model_openelm::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

-    switch (hparams.n_layer) {
-    case 16: type = LLM_TYPE_270M; break;
-    case 20: type = LLM_TYPE_450M; break;
-    case 28: type = LLM_TYPE_1B; break;
-    case 36: type = LLM_TYPE_3B; break;
-    default: type = LLM_TYPE_UNKNOWN;
+    switch (hparams.n_layer()) {
+        case 16: type = LLM_TYPE_270M; break;
+        case 20: type = LLM_TYPE_450M; break;
+        case 28: type = LLM_TYPE_1B; break;
+        case 36: type = LLM_TYPE_3B; break;
+        default: type = LLM_TYPE_UNKNOWN;
    }
 }

@@ -3,7 +3,7 @@
 void llama_model_orion::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 40: type = LLM_TYPE_14B; break;
        default: type = LLM_TYPE_UNKNOWN;
    }
@@ -2,7 +2,8 @@

 void llama_model_pangu_embed::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-    switch (hparams.n_layer) {
+
+    switch (hparams.n_layer()) {
        case 26: type = LLM_TYPE_1B; break; // openPangu-Embedded-1B-V1.1
        case 34: type = LLM_TYPE_7B; break; // openPangu-Embedded-7B-V1.1
        default: type = LLM_TYPE_UNKNOWN;
@@ -3,7 +3,7 @@
 void llama_model_phi2::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 24: type = LLM_TYPE_1B; break;
        case 32: type = LLM_TYPE_3B; break;
        default: type = LLM_TYPE_UNKNOWN;
@@ -3,7 +3,7 @@
 void llama_model_phi3::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 24: type = LLM_TYPE_1B; break;
        case 32: type = LLM_TYPE_3B; break;
        case 40: type = LLM_TYPE_14B; break;
@@ -3,7 +3,7 @@
 void llama_model_phimoe::load_arch_hparams(llama_model_loader & ml) {
    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

-    switch (hparams.n_layer) {
+    switch (hparams.n_layer()) {
        case 32: type = LLM_TYPE_16x3_8B; break;
        default: type = LLM_TYPE_UNKNOWN;
    }
--- a/Show More
+++ b/Show More