mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-09 07:16:44 +02:00
hparams : refactor hparams.n_layer (#24060)
* hparams : refactor hparams.n_layer * cont : remove `n_layer_kv()`, use n_layer_all instead * cont : type consistency * pi : update SYSTEM.md * models : fix Step3.5 MTP * cont : remove duplicate switch cases * cont : explicitly set `false` to extra layers for `is_swa` and `is_recr` * cont : fix nextn layer count handling Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
+2
-2
@@ -16,12 +16,12 @@ Pull requests (PRs):
|
||||
- New branch names are prefixed with "gg/"
|
||||
- Before opening a pull request, ask the user to confirm the description
|
||||
- When creating a pull request, look for the repository's PR template and follow it
|
||||
- For the AI usage disclosure section, write "YES. llama.cpp + pi + [MODEL]"
|
||||
- For the AI usage disclosure section, write "YES. pi:llama.cpp/[MODEL]"
|
||||
- Ask the user to tell you what model was used and write it in place of [MODEL]
|
||||
- Always create the pull requests in draft mode
|
||||
|
||||
Commits:
|
||||
- On every commit that you make, include a "Assisted-by: llama.cpp:local pi" tag
|
||||
- On every commit that you make, include a "Assisted-by: pi:llama.cpp/[MODEL]" tag
|
||||
- Do not explicitly set the git author in commits - rely on the default git config
|
||||
- Always use `--no-gpg-sign` when committing
|
||||
- Never `git push` without explicit confirmation from the user
|
||||
|
||||
@@ -41,7 +41,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
|
||||
auto it = ctx_map.find(buft);
|
||||
if (it == ctx_map.end()) {
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ hparams.n_layer*ggml_tensor_overhead(),
|
||||
/*.mem_size =*/ hparams.n_layer()*ggml_tensor_overhead(),
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
@@ -61,9 +61,9 @@ bool llama_adapter_cvec::init(const llama_model & model) {
|
||||
};
|
||||
|
||||
// make tensors
|
||||
tensors.reserve(hparams.n_layer);
|
||||
tensors.reserve(hparams.n_layer());
|
||||
tensors.push_back(nullptr); // there's never a tensor for layer 0
|
||||
for (size_t il = 1; il < hparams.n_layer; il++) {
|
||||
for (size_t il = 1; il < hparams.n_layer(); il++) {
|
||||
ggml_backend_buffer_type_t buft = model.select_buft(il);
|
||||
ggml_context * ctx = ctx_for_buft(buft);
|
||||
if (!ctx) {
|
||||
@@ -121,7 +121,7 @@ bool llama_adapter_cvec::apply(
|
||||
layer_start = il_start;
|
||||
layer_end = il_end;
|
||||
|
||||
for (size_t il = 1; il < hparams.n_layer; il++) {
|
||||
for (size_t il = 1; il < hparams.n_layer(); il++) {
|
||||
assert(tensors[il] != nullptr);
|
||||
|
||||
const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
|
||||
|
||||
@@ -341,7 +341,7 @@ llama_context::llama_context(
|
||||
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
||||
bool pipeline_parallel =
|
||||
model.n_devices() > 1 &&
|
||||
model.n_gpu_layers() > model.hparams.n_layer &&
|
||||
model.n_gpu_layers() > model.hparams.n_layer() &&
|
||||
model.split_mode() == LLAMA_SPLIT_MODE_LAYER &&
|
||||
cparams.offload_kqv &&
|
||||
!model.has_tensor_overrides();
|
||||
@@ -2351,7 +2351,7 @@ llm_graph_cb llama_context::graph_get_cb() const {
|
||||
|
||||
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
|
||||
// FIXME: fix in ggml_backend_sched
|
||||
const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer;
|
||||
const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer();
|
||||
if (ubatch.n_tokens < 32 || full_offload) {
|
||||
if (il != -1 && strcmp(name, "norm") == 0) {
|
||||
const auto & dev_layer = model.dev_layer(il);
|
||||
@@ -3416,7 +3416,7 @@ llama_context * llama_init_from_model(
|
||||
|
||||
if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && ggml_is_quantized(params.type_k)) {
|
||||
const uint32_t blck_size = ggml_blck_size(params.type_k);
|
||||
for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
|
||||
for (uint32_t il = 0; il < model->hparams.n_layer(); ++il) {
|
||||
if (model->hparams.n_embd_head_k(il) % blck_size != 0) {
|
||||
LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
|
||||
__func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k(il));
|
||||
@@ -3427,7 +3427,7 @@ llama_context * llama_init_from_model(
|
||||
|
||||
if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && ggml_is_quantized(params.type_v)) {
|
||||
const uint32_t blck_size = ggml_blck_size(params.type_v);
|
||||
for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
|
||||
for (uint32_t il = 0; il < model->hparams.n_layer(); ++il) {
|
||||
if (model->hparams.n_embd_head_v(il) % blck_size != 0) {
|
||||
LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_v=%u\n",
|
||||
__func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v(il));
|
||||
@@ -3449,7 +3449,7 @@ llama_context * llama_init_from_model(
|
||||
}
|
||||
|
||||
if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP &&
|
||||
model->hparams.nextn_predict_layers == 0) {
|
||||
model->hparams.n_layer_nextn == 0) {
|
||||
LLAMA_LOG_WARN("%s: context type MTP requested but model doesn't contain MTP layers\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
+1
-1
@@ -1005,7 +1005,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
|
||||
cparams (params.cparams),
|
||||
ubatch (params.ubatch),
|
||||
n_embd (hparams.n_embd),
|
||||
n_layer (hparams.n_layer),
|
||||
n_layer (hparams.n_layer()),
|
||||
n_rot (hparams.n_rot()),
|
||||
n_ctx (cparams.n_ctx),
|
||||
n_head (hparams.n_head()),
|
||||
|
||||
+38
-45
@@ -7,31 +7,38 @@
|
||||
|
||||
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
|
||||
if (dense_first) {
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
for (uint32_t il = 0; il < n_layer(); ++il) {
|
||||
is_swa_impl[il] = n_pattern == 0 || (il % n_pattern != 0);
|
||||
}
|
||||
} else {
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
for (uint32_t il = 0; il < n_layer(); ++il) {
|
||||
is_swa_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t il = n_layer(); il < n_layer_all; ++il) {
|
||||
is_swa_impl[il] = false;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: implement
|
||||
//void llama_hparams::set_recr_pattern(uint32_t n_pattern, bool dense_first) {
|
||||
// if (dense_first) {
|
||||
// for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
// is_recr_impl[il] = n_pattern == 0 || (il % n_pattern != 0);
|
||||
// }
|
||||
// } else {
|
||||
// for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
// is_recr_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
void llama_hparams::set_recr_pattern(uint32_t n_pattern, bool dense_first) {
|
||||
if (dense_first) {
|
||||
for (uint32_t il = 0; il < n_layer(); ++il) {
|
||||
is_recr_impl[il] = n_pattern == 0 || (il % n_pattern != 0);
|
||||
}
|
||||
} else {
|
||||
for (uint32_t il = 0; il < n_layer(); ++il) {
|
||||
is_recr_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t il = n_layer(); il < n_layer_all; ++il) {
|
||||
is_recr_impl[il] = false;
|
||||
}
|
||||
}
|
||||
|
||||
bool llama_hparams::is_swa_any() const {
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
for (uint32_t il = 0; il < n_layer_all; ++il) {
|
||||
if (is_swa_impl[il]) {
|
||||
return true;
|
||||
}
|
||||
@@ -41,7 +48,7 @@ bool llama_hparams::is_swa_any() const {
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_head(uint32_t il) const {
|
||||
if (il < n_layer) {
|
||||
if (il < n_layer_all) {
|
||||
return n_head_arr[il];
|
||||
}
|
||||
|
||||
@@ -49,7 +56,7 @@ uint32_t llama_hparams::n_head(uint32_t il) const {
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_head_kv(uint32_t il) const {
|
||||
if (il < n_layer) {
|
||||
if (il < n_layer_all) {
|
||||
return n_head_kv_arr[il];
|
||||
}
|
||||
|
||||
@@ -57,7 +64,7 @@ uint32_t llama_hparams::n_head_kv(uint32_t il) const {
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_ff(uint32_t il) const {
|
||||
if (il < n_layer) {
|
||||
if (il < n_layer_all) {
|
||||
return n_ff_arr[il];
|
||||
}
|
||||
|
||||
@@ -76,7 +83,7 @@ uint32_t llama_hparams::n_gqa(uint32_t il) const {
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_rot(uint32_t il) const {
|
||||
if (il < n_layer) {
|
||||
if (il < n_layer_all) {
|
||||
return is_swa(il) ? n_rot_swa : n_rot_full;
|
||||
}
|
||||
|
||||
@@ -98,7 +105,7 @@ uint32_t llama_hparams::n_embd_out() const {
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_head_k(uint32_t il) const {
|
||||
if (il < n_layer) {
|
||||
if (il < n_layer_all) {
|
||||
return is_swa(il) ? n_embd_head_k_swa : n_embd_head_k_full;
|
||||
}
|
||||
|
||||
@@ -106,7 +113,7 @@ uint32_t llama_hparams::n_embd_head_k(uint32_t il) const {
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_head_v(uint32_t il) const {
|
||||
if (il < n_layer) {
|
||||
if (il < n_layer_all) {
|
||||
return is_swa(il) ? n_embd_head_v_swa : n_embd_head_v_full;
|
||||
}
|
||||
|
||||
@@ -127,7 +134,7 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
|
||||
|
||||
bool llama_hparams::is_n_embd_k_gqa_variable() const {
|
||||
const uint32_t val = n_embd_k_gqa();
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
for (uint32_t il = 0; il < n_layer_all; ++il) {
|
||||
if (val != n_embd_k_gqa(il)) {
|
||||
return true;
|
||||
}
|
||||
@@ -138,7 +145,7 @@ bool llama_hparams::is_n_embd_k_gqa_variable() const {
|
||||
|
||||
bool llama_hparams::is_n_embd_v_gqa_variable() const {
|
||||
const uint32_t val = n_embd_v_gqa();
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
for (uint32_t il = 0; il < n_layer_all; ++il) {
|
||||
if (val != n_embd_v_gqa(il)) {
|
||||
return true;
|
||||
}
|
||||
@@ -149,7 +156,7 @@ bool llama_hparams::is_n_embd_v_gqa_variable() const {
|
||||
|
||||
uint32_t llama_hparams::n_embd_k_gqa_max() const {
|
||||
uint32_t val = n_embd_k_gqa();
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
for (uint32_t il = 0; il < n_layer_all; ++il) {
|
||||
val = std::max(val, n_embd_k_gqa(il));
|
||||
}
|
||||
|
||||
@@ -158,7 +165,7 @@ uint32_t llama_hparams::n_embd_k_gqa_max() const {
|
||||
|
||||
uint32_t llama_hparams::n_embd_v_gqa_max() const {
|
||||
uint32_t val = n_embd_v_gqa();
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
for (uint32_t il = 0; il < n_layer_all; ++il) {
|
||||
val = std::max(val, n_embd_v_gqa(il));
|
||||
}
|
||||
|
||||
@@ -207,11 +214,11 @@ uint32_t llama_hparams::n_embd_s() const {
|
||||
}
|
||||
|
||||
bool llama_hparams::is_recr(uint32_t il) const {
|
||||
if (il < n_layer) {
|
||||
if (il < n_layer_all) {
|
||||
return is_recr_impl[il];
|
||||
}
|
||||
|
||||
GGML_ABORT("%s: il (%u) out of bounds (n_layer: %u)\n", __func__, il, n_layer);
|
||||
GGML_ABORT("%s: il (%u) out of bounds (n_layer_all: %u)\n", __func__, il, n_layer_all);
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_pos_per_embd() const {
|
||||
@@ -219,11 +226,11 @@ uint32_t llama_hparams::n_pos_per_embd() const {
|
||||
}
|
||||
|
||||
bool llama_hparams::is_swa(uint32_t il) const {
|
||||
if (il < n_layer) {
|
||||
if (il < n_layer_all) {
|
||||
return is_swa_impl[il];
|
||||
}
|
||||
|
||||
GGML_ABORT("fatal error");
|
||||
GGML_ABORT("%s: il (%u) out of bounds (n_layer_all: %u)\n", __func__, il, n_layer_all);
|
||||
}
|
||||
|
||||
bool llama_hparams::is_mla() const {
|
||||
@@ -242,12 +249,6 @@ uint32_t llama_hparams::n_embd_head_v_mla() const {
|
||||
}
|
||||
|
||||
bool llama_hparams::has_kv(uint32_t il) const {
|
||||
if (kv_only_nextn) {
|
||||
// MTP head: only the trailing nextn_predict_layers blocks own a KV cache;
|
||||
// the leading trunk blocks are not executed in this graph.
|
||||
return nextn_predict_layers > 0 && il >= (n_layer - nextn_predict_layers);
|
||||
}
|
||||
|
||||
if (n_layer_kv_from_start >= 0) {
|
||||
if (il < (uint32_t) n_layer_kv_from_start) {
|
||||
return true;
|
||||
@@ -260,16 +261,8 @@ bool llama_hparams::has_kv(uint32_t il) const {
|
||||
return true;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_layer_kv() const {
|
||||
uint32_t res = 0;
|
||||
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
if (has_kv(il)) {
|
||||
res++;
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
uint32_t llama_hparams::n_layer() const {
|
||||
return n_layer_all - n_layer_nextn;
|
||||
}
|
||||
|
||||
bool llama_hparams::use_mrope() const {
|
||||
|
||||
+8
-9
@@ -48,12 +48,15 @@ struct llama_hparams {
|
||||
|
||||
uint32_t n_ctx_train; // context size the model was trained on
|
||||
uint32_t n_embd;
|
||||
uint32_t n_layer;
|
||||
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
|
||||
uint32_t n_layer_all;
|
||||
uint32_t n_layer_nextn = 0;
|
||||
uint32_t n_expert = 0;
|
||||
uint32_t n_expert_used = 0;
|
||||
uint32_t n_rel_attn_bkts = 0;
|
||||
|
||||
// TODO: this needs to be reworked
|
||||
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
|
||||
|
||||
// different head size for full_attention and SWA layers
|
||||
uint32_t n_embd_head_k_full; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
|
||||
uint32_t n_embd_head_v_full; // dimension of values (d_v) aka n_embd_head
|
||||
@@ -96,9 +99,6 @@ struct llama_hparams {
|
||||
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
|
||||
uint32_t moe_every_n_layers = 0;
|
||||
uint32_t moe_latent_size = 0;
|
||||
uint32_t nextn_predict_layers = 0;
|
||||
|
||||
bool kv_only_nextn = false; // if true, only the last nextn_predict_layers blocks have a KV cache (MTP head arches)
|
||||
|
||||
float f_norm_eps;
|
||||
float f_norm_rms_eps;
|
||||
@@ -272,8 +272,7 @@ struct llama_hparams {
|
||||
|
||||
bool is_swa(uint32_t il) const;
|
||||
|
||||
// TODO: implement
|
||||
//void set_recr_pattern(uint32_t n_pattern, bool dense_first = false);
|
||||
void set_recr_pattern(uint32_t n_pattern, bool dense_first = false);
|
||||
|
||||
// whether or not the given layer is recurrent (for hybrid models)
|
||||
bool is_recr(uint32_t il) const;
|
||||
@@ -329,8 +328,8 @@ struct llama_hparams {
|
||||
|
||||
bool has_kv(uint32_t il) const;
|
||||
|
||||
// number of layers for which has_kv() returns true
|
||||
uint32_t n_layer_kv() const;
|
||||
// number of effective layers (excludes nextn layers)
|
||||
uint32_t n_layer() const;
|
||||
|
||||
// note that this function uses different SWA parameters from those in the hparams
|
||||
// note: inlined on purpose for performance reasons
|
||||
|
||||
@@ -97,7 +97,7 @@ llama_kv_cache::llama_kv_cache(
|
||||
|
||||
GGML_ASSERT(kv_size % n_pad == 0);
|
||||
|
||||
const uint32_t n_layer_kv = hparams.n_layer_kv();
|
||||
const uint32_t n_layer = hparams.n_layer_all;
|
||||
|
||||
// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
|
||||
struct ggml_backend_buft_comparator {
|
||||
@@ -112,7 +112,7 @@ llama_kv_cache::llama_kv_cache(
|
||||
auto it = ctx_map.find(buft);
|
||||
if (it == ctx_map.end()) {
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer_kv*ggml_tensor_overhead()),
|
||||
/*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer*ggml_tensor_overhead()),
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
@@ -160,7 +160,7 @@ llama_kv_cache::llama_kv_cache(
|
||||
|
||||
const bool is_mla = hparams.is_mla();
|
||||
|
||||
for (uint32_t il = 0; il < hparams.n_layer; il++) {
|
||||
for (uint32_t il = 0; il < n_layer; il++) {
|
||||
if (!hparams.has_kv(il)) {
|
||||
LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il);
|
||||
continue;
|
||||
@@ -230,7 +230,7 @@ llama_kv_cache::llama_kv_cache(
|
||||
if (reuse) {
|
||||
LLAMA_LOG_DEBUG("%s: reusing layers:\n", __func__);
|
||||
|
||||
for (uint32_t il = 0; il < hparams.n_layer; il++) {
|
||||
for (uint32_t il = 0; il < n_layer; il++) {
|
||||
const int32_t il_reuse = reuse(il);
|
||||
|
||||
if (il_reuse < 0) {
|
||||
|
||||
@@ -26,7 +26,7 @@ llama_memory_recurrent::llama_memory_recurrent(
|
||||
uint32_t n_seq_max,
|
||||
uint32_t n_rs_seq,
|
||||
const layer_filter_cb & filter) : hparams(model.hparams), n_seq_max(n_seq_max) {
|
||||
const int32_t n_layer = hparams.n_layer;
|
||||
const int32_t n_layer = hparams.n_layer();
|
||||
|
||||
head = 0;
|
||||
size = mem_size;
|
||||
@@ -863,7 +863,7 @@ void llama_memory_recurrent::state_write_meta(llama_io_write_i & io, const std::
|
||||
|
||||
void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
|
||||
const uint32_t s_trans = 0;
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
const uint32_t n_layer = hparams.n_layer();
|
||||
|
||||
io.write(&s_trans, sizeof(s_trans));
|
||||
io.write(&n_layer, sizeof(n_layer));
|
||||
@@ -1047,8 +1047,8 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell
|
||||
io.read(&s_trans, sizeof(s_trans));
|
||||
io.read(&n_layer, sizeof(n_layer));
|
||||
|
||||
if (n_layer != hparams.n_layer) {
|
||||
LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
|
||||
if (n_layer != hparams.n_layer()) {
|
||||
LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer());
|
||||
return false;
|
||||
}
|
||||
if (cell_count > size) {
|
||||
|
||||
@@ -1050,10 +1050,10 @@ struct ggml_tensor * llama_model_loader::create_tensor(
|
||||
if (it == ctx_map.end()) {
|
||||
// one ggml context per buffer type
|
||||
int max_n_tensors = n_tensors;
|
||||
max_n_tensors += 1; // duplicated output tensor
|
||||
max_n_tensors += hparams.n_layer*2; // duplicated rope freq tensors
|
||||
max_n_tensors += 1; // duplicated output tensor
|
||||
max_n_tensors += hparams.n_layer()*2; // duplicated rope freq tensors
|
||||
if (files.empty()) {
|
||||
max_n_tensors += hparams.n_layer*256; // this should be well above what any model actually uses
|
||||
max_n_tensors += hparams.n_layer()*256; // this should be well above what any model actually uses
|
||||
}
|
||||
const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
|
||||
|
||||
|
||||
@@ -77,7 +77,7 @@ void llama_model_saver::add_kv(const enum llm_kv key, const char value) {
|
||||
template <typename Container>
|
||||
void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, const bool per_layer) {
|
||||
GGML_ASSERT(model != nullptr || !per_layer);
|
||||
const size_t n_values = per_layer ? size_t(model->hparams.n_layer) : value.size();
|
||||
const size_t n_values = per_layer ? size_t(model->hparams.n_layer()) : value.size();
|
||||
GGML_ASSERT(n_values <= value.size());
|
||||
|
||||
if (n_values == 0) {
|
||||
@@ -206,7 +206,7 @@ void llama_model_saver::add_kv_from_model() {
|
||||
if (hparams.n_embd_out_impl > 0) {
|
||||
add_kv(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl);
|
||||
}
|
||||
add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||
add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer_all);
|
||||
add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||
add_kv(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, true);
|
||||
add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
@@ -227,7 +227,7 @@ void llama_model_saver::add_kv_from_model() {
|
||||
add_kv(LLM_KV_EXPERT_GROUP_SCALE, hparams.expert_group_scale);
|
||||
add_kv(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts);
|
||||
add_kv(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers);
|
||||
add_kv(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers);
|
||||
add_kv(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn);
|
||||
add_kv(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers);
|
||||
add_kv(LLM_KV_POOLING_TYPE, uint32_t(hparams.pooling_type));
|
||||
add_kv(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
||||
|
||||
+39
-40
@@ -398,7 +398,7 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
|
||||
rotation = get_il_eff(il) % ud->n_devices;
|
||||
} else {
|
||||
il = 0;
|
||||
rotation = hparams.n_layer % ud->n_devices;
|
||||
rotation = hparams.n_layer() % ud->n_devices;
|
||||
}
|
||||
const ggml_tensor * tensor_axis_0 = suffix.empty() ? tensor : ud->model->get_tensor((prefix + suffix).c_str());
|
||||
if (tensor_axis_0 == nullptr) {
|
||||
@@ -1034,7 +1034,7 @@ void llama_model_base::load_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer_all);
|
||||
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
||||
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
||||
ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups, false);
|
||||
@@ -1089,13 +1089,13 @@ void llama_model_base::load_hparams(llama_model_loader & ml) {
|
||||
std::fill(hparams.swiglu_clamp_exp.begin(), hparams.swiglu_clamp_exp.end(), 0.0f);
|
||||
std::fill(hparams.swiglu_clamp_shexp.begin(), hparams.swiglu_clamp_shexp.end(), 0.0f);
|
||||
|
||||
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
|
||||
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer(), false);
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer(), false);
|
||||
|
||||
// n_head_kv is optional, default to n_head
|
||||
hparams.n_head_kv_arr = hparams.n_head_arr;
|
||||
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer(), false);
|
||||
|
||||
bool rope_finetuned = false;
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
|
||||
@@ -1194,7 +1194,7 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
|
||||
const auto & use_mlock = params.use_mlock;
|
||||
const auto & tensor_split = params.tensor_split;
|
||||
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_layer = hparams.n_layer_all;
|
||||
const int n_gpu_layers = this->n_gpu_layers();
|
||||
|
||||
const bool use_mmap_buffer = true;
|
||||
@@ -1251,10 +1251,10 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
|
||||
splits[i] /= split_sum;
|
||||
}
|
||||
|
||||
const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
|
||||
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
|
||||
const int i_gpu_start = std::max(n_layer + 1 - n_gpu_layers, 0);
|
||||
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, n_layer + 1);
|
||||
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
||||
const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il);
|
||||
const bool is_swa = il < n_layer && hparams.is_swa(il);
|
||||
if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
|
||||
LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
|
||||
return {cpu_dev, &pimpl->cpu_buft_list};
|
||||
@@ -1557,7 +1557,7 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
|
||||
}
|
||||
|
||||
if (llama_supports_gpu_offload()) {
|
||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||
const int n_gpu = std::min(n_gpu_layers, n_layer);
|
||||
|
||||
int n_repeating = n_gpu;
|
||||
if (n_repeating > 0) {
|
||||
@@ -1566,8 +1566,8 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating);
|
||||
|
||||
const int max_backend_supported_layers = hparams.n_layer + 1;
|
||||
const int max_offloadable_layers = hparams.n_layer + 1;
|
||||
const int max_backend_supported_layers = n_layer + 1;
|
||||
const int max_offloadable_layers = n_layer + 1;
|
||||
|
||||
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
||||
}
|
||||
@@ -1636,7 +1636,7 @@ const float * llama_model::tensor_split() const {
|
||||
}
|
||||
|
||||
uint32_t llama_model::n_gpu_layers() const {
|
||||
return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer + 1;
|
||||
return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer() + 1;
|
||||
}
|
||||
|
||||
llama_split_mode llama_model::split_mode() const {
|
||||
@@ -1707,17 +1707,17 @@ void llama_model::print_info() const {
|
||||
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
||||
LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
|
||||
LLAMA_LOG_INFO("%s: n_embd_inp = %u\n", __func__, hparams.n_embd_inp());
|
||||
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
|
||||
LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
|
||||
LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
|
||||
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer());
|
||||
LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer()).c_str());
|
||||
LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer()).c_str());
|
||||
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot_full);
|
||||
LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
|
||||
LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
|
||||
LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k_full);
|
||||
LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v_full);
|
||||
LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
|
||||
LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
|
||||
LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
|
||||
LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer()).c_str());
|
||||
LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer()).c_str());
|
||||
LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer()).c_str());
|
||||
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
|
||||
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
||||
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
||||
@@ -1725,7 +1725,7 @@ void llama_model::print_info() const {
|
||||
LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
|
||||
LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
|
||||
LLAMA_LOG_INFO("%s: f_attn_value_scale = %.4f\n", __func__, hparams.f_attn_value_scale);
|
||||
LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
|
||||
LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer()).c_str());
|
||||
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
||||
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
||||
LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups);
|
||||
@@ -1852,7 +1852,7 @@ void llama_model::print_info() const {
|
||||
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
||||
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
||||
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
||||
LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers);
|
||||
LLAMA_LOG_INFO("%s: n_layer_nextn = %d\n", __func__, hparams.n_layer_nextn);
|
||||
}
|
||||
|
||||
if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
|
||||
@@ -2034,22 +2034,21 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||
llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
|
||||
llama_memory_hybrid::layer_filter_cb filter_recr = nullptr;
|
||||
if (arch == LLM_ARCH_FALCON_H1) {
|
||||
filter_attn = [&](int32_t) { return true; };
|
||||
filter_recr = [&](int32_t) { return true; };
|
||||
filter_attn = [&](uint32_t) { return true; };
|
||||
filter_recr = [&](uint32_t) { return true; };
|
||||
} else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
||||
filter_attn = [&](int32_t il) {
|
||||
filter_attn = [&](uint32_t il) {
|
||||
return !hparams.is_recr(il) && hparams.n_ff(il) == 0;
|
||||
};
|
||||
filter_recr = [&](int32_t il) {
|
||||
filter_recr = [&](uint32_t il) {
|
||||
return hparams.is_recr(il) && hparams.n_ff(il) == 0;
|
||||
};
|
||||
} else if (arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE) {
|
||||
const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers;
|
||||
filter_attn = [&, n_main](int32_t il) {
|
||||
return (uint32_t)il < n_main && !hparams.is_recr(il);
|
||||
filter_attn = [&](uint32_t il) {
|
||||
return il < hparams.n_layer() && !hparams.is_recr(il);
|
||||
};
|
||||
filter_recr = [&, n_main](int32_t il) {
|
||||
return (uint32_t)il < n_main && hparams.is_recr(il);
|
||||
filter_recr = [&](uint32_t il) {
|
||||
return il < hparams.n_layer() && hparams.is_recr(il);
|
||||
};
|
||||
}
|
||||
|
||||
@@ -2098,9 +2097,11 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||
llama_kv_cache::layer_filter_cb filter = nullptr;
|
||||
|
||||
if (arch == LLM_ARCH_GEMMA3N || arch == LLM_ARCH_GEMMA4) {
|
||||
reuse = [&](int32_t il) {
|
||||
if (il >= (int32_t) hparams.n_layer_kv_from_start) {
|
||||
return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
|
||||
reuse = [&](uint32_t il) {
|
||||
GGML_ASSERT(hparams.n_layer_kv_from_start >= 2);
|
||||
|
||||
if (il >= (uint32_t)hparams.n_layer_kv_from_start) {
|
||||
return hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
|
||||
}
|
||||
|
||||
return -1;
|
||||
@@ -2108,16 +2109,14 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||
}
|
||||
|
||||
if (mtp_on_hybrid_qwen35) {
|
||||
const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers;
|
||||
filter = [n_main](int32_t il) { return (uint32_t)il >= n_main; };
|
||||
filter = [&](uint32_t il) { return il >= hparams.n_layer(); };
|
||||
}
|
||||
|
||||
if (arch == LLM_ARCH_STEP35 && hparams.nextn_predict_layers > 0) {
|
||||
const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers;
|
||||
if (arch == LLM_ARCH_STEP35 && hparams.n_layer_nextn > 0) {
|
||||
if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP) {
|
||||
filter = [n_main](int32_t il) { return (uint32_t)il >= n_main; };
|
||||
filter = [&](uint32_t il) { return il >= hparams.n_layer(); };
|
||||
} else {
|
||||
filter = [n_main](int32_t il) { return (uint32_t)il < n_main; };
|
||||
filter = [&](uint32_t il) { return il < hparams.n_layer(); };
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2242,7 +2241,7 @@ int32_t llama_model_n_embd_out(const llama_model * model) {
|
||||
}
|
||||
|
||||
int32_t llama_model_n_layer(const llama_model * model) {
|
||||
return model->hparams.n_layer;
|
||||
return model->hparams.n_layer();
|
||||
}
|
||||
|
||||
int32_t llama_model_n_head(const llama_model * model) {
|
||||
|
||||
+2
-1
@@ -700,7 +700,8 @@ const char * llm_type_name(llm_type type);
|
||||
// convenience macro for loading local variables for load_tensors() in llama_model_base
|
||||
// note: cast to int64_t since we will use these for the tensor dimensions
|
||||
#define LLAMA_LOAD_LOCALS \
|
||||
const int n_layer = hparams.n_layer; GGML_UNUSED(n_layer); \
|
||||
const int n_layer = hparams.n_layer(); GGML_UNUSED(n_layer); \
|
||||
const int n_layer_all = hparams.n_layer_all; GGML_UNUSED(n_layer_all); \
|
||||
const int64_t n_head = hparams.n_head(); GGML_UNUSED(n_head); \
|
||||
const int64_t n_head_kv = hparams.n_head_kv(); GGML_UNUSED(n_head_kv); \
|
||||
const int64_t n_embd = hparams.n_embd; GGML_UNUSED(n_embd); \
|
||||
|
||||
+2
-2
@@ -847,7 +847,7 @@ static void init_quantize_state_counters(quantize_state_impl & qs, std::vector<t
|
||||
qs.has_tied_embeddings = false;
|
||||
}
|
||||
}
|
||||
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer;
|
||||
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer();
|
||||
}
|
||||
|
||||
//
|
||||
@@ -1348,7 +1348,7 @@ llama_model * llama_quant_model_from_metadata(const llama_quant_model_desc * des
|
||||
model->hparams.n_embd = desc->n_embd;
|
||||
model->hparams.n_embd_head_k_full = desc->n_embd_head_k;
|
||||
model->hparams.n_embd_head_v_full = desc->n_embd_head_v;
|
||||
model->hparams.n_layer = desc->n_layer;
|
||||
model->hparams.n_layer_all = desc->n_layer;
|
||||
model->hparams.n_expert = desc->n_expert;
|
||||
|
||||
for (uint32_t i = 0; i < desc->n_layer; i++) {
|
||||
|
||||
@@ -30,7 +30,7 @@ void llama_model_afmoe::load_arch_hparams(llama_model_loader & ml) {
|
||||
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
|
||||
}
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 56: type = LLM_TYPE_6B; break;
|
||||
case 32: type = LLM_TYPE_26B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
|
||||
@@ -2,12 +2,13 @@
|
||||
|
||||
void llama_model_apertus::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n, hparams.n_layer);
|
||||
ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p, hparams.n_layer);
|
||||
ml.get_key_or_arr(LLM_KV_XIELU_BETA, hparams.xielu_beta, hparams.n_layer);
|
||||
ml.get_key_or_arr(LLM_KV_XIELU_EPS, hparams.xielu_eps, hparams.n_layer);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n, hparams.n_layer());
|
||||
ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p, hparams.n_layer());
|
||||
ml.get_key_or_arr(LLM_KV_XIELU_BETA, hparams.xielu_beta, hparams.n_layer());
|
||||
ml.get_key_or_arr(LLM_KV_XIELU_EPS, hparams.xielu_eps, hparams.n_layer());
|
||||
|
||||
switch (hparams.n_layer()) {
|
||||
case 32: type = LLM_TYPE_8B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ void llama_model_arcee::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
// Arcee uses the same structure as Llama
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 36: type = LLM_TYPE_4B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ void llama_model_arctic::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
if (hparams.n_expert == 128) {
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 35: type = LLM_TYPE_10B_128x3_66B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@ void llama_model_arwkv7::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK, hparams.n_lora_gate, false);
|
||||
ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 12:
|
||||
switch (hparams.n_embd) {
|
||||
case 768: type = LLM_TYPE_190M; break;
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
void llama_model_baichuan::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 32: type = LLM_TYPE_7B; break;
|
||||
case 40: type = LLM_TYPE_13B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
|
||||
@@ -8,7 +8,7 @@ void llama_model_bailingmoe::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 28: type = LLM_TYPE_16B; break;
|
||||
case 88: type = LLM_TYPE_290B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
|
||||
@@ -9,17 +9,13 @@ void llama_model_bailingmoe2::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
||||
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
|
||||
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
||||
GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
|
||||
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false);
|
||||
|
||||
// TODO: when MTP is implemented, this should probably be updated if needed
|
||||
hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
|
||||
GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl");
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 20: type = LLM_TYPE_16B_A1B; break;
|
||||
case 21: type = LLM_TYPE_16B_A1B; break;
|
||||
case 32: type = LLM_TYPE_100B_A6B; break;
|
||||
case 33: type = LLM_TYPE_100B_A6B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
}
|
||||
@@ -39,9 +35,9 @@ void llama_model_bailingmoe2::load_arch_tensors(llama_model_loader &) {
|
||||
GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for bailingmoe2");
|
||||
GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for bailingmoe2");
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
for (int i = 0; i < n_layer_all; ++i) {
|
||||
int flags = 0;
|
||||
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
||||
if (i >= n_layer) {
|
||||
// skip all tensors in the NextN layers
|
||||
flags |= TENSOR_SKIP;
|
||||
}
|
||||
@@ -78,7 +74,7 @@ void llama_model_bailingmoe2::load_arch_tensors(llama_model_loader &) {
|
||||
}
|
||||
|
||||
// NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
|
||||
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
||||
if (i >= n_layer) {
|
||||
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
|
||||
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
|
||||
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
|
||||
@@ -112,8 +108,7 @@ llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
|
||||
for (int il = 0; il < n_transformer_layers; ++il) {
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
@@ -146,7 +141,7 @@ llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
|
||||
if (il == n_transformer_layers - 1 && inp_out_ids) {
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
|
||||
+2
-2
@@ -1,9 +1,9 @@
|
||||
#include "models.h"
|
||||
|
||||
void llama_model_bert::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 3:
|
||||
type = LLM_TYPE_17M; break; // bge-micro
|
||||
case 6:
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
void llama_model_bitnet::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 26: type = LLM_TYPE_3B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
void llama_model_bloom::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 24: type = LLM_TYPE_1B; break;
|
||||
case 30:
|
||||
switch (hparams.n_embd) {
|
||||
|
||||
@@ -6,7 +6,7 @@ void llama_model_chameleon::load_arch_hparams(llama_model_loader & ml) {
|
||||
hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
|
||||
ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 32: type = LLM_TYPE_7B; break;
|
||||
case 48: type = LLM_TYPE_34B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
|
||||
void llama_model_chatglm::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
switch (hparams.n_layer) {
|
||||
|
||||
switch (hparams.n_layer()) {
|
||||
case 28: {
|
||||
if (hparams.n_head(0) == 16) {
|
||||
type = LLM_TYPE_1_5B;
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
|
||||
void llama_model_codeshell::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
switch (hparams.n_layer) {
|
||||
|
||||
switch (hparams.n_layer()) {
|
||||
case 42: type = LLM_TYPE_7B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
|
||||
void llama_model_cogvlm::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
switch (hparams.n_layer) {
|
||||
|
||||
switch (hparams.n_layer()) {
|
||||
case 32: type = LLM_TYPE_13B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ void llama_model_cohere2::load_arch_hparams(llama_model_loader & ml) {
|
||||
uint32_t swa_period = 4;
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
||||
hparams.set_swa_pattern(swa_period);
|
||||
|
||||
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||
|
||||
@@ -12,7 +13,8 @@ void llama_model_cohere2::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
switch (hparams.n_layer) {
|
||||
|
||||
switch (hparams.n_layer()) {
|
||||
case 32: type = LLM_TYPE_8B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
@@ -3,7 +3,8 @@
|
||||
void llama_model_command_r::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
switch (hparams.n_layer) {
|
||||
|
||||
switch (hparams.n_layer()) {
|
||||
case 40: type = LLM_TYPE_35B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
+6
-6
@@ -1,14 +1,14 @@
|
||||
#include "models.h"
|
||||
|
||||
void llama_model_dbrx::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 40: type = LLM_TYPE_16x12B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
switch (hparams.n_layer()) {
|
||||
case 40: type = LLM_TYPE_16x12B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void llama_model_dbrx::load_arch_tensors(llama_model_loader &) {
|
||||
LLAMA_LOAD_LOCALS;
|
||||
|
||||
+2
-1
@@ -2,7 +2,8 @@
|
||||
|
||||
void llama_model_deci::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
switch (hparams.n_layer) {
|
||||
|
||||
switch (hparams.n_layer()) {
|
||||
case 32: type = LLM_TYPE_7B; break;
|
||||
case 80: type = LLM_TYPE_70B; break;
|
||||
case 162: type = LLM_TYPE_405B; break;
|
||||
|
||||
@@ -5,7 +5,7 @@ void llama_model_deepseek2::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
|
||||
|
||||
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B, Kanana-2-30B-A3B
|
||||
const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26 || (hparams.n_layer == 48 && n_vocab == 128256));
|
||||
const bool is_lite = (hparams.n_layer() == 27 || hparams.n_layer() == 26 || (hparams.n_layer() == 48 && n_vocab == 128256));
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
||||
@@ -23,7 +23,7 @@ void llama_model_deepseek2::load_arch_hparams(llama_model_loader & ml) {
|
||||
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
|
||||
// for compatibility with existing DeepSeek V2 and V2.5 GGUFs
|
||||
// that have no expert_gating_func model parameter set
|
||||
if ((hparams.n_layer == 47 || hparams.n_layer == 48) && n_vocab == 154880) {
|
||||
if ((hparams.n_layer() == 47 || hparams.n_layer() == 48) && n_vocab == 154880) {
|
||||
// GLM 4.7 Lite
|
||||
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
|
||||
} else {
|
||||
@@ -43,7 +43,7 @@ void llama_model_deepseek2::load_arch_hparams(llama_model_loader & ml) {
|
||||
|
||||
hparams.f_attn_temp_offset = 0.0f;
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 27: type = LLM_TYPE_16B; break;
|
||||
case 47: type = LLM_TYPE_30B_A3B; break;
|
||||
case 60: type = LLM_TYPE_236B; break;
|
||||
@@ -191,8 +191,7 @@ llama_model_deepseek2::graph::graph(const llama_model & model, const llm_graph_p
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
int effective_n_layers = hparams.n_layer - hparams.nextn_predict_layers;
|
||||
for (int il = 0; il < effective_n_layers; ++il) {
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
@@ -366,7 +365,7 @@ llama_model_deepseek2::graph::graph(const llama_model & model, const llm_graph_p
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
}
|
||||
}
|
||||
if (il == effective_n_layers - 1 && inp_out_ids) {
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
|
||||
@@ -14,7 +14,7 @@ void llama_model_deepseek2ocr::load_arch_hparams(llama_model_loader & ml) {
|
||||
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
|
||||
}
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 12: type = LLM_TYPE_3B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
@@ -31,7 +31,7 @@ void llama_model_deepseek32::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_INDEXER_TOP_K, hparams.indexer_top_k);
|
||||
|
||||
// Expert gating function
|
||||
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
|
||||
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
|
||||
|
||||
if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
|
||||
// [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
||||
@@ -40,13 +40,10 @@ void llama_model_deepseek32::load_arch_hparams(llama_model_loader & ml) {
|
||||
}
|
||||
|
||||
// NextN/MTP parameters
|
||||
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
||||
GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
|
||||
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false);
|
||||
GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer");
|
||||
|
||||
// TODO: when MTP is implemented, this should probably be updated if needed
|
||||
hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 62: type = LLM_TYPE_685B_A37B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
@@ -82,9 +79,9 @@ void llama_model_deepseek32::load_arch_tensors(llama_model_loader &) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
for (int i = 0; i < n_layer_all; ++i) {
|
||||
int flags = 0;
|
||||
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
||||
if (i >= n_layer) {
|
||||
// skip all tensors in the NextN layers
|
||||
// TODO @ngxson : TENSOR_NOT_REQUIRED was a hack, need to remove it later
|
||||
flags |= TENSOR_SKIP | TENSOR_NOT_REQUIRED;
|
||||
@@ -142,7 +139,7 @@ void llama_model_deepseek32::load_arch_tensors(llama_model_loader &) {
|
||||
}
|
||||
|
||||
// NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
|
||||
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
||||
if (i >= n_layer) {
|
||||
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
|
||||
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
|
||||
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
|
||||
@@ -205,8 +202,7 @@ llama_model_deepseek32::graph::graph(const llama_model & model, const llm_graph_
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
int effective_n_layers = hparams.n_layer - hparams.nextn_predict_layers;
|
||||
for (int il = 0; il < effective_n_layers; ++il) {
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
@@ -427,7 +423,7 @@ llama_model_deepseek32::graph::graph(const llama_model & model, const llm_graph_
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, top_k, kq_scale, il);
|
||||
}
|
||||
}
|
||||
if (il == effective_n_layers - 1 && inp_out_ids) {
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
|
||||
@@ -8,7 +8,8 @@ void llama_model_dots1::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
||||
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
||||
switch (hparams.n_layer) {
|
||||
|
||||
switch (hparams.n_layer()) {
|
||||
case 62: type = LLM_TYPE_142B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
@@ -2,8 +2,9 @@
|
||||
|
||||
void llama_model_dream::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
// Dream models are primarily 7B with 28 layers
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 28:
|
||||
type = LLM_TYPE_7B;
|
||||
break;
|
||||
|
||||
@@ -12,7 +12,7 @@ void llama_model_ernie4_5::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
||||
}
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 18: type = LLM_TYPE_0_3B; break;
|
||||
case 28: type = LLM_TYPE_21B_A3B; break;
|
||||
case 54: type = LLM_TYPE_300B_A47B; break;
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
void llama_model_eurobert::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
if (hparams.n_layer == 12) {
|
||||
if (hparams.n_layer() == 12) {
|
||||
type = LLM_TYPE_SMALL; // 0.2B
|
||||
}
|
||||
}
|
||||
|
||||
+10
-12
@@ -20,13 +20,12 @@ void llama_model_exaone_moe::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
||||
|
||||
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
||||
GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
|
||||
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false);
|
||||
GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl");
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 32: type = LLM_TYPE_30B_A3B; break;
|
||||
case 48:
|
||||
case 49: type = LLM_TYPE_235B_A22B; break;
|
||||
case 48: type = LLM_TYPE_235B_A22B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
}
|
||||
@@ -50,9 +49,9 @@ void llama_model_exaone_moe::load_arch_tensors(llama_model_loader &) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
for (int i = 0; i < n_layer_all; ++i) {
|
||||
int flags = 0;
|
||||
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
||||
if (i >= n_layer) {
|
||||
// skip all tensors in the NextN layers
|
||||
flags |= TENSOR_SKIP;
|
||||
}
|
||||
@@ -70,7 +69,7 @@ void llama_model_exaone_moe::load_arch_tensors(llama_model_loader &) {
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
|
||||
|
||||
// dense layers for first n_layer_dense_lead layers or nextn_predict_layers layers at the end
|
||||
if (i < (int) hparams.n_layer_dense_lead || (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers)) {
|
||||
if (i < (int) hparams.n_layer_dense_lead || (i >= n_layer)) {
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, flags);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags);
|
||||
@@ -95,7 +94,7 @@ void llama_model_exaone_moe::load_arch_tensors(llama_model_loader &) {
|
||||
}
|
||||
|
||||
// NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
|
||||
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
||||
if (i >= n_layer) {
|
||||
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), {2 * n_embd, n_embd}, flags);
|
||||
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), {n_embd}, flags);
|
||||
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), {n_embd}, flags);
|
||||
@@ -130,8 +129,7 @@ llama_model_exaone_moe::graph::graph(const llama_model & model, const llm_graph_
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
|
||||
for (int il = 0; il < n_transformer_layers; ++il) {
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// use RoPE for SWA layers
|
||||
@@ -170,7 +168,7 @@ llama_model_exaone_moe::graph::graph(const llama_model & model, const llm_graph_
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
if (il == n_transformer_layers - 1 && inp_out_ids) {
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
void llama_model_exaone::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 32: type = LLM_TYPE_8B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
+9
-13
@@ -1,7 +1,7 @@
|
||||
#include "models.h"
|
||||
|
||||
void llama_model_exaone4::load_arch_hparams(llama_model_loader & ml) {
|
||||
if (hparams.n_layer == 64) { // 32B
|
||||
if (hparams.n_layer() == 64) { // 32B
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.n_swa = 4096;
|
||||
uint32_t swa_period = 4;
|
||||
@@ -15,11 +15,11 @@ void llama_model_exaone4::load_arch_hparams(llama_model_loader & ml) {
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
||||
GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
|
||||
hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
|
||||
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer");
|
||||
|
||||
switch (hparams.n_layer()) {
|
||||
case 30: type = LLM_TYPE_1_2B; break;
|
||||
case 64: type = LLM_TYPE_32B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
@@ -40,8 +40,8 @@ void llama_model_exaone4::load_arch_tensors(llama_model_loader &) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
const bool is_nextn = hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers;
|
||||
for (int i = 0; i < n_layer_all; ++i) {
|
||||
const bool is_nextn = i >= n_layer;
|
||||
int flags = 0;
|
||||
if (is_nextn) {
|
||||
// NextN/MTP layers are preserved in GGUF but are not executed yet.
|
||||
@@ -109,11 +109,7 @@ llama_model_exaone4::graph<iswa>::graph(const llama_model & model, const llm_gra
|
||||
}
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
// MTP / NextN tail blocks are loaded for compatibility but not executed (same as exaone-moe).
|
||||
const int n_layer_main = int(n_layer) - int(hparams.nextn_predict_layers);
|
||||
GGML_ASSERT(n_layer_main > 0);
|
||||
|
||||
for (int il = 0; il < n_layer_main; ++il) {
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// use RoPE for SWA layers or non-SWA models
|
||||
@@ -149,7 +145,7 @@ llama_model_exaone4::graph<iswa>::graph(const llama_model & model, const llm_gra
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
if (il == n_layer_main - 1 && inp_out_ids) {
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
|
||||
@@ -13,7 +13,7 @@ void llama_model_falcon_h1::load_arch_hparams(llama_model_loader & ml) {
|
||||
|
||||
std::fill(hparams.is_recr_impl.begin(), hparams.is_recr_impl.end(), true);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 36:
|
||||
type = LLM_TYPE_0_5B; break;
|
||||
case 24:
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
void llama_model_falcon::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 32: type = LLM_TYPE_7B; break;
|
||||
case 60: type = LLM_TYPE_40B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
|
||||
@@ -21,7 +21,7 @@ void llama_model_gemma_embedding::load_arch_hparams(llama_model_loader & ml) {
|
||||
GGML_ASSERT((hparams.dense_2_feat_in == 0 || hparams.dense_2_feat_in == hparams.n_embd) && "dense_2_feat_in must be equal to n_embd");
|
||||
GGML_ASSERT((hparams.dense_3_feat_out == 0 || hparams.dense_3_feat_out == hparams.n_embd) && "dense_3_feat_out must be equal to n_embd");
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 24: type = LLM_TYPE_0_3B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
void llama_model_gemma::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 18: type = LLM_TYPE_2B; break;
|
||||
case 28: type = LLM_TYPE_7B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
|
||||
@@ -16,7 +16,7 @@ void llama_model_gemma2::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
|
||||
ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 26: type = LLM_TYPE_2B; break;
|
||||
case 42: type = LLM_TYPE_9B; break;
|
||||
case 46: type = LLM_TYPE_27B; break;
|
||||
|
||||
@@ -17,7 +17,7 @@ void llama_model_gemma3::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 18: type = LLM_TYPE_270M; break;
|
||||
case 26: type = LLM_TYPE_1B; break;
|
||||
case 32: type = LLM_TYPE_8B; break; // Rnj-1
|
||||
|
||||
@@ -6,14 +6,14 @@ void llama_model_gemma3n::load_arch_hparams(llama_model_loader & ml) {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.set_swa_pattern(swa_period);
|
||||
|
||||
hparams.n_layer_kv_from_start = 20;
|
||||
hparams.f_attention_scale = 1.0f;
|
||||
hparams.n_layer_kv_from_start = 20;
|
||||
hparams.f_attention_scale = 1.0f;
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 30: type = LLM_TYPE_E2B; break;
|
||||
case 35: type = LLM_TYPE_E4B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
|
||||
@@ -2,12 +2,12 @@
|
||||
|
||||
void llama_model_gemma4::load_arch_hparams(llama_model_loader & ml) {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer);
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer());
|
||||
|
||||
uint32_t n_kv_shared_layers = 0;
|
||||
ml.get_key(LLM_KV_ATTENTION_SHARED_KV_LAYERS, n_kv_shared_layers, false);
|
||||
|
||||
hparams.n_layer_kv_from_start = hparams.n_layer - (int32_t)n_kv_shared_layers;
|
||||
hparams.n_layer_kv_from_start = hparams.n_layer_all - (int32_t)n_kv_shared_layers;
|
||||
hparams.f_attention_scale = 1.0f; // Gemma4 uses self.scaling = 1.0 (no pre-attn scaling)
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||
@@ -19,7 +19,7 @@ void llama_model_gemma4::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa);
|
||||
ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 30: type = LLM_TYPE_26B_A4B; break;
|
||||
case 35: type = LLM_TYPE_E2B; break;
|
||||
case 42: type = LLM_TYPE_E4B; break;
|
||||
|
||||
+7
-10
@@ -33,13 +33,10 @@ void llama_model_glm_dsa::load_arch_hparams(llama_model_loader & ml) {
|
||||
}
|
||||
|
||||
// NextN/MTP parameters
|
||||
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
||||
GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
|
||||
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false);
|
||||
GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl");
|
||||
|
||||
// TODO: when MTP is implemented, this should probably be updated if needed
|
||||
hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 79: type = LLM_TYPE_744B_A40B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
@@ -76,9 +73,9 @@ void llama_model_glm_dsa::load_arch_tensors(llama_model_loader &) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
for (int i = 0; i < n_layer_all; ++i) {
|
||||
int flags = 0;
|
||||
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
||||
if (i >= n_layer) {
|
||||
// skip all tensors in the NextN layers
|
||||
// TODO @ngxson : TENSOR_NOT_REQUIRED was a hack, need to remove it later
|
||||
flags |= TENSOR_SKIP | TENSOR_NOT_REQUIRED;
|
||||
@@ -135,8 +132,8 @@ void llama_model_glm_dsa::load_arch_tensors(llama_model_loader &) {
|
||||
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags);
|
||||
}
|
||||
|
||||
// NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
|
||||
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
||||
// NextN/MTP tensors (preserved but unused) - conditionally load for last n_layer_nextn
|
||||
if (i >= n_layer) {
|
||||
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
|
||||
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
|
||||
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
|
||||
|
||||
+11
-15
@@ -20,16 +20,13 @@ void llama_model_glm4_moe::load_arch_hparams(llama_model_loader & ml) {
|
||||
}
|
||||
|
||||
// NextN/MTP parameters
|
||||
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
||||
GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
|
||||
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false);
|
||||
GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl");
|
||||
|
||||
// TODO: when MTP is implemented, this should probably be updated if needed
|
||||
hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
|
||||
case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open
|
||||
case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
|
||||
switch (hparams.n_layer()) {
|
||||
case 46: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air
|
||||
case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open
|
||||
case 92: type = LLM_TYPE_355B_A32B; break; // GLM-4.5
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
}
|
||||
@@ -54,9 +51,9 @@ void llama_model_glm4_moe::load_arch_tensors(llama_model_loader &) {
|
||||
|
||||
// Load ALL tensors including NextN layer to satisfy total tensor count
|
||||
// but only PROCESS up to last layer (skipping final NextN layer) in forward pass
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
for (int i = 0; i < n_layer_all; ++i) {
|
||||
int flags = 0;
|
||||
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
||||
if (i >= n_layer) {
|
||||
// skip all tensors in the NextN layers
|
||||
flags |= TENSOR_SKIP;
|
||||
}
|
||||
@@ -116,7 +113,7 @@ void llama_model_glm4_moe::load_arch_tensors(llama_model_loader &) {
|
||||
}
|
||||
|
||||
// NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
|
||||
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
||||
if (i >= n_layer) {
|
||||
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
|
||||
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
|
||||
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
|
||||
@@ -161,8 +158,7 @@ llama_model_glm4_moe::graph::graph(const llama_model & model, const llm_graph_pa
|
||||
|
||||
// Only process up to last layer (skip final NextN layer)
|
||||
// Final layer tensors are loaded but not processed in forward pass
|
||||
const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
|
||||
for (int il = 0; il < n_transformer_layers; ++il) {
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// Pre-attention norm
|
||||
@@ -211,7 +207,7 @@ llama_model_glm4_moe::graph::graph(const llama_model & model, const llm_graph_pa
|
||||
model.layers[il].wo, NULL, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
if (il == n_transformer_layers - 1 && inp_out_ids) {
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
|
||||
+8
-12
@@ -5,13 +5,10 @@ void llama_model_glm4::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
|
||||
|
||||
// NextN/MTP parameters (GLM-OCR)
|
||||
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
||||
GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
|
||||
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false);
|
||||
GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl");
|
||||
|
||||
// TODO: when MTP is implemented, this should probably be updated if needed
|
||||
hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 17: type = LLM_TYPE_1B; break; // GLM-OCR
|
||||
case 40: type = LLM_TYPE_9B; break;
|
||||
case 61: type = LLM_TYPE_32B; break;
|
||||
@@ -32,9 +29,9 @@ void llama_model_glm4::load_arch_tensors(llama_model_loader &) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
for (int i = 0; i < n_layer_all; ++i) {
|
||||
int flags = 0;
|
||||
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
||||
if (i >= n_layer) {
|
||||
// skip all tensors in the NextN layers
|
||||
flags |= TENSOR_SKIP;
|
||||
}
|
||||
@@ -55,7 +52,7 @@ void llama_model_glm4::load_arch_tensors(llama_model_loader &) {
|
||||
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, flags);
|
||||
|
||||
// NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
|
||||
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
||||
if (i >= n_layer) {
|
||||
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
|
||||
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
|
||||
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
|
||||
@@ -100,8 +97,7 @@ llama_model_glm4::graph::graph(const llama_model & model, const llm_graph_params
|
||||
|
||||
// Only process up to last layer (skip final NextN layer)
|
||||
// Final layer tensors are loaded but not processed in forward pass
|
||||
const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
|
||||
for (int il = 0; il < n_transformer_layers; ++il) {
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// Pre-attention norm
|
||||
@@ -140,7 +136,7 @@ llama_model_glm4::graph::graph(const llama_model & model, const llm_graph_params
|
||||
model.layers[il].wo, NULL, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
if (il == n_transformer_layers - 1 && inp_out_ids) {
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
|
||||
+2
-1
@@ -2,7 +2,8 @@
|
||||
|
||||
void llama_model_gpt2::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
switch (hparams.n_layer) {
|
||||
|
||||
switch (hparams.n_layer()) {
|
||||
case 12: type = LLM_TYPE_SMALL; break;
|
||||
case 24: type = LLM_TYPE_MEDIUM; break;
|
||||
case 36: type = LLM_TYPE_LARGE; break;
|
||||
|
||||
@@ -3,7 +3,8 @@
|
||||
void llama_model_gptneox::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
|
||||
switch (hparams.n_layer) {
|
||||
|
||||
switch (hparams.n_layer()) {
|
||||
case 6:
|
||||
switch (hparams.n_ff()) {
|
||||
case 512: type = LLM_TYPE_14M; break;
|
||||
|
||||
@@ -19,7 +19,7 @@ void llama_model_granite_hybrid::load_arch_hparams(llama_model_loader & ml) {
|
||||
hparams.rope_finetuned = rope_finetuned;
|
||||
|
||||
// A layer is recurrent IFF the n_head_kv value is set to 0
|
||||
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
||||
for (uint32_t i = 0; i < hparams.n_layer(); ++i) {
|
||||
hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ void llama_model_granite_moe::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
|
||||
hparams.rope_finetuned = rope_finetuned;
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 32: type = LLM_TYPE_3B; break;
|
||||
case 40: type = LLM_TYPE_3B; break;
|
||||
// Add additional layer/vocab/etc checks here for other model sizes
|
||||
|
||||
@@ -12,7 +12,7 @@ void llama_model_granite::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
|
||||
hparams.rope_finetuned = rope_finetuned;
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 32: type = LLM_TYPE_3B; break;
|
||||
case 40: type = LLM_TYPE_3B; break;
|
||||
// Add additional layer/vocab/etc checks here for other model sizes
|
||||
|
||||
+1
-1
@@ -26,7 +26,7 @@ void llama_model_grok::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 64: type = LLM_TYPE_314B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ void llama_model_grovemoe::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 48: type = LLM_TYPE_30B_A3B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@ void llama_model_hunyuan_moe::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 32: type = LLM_TYPE_A13B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
|
||||
void llama_model_internlm2::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
switch (hparams.n_layer) {
|
||||
|
||||
switch (hparams.n_layer()) {
|
||||
case 32: type = LLM_TYPE_7B; break;
|
||||
case 48: type = LLM_TYPE_20B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
|
||||
+1
-1
@@ -4,7 +4,7 @@ void llama_model_jais::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 24: type = LLM_TYPE_1_3B; break;
|
||||
case 40: type = LLM_TYPE_13B; break;
|
||||
/* TODO: add variants */
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
void llama_model_jais2::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 32: type = LLM_TYPE_8B; break;
|
||||
case 68: type = LLM_TYPE_70B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
|
||||
@@ -8,11 +8,11 @@ void llama_model_jamba::load_arch_hparams(llama_model_loader & ml) {
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
||||
for (uint32_t i = 0; i < hparams.n_layer(); ++i) {
|
||||
hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0;
|
||||
}
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
// TODO: Jamba layers are a bit heterogeneous, so naming this is hard.
|
||||
case 12: // 900M 8x???M
|
||||
case 32: // 51B 16x?B
|
||||
|
||||
@@ -4,7 +4,7 @@ void llama_model_jina_bert_v2::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
hparams.f_max_alibi_bias = 8.0f;
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 4: type = LLM_TYPE_33M; break; // jina-embeddings-small
|
||||
case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
void llama_model_jina_bert_v3::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 24:
|
||||
type = LLM_TYPE_558M; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
|
||||
@@ -14,7 +14,7 @@ void llama_model_kimi_linear::load_arch_hparams(llama_model_loader & ml) {
|
||||
|
||||
// Mark KDA layers as recurrent using n_head_kv pattern (like Jamba)
|
||||
// Set n_head_kv = 0 for KDA layers (recurrent), n_head_kv = n_head for MLA layers (attention)
|
||||
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
||||
for (uint32_t i = 0; i < hparams.n_layer(); ++i) {
|
||||
hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0; // KDA layers are recurrent
|
||||
}
|
||||
|
||||
@@ -25,7 +25,7 @@ void llama_model_kimi_linear::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
||||
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 27: type = LLM_TYPE_48B_A3B; break; // Kimi-Linear-48B-A3B
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
+7
-3
@@ -5,10 +5,13 @@
|
||||
void llama_model_lfm2::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
for (uint32_t il = 0; il < hparams.n_layer; ++il) {
|
||||
|
||||
for (uint32_t il = 0; il < hparams.n_layer(); ++il) {
|
||||
hparams.is_recr_impl[il] = hparams.n_head_kv(il) == 0;
|
||||
}
|
||||
hparams.n_layer_dense_lead = hparams.n_layer;
|
||||
|
||||
hparams.n_layer_dense_lead = hparams.n_layer();
|
||||
|
||||
switch (hparams.n_ff()) {
|
||||
case 4608: type = LLM_TYPE_350M; break;
|
||||
case 6912: type = LLM_TYPE_700M; break;
|
||||
@@ -16,9 +19,10 @@ void llama_model_lfm2::load_arch_hparams(llama_model_loader & ml) {
|
||||
case 10752: type = LLM_TYPE_2_6B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
if (const auto is_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); is_swa && hparams.n_swa > 0) {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
for (uint32_t il = 0; il < hparams.n_layer; ++il) {
|
||||
for (uint32_t il = 0; il < hparams.n_layer(); ++il) {
|
||||
hparams.is_swa_impl[il] = !hparams.is_recr_impl[il];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,11 +9,11 @@ void llama_model_lfm2moe::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
|
||||
|
||||
for (uint32_t il = 0; il < hparams.n_layer; ++il) {
|
||||
for (uint32_t il = 0; il < hparams.n_layer(); ++il) {
|
||||
hparams.is_recr_impl[il] = hparams.n_head_kv(il) == 0;
|
||||
}
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 24: type = LLM_TYPE_8B_A1B; break;
|
||||
case 40: type = LLM_TYPE_24B_A2B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
|
||||
@@ -2,11 +2,12 @@
|
||||
|
||||
void llama_model_llada_moe::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
// diffusion language model uses non-causal attention
|
||||
hparams.causal_attn = false;
|
||||
switch (hparams.n_layer) {
|
||||
|
||||
switch (hparams.n_layer()) {
|
||||
case 16: type = LLM_TYPE_A1_7B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
@@ -2,14 +2,16 @@
|
||||
|
||||
void llama_model_llada::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
// LLaDA-8B has 32 layers, similar to LLaMA but for diffusion
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 32:
|
||||
type = LLM_TYPE_8B;
|
||||
break;
|
||||
default:
|
||||
type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
// Set non-causal attention for diffusion models
|
||||
hparams.causal_attn = false;
|
||||
}
|
||||
|
||||
@@ -7,13 +7,13 @@ void llama_model_llama::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
if (hparams.n_expert == 8) {
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 32: type = LLM_TYPE_8x7B; break;
|
||||
case 56: type = LLM_TYPE_8x22B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} else {
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B
|
||||
case 22: type = LLM_TYPE_1B; break;
|
||||
case 26: type = LLM_TYPE_3B; break;
|
||||
|
||||
@@ -8,7 +8,7 @@ void llama_model_llama4::load_arch_hparams(llama_model_loader & ml) {
|
||||
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||
if (found_swa && hparams.n_swa == 0) {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
|
||||
hparams.n_no_rope_layer_step = hparams.n_layer(); // always use rope
|
||||
} else {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
|
||||
hparams.n_swa = 8192;
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
|
||||
void llama_model_maincoder::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
switch (hparams.n_layer) {
|
||||
|
||||
switch (hparams.n_layer()) {
|
||||
case 32: type = LLM_TYPE_1B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
@@ -9,7 +9,7 @@ void llama_model_mamba::load_arch_hparams(llama_model_loader & ml) {
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 24:
|
||||
switch (hparams.n_embd) {
|
||||
case 768: type = LLM_TYPE_SMALL; break;
|
||||
|
||||
@@ -9,7 +9,7 @@ void llama_model_mamba2::load_arch_hparams(llama_model_loader & ml) {
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 24:
|
||||
switch (hparams.n_embd) {
|
||||
case 768: type = LLM_TYPE_SMALL; break;
|
||||
|
||||
@@ -13,7 +13,7 @@ void llama_model_mellum::load_arch_hparams(llama_model_loader & ml) {
|
||||
if (res) {
|
||||
hparams.set_swa_pattern(swa_period);
|
||||
} else {
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer);
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer());
|
||||
}
|
||||
|
||||
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||
@@ -24,7 +24,7 @@ void llama_model_mellum::load_arch_hparams(llama_model_loader & ml) {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
}
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 28: type = LLM_TYPE_12B_A2_5B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
+8
-14
@@ -9,18 +9,17 @@ void llama_model_mimo2::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer);
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer());
|
||||
|
||||
float value_scale = 0.0f;
|
||||
if (ml.get_key(LLM_KV_ATTENTION_VALUE_SCALE, value_scale, false) && value_scale != 1.0f) {
|
||||
hparams.f_attn_value_scale = value_scale;
|
||||
}
|
||||
|
||||
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
||||
GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
|
||||
hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
|
||||
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false);
|
||||
GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl");
|
||||
|
||||
switch (hparams.n_layer - hparams.nextn_predict_layers) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 48: type = LLM_TYPE_310B_A15B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
@@ -35,16 +34,14 @@ void llama_model_mimo2::load_arch_tensors(llama_model_loader &) {
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
const uint32_t n_nextn = hparams.nextn_predict_layers;
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
for (int i = 0; i < n_layer_all; ++i) {
|
||||
auto & layer = layers[i];
|
||||
uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
|
||||
uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
|
||||
uint32_t n_head = hparams.n_head(i);
|
||||
|
||||
// NextN/MTP layers (the last n_nextn blocks) are preserved but disabled pending support
|
||||
const bool is_nextn = (n_nextn > 0) && (static_cast<uint32_t>(i) >= n_layer - n_nextn);
|
||||
const bool is_nextn = i >= n_layer;
|
||||
const int skip = is_nextn ? TENSOR_SKIP : 0;
|
||||
|
||||
create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, skip);
|
||||
@@ -93,10 +90,7 @@ llama_model_mimo2::graph::graph(const llama_model & model, const llm_graph_param
|
||||
|
||||
const float v_scale = hparams.f_attn_value_scale;
|
||||
|
||||
// The last hparams.nextn_predict_layers blocks are MTP heads, currently inactive
|
||||
const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
|
||||
|
||||
for (int il = 0; il < n_transformer_layers; ++il) {
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
uint32_t n_head_l = hparams.n_head(il);
|
||||
@@ -174,7 +168,7 @@ llama_model_mimo2::graph::graph(const llama_model & model, const llm_graph_param
|
||||
}
|
||||
}
|
||||
|
||||
if (il == n_transformer_layers - 1 && inp_out_ids) {
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
void llama_model_minicpm::load_arch_hparams(llama_model_loader & ml) {
|
||||
// Backward-compatible defaults for older MiniCPM GGUFs
|
||||
hparams.f_embedding_scale = 12.0f;
|
||||
hparams.f_residual_scale = 1.4f / sqrtf(float(hparams.n_layer));
|
||||
hparams.f_residual_scale = 1.4f / sqrtf(float(hparams.n_layer()));
|
||||
hparams.f_logit_scale = hparams.n_embd ? (256.0f / float(hparams.n_embd)) : 1.0f;
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
@@ -16,7 +16,7 @@ void llama_model_minicpm::load_arch_hparams(llama_model_loader & ml) {
|
||||
// MiniCPM uses rope by default, unlike Granite which uses it as a switch
|
||||
hparams.rope_finetuned = true;
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 52: type = LLM_TYPE_1B; break;
|
||||
case 40: type = LLM_TYPE_2B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
|
||||
@@ -5,7 +5,7 @@ void llama_model_minicpm3::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
||||
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 62: type = LLM_TYPE_4B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@ void llama_model_minimax_m2::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 62: type = LLM_TYPE_230B_A10B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
@@ -18,7 +18,7 @@ void llama_model_mistral3::load_arch_hparams(llama_model_loader & ml) {
|
||||
}
|
||||
}
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 26: type = LLM_TYPE_3B; break;
|
||||
case 34: type = LLM_TYPE_8B; break;
|
||||
case 40: type = LLM_TYPE_14B; break;
|
||||
|
||||
@@ -22,7 +22,7 @@ void llama_model_modern_bert::load_arch_hparams(llama_model_loader & ml) {
|
||||
hparams.llm_ffn_op = llm_ffn_op_type_from_string(hidden_act, LLM_FFN_GEGLU);
|
||||
}
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 12:
|
||||
type = LLM_TYPE_47M; break; // granite-embedding-small
|
||||
case 22:
|
||||
|
||||
+1
-1
@@ -5,7 +5,7 @@ void llama_model_mpt::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 32: type = LLM_TYPE_7B; break;
|
||||
case 48: type = LLM_TYPE_30B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
|
||||
@@ -9,7 +9,7 @@ void llama_model_nemotron_h::load_arch_hparams(llama_model_loader & ml) {
|
||||
|
||||
// A layer is recurrent IFF the n_head_kv value is set to 0 and
|
||||
// the n_ff value is set to 0
|
||||
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
||||
for (uint32_t i = 0; i < hparams.n_layer(); ++i) {
|
||||
hparams.is_recr_impl[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0);
|
||||
}
|
||||
|
||||
@@ -22,7 +22,7 @@ void llama_model_nemotron_h::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
||||
ml.get_key(LLM_KV_MOE_LATENT_SIZE, hparams.moe_latent_size, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B
|
||||
case 56: type = LLM_TYPE_9B; break;
|
||||
case 88: type = LLM_TYPE_120B_A12B; break;
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
|
||||
void llama_model_nemotron::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
switch (hparams.n_layer) {
|
||||
|
||||
switch (hparams.n_layer()) {
|
||||
case 32: type = LLM_TYPE_4B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
void llama_model_neo_bert::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
if (hparams.n_layer == 28) {
|
||||
if (hparams.n_layer() == 28) {
|
||||
type = LLM_TYPE_250M;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ void llama_model_nomic_bert_moe::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
|
||||
|
||||
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
||||
if (hparams.n_layer() == 12 && hparams.n_embd == 768) {
|
||||
if (arch == LLM_ARCH_NOMIC_BERT) {
|
||||
type = LLM_TYPE_137M;
|
||||
} else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
|
||||
|
||||
@@ -4,7 +4,7 @@ void llama_model_nomic_bert::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
|
||||
|
||||
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
||||
if (hparams.n_layer() == 12 && hparams.n_embd == 768) {
|
||||
if (arch == LLM_ARCH_NOMIC_BERT) {
|
||||
type = LLM_TYPE_137M;
|
||||
} else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
|
||||
|
||||
+1
-1
@@ -4,7 +4,7 @@ void llama_model_olmo::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 22: type = LLM_TYPE_1B; break;
|
||||
case 32: type = LLM_TYPE_7B; break;
|
||||
case 80: type = LLM_TYPE_70B; break;
|
||||
|
||||
@@ -17,7 +17,7 @@ void llama_model_olmo2::load_arch_hparams(llama_model_loader & ml) {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
}
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 16: type = LLM_TYPE_1B; break;
|
||||
case 32: type = LLM_TYPE_7B; break;
|
||||
case 40: type = LLM_TYPE_13B; break;
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
|
||||
void llama_model_olmoe::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
switch (hparams.n_layer) {
|
||||
|
||||
switch (hparams.n_layer()) {
|
||||
case 16: type = LLM_TYPE_A1_7B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
@@ -14,7 +14,7 @@ void llama_model_openai_moe::load_arch_hparams(llama_model_loader & ml) {
|
||||
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 24: type = LLM_TYPE_20B; break;
|
||||
case 36: type = LLM_TYPE_120B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
|
||||
@@ -3,12 +3,12 @@
|
||||
void llama_model_openelm::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 16: type = LLM_TYPE_270M; break;
|
||||
case 20: type = LLM_TYPE_450M; break;
|
||||
case 28: type = LLM_TYPE_1B; break;
|
||||
case 36: type = LLM_TYPE_3B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
switch (hparams.n_layer()) {
|
||||
case 16: type = LLM_TYPE_270M; break;
|
||||
case 20: type = LLM_TYPE_450M; break;
|
||||
case 28: type = LLM_TYPE_1B; break;
|
||||
case 36: type = LLM_TYPE_3B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
void llama_model_orion::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 40: type = LLM_TYPE_14B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
|
||||
void llama_model_pangu_embed::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
switch (hparams.n_layer) {
|
||||
|
||||
switch (hparams.n_layer()) {
|
||||
case 26: type = LLM_TYPE_1B; break; // openPangu-Embedded-1B-V1.1
|
||||
case 34: type = LLM_TYPE_7B; break; // openPangu-Embedded-7B-V1.1
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
|
||||
+1
-1
@@ -3,7 +3,7 @@
|
||||
void llama_model_phi2::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 24: type = LLM_TYPE_1B; break;
|
||||
case 32: type = LLM_TYPE_3B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
|
||||
+1
-1
@@ -3,7 +3,7 @@
|
||||
void llama_model_phi3::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 24: type = LLM_TYPE_1B; break;
|
||||
case 32: type = LLM_TYPE_3B; break;
|
||||
case 40: type = LLM_TYPE_14B; break;
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
void llama_model_phimoe::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
switch (hparams.n_layer()) {
|
||||
case 32: type = LLM_TYPE_16x3_8B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user