mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-09 07:16:44 +02:00
model : support granite multilingual embeddings R2 (ibm-granite/granite-embedding-{97,311}m-multilingual-r2) (#22716)
* Add support for the ibm-granite/granite-embedding-{97m,311m}-multilingual-r2 embedding models:
* Added a version of the gpt4o tokenizer that has a fixed regex (better handling of marks), and different token merging setting for the 97m model
* Reused gemma4 tokenizer for the 311m model
* granite-embedding-*-multilingual-r2 : add support SwiGLU FFN for Granite Embedding Multilingual R2
* added new GGUF key <arch>.hidden_activation (LLM_KV_HIDDEN_ACT) + writer
* added a forward declaration of llm_ffn_op_type to llama-hparams.h
* added llm_ffn_op in hparams
* added LLM_FFN_NONE = 0 sentinel to llm_ffn_op_type (value-initialization), modern-bert: explicitly assigns LLM_FFN_GEGLU before reading GGUF (unchanged).
* centralized hidden_act mapping in llama-model.cpp, added llm_ffn_op_type_from_string() helper, mirroring rope_scaling_type/llama_rope_scaling_type_from_string()
* modern-bert reads the GGUF key (when present) and uses the resulting op in its FFN graph
* Added granite-embedding-{97m,311m}-multilingual-r2 to the converter code
* Added the hashes for the granite embedding multilingual R2 models
* Set the hidden_activation in the GGUF if the field is present in config.json (such as for the granite embedding models)
This commit is contained in:
@@ -1657,6 +1657,12 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "36f3066e97b7f3994b379aaacde306c1444c6ae84e81a5ae3cd2b7ed3b8c42d4":
|
||||
# ref: https://huggingface.co/openbmb/MiniCPM5-1B
|
||||
res = "minicpm5"
|
||||
if chkhsh == "f241072145675bf8322086f115aebad05e9f869557a238bf2150a2a417d1bf60":
|
||||
# ref: https://huggingface.co/ibm-granite/granite-embedding-97m-multilingual-r2
|
||||
res = "granite-embed-multi-97m"
|
||||
if chkhsh == "789696f5946cc0fc59371f39f6097cafed196b3acded6140432f26bbb1ae1669":
|
||||
# ref: https://huggingface.co/ibm-granite/granite-embedding-311m-multilingual-r2
|
||||
res = "granite-embed-multi-311m"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
|
||||
@@ -603,6 +603,12 @@ class ModernBertModel(BertModel):
|
||||
self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
||||
# FFN activation: ModernBert uses a GLU pair (ffn_up output is 2*n_ff). The
|
||||
# original ModernBERT uses GELU (-> GeGLU); some derivatives such as IBM
|
||||
# Granite Embedding 97m R2 use SiLU (-> SwiGLU). Persist this so the
|
||||
# llama.cpp graph can pick the matching activation.
|
||||
if hidden_act := self.hparams.get("hidden_activation"):
|
||||
self.gguf_writer.add_hidden_act(hidden_act)
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
|
||||
|
||||
@@ -158,6 +158,8 @@ models = [
|
||||
{"name": "sarvam-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sarvamai/sarvam-30b", },
|
||||
{"name": "talkie", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/lewtun/talkie-1930-13b-it-hf", },
|
||||
{"name": "minicpm5", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openbmb/MiniCPM5-1B"},
|
||||
{"name": "granite-embed-multi-97m", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-embedding-97m-multilingual-r2", },
|
||||
{"name": "granite-embed-multi-311m", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-embedding-311m-multilingual-r2", },
|
||||
]
|
||||
|
||||
# some models are known to be broken upstream, so we will skip them as exceptions
|
||||
|
||||
@@ -150,6 +150,7 @@ class Keys:
|
||||
EMBD_LENGTH_PER_LAYER_INP = "{arch}.embedding_length_per_layer_input"
|
||||
SWIGLU_CLAMP_EXP = "{arch}.swiglu_clamp_exp"
|
||||
SWIGLU_CLAMP_SHEXP = "{arch}.swiglu_clamp_shexp"
|
||||
HIDDEN_ACT = "{arch}.hidden_activation"
|
||||
DENSE_FEAT_IN_SIZE = "{arch}.{dense}_feat_in"
|
||||
DENSE_FEAT_OUT_SIZE = "{arch}.{dense}_feat_out"
|
||||
|
||||
|
||||
@@ -853,6 +853,9 @@ class GGUFWriter:
|
||||
def add_swiglu_clamp_shexp(self, values: Sequence[float]) -> None:
|
||||
self.add_array(Keys.LLM.SWIGLU_CLAMP_SHEXP.format(arch=self.arch), values)
|
||||
|
||||
def add_hidden_act(self, value: str) -> None:
|
||||
self.add_string(Keys.LLM.HIDDEN_ACT.format(arch=self.arch), value)
|
||||
|
||||
def add_expert_group_scale(self, value: float) -> None:
|
||||
self.add_float32(Keys.LLM.EXPERT_GROUP_SCALE.format(arch=self.arch), value)
|
||||
|
||||
|
||||
@@ -195,6 +195,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_MOE_LATENT_SIZE, "%s.moe_latent_size" },
|
||||
{ LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
|
||||
{ LLM_KV_NUM_DEEPSTACK_LAYERS, "%s.n_deepstack_layers" },
|
||||
{ LLM_KV_HIDDEN_ACT, "%s.hidden_activation" },
|
||||
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
||||
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
||||
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
||||
|
||||
@@ -199,6 +199,7 @@ enum llm_kv {
|
||||
LLM_KV_MOE_LATENT_SIZE,
|
||||
LLM_KV_NEXTN_PREDICT_LAYERS,
|
||||
LLM_KV_NUM_DEEPSTACK_LAYERS,
|
||||
LLM_KV_HIDDEN_ACT,
|
||||
LLM_KV_POOLING_TYPE,
|
||||
LLM_KV_LOGIT_SCALE,
|
||||
LLM_KV_DECODER_START_TOKEN_ID,
|
||||
|
||||
+2
-1
@@ -36,7 +36,8 @@ enum llm_graph_type {
|
||||
LLM_GRAPH_TYPE_DECODER_MTP,
|
||||
};
|
||||
|
||||
enum llm_ffn_op_type {
|
||||
enum llm_ffn_op_type : int {
|
||||
LLM_FFN_NONE = 0, // sentinel: unset; archs must assign before use
|
||||
LLM_FFN_SILU,
|
||||
LLM_FFN_GELU,
|
||||
LLM_FFN_RELU,
|
||||
|
||||
@@ -23,6 +23,9 @@ enum llama_swa_type {
|
||||
LLAMA_SWA_TYPE_SYMMETRIC = 3,
|
||||
};
|
||||
|
||||
// forward declaration; full definition in llama-graph.h
|
||||
enum llm_ffn_op_type : int;
|
||||
|
||||
struct llama_hparams_posnet {
|
||||
uint32_t n_embd;
|
||||
uint32_t n_layer;
|
||||
@@ -227,6 +230,14 @@ struct llama_hparams {
|
||||
enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
|
||||
|
||||
|
||||
// Resolved FFN gated activation flavor for archs that read
|
||||
// `<arch>.hidden_activation` from the GGUF (e.g. ModernBert derivatives).
|
||||
// Defaults to LLM_FFN_NONE (sentinel = 0); the mapping from the GGUF
|
||||
// string to a real op is done at hparam-load time via
|
||||
// llm_ffn_op_type_from_string() in llama-model.cpp, mirroring how
|
||||
// rope_scaling_type_train is handled.
|
||||
enum llm_ffn_op_type llm_ffn_op;
|
||||
|
||||
// Step35: optional per-layer clamps for (Swi)GLU
|
||||
std::array<float, LLAMA_MAX_LAYERS> swiglu_clamp_exp; // clamping for expert FFN
|
||||
std::array<float, LLAMA_MAX_LAYERS> swiglu_clamp_shexp; // shared expert
|
||||
|
||||
@@ -822,6 +822,28 @@ static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::st
|
||||
return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
||||
}
|
||||
|
||||
// Maps the GGUF `<arch>.hidden_activation` string to the FFN op type used by the
|
||||
// graph builders. Only gated activations that map cleanly to llm_ffn_op_type are
|
||||
// listed; unrecognized values fall back to GeGLU, which matches the historical
|
||||
// default for ModernBert-style architectures.
|
||||
static const std::map<std::string, llm_ffn_op_type> LLM_FFN_OP_TYPES_FROM_STRING = {
|
||||
{ "gelu", LLM_FFN_GEGLU },
|
||||
{ "geglu", LLM_FFN_GEGLU },
|
||||
{ "silu", LLM_FFN_SWIGLU },
|
||||
{ "swish", LLM_FFN_SWIGLU },
|
||||
{ "swiglu", LLM_FFN_SWIGLU },
|
||||
{ "relu", LLM_FFN_RELU },
|
||||
{ "reglu", LLM_FFN_REGLU },
|
||||
};
|
||||
|
||||
llm_ffn_op_type llm_ffn_op_type_from_string(const std::string & name, llm_ffn_op_type fallback) {
|
||||
const auto it = LLM_FFN_OP_TYPES_FROM_STRING.find(name);
|
||||
if (it != LLM_FFN_OP_TYPES_FROM_STRING.end()) {
|
||||
return it->second;
|
||||
}
|
||||
return fallback;
|
||||
}
|
||||
|
||||
// CPU: ACCEL -> GPU host -> CPU extra -> CPU
|
||||
static buft_list_t make_cpu_buft_list(const std::vector<llama_device> & devices, bool use_extra_bufts, bool no_host) {
|
||||
buft_list_t buft_list;
|
||||
|
||||
@@ -145,6 +145,10 @@ enum llm_type {
|
||||
|
||||
std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
|
||||
|
||||
// Map a GGUF activation-name string to llm_ffn_op_type. Returns `fallback` if
|
||||
// the string is empty or not recognized.
|
||||
llm_ffn_op_type llm_ffn_op_type_from_string(const std::string & name, llm_ffn_op_type fallback);
|
||||
|
||||
struct llama_layer_posnet {
|
||||
// resnet
|
||||
struct ggml_tensor * norm1 = nullptr;
|
||||
|
||||
+16
-1
@@ -432,6 +432,15 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_GRANITE_EMB_MULTI:
|
||||
// Same lookaheads as GPT4O but with \p{M} added so combining marks
|
||||
// (diacritics) attach to their base letters. Avoids excessive
|
||||
// backtracking on scripts that use them heavily (Bengali, Hindi,
|
||||
// Telugu, Thai, ...). See PR #22716 for benchmarks.
|
||||
regex_exprs = {
|
||||
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}\\p{M}])([^a-z]))*((?=[\\p{L}\\p{M}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}\\p{M}])([^a-z]))+((?=[\\p{L}\\p{M}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_TINY_AYA:
|
||||
regex_exprs = {
|
||||
// original regex from tokenizer.json: "\\d{1,3}(?=(?:\\d{3})*\\b)"
|
||||
@@ -2142,7 +2151,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
tokenizer_pre == "jais-2") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS2;
|
||||
} else if (
|
||||
tokenizer_pre == "gemma4") {
|
||||
tokenizer_pre == "gemma4" ||
|
||||
tokenizer_pre == "granite-embed-multi-311m") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_GEMMA4;
|
||||
escape_whitespaces = true;
|
||||
} else if (
|
||||
@@ -2252,6 +2262,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
tokenizer_pre == "talkie") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "granite-embed-multi-97m") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_GRANITE_EMB_MULTI;
|
||||
clean_spaces = false;
|
||||
ignore_merges = true;
|
||||
} else if (
|
||||
tokenizer_pre == "tiny_aya") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_TINY_AYA;
|
||||
|
||||
+55
-54
@@ -8,60 +8,61 @@
|
||||
|
||||
// pre-tokenization types
|
||||
enum llama_vocab_pre_type {
|
||||
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
|
||||
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
||||
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
||||
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
|
||||
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
|
||||
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
||||
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
||||
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
||||
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
|
||||
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
|
||||
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
|
||||
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
||||
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
||||
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
||||
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
||||
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
|
||||
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
|
||||
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
|
||||
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
|
||||
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
|
||||
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
||||
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
||||
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
||||
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
|
||||
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
||||
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
||||
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
||||
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
||||
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
|
||||
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
|
||||
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
|
||||
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
|
||||
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
||||
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
||||
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
|
||||
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
||||
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
|
||||
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
|
||||
LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
|
||||
LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
|
||||
LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
|
||||
LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41,
|
||||
LLAMA_VOCAB_PRE_TYPE_AFMOE = 42,
|
||||
LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43,
|
||||
LLAMA_VOCAB_PRE_TYPE_YOUTU = 44,
|
||||
LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45,
|
||||
LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46,
|
||||
LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47,
|
||||
LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48,
|
||||
LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49,
|
||||
LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50,
|
||||
LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51,
|
||||
LLAMA_VOCAB_PRE_TYPE_MINICPM5 = 52,
|
||||
LLAMA_VOCAB_PRE_TYPE_WHITESPACE = 53,
|
||||
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
|
||||
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
||||
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
||||
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
|
||||
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
|
||||
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
||||
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
||||
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
||||
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
|
||||
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
|
||||
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
|
||||
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
||||
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
||||
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
||||
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
||||
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
|
||||
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
|
||||
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
|
||||
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
|
||||
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
|
||||
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
||||
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
||||
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
||||
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
|
||||
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
||||
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
||||
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
||||
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
||||
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
|
||||
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
|
||||
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
|
||||
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
|
||||
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
||||
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
||||
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
|
||||
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
||||
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
|
||||
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
|
||||
LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
|
||||
LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
|
||||
LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
|
||||
LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41,
|
||||
LLAMA_VOCAB_PRE_TYPE_AFMOE = 42,
|
||||
LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43,
|
||||
LLAMA_VOCAB_PRE_TYPE_YOUTU = 44,
|
||||
LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45,
|
||||
LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46,
|
||||
LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47,
|
||||
LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48,
|
||||
LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49,
|
||||
LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50,
|
||||
LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51,
|
||||
LLAMA_VOCAB_PRE_TYPE_MINICPM5 = 52,
|
||||
LLAMA_VOCAB_PRE_TYPE_WHITESPACE = 53,
|
||||
LLAMA_VOCAB_PRE_TYPE_GRANITE_EMB_MULTI = 54,
|
||||
};
|
||||
|
||||
struct LLM_KV;
|
||||
|
||||
@@ -14,6 +14,14 @@ void llama_model_modern_bert::load_arch_hparams(llama_model_loader & ml) {
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
|
||||
// Some ModernBert derivatives (e.g. IBM Granite Embedding 97m R2) use
|
||||
// SiLU/SwiGLU in the FFN instead of the default GELU/GeGLU.
|
||||
hparams.llm_ffn_op = LLM_FFN_GEGLU;
|
||||
std::string hidden_act;
|
||||
if (ml.get_key(LLM_KV_HIDDEN_ACT, hidden_act, false)) {
|
||||
hparams.llm_ffn_op = llm_ffn_op_type_from_string(hidden_act, LLM_FFN_GEGLU);
|
||||
}
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 12:
|
||||
type = LLM_TYPE_47M; break; // granite-embedding-small
|
||||
@@ -144,7 +152,8 @@ llama_model_modern_bert::graph::graph(const llama_model & model, const llm_graph
|
||||
NULL, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_GEGLU, LLM_FFN_SEQ, il);
|
||||
hparams.llm_ffn_op,
|
||||
LLM_FFN_SEQ, il);
|
||||
|
||||
// attentions bypass the intermediate layer
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
Reference in New Issue
Block a user