mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-09 07:16:44 +02:00
mtp: support for gemma-4 E2B and E4B assistants (#24282)
* models: update converter to support smaller assistants * models: add masked_embd tensors to gemma4-assist arch * gemma-4: remove temp debug for conversion * gemma-4-mtp: filter out masked_embedding tensors during conversion
This commit is contained in:
@@ -789,6 +789,16 @@ class Gemma4UnifiedModel(Gemma4Model):
|
|||||||
class Gemma4AssistantModel(Gemma4Model):
|
class Gemma4AssistantModel(Gemma4Model):
|
||||||
model_arch = gguf.MODEL_ARCH.GEMMA4_ASSISTANT
|
model_arch = gguf.MODEL_ARCH.GEMMA4_ASSISTANT
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
|
||||||
|
name, gen = item
|
||||||
|
|
||||||
|
if "masked_embedding" in name:
|
||||||
|
logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
return super().filter_tensors(item)
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
self.gguf_writer.add_embedding_length_out(self.hparams["backbone_hidden_size"])
|
self.gguf_writer.add_embedding_length_out(self.hparams["backbone_hidden_size"])
|
||||||
|
|||||||
@@ -538,6 +538,8 @@ class VISION_PROJECTOR_TYPE(IntEnum):
|
|||||||
class MODEL_TENSOR(IntEnum):
|
class MODEL_TENSOR(IntEnum):
|
||||||
TOKEN_EMBD = auto()
|
TOKEN_EMBD = auto()
|
||||||
TOKEN_EMBD_NORM = auto()
|
TOKEN_EMBD_NORM = auto()
|
||||||
|
MASKED_EMBD_CENTROIDS= auto()
|
||||||
|
MASKED_EMBD_ORDERING = auto()
|
||||||
TOKEN_TYPES = auto()
|
TOKEN_TYPES = auto()
|
||||||
POS_EMBD = auto()
|
POS_EMBD = auto()
|
||||||
OUTPUT = auto()
|
OUTPUT = auto()
|
||||||
@@ -1087,6 +1089,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||||
MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
|
MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
|
||||||
MODEL_TENSOR.TOKEN_TYPES: "token_types",
|
MODEL_TENSOR.TOKEN_TYPES: "token_types",
|
||||||
|
MODEL_TENSOR.MASKED_EMBD_CENTROIDS: "masked_embd_centroids",
|
||||||
|
MODEL_TENSOR.MASKED_EMBD_ORDERING: "masked_embd_ordering",
|
||||||
MODEL_TENSOR.POS_EMBD: "position_embd",
|
MODEL_TENSOR.POS_EMBD: "position_embd",
|
||||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||||
MODEL_TENSOR.OUTPUT: "output",
|
MODEL_TENSOR.OUTPUT: "output",
|
||||||
@@ -2586,6 +2590,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||||||
MODEL_ARCH.GEMMA4_ASSISTANT: [
|
MODEL_ARCH.GEMMA4_ASSISTANT: [
|
||||||
MODEL_TENSOR.ROPE_FREQS,
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.MASKED_EMBD_CENTROIDS,
|
||||||
|
MODEL_TENSOR.MASKED_EMBD_ORDERING,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
MODEL_TENSOR.NEXTN_PROJ_PRE,
|
MODEL_TENSOR.NEXTN_PROJ_PRE,
|
||||||
MODEL_TENSOR.NEXTN_PROJ_POST,
|
MODEL_TENSOR.NEXTN_PROJ_POST,
|
||||||
|
|||||||
@@ -37,6 +37,14 @@ class TensorNameMap:
|
|||||||
"model.embed", # talkie
|
"model.embed", # talkie
|
||||||
),
|
),
|
||||||
|
|
||||||
|
# Masked embeddings
|
||||||
|
MODEL_TENSOR.MASKED_EMBD_CENTROIDS: (
|
||||||
|
"masked_embedding.centroids", # gemma-4 E2B/E4B assistants
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.MASKED_EMBD_ORDERING: (
|
||||||
|
"masked_embedding.token_ordering", # gemma-4 E2B/E4B assistants
|
||||||
|
),
|
||||||
|
|
||||||
# Token type embeddings
|
# Token type embeddings
|
||||||
MODEL_TENSOR.TOKEN_TYPES: (
|
MODEL_TENSOR.TOKEN_TYPES: (
|
||||||
"embeddings.token_type_embeddings", # bert nomic-bert
|
"embeddings.token_type_embeddings", # bert nomic-bert
|
||||||
|
|||||||
@@ -559,6 +559,8 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
|
|||||||
{ LLM_TENSOR_INDEXER_PROJ, "blk.%d.indexer.proj" },
|
{ LLM_TENSOR_INDEXER_PROJ, "blk.%d.indexer.proj" },
|
||||||
{ LLM_TENSOR_INDEXER_ATTN_K, "blk.%d.indexer.attn_k" },
|
{ LLM_TENSOR_INDEXER_ATTN_K, "blk.%d.indexer.attn_k" },
|
||||||
{ LLM_TENSOR_INDEXER_ATTN_Q_B, "blk.%d.indexer.attn_q_b" },
|
{ LLM_TENSOR_INDEXER_ATTN_Q_B, "blk.%d.indexer.attn_q_b" },
|
||||||
|
{ LLM_TENSOR_MASKED_EMBD_CENTROIDS, "masked_embd_centroids" },
|
||||||
|
{ LLM_TENSOR_MASKED_EMBD_ORDERING, "masked_embd_ordering" },
|
||||||
};
|
};
|
||||||
|
|
||||||
// declare information about the model weight tensors:
|
// declare information about the model weight tensors:
|
||||||
@@ -783,6 +785,8 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|||||||
// latent projections feed ggml_mul_mat, the buft probe must use MUL_MAT to keep them on GPU
|
// latent projections feed ggml_mul_mat, the buft probe must use MUL_MAT to keep them on GPU
|
||||||
{LLM_TENSOR_FFN_LATENT_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
{LLM_TENSOR_FFN_LATENT_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
{LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
{LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||||
|
{LLM_TENSOR_MASKED_EMBD_CENTROIDS, {LLM_TENSOR_LAYER_INPUT, GGML_OP_NONE}},
|
||||||
|
{LLM_TENSOR_MASKED_EMBD_ORDERING, {LLM_TENSOR_LAYER_INPUT, GGML_OP_NONE}},
|
||||||
};
|
};
|
||||||
|
|
||||||
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
|
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
|
||||||
|
|||||||
@@ -566,8 +566,11 @@ enum llm_tensor {
|
|||||||
LLM_TENSOR_NEXTN_HNORM,
|
LLM_TENSOR_NEXTN_HNORM,
|
||||||
LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
|
LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
|
||||||
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
|
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
|
||||||
|
LLM_TENSOR_MASKED_EMBD_CENTROIDS,
|
||||||
|
LLM_TENSOR_MASKED_EMBD_ORDERING,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
enum llm_tensor_layer {
|
enum llm_tensor_layer {
|
||||||
LLM_TENSOR_LAYER_INPUT,
|
LLM_TENSOR_LAYER_INPUT,
|
||||||
LLM_TENSOR_LAYER_REPEATING,
|
LLM_TENSOR_LAYER_REPEATING,
|
||||||
|
|||||||
@@ -39,6 +39,9 @@ void llama_model_gemma4_assistant::load_arch_tensors(llama_model_loader &) {
|
|||||||
|
|
||||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
||||||
|
|
||||||
|
create_tensor(tn(LLM_TENSOR_MASKED_EMBD_CENTROIDS, "weight"), {}, TENSOR_NOT_REQUIRED);
|
||||||
|
create_tensor(tn(LLM_TENSOR_MASKED_EMBD_ORDERING), {}, TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
const int64_t n_embd_backbone = hparams.n_embd_inp();
|
const int64_t n_embd_backbone = hparams.n_embd_inp();
|
||||||
nextn_proj_post = create_tensor(tn(LLM_TENSOR_NEXTN_PROJ_POST, "weight"), { n_embd, n_embd_backbone }, 0);
|
nextn_proj_post = create_tensor(tn(LLM_TENSOR_NEXTN_PROJ_POST, "weight"), { n_embd, n_embd_backbone }, 0);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user