mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-09 07:16:44 +02:00
convert: Fix Gemma 4 Unified conversion (#24118)
* Fix Gemma 4 Unified conversion * Set audio hidden size to audio_embed_dim
This commit is contained in:
+11
-4
@@ -798,7 +798,8 @@ class Gemma4VisionAudioModel(MmprojModel):
|
||||
# remap audio hparams
|
||||
if self.hparams_audio:
|
||||
self.hparams_audio["feat_in"] = self.hparams_audio.get("input_feat_size", 128)
|
||||
self.hparams_audio["intermediate_size"] = self.hparams_audio["hidden_size"] * 4
|
||||
if "hidden_size" in self.hparams_audio:
|
||||
self.hparams_audio["intermediate_size"] = self.hparams_audio["hidden_size"] * 4
|
||||
else:
|
||||
self.has_audio_encoder = False
|
||||
|
||||
@@ -872,7 +873,7 @@ class Gemma4UnifiedVisionAudioModel(Gemma4VisionAudioModel):
|
||||
assert self.hparams_audio is not None
|
||||
text_embd_dim = self.hparams_vision["mm_embed_dim"]
|
||||
self.hparams_vision["hidden_size"] = text_embd_dim
|
||||
self.hparams_audio["hidden_size"] = text_embd_dim
|
||||
self.hparams_audio["hidden_size"] = self.hparams_audio["audio_embed_dim"]
|
||||
# this is a transformer-less vision tower, the params below are redundant but set to avoid error
|
||||
self.hparams_vision["intermediate_size"] = 0
|
||||
self.hparams_vision["num_layers"] = 0
|
||||
@@ -897,7 +898,10 @@ class Gemma4UnifiedVisionAudioModel(Gemma4VisionAudioModel):
|
||||
# ggml im2col outputs in RR..GG..BB.. (CHW) order, but weight expects RGBRGB.. (HWC).
|
||||
# Permute columns so column i aligns with CHW input position i.
|
||||
assert self.hparams_vision is not None
|
||||
p = self.hparams_vision["model_patch_size"]
|
||||
if "model_patch_size" in self.hparams_vision:
|
||||
p = self.hparams_vision["model_patch_size"]
|
||||
else:
|
||||
p = self.hparams_vision["patch_size"] * self.hparams_vision["pooling_kernel_size"]
|
||||
i = torch.arange(p * p * 3)
|
||||
ch = i // (p * p)
|
||||
row = (i % (p * p)) // p
|
||||
@@ -908,7 +912,10 @@ class Gemma4UnifiedVisionAudioModel(Gemma4VisionAudioModel):
|
||||
elif "patch_ln1.weight" in name or "patch_ln1.bias" in name:
|
||||
# same permutation for patch_ln1 as patch_dense to align with CHW input order
|
||||
assert self.hparams_vision is not None
|
||||
p = self.hparams_vision["model_patch_size"]
|
||||
if "model_patch_size" in self.hparams_vision:
|
||||
p = self.hparams_vision["model_patch_size"]
|
||||
else:
|
||||
p = self.hparams_vision["patch_size"] * self.hparams_vision["pooling_kernel_size"]
|
||||
i = torch.arange(p * p * 3)
|
||||
ch = i // (p * p)
|
||||
row = (i % (p * p)) // p
|
||||
|
||||
Reference in New Issue
Block a user