diff --git a/conversion/__init__.py b/conversion/__init__.py index 2c79580f8a..c670798fc2 100644 --- a/conversion/__init__.py +++ b/conversion/__init__.py @@ -253,6 +253,7 @@ MMPROJ_MODEL_MAP: dict[str, str] = { "Glm4vMoeForConditionalGeneration": "qwen3vl", "GlmOcrForConditionalGeneration": "qwen3vl", "GlmasrModel": "ultravox", + "Granite4VisionForConditionalGeneration": "granite", "GraniteSpeechForConditionalGeneration": "granite", "HunYuanVLForConditionalGeneration": "hunyuan", "Idefics3ForConditionalGeneration": "smolvlm", diff --git a/conversion/granite.py b/conversion/granite.py index 647269ba74..53441fe570 100644 --- a/conversion/granite.py +++ b/conversion/granite.py @@ -1,5 +1,6 @@ from __future__ import annotations +import re from typing import Any, Callable, Iterable, TYPE_CHECKING import torch @@ -13,7 +14,7 @@ from .llama import LlamaModel from .mamba import Mamba2Model -@ModelBase.register("GraniteForCausalLM", "GraniteSpeechForConditionalGeneration") +@ModelBase.register("GraniteForCausalLM") class GraniteModel(LlamaModel): """Conversion for IBM's GraniteForCausalLM""" model_arch = gguf.MODEL_ARCH.GRANITE @@ -46,11 +47,29 @@ class GraniteModel(LlamaModel): self.gguf_writer.add_logit_scale(logits_scale) logger.info("gguf: (granite) logits_scale = %s", logits_scale) + # If being used as the base for Granite4 Vision, add deepstack_layer_arr + if self.hparams.get("spatial_target_layers") or self.hparams.get("deepstack_layer_map"): + normalized_projector_map = Granite4VisionMmprojModel.get_normalized_projector_map(self.hparams) + deepstack_mapping_arr = [-1 for _ in range(self.block_count)] # Populate with -1 sentinels + for proj_idx, (_, llm_layer, _, _) in enumerate(normalized_projector_map): + # Skip the first projector which is handled as the base embedding + # stream like normal + if proj_idx == 0: + continue + deepstack_mapping_arr[llm_layer] = proj_idx + self.gguf_writer.add_deepstack_mapping(deepstack_mapping_arr) + @classmethod def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: name, gen = item - if name.startswith("encoder."): - return None + # Skip multimodal tensors + if ( + name.startswith(("encoder.")) + or "image_" in name + or "layerwise_projectors" in name + or "spatial_projectors" in name + ): + return return super().filter_tensors(item) @@ -241,7 +260,8 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel): assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}" def set_vocab(self): - self.hparams["pad_vocab_size_multiple"] = 8 + # For models with no ssm layers, don't pad for mamba2 + self.hparams["pad_vocab_size_multiple"] = 8 if self._ssm_layers else 1 Mamba2Model.set_vocab(self) @@ -326,3 +346,133 @@ class GraniteSpeechMmprojModel(MmprojModel): data_torch = data_torch.squeeze(1) yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Granite4VisionForConditionalGeneration") +class Granite4VisionMmprojModel(MmprojModel): + has_vision_encoder = True + has_audio_encoder = False + + @staticmethod + def get_normalized_projector_map(global_config: dict) -> list[tuple[int, int, str, int]]: + """Normalize both deepstack and spatial projector maps to the form: + (vision_layer, llm_layer, , type_index) + + This is then used to populate the following mappings: + - vision_feature_layers (mmproj hparam): ordered list of all + vision_layer values where order corresponds with the order of the + stacked projector tensors + NOTE: Values may appear multiple times for spatial projectors + - tensor_prefix_map (mmproj tensors): mapping from tensor prefixes to + the index of the corresponding projector in the stacked tensors + - deepstack_layer_arr (llm hparam): per-text-layer array indicating + which input vision feature should be injected at that layer + (-1 if none) + + Output: (vision_layer, llm_layer, , type_index) + """ + deepstack_map = global_config.get("deepstack_layer_map", []) # [[vis_layer, llm_layer], ...] + spatial_layers = global_config.get("spatial_target_layers", []) # [llm_layer, ...] + n_text_layers = global_config["text_config"]["num_hidden_layers"] + n_vision_layers = global_config["vision_config"]["num_hidden_layers"] + normalized_projector_map = [] + if deepstack_map: + for deepstack_idx, (vision_layer, llm_layer) in enumerate(sorted(deepstack_map)): + if vision_layer < 0: + vision_layer = n_vision_layers + vision_layer + if llm_layer < 0: + llm_layer = n_text_layers + llm_layer + normalized_projector_map.append((vision_layer, llm_layer, "layerwise", deepstack_idx)) + if spatial_layers: + spatial_vision_layer = global_config.get("spatial_vision_layer", -1) + if spatial_vision_layer < 0: + spatial_vision_layer = n_vision_layers + spatial_vision_layer + for spatial_idx, llm_layer in enumerate(spatial_layers): + normalized_projector_map.append((spatial_vision_layer, llm_layer, "spatial", spatial_idx)) + return list(sorted(normalized_projector_map, key=(lambda entry: entry[1]))) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + normalized_projector_map = self.get_normalized_projector_map(self.global_config) + self._n_proj = len(normalized_projector_map) + + self._tensor_prefix_map = { + f"model.{proj_type}_projectors.{type_idx}": proj_idx + for proj_idx, (_, _, proj_type, type_idx) in enumerate(normalized_projector_map) + } + self._vision_feature_layers = [vision_layer for vision_layer, _, _, _ in normalized_projector_map] + self._spatial_offsets = [ + type_idx if proj_type == "spatial" else -1 + for _, _, proj_type, type_idx in normalized_projector_map + ] + + def set_gguf_parameters(self): + assert self.hparams_vision is not None + super().set_gguf_parameters() + + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GRANITE4_VISION) + + # SigLIP encoder hparams + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) + self.gguf_writer.add_vision_use_gelu(True) + + # Preprocessor + self.gguf_writer.add_vision_preproc_image_size(self.hparams.get("image_size", 384)) + + # QFormer projector config + ds_rate = self.global_config["downsample_rate"] + ds_parts = ds_rate.split("/") + assert len(ds_parts) == 2, f"Invalid 'downsample_rate' value: {ds_rate}" + query_side, window_side = [int(p) for p in ds_parts] + self.gguf_writer.add_vision_projector_query_side(query_side) + self.gguf_writer.add_vision_projector_window_side(window_side) + + # Set vision feature layers + self.gguf_writer.add_vision_feature_layers(self._vision_feature_layers) + + # Set the spatial offests per projector + self.gguf_writer.add_vision_spatial_offsets(self._spatial_offsets) + + # Add flattened image grind pinpoints (resolution candidates internally) + if pinpoints := self.global_config.get("image_grid_pinpoints"): + # Flatten with h, w -> w, h inversion + pinpoints = [val for h, w in pinpoints for val in (w, h)] + self.gguf_writer.add_vision_image_grid_pinpoints(pinpoints) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, _ = item + if ("vision_model.head" in name or name.startswith("lm_head")): + return None + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + + # Detect projector tensors and bin them + projector_idx = None + for prefix, proj_idx in self._tensor_prefix_map.items(): + if name.startswith(prefix): + projector_idx = proj_idx + break + if projector_idx is not None: + # If this projector tensor has a block id within the projector, + # alias the bid to projector_idx + # + # TODO: currently, none of the Granite 4 Vision models have + # projectors with multiple QFormer layers, so the `layer.{}` index + # is always 0. This allows us to simply map to a single `bid` that + # matches the projector index. If this changes, we'll need a + # convention that merges the two IDs. + id_matches = list(re.finditer(r"\.([0-9]+)\.", name)) + all_ids = [int(m.group(1)) for m in id_matches] + assert len(all_ids) >= 1 and len(all_ids) <= 2, "Must have at least 1 and at most 2 ids in tensor names" + # If not layer id, just use the projector index + new_bid = projector_idx + if len(all_ids) == 1: + new_name = name[:id_matches[0].span(1)[0]] + str(new_bid) + name[id_matches[0].span(1)[1]:] + else: # len(all_ids) == 2 + new_bid = projector_idx # + all_ids[1] + new_name = name[:id_matches[0].span(0)[0]] + name[id_matches[0].span(1)[1]:id_matches[1].span(1)[0]] + str(new_bid) + name[id_matches[1].span(1)[1]:] + yield from super().modify_tensors(data_torch, new_name, new_bid) + return + yield from super().modify_tensors(data_torch, name, bid) diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index 9a6437beab..45202b3338 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -311,6 +311,10 @@ def parse_args() -> argparse.Namespace: "--base-model-id", type=str, help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')", ) + parser.add_argument( + "--trust-remote-code", default=False, action="store_true", + help="trust remote code in the model", + ) parser.add_argument( "lora_path", type=Path, help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)", @@ -319,11 +323,11 @@ def parse_args() -> argparse.Namespace: return parser.parse_args() -def load_hparams_from_hf(hf_model_id: str) -> tuple[dict[str, Any], Path | None]: +def load_hparams_from_hf(hf_model_id: str, trust_remote_code: bool) -> tuple[dict[str, Any], Path | None]: from huggingface_hub import try_to_load_from_cache # normally, adapter does not come with base model config, we need to load it from AutoConfig - config = AutoConfig.from_pretrained(hf_model_id) + config = AutoConfig.from_pretrained(hf_model_id, trust_remote_code=trust_remote_code) cache_dir = try_to_load_from_cache(hf_model_id, "config.json") cache_dir = Path(cache_dir).parent if isinstance(cache_dir, str) else None @@ -372,13 +376,13 @@ if __name__ == '__main__': # load base model if base_model_id is not None: logger.info(f"Loading base model from Hugging Face: {base_model_id}") - hparams, dir_base_model = load_hparams_from_hf(base_model_id) + hparams, dir_base_model = load_hparams_from_hf(base_model_id, args.trust_remote_code) elif dir_base_model is None: if "base_model_name_or_path" in lparams: model_id = lparams["base_model_name_or_path"] logger.info(f"Loading base model from Hugging Face: {model_id}") try: - hparams, dir_base_model = load_hparams_from_hf(model_id) + hparams, dir_base_model = load_hparams_from_hf(model_id, args.trust_remote_code) except OSError as e: logger.error(f"Failed to load base model config: {e}") logger.error("Please try downloading the base model and add its path to --base") @@ -393,7 +397,9 @@ if __name__ == '__main__': with torch.inference_mode(): try: - model_class = get_model_class(hparams["architectures"][0]) + model_arch = hparams.get("text_config", {}).get("architectures", hparams["architectures"])[0] + logger.info("Using model architecture: %s", model_arch) + model_class = get_model_class(model_arch) except NotImplementedError: logger.error(f"Model {hparams['architectures'][0]} is not supported") sys.exit(1) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index ce556ec9b6..814980ce50 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -128,6 +128,7 @@ class Keys: MOE_LATENT_SIZE = "{arch}.moe_latent_size" NEXTN_PREDICT_LAYERS = "{arch}.nextn_predict_layers" NUM_DEEPSTACK_LAYERS = "{arch}.n_deepstack_layers" + DEEPSTACK_MAPPING = "{arch}.deepstack_mapping" POOLING_TYPE = "{arch}.pooling_type" LOGIT_SCALE = "{arch}.logit_scale" DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id" @@ -325,6 +326,8 @@ class Keys: WA_PATTERN_MODE = "clip.vision.wa_pattern_mode" # used by mimovl, per-layer -1/0/1 IS_DEEPSTACK_LAYERS = "clip.vision.is_deepstack_layers" WINDOW_SIZE = "clip.vision.window_size" + FEATURE_LAYERS = "clip.vision.feature_layer" # Granite4 Vision + IMAGE_GRID_PINPOINTS = "clip.vision.image_grid_pinpoints" # Granite4 Vision class Attention: HEAD_COUNT = "clip.vision.attention.head_count" @@ -333,6 +336,9 @@ class Keys: class Projector: SCALE_FACTOR = "clip.vision.projector.scale_factor" + QUERY_SIDE = "clip.vision.projector.query_side" + WINDOW_SIDE = "clip.vision.projector.window_side" + SPATIAL_OFFSETS = "clip.vision.projector.spatial_offsets" class SAM: BLOCK_COUNT = "clip.vision.sam.block_count" @@ -821,6 +827,31 @@ class MODEL_TENSOR(IntEnum): V_RESMPL_QUERY_768 = auto() # Deepseek-OCR-2 V_RESMPL_QUERY_1024 = auto() # Deepseek-OCR-2 + # qformer projector (vision) - Granite4 Vision + V_QF_PROJ_QUERY = auto() + V_QF_PROJ_NORM = auto() + V_QF_PROJ_LINEAR = auto() + V_QF_SELF_ATTN_Q = auto() + V_QF_SELF_ATTN_K = auto() + V_QF_SELF_ATTN_V = auto() + V_QF_SELF_ATTN_O = auto() + V_QF_SELF_ATTN_NORM = auto() + V_QF_CROSS_ATTN_Q = auto() + V_QF_CROSS_ATTN_K = auto() + V_QF_CROSS_ATTN_V = auto() + V_QF_CROSS_ATTN_O = auto() + V_QF_CROSS_ATTN_NORM = auto() + V_QF_FFN_UP = auto() + V_QF_FFN_DOWN = auto() + V_QF_FFN_NORM = auto() + V_PROJ_NORM = auto() + # multi-projector (bid => projector id) - Granite4 vision + V_MULTI_PROJ_IMG_POS = auto() + V_MULTI_PROJ_QUERY = auto() + V_MULTI_PROJ_NORM = auto() + V_MULTI_PROJ_LINEAR = auto() + V_MULTI_PROJ_POST_NORM = auto() + # audio (mtmd) A_ENC_EMBD_POS = auto() A_ENC_EMBD_NORM = auto() @@ -885,7 +916,7 @@ class MODEL_TENSOR(IntEnum): A_CTC_OUT = auto() A_CTC_OUT_MID = auto() A_ENC_ATTN_REL_POS_EMB = auto() - # qformer projector + # audio qformer projector A_QF_PROJ_QUERY = auto() A_QF_PROJ_NORM = auto() A_QF_PROJ_LINEAR = auto() @@ -1337,10 +1368,33 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.V_SAM_NECK: "v.sam.neck.{bid}", MODEL_TENSOR.V_SAM_NET_2: "v.sam.net_2", MODEL_TENSOR.V_SAM_NET_3: "v.sam.net_3", - MODEL_TENSOR.V_ENC_EMBD_IMGNL: "v.image_newline", # Deepseek-OCR + MODEL_TENSOR.V_ENC_EMBD_IMGNL: "v.image_newline", # Deepseek-OCR, Granite4Vision MODEL_TENSOR.V_ENC_EMBD_VSEP: "v.view_seperator", # Deepseek-OCR MODEL_TENSOR.V_RESMPL_QUERY_768: "v.resample_query_768", # Deepseek-OCR-2 qwen2 MODEL_TENSOR.V_RESMPL_QUERY_1024: "v.resample_query_1024", # Deepseek-OCR-2 qwen2 + # Granite4 Vision + # qformer layers (bid => proj_id) + # NOTE: Names align with A_QF_* + MODEL_TENSOR.V_QF_SELF_ATTN_Q: "v.proj_blk.{bid}.self_attn_q", + MODEL_TENSOR.V_QF_SELF_ATTN_K: "v.proj_blk.{bid}.self_attn_k", + MODEL_TENSOR.V_QF_SELF_ATTN_V: "v.proj_blk.{bid}.self_attn_v", + MODEL_TENSOR.V_QF_SELF_ATTN_O: "v.proj_blk.{bid}.self_attn_out", + MODEL_TENSOR.V_QF_SELF_ATTN_NORM: "v.proj_blk.{bid}.self_attn_norm", + MODEL_TENSOR.V_QF_CROSS_ATTN_Q: "v.proj_blk.{bid}.cross_attn_q", + MODEL_TENSOR.V_QF_CROSS_ATTN_K: "v.proj_blk.{bid}.cross_attn_k", + MODEL_TENSOR.V_QF_CROSS_ATTN_V: "v.proj_blk.{bid}.cross_attn_v", + MODEL_TENSOR.V_QF_CROSS_ATTN_O: "v.proj_blk.{bid}.cross_attn_out", + MODEL_TENSOR.V_QF_CROSS_ATTN_NORM: "v.proj_blk.{bid}.cross_attn_norm", + MODEL_TENSOR.V_QF_FFN_UP: "v.proj_blk.{bid}.ffn_up", + MODEL_TENSOR.V_QF_FFN_DOWN: "v.proj_blk.{bid}.ffn_down", + MODEL_TENSOR.V_QF_FFN_NORM: "v.proj_blk.{bid}.ffn_norm", + # multi-projector (bid => projector ID) + MODEL_TENSOR.V_MULTI_PROJ_IMG_POS: "v.proj_blk.{bid}.img_pos", + MODEL_TENSOR.V_MULTI_PROJ_QUERY: "v.proj_blk.{bid}.query", + MODEL_TENSOR.V_MULTI_PROJ_NORM: "v.proj_blk.{bid}.norm", + MODEL_TENSOR.V_MULTI_PROJ_LINEAR: "v.proj_blk.{bid}.linear", + MODEL_TENSOR.V_MULTI_PROJ_POST_NORM: "v.proj_blk.{bid}.post_norm", + # audio (mtmd) # note: all audio tensor names must use prefix "a." or "mm.a." MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd", @@ -1522,6 +1576,29 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.V_SAM_NET_3, MODEL_TENSOR.V_RESMPL_QUERY_768, MODEL_TENSOR.V_RESMPL_QUERY_1024, + MODEL_TENSOR.V_PROJ_NORM, + MODEL_TENSOR.V_QF_PROJ_QUERY, + MODEL_TENSOR.V_QF_PROJ_NORM, + MODEL_TENSOR.V_QF_PROJ_LINEAR, + MODEL_TENSOR.V_QF_SELF_ATTN_Q, + MODEL_TENSOR.V_QF_SELF_ATTN_K, + MODEL_TENSOR.V_QF_SELF_ATTN_V, + MODEL_TENSOR.V_QF_SELF_ATTN_O, + MODEL_TENSOR.V_QF_SELF_ATTN_NORM, + MODEL_TENSOR.V_QF_CROSS_ATTN_Q, + MODEL_TENSOR.V_QF_CROSS_ATTN_K, + MODEL_TENSOR.V_QF_CROSS_ATTN_V, + MODEL_TENSOR.V_QF_CROSS_ATTN_O, + MODEL_TENSOR.V_QF_CROSS_ATTN_NORM, + MODEL_TENSOR.V_QF_FFN_UP, + MODEL_TENSOR.V_QF_FFN_DOWN, + MODEL_TENSOR.V_QF_FFN_NORM, + MODEL_TENSOR.V_QF_PROJ_NORM, + MODEL_TENSOR.V_MULTI_PROJ_IMG_POS, + MODEL_TENSOR.V_MULTI_PROJ_QUERY, + MODEL_TENSOR.V_MULTI_PROJ_LINEAR, + MODEL_TENSOR.V_MULTI_PROJ_NORM, + MODEL_TENSOR.V_MULTI_PROJ_POST_NORM, # audio MODEL_TENSOR.A_ENC_EMBD_POS, MODEL_TENSOR.A_ENC_EMBD_NORM, @@ -4388,6 +4465,7 @@ class VisionProjectorType: MINICPMV4_6 = "minicpmv4_6" GRANITE_SPEECH = "granite_speech" # audio MIMOVL = "mimovl" + GRANITE4_VISION = "granite4_vision" # Items here are (block size, type size) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 875d0f73d9..182c9c54a5 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -959,8 +959,13 @@ class GGUFWriter: self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value) def add_num_deepstack_layers(self, count: int) -> None: + """Add scalar deepstack layer count (qwen3vl format)""" self.add_uint32(Keys.LLM.NUM_DEEPSTACK_LAYERS.format(arch=self.arch), count) + def add_deepstack_mapping(self, layers: Sequence[int]) -> None: + """Add per-layer deepstack projector indices (Granite4 Vision format)""" + self.add_array(Keys.LLM.DEEPSTACK_MAPPING.format(arch=self.arch), list(layers)) + def add_rope_dimension_count(self, count: int) -> None: self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count) @@ -1184,6 +1189,15 @@ class GGUFWriter: def add_vision_preproc_image_size(self, value: int) -> None: self.add_uint32(Keys.ClipVision.PREPROC_IMAGE_SIZE, value) + def add_vision_projector_query_side(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.Projector.QUERY_SIDE, value) + + def add_vision_projector_window_side(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.Projector.WINDOW_SIDE, value) + + def add_vision_spatial_offsets(self, layers: Sequence[int]) -> None: + self.add_array(Keys.ClipVision.Projector.SPATIAL_OFFSETS, layers) + def add_vision_image_mean(self, values: Sequence[float]) -> None: self.add_array(Keys.ClipVision.IMAGE_MEAN, values) @@ -1240,6 +1254,12 @@ class GGUFWriter: def add_vision_window_size(self, value: int) -> None: self.add_uint32(Keys.ClipVision.WINDOW_SIZE, value) + def add_vision_feature_layers(self, layers: Sequence[int]) -> None: + self.add_array(Keys.ClipVision.FEATURE_LAYERS, layers) + + def add_vision_image_grid_pinpoints(self, layers: Sequence[Sequence[int]]) -> None: + self.add_array(Keys.ClipVision.IMAGE_GRID_PINPOINTS, layers) + def add_vision_sam_layers_count(self, value: int) -> None: self.add_uint32(Keys.ClipVision.SAM.BLOCK_COUNT, value) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 82f26e7b30..3e63b21650 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1408,6 +1408,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_EMBD_PATCH: ( + "model.vision_tower.vision_model.embeddings.patch_embedding", # Granite4Vision "vision_tower.vision_model.embeddings.patch_embedding", "model.vision_tower.embeddings.patch_embedding", # minicpmv4_6 "model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1 @@ -1439,6 +1440,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_EMBD_POS: ( + "model.vision_tower.vision_model.embeddings.position_embedding", # Granite4Vision "vision_tower.vision_model.embeddings.position_embedding", "model.vision_tower.embeddings.position_embedding", # minicpmv4_6 "model.vision_tower.embeddings.position_embeddings", # Intern-S1 @@ -1456,8 +1458,9 @@ class TensorNameMap: "model.vision_embedder.pos_embedding", # gemma4 unified ), + # TODO: I think these should all be moved to mapping_cfg? MODEL_TENSOR.V_ENC_EMBD_IMGNL: ( - "model.image_newline", # Deepseek-OCR + "model.image_newline", # Deepseek-OCR, Granite4Vision "vit.perceive.image_newline", # HunyuanVL ), @@ -1477,6 +1480,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_ATTN_Q: ( + "model.vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj", # Granite4Vision "vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj", "model.vision_tower.encoder.layers.{bid}.self_attn.q_proj", # minicpmv4_6 "model.vision_tower.encoder.layer.{bid}.attention.q_proj", # Intern-S1 @@ -1502,6 +1506,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_ATTN_K: ( + "model.vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj", # Granite4Vision "vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj", "model.vision_tower.encoder.layers.{bid}.self_attn.k_proj", # minicpmv4_6 "model.vision_tower.encoder.layer.{bid}.attention.k_proj", # Intern-S1 @@ -1527,6 +1532,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_ATTN_V: ( + "model.vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj", # Granite4Vision "vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj", "model.vision_tower.encoder.layers.{bid}.self_attn.v_proj", # minicpmv4_6 "model.vision_tower.encoder.layer.{bid}.attention.v_proj", # Intern-S1 @@ -1545,6 +1551,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_INPUT_NORM: ( + "model.vision_tower.vision_model.encoder.layers.{bid}.layer_norm1", # Granite4Vision "vision_tower.vision_model.encoder.layers.{bid}.layer_norm1", "model.vision_tower.encoder.layers.{bid}.layer_norm1", # minicpmv4_6 "vision_tower.vision_model.encoder.layers.{bid}.norm1", # InternVL @@ -1567,6 +1574,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_ATTN_O: ( + "model.vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj", # Granite4Vision "vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj", "model.vision_tower.encoder.layers.{bid}.self_attn.out_proj", # minicpmv4_6 "vision_tower.vision_model.encoder.layers.{bid}.attn.proj", # InternVL @@ -1595,6 +1603,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_POST_ATTN_NORM: ( + "model.vision_tower.vision_model.encoder.layers.{bid}.layer_norm2", # Granite4Vision "vision_tower.vision_model.encoder.layers.{bid}.layer_norm2", "model.vision_tower.encoder.layers.{bid}.layer_norm2", # minicpmv4_6 "vision_tower.vision_model.encoder.layers.{bid}.norm2", # InternVL @@ -1618,6 +1627,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_FFN_UP: ( + "model.vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1", # Granite4Vision "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1", "model.vision_tower.encoder.layers.{bid}.mlp.fc1", # minicpmv4_6 "model.vision_tower.encoder.layer.{bid}.mlp.fc1", # Intern-S1 @@ -1649,6 +1659,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_ENC_FFN_DOWN: ( + "model.vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2", # Granite4Vision "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2", "model.vision_tower.encoder.layers.{bid}.mlp.fc2", # minicpmv4_6 "model.vision_tower.encoder.layer.{bid}.mlp.fc2", # Intern-S1 @@ -1706,6 +1717,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_POST_NORM: ( + "model.vision_tower.vision_model.post_layernorm", # Granite4Vision "vision_tower.vision_model.post_layernorm", "model.vision_tower.post_layernorm", # minicpmv4_6 "model.vision_model.post_layernorm", # SmolVLM @@ -1952,6 +1964,82 @@ class TensorNameMap: "model.vision_tower.std_scale", # gemma4 ), + # For these tensors, bid => projector ID + MODEL_TENSOR.V_MULTI_PROJ_IMG_POS: ( + "model.layerwise_projectors.{bid}.image_positions", # Granite4 Vision + "model.spatial_projectors.{bid}.image_positions", # Granite4 Vision + ), + MODEL_TENSOR.V_MULTI_PROJ_QUERY: ( + "model.layerwise_projectors.{bid}.query", # Granite4 Vision + "model.spatial_projectors.{bid}.query", # Granite4 Vision + ), + MODEL_TENSOR.V_MULTI_PROJ_LINEAR: ( + "model.layerwise_projectors.{bid}.out_linear", # Granite4 Vision + "model.spatial_projectors.{bid}.out_linear", # Granite4 Vision + ), + MODEL_TENSOR.V_MULTI_PROJ_NORM: ( + "model.layerwise_projectors.{bid}.norm", # Granite4 Vision + "model.spatial_projectors.{bid}.norm", # Granite4 Vision + ), + MODEL_TENSOR.V_MULTI_PROJ_POST_NORM: ( + "model.layerwise_projectors.{bid}.qformer.layernorm", # Granite4 Vision + "model.spatial_projectors.{bid}.qformer.layernorm", # Granite4 Vision + ), + + # For these tensors, bid => proj-id + MODEL_TENSOR.V_QF_SELF_ATTN_Q: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.attention.attention.query", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.attention.attention.query", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_SELF_ATTN_K: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.attention.attention.key", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.attention.attention.key", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_SELF_ATTN_V: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.attention.attention.value", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.attention.attention.value", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_SELF_ATTN_O: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.attention.output.dense", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.attention.output.dense", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_SELF_ATTN_NORM: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.attention.output.LayerNorm", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.attention.output.LayerNorm", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_CROSS_ATTN_Q: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.crossattention.attention.query", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.crossattention.attention.query", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_CROSS_ATTN_K: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.crossattention.attention.key", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.crossattention.attention.key", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_CROSS_ATTN_V: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.crossattention.attention.value", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.crossattention.attention.value", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_CROSS_ATTN_O: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.crossattention.output.dense", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.crossattention.output.dense", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_CROSS_ATTN_NORM: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.crossattention.output.LayerNorm", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.crossattention.output.LayerNorm", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_FFN_UP: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.intermediate_query.dense", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.intermediate_query.dense", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_FFN_DOWN: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.output_query.dense", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.output_query.dense", # Granite4 Vision + ), + MODEL_TENSOR.V_QF_FFN_NORM: ( + "model.layerwise_projectors.qformer.encoder.layer.{bid}.output_query.LayerNorm", # Granite4 Vision + "model.spatial_projectors.qformer.encoder.layer.{bid}.output_query.LayerNorm", # Granite4 Vision + ), + # audio (mtmd) MODEL_TENSOR.A_ENC_EMBD_POS: ( diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index fea898deaf..52963f8f1e 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -196,6 +196,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_MOE_LATENT_SIZE, "%s.moe_latent_size" }, { LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" }, { LLM_KV_NUM_DEEPSTACK_LAYERS, "%s.n_deepstack_layers" }, + { LLM_KV_DEEPSTACK_MAPPING, "%s.deepstack_mapping" }, { LLM_KV_HIDDEN_ACT, "%s.hidden_activation" }, { LLM_KV_POOLING_TYPE, "%s.pooling_type" }, { LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index f364f6b0ba..dc9bca9bfc 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -200,6 +200,7 @@ enum llm_kv { LLM_KV_MOE_LATENT_SIZE, LLM_KV_NEXTN_PREDICT_LAYERS, LLM_KV_NUM_DEEPSTACK_LAYERS, + LLM_KV_DEEPSTACK_MAPPING, LLM_KV_HIDDEN_ACT, LLM_KV_POOLING_TYPE, LLM_KV_LOGIT_SCALE, diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 172edf24cb..3b8125cde7 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1859,7 +1859,12 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { res->t_inp_embd = cur; // For Granite architecture - if (hparams.f_embedding_scale != 0.0f) { + // NOTE: Only apply scale to token inputs. Raw embeddings are assumed to be + // multimodal inputs that should not be scaled. + if (ubatch.token && hparams.f_embedding_scale != 0.0f) { + if (!ggml_is_contiguous(cur)) { + cur = ggml_cont(ctx0, cur); + } cur = ggml_scale(ctx0, cur, hparams.f_embedding_scale); } diff --git a/src/llama-hparams.h b/src/llama-hparams.h index fde6183e87..87db4a0dd3 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -219,8 +219,18 @@ struct llama_hparams { uint32_t indexer_top_k = 0; // qwen3vl deepstack + // When parsed from GGUF, this implies the first N layers consume the first + // N deepstack embeddings. Use deepstack_mapping_arr if you need a more + // complex mapping. If using deepstack_mapping_arr, also make sure to set + // n_deepstack_layers to the number of unique deepstack layers so that + // n_embd_imp is accurate (see granite.cpp). uint32_t n_deepstack_layers = 0; + // deepstack layer array (Granite4 Vision) + // -1 => no deepstack + // >=0 => input embedding index for deepstack injection + std::array deepstack_mapping_arr; + // gemma4 per-layer embedding uint32_t n_embd_per_layer = 0; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index ba08a19ac7..0d1cf3cc33 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -393,6 +393,7 @@ namespace GGUFMeta { } template bool llama_model_loader::get_arr>(enum llm_kv kid, std::vector & result, bool required); + template bool llama_model_loader::get_arr>(enum llm_kv kid, std::array & result, bool required); template bool llama_model_loader::get_key(const std::string & key, T & result, bool required) { diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp index b052287809..67d4a9df0f 100644 --- a/src/llama-model-saver.cpp +++ b/src/llama-model-saver.cpp @@ -229,6 +229,7 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers); add_kv(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn); add_kv(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers); + add_kv(LLM_KV_DEEPSTACK_MAPPING, hparams.deepstack_mapping_arr); add_kv(LLM_KV_POOLING_TYPE, uint32_t(hparams.pooling_type)); add_kv(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); add_kv(LLM_KV_DECODER_START_TOKEN_ID, hparams.dec_start_token_id); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 784deb70af..6808ad044c 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1100,6 +1100,9 @@ void llama_model_base::load_hparams(llama_model_loader & ml) { ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer(), false); ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer(), false); + // Populate deepstack_mapping_arr - initialized to -1 (no deepstack) + std::fill(hparams.deepstack_mapping_arr.begin(), hparams.deepstack_mapping_arr.end(), -1); + // n_head_kv is optional, default to n_head hparams.n_head_kv_arr = hparams.n_head_arr; @@ -1678,10 +1681,10 @@ uint64_t llama_model::n_elements() const { void llama_model::print_info() const { const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train); - auto print_f = [](const std::function & f, uint32_t n) { + auto print_f = [](const std::function & f, uint32_t n) { bool is_var = false; - std::vector v; + std::vector v; for (uint32_t i = 0; i < n; ++i) { v.push_back(f(i)); if (v[i] != v[0]) { @@ -1755,6 +1758,14 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn); LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul); LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown"); + if (arch == LLM_ARCH_GRANITE && + std::any_of(hparams.deepstack_mapping_arr.begin(), + hparams.deepstack_mapping_arr.end(), + [](const auto & entry) { return entry >= 0; })) { + LLAMA_LOG_INFO("%s: deepstack_mapping_arr = %s\n", __func__, + print_f([&](uint32_t il) { return hparams.deepstack_mapping_arr[il]; }, + hparams.n_layer).c_str()); + } // MRoPE (Multi-axis Rotary Position Embedding) sections if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) { LLAMA_LOG_INFO("%s: mrope sections = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]); diff --git a/src/models/granite.cpp b/src/models/granite.cpp index 7aff942da0..4a75c5ff3c 100644 --- a/src/models/granite.cpp +++ b/src/models/granite.cpp @@ -1,5 +1,7 @@ #include "models.h" +#include + void llama_model_granite::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); @@ -7,6 +9,27 @@ void llama_model_granite::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, false); ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale, false); + // Granite4 Vision uses array deepstack_mapping + ml.get_arr(LLM_KV_DEEPSTACK_MAPPING, hparams.deepstack_mapping_arr, false); + + // Count the unique deepstack input indices + std::unordered_set unique_deepstack_idxs; + for (const auto val : hparams.deepstack_mapping_arr) { + if (val >= 0) { + unique_deepstack_idxs.insert(val); + } + } + hparams.n_deepstack_layers = unique_deepstack_idxs.size(); + + // Ensure all values are valid (avoid overflow attacks) + for (const auto val : unique_deepstack_idxs) { + if (val > hparams.n_deepstack_layers) { + std::stringstream ss; + ss << "Invalid deepstack index: " << val << " > " << hparams.n_deepstack_layers; + throw std::runtime_error(ss.str()); + } + } + // Granite uses rope_finetuned as a switch for rope, so default to true bool rope_finetuned = true; ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false); @@ -112,6 +135,20 @@ llama_model_granite::graph::graph( ggml_tensor * inp_out_ids = build_inp_out_ids(); for (int il = 0; il < n_layer; ++il) { + + // Granite Vision 4.1 deepstack: inject the projector stream that + // targets decoder layer `il` before the decoder runs. + // NOTE: skip the first deepstack layer since that's inpL + const auto & deepstack_emb_idx = hparams.deepstack_mapping_arr[il]; + if (il > 0 && deepstack_emb_idx >= 0) { + ggml_tensor * ds = ggml_view_2d(ctx0, + res->t_inp_embd, n_embd, n_tokens, + res->t_inp_embd->nb[1], + deepstack_emb_idx * n_embd * sizeof(float)); + inpL = ggml_add(ctx0, inpL, ds); + cb(inpL, "deepstack_in", il); + } + ggml_tensor * inpSA = inpL; // norm diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 93f005652b..20c5317863 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -25,6 +25,7 @@ add_library(mtmd models/gemma4uv.cpp models/glm4v.cpp models/granite-speech.cpp + models/granite4-vision.cpp models/hunyuanvl.cpp models/internvl.cpp models/kimivl.cpp diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index c055cfb754..393e085f71 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -35,20 +35,22 @@ #define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" // vision-specific -#define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities -#define KEY_IMAGE_SIZE "clip.vision.image_size" -#define KEY_IMAGE_MIN_PIXELS "clip.vision.image_min_pixels" -#define KEY_IMAGE_MAX_PIXELS "clip.vision.image_max_pixels" -#define KEY_PREPROC_MIN_TILES "clip.vision.preproc_min_tiles" -#define KEY_PREPROC_MAX_TILES "clip.vision.preproc_max_tiles" -#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size" -#define KEY_PATCH_SIZE "clip.vision.patch_size" -#define KEY_IMAGE_MEAN "clip.vision.image_mean" -#define KEY_IMAGE_STD "clip.vision.image_std" -#define KEY_FEATURE_LAYER "clip.vision.feature_layer" -#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor" -#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size" -#define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers" +#define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities +#define KEY_IMAGE_SIZE "clip.vision.image_size" +#define KEY_IMAGE_MIN_PIXELS "clip.vision.image_min_pixels" +#define KEY_IMAGE_MAX_PIXELS "clip.vision.image_max_pixels" +#define KEY_PREPROC_MIN_TILES "clip.vision.preproc_min_tiles" +#define KEY_PREPROC_MAX_TILES "clip.vision.preproc_max_tiles" +#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size" +#define KEY_PATCH_SIZE "clip.vision.patch_size" +#define KEY_IMAGE_MEAN "clip.vision.image_mean" +#define KEY_IMAGE_STD "clip.vision.image_std" +#define KEY_FEATURE_LAYER "clip.vision.feature_layer" +#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor" +#define KEY_PROJ_SAMPLE_QUERY_SIDE "clip.vision.projector.query_side" +#define KEY_PROJ_SAMPLE_WINDOW_SIDE "clip.vision.projector.window_side" +#define KEY_PROJ_SPATIAL_OFFSETS "clip.vision.projector.spatial_offsets" +#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size" #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" @@ -72,7 +74,6 @@ #define KEY_A_PROJ_DOWNSAMPLE_RATE "clip.audio.projector.downsample_rate" #define KEY_A_PROJ_HEAD_COUNT "clip.audio.projector.head_count" - // // tensor name constants // @@ -210,22 +211,28 @@ #define TN_CTC_OUT_MID "a.enc_ctc_out_mid.%s" #define TN_ATTN_REL_POS_EMB "%s.blk.%d.attn_rel_pos_emb" // qformer projector -#define TN_QF_PROJ_QUERY "a.proj_query" -#define TN_QF_PROJ_NORM "a.proj_norm.%s" -#define TN_QF_PROJ_LINEAR "a.proj_linear.%s" -#define TN_QF_SELF_ATTN_Q "a.proj_blk.%d.self_attn_q.%s" -#define TN_QF_SELF_ATTN_K "a.proj_blk.%d.self_attn_k.%s" -#define TN_QF_SELF_ATTN_V "a.proj_blk.%d.self_attn_v.%s" -#define TN_QF_SELF_ATTN_O "a.proj_blk.%d.self_attn_out.%s" -#define TN_QF_SELF_ATTN_N "a.proj_blk.%d.self_attn_norm.%s" -#define TN_QF_CROSS_ATTN_Q "a.proj_blk.%d.cross_attn_q.%s" -#define TN_QF_CROSS_ATTN_K "a.proj_blk.%d.cross_attn_k.%s" -#define TN_QF_CROSS_ATTN_V "a.proj_blk.%d.cross_attn_v.%s" -#define TN_QF_CROSS_ATTN_O "a.proj_blk.%d.cross_attn_out.%s" -#define TN_QF_CROSS_ATTN_N "a.proj_blk.%d.cross_attn_norm.%s" -#define TN_QF_FFN_UP "a.proj_blk.%d.ffn_up.%s" -#define TN_QF_FFN_DOWN "a.proj_blk.%d.ffn_down.%s" -#define TN_QF_FFN_NORM "a.proj_blk.%d.ffn_norm.%s" +#define TN_QF_PROJ_QUERY "%s.proj_query" +#define TN_QF_PROJ_NORM "%s.proj_norm.%s" +#define TN_QF_PROJ_LINEAR "%s.proj_linear.%s" +#define TN_QF_SELF_ATTN_Q "%s.proj_blk.%d.self_attn_q.%s" +#define TN_QF_SELF_ATTN_K "%s.proj_blk.%d.self_attn_k.%s" +#define TN_QF_SELF_ATTN_V "%s.proj_blk.%d.self_attn_v.%s" +#define TN_QF_SELF_ATTN_O "%s.proj_blk.%d.self_attn_out.%s" +#define TN_QF_SELF_ATTN_N "%s.proj_blk.%d.self_attn_norm.%s" +#define TN_QF_CROSS_ATTN_Q "%s.proj_blk.%d.cross_attn_q.%s" +#define TN_QF_CROSS_ATTN_K "%s.proj_blk.%d.cross_attn_k.%s" +#define TN_QF_CROSS_ATTN_V "%s.proj_blk.%d.cross_attn_v.%s" +#define TN_QF_CROSS_ATTN_O "%s.proj_blk.%d.cross_attn_out.%s" +#define TN_QF_CROSS_ATTN_N "%s.proj_blk.%d.cross_attn_norm.%s" +#define TN_QF_FFN_UP "%s.proj_blk.%d.ffn_up.%s" +#define TN_QF_FFN_DOWN "%s.proj_blk.%d.ffn_down.%s" +#define TN_QF_FFN_NORM "%s.proj_blk.%d.ffn_norm.%s" +// multi-projector qformer (bid => projector ID) +#define TN_MULTI_PROJ_IMG_POS "v.proj_blk.%d.img_pos" +#define TN_MULTI_PROJ_QUERY "%s.proj_blk.%d.query" +#define TN_MULTI_PROJ_LINEAR "%s.proj_blk.%d.linear.%s" +#define TN_MULTI_PROJ_NORM "%s.proj_blk.%d.norm.%s" +#define TN_MULTI_PROJ_POST_NORM "%s.proj_blk.%d.post_norm.%s" // gemma4 audio conformer #define TN_A_MM_INP_PROJ "mm.a.input_projection.%s" @@ -354,6 +361,7 @@ enum projector_type { PROJECTOR_TYPE_MINICPMV4_6, PROJECTOR_TYPE_GRANITE_SPEECH, PROJECTOR_TYPE_MIMOVL, + PROJECTOR_TYPE_GRANITE4_VISION, PROJECTOR_TYPE_UNKNOWN, }; @@ -407,6 +415,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_MINICPMV4_6, "minicpmv4_6"}, { PROJECTOR_TYPE_GRANITE_SPEECH, "granite_speech"}, { PROJECTOR_TYPE_MIMOVL, "mimovl"}, + { PROJECTOR_TYPE_GRANITE4_VISION, "granite4_vision"}, }; static projector_type clip_projector_type_from_string(const std::string & str) { @@ -438,6 +447,8 @@ struct clip_image_f32 { // marks the global view in e.g., DeepSeek-OCR Models bool add_viewsep = false; + // whether a learned newline token should be appended after the image (eg Granite4 Vision) + bool add_newline = false; }; // diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index 238f805a9a..48796b6306 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -4,6 +4,7 @@ #include "clip.h" #include "clip-impl.h" +#include #include #include #include @@ -90,7 +91,7 @@ struct clip_hparams { float eps = 1e-6; float rope_theta = 0.0; - std::unordered_set vision_feature_layer; + std::vector vision_feature_layer; int32_t attn_window_size = 0; int32_t n_wa_pattern = 0; std::unordered_set wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL) @@ -101,6 +102,11 @@ struct clip_hparams { int32_t sam_n_head = 0; int32_t sam_n_embd = 0; + // Granite4 Vision + std::vector proj_spatial_offsets; + int32_t downsample_query_side; + int32_t downsample_window_side; + // audio int32_t n_mel_bins = 0; // whisper preprocessor int32_t proj_stack_factor = 0; // ultravox @@ -158,6 +164,10 @@ struct clip_hparams { return false; } + + bool is_vision_feature_layer(int32_t layer) const { + return std::find(vision_feature_layer.begin(), vision_feature_layer.end(), layer) != vision_feature_layer.end(); + } }; struct clip_layer { @@ -325,6 +335,20 @@ struct yasa2_stage { std::vector blocks; }; +// QFormer projector block for models with 1 (or more) QFormer projectors +// Granite Speech, Granite4 Vision +struct qf_block { + ggml_tensor * qf_proj_query = nullptr; + ggml_tensor * qf_proj_norm_w = nullptr; + ggml_tensor * qf_proj_norm_b = nullptr; + ggml_tensor * qf_proj_linear_w = nullptr; + ggml_tensor * qf_proj_linear_b = nullptr; + ggml_tensor * qf_proj_post_norm_w = nullptr; + ggml_tensor * qf_proj_post_norm_b = nullptr; + ggml_tensor * qf_proj_img_pos = nullptr; // Vision only + std::vector qf_proj_layers; +}; + struct clip_model { clip_modality modality = CLIP_MODALITY_VISION; projector_type proj_type = PROJECTOR_TYPE_MLP; @@ -589,13 +613,8 @@ struct clip_model { ggml_tensor * ctc_out_b = nullptr; ggml_tensor * ctc_out_mid_w = nullptr; ggml_tensor * ctc_out_mid_b = nullptr; - // qformer projector - ggml_tensor * qf_proj_query = nullptr; - ggml_tensor * qf_proj_norm_w = nullptr; - ggml_tensor * qf_proj_norm_b = nullptr; - ggml_tensor * qf_proj_linear_w = nullptr; - ggml_tensor * qf_proj_linear_b = nullptr; - std::vector qf_proj_layers; + // qformer projector(s) + std::vector qf_proj_blocks; bool audio_has_avgpool() const { return proj_type == PROJECTOR_TYPE_QWEN2A diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 80136ed866..c12c910a1c 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -997,6 +997,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; + case PROJECTOR_TYPE_GRANITE4_VISION: + { + builder = std::make_unique(ctx, img); + } break; default: GGML_ABORT("missing cgraph builder"); } @@ -1234,12 +1238,7 @@ struct clip_model_loader { // to form the final visual features. // NOTE: gguf conversions should standardize the values of the vision feature layer to // be non-negative, since we use -1 to mark values as unset here. - std::vector vision_feature_layer; - get_arr_int(KEY_FEATURE_LAYER, vision_feature_layer, false); - // convert std::vector to std::unordered_set - for (auto & layer : vision_feature_layer) { - hparams.vision_feature_layer.insert(layer); - } + get_arr_int(KEY_FEATURE_LAYER, hparams.vision_feature_layer, false); // model-specific params switch (model.proj_type) { @@ -1627,6 +1626,23 @@ struct clip_model_loader { hparams.image_pad_color = {127, 127, 127}; hparams.image_resize_algo = RESIZE_ALGO_BILINEAR; } break; + case PROJECTOR_TYPE_GRANITE4_VISION: + { + // SigLIP tower. + hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW; + hparams.image_resize_pad = PAD_CEIL; + + get_arr_int(KEY_FEATURE_LAYER, hparams.vision_feature_layer); + get_arr_int(KEY_PROJ_SPATIAL_OFFSETS, hparams.proj_spatial_offsets); + if (hparams.vision_feature_layer.size() != hparams.proj_spatial_offsets.size()) { + throw std::runtime_error(string_format("%s: vision_feature_layer.size() %d != proj_spatial_offsets.size() %d", + hparams.vision_feature_layer.size(), hparams.proj_spatial_offsets.size())); + } + + get_u32(KEY_PROJ_SAMPLE_QUERY_SIDE, hparams.downsample_query_side); + get_u32(KEY_PROJ_SAMPLE_WINDOW_SIDE, hparams.downsample_window_side); + hparams.warmup_image_size = hparams.image_size; + } break; default: throw std::runtime_error(string_format("%s: unknown vision projector type %s\n", __func__, proj_type.c_str())); } @@ -2628,47 +2644,106 @@ struct clip_model_loader { layer.conv_pw2_b = get_tensor(string_format(TN_CONV_PW2, prefix, il, "bias")); } - model.qf_proj_query = get_tensor(TN_QF_PROJ_QUERY); - model.qf_proj_norm_w = get_tensor(string_format(TN_QF_PROJ_NORM, "weight")); - model.qf_proj_norm_b = get_tensor(string_format(TN_QF_PROJ_NORM, "bias")); - model.qf_proj_linear_w = get_tensor(string_format(TN_QF_PROJ_LINEAR, "weight")); - model.qf_proj_linear_b = get_tensor(string_format(TN_QF_PROJ_LINEAR, "bias")); + model.qf_proj_blocks.resize(1); + auto & qf = model.qf_proj_blocks[0]; + qf.qf_proj_query = get_tensor(string_format(TN_QF_PROJ_QUERY, prefix)); + qf.qf_proj_norm_w = get_tensor(string_format(TN_QF_PROJ_NORM, prefix, "weight")); + qf.qf_proj_norm_b = get_tensor(string_format(TN_QF_PROJ_NORM, prefix, "bias")); + qf.qf_proj_linear_w = get_tensor(string_format(TN_QF_PROJ_LINEAR, prefix, "weight")); + qf.qf_proj_linear_b = get_tensor(string_format(TN_QF_PROJ_LINEAR, prefix, "bias")); const int n_proj_layers = 2; - model.qf_proj_layers.resize(n_proj_layers); + qf.qf_proj_layers.resize(n_proj_layers); for (int il = 0; il < n_proj_layers; ++il) { - auto & pl = model.qf_proj_layers[il]; + auto & pl = qf.qf_proj_layers[il]; - pl.q_w = get_tensor(string_format(TN_QF_SELF_ATTN_Q, il, "weight")); - pl.q_b = get_tensor(string_format(TN_QF_SELF_ATTN_Q, il, "bias")); - pl.k_w = get_tensor(string_format(TN_QF_SELF_ATTN_K, il, "weight")); - pl.k_b = get_tensor(string_format(TN_QF_SELF_ATTN_K, il, "bias")); - pl.v_w = get_tensor(string_format(TN_QF_SELF_ATTN_V, il, "weight")); - pl.v_b = get_tensor(string_format(TN_QF_SELF_ATTN_V, il, "bias")); - pl.o_w = get_tensor(string_format(TN_QF_SELF_ATTN_O, il, "weight")); - pl.o_b = get_tensor(string_format(TN_QF_SELF_ATTN_O, il, "bias")); - pl.ln_1_w = get_tensor(string_format(TN_QF_SELF_ATTN_N, il, "weight")); - pl.ln_1_b = get_tensor(string_format(TN_QF_SELF_ATTN_N, il, "bias")); + pl.q_w = get_tensor(string_format(TN_QF_SELF_ATTN_Q, prefix, il, "weight")); + pl.q_b = get_tensor(string_format(TN_QF_SELF_ATTN_Q, prefix, il, "bias")); + pl.k_w = get_tensor(string_format(TN_QF_SELF_ATTN_K, prefix, il, "weight")); + pl.k_b = get_tensor(string_format(TN_QF_SELF_ATTN_K, prefix, il, "bias")); + pl.v_w = get_tensor(string_format(TN_QF_SELF_ATTN_V, prefix, il, "weight")); + pl.v_b = get_tensor(string_format(TN_QF_SELF_ATTN_V, prefix, il, "bias")); + pl.o_w = get_tensor(string_format(TN_QF_SELF_ATTN_O, prefix, il, "weight")); + pl.o_b = get_tensor(string_format(TN_QF_SELF_ATTN_O, prefix, il, "bias")); + pl.ln_1_w = get_tensor(string_format(TN_QF_SELF_ATTN_N, prefix, il, "weight")); + pl.ln_1_b = get_tensor(string_format(TN_QF_SELF_ATTN_N, prefix, il, "bias")); - pl.cross_attn_q_w = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, il, "weight")); - pl.cross_attn_q_b = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, il, "bias")); - pl.cross_attn_k_w = get_tensor(string_format(TN_QF_CROSS_ATTN_K, il, "weight")); - pl.cross_attn_k_b = get_tensor(string_format(TN_QF_CROSS_ATTN_K, il, "bias")); - pl.cross_attn_v_w = get_tensor(string_format(TN_QF_CROSS_ATTN_V, il, "weight")); - pl.cross_attn_v_b = get_tensor(string_format(TN_QF_CROSS_ATTN_V, il, "bias")); - pl.cross_attn_o_w = get_tensor(string_format(TN_QF_CROSS_ATTN_O, il, "weight")); - pl.cross_attn_o_b = get_tensor(string_format(TN_QF_CROSS_ATTN_O, il, "bias")); - pl.cross_attn_norm_w = get_tensor(string_format(TN_QF_CROSS_ATTN_N, il, "weight")); - pl.cross_attn_norm_b = get_tensor(string_format(TN_QF_CROSS_ATTN_N, il, "bias")); + pl.cross_attn_q_w = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, prefix, il, "weight")); + pl.cross_attn_q_b = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, prefix, il, "bias")); + pl.cross_attn_k_w = get_tensor(string_format(TN_QF_CROSS_ATTN_K, prefix, il, "weight")); + pl.cross_attn_k_b = get_tensor(string_format(TN_QF_CROSS_ATTN_K, prefix, il, "bias")); + pl.cross_attn_v_w = get_tensor(string_format(TN_QF_CROSS_ATTN_V, prefix, il, "weight")); + pl.cross_attn_v_b = get_tensor(string_format(TN_QF_CROSS_ATTN_V, prefix, il, "bias")); + pl.cross_attn_o_w = get_tensor(string_format(TN_QF_CROSS_ATTN_O, prefix, il, "weight")); + pl.cross_attn_o_b = get_tensor(string_format(TN_QF_CROSS_ATTN_O, prefix, il, "bias")); + pl.cross_attn_norm_w = get_tensor(string_format(TN_QF_CROSS_ATTN_N, prefix, il, "weight")); + pl.cross_attn_norm_b = get_tensor(string_format(TN_QF_CROSS_ATTN_N, prefix, il, "bias")); - pl.ff_up_w = get_tensor(string_format(TN_QF_FFN_UP, il, "weight")); - pl.ff_up_b = get_tensor(string_format(TN_QF_FFN_UP, il, "bias")); - pl.ff_down_w = get_tensor(string_format(TN_QF_FFN_DOWN, il, "weight")); - pl.ff_down_b = get_tensor(string_format(TN_QF_FFN_DOWN, il, "bias")); - pl.ln_2_w = get_tensor(string_format(TN_QF_FFN_NORM, il, "weight")); - pl.ln_2_b = get_tensor(string_format(TN_QF_FFN_NORM, il, "bias")); + pl.ff_up_w = get_tensor(string_format(TN_QF_FFN_UP, prefix, il, "weight")); + pl.ff_up_b = get_tensor(string_format(TN_QF_FFN_UP, prefix, il, "bias")); + pl.ff_down_w = get_tensor(string_format(TN_QF_FFN_DOWN, prefix, il, "weight")); + pl.ff_down_b = get_tensor(string_format(TN_QF_FFN_DOWN, prefix, il, "bias")); + pl.ln_2_w = get_tensor(string_format(TN_QF_FFN_NORM, prefix, il, "weight")); + pl.ln_2_b = get_tensor(string_format(TN_QF_FFN_NORM, prefix, il, "bias")); } } break; + case PROJECTOR_TYPE_GRANITE4_VISION: + { + // image_newline lives at the top-level. + model.image_newline = get_tensor(TN_IMAGE_NEWLINE); + + // Load separate layerwise and spatial projector tensors + const auto projector_count = hparams.vision_feature_layer.size(); + model.qf_proj_blocks.resize(projector_count); + for (size_t bid = 0; bid < projector_count; ++bid) { + auto & b = model.qf_proj_blocks[bid]; + + // non-layerwise tensors + b.qf_proj_img_pos = get_tensor(string_format(TN_MULTI_PROJ_IMG_POS, bid)); + b.qf_proj_query = get_tensor(string_format(TN_MULTI_PROJ_QUERY, prefix, bid)); + b.qf_proj_linear_w = get_tensor(string_format(TN_MULTI_PROJ_LINEAR, prefix, bid, "weight")); + b.qf_proj_linear_b = get_tensor(string_format(TN_MULTI_PROJ_LINEAR, prefix, bid, "bias")); + b.qf_proj_norm_w = get_tensor(string_format(TN_MULTI_PROJ_NORM, prefix, bid, "weight")); + b.qf_proj_norm_b = get_tensor(string_format(TN_MULTI_PROJ_NORM, prefix, bid, "bias")); + b.qf_proj_post_norm_w = get_tensor(string_format(TN_MULTI_PROJ_POST_NORM, prefix, bid, "weight")); + b.qf_proj_post_norm_b = get_tensor(string_format(TN_MULTI_PROJ_POST_NORM, prefix, bid, "bias")); + + // laywerwise tensors + // NOTE: If any model uses multi-layer qformers, this will need to change + b.qf_proj_layers.resize(1); + auto & pl = b.qf_proj_layers[0]; + + pl.q_w = get_tensor(string_format(TN_QF_SELF_ATTN_Q, prefix, bid, "weight")); + pl.q_b = get_tensor(string_format(TN_QF_SELF_ATTN_Q, prefix, bid, "bias")); + pl.k_w = get_tensor(string_format(TN_QF_SELF_ATTN_K, prefix, bid, "weight")); + pl.k_b = get_tensor(string_format(TN_QF_SELF_ATTN_K, prefix, bid, "bias")); + pl.v_w = get_tensor(string_format(TN_QF_SELF_ATTN_V, prefix, bid, "weight")); + pl.v_b = get_tensor(string_format(TN_QF_SELF_ATTN_V, prefix, bid, "bias")); + pl.o_w = get_tensor(string_format(TN_QF_SELF_ATTN_O, prefix, bid, "weight")); + pl.o_b = get_tensor(string_format(TN_QF_SELF_ATTN_O, prefix, bid, "bias")); + pl.ln_1_w = get_tensor(string_format(TN_QF_SELF_ATTN_N, prefix, bid, "weight")); + pl.ln_1_b = get_tensor(string_format(TN_QF_SELF_ATTN_N, prefix, bid, "bias")); + + pl.cross_attn_q_w = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, prefix, bid, "weight")); + pl.cross_attn_q_b = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, prefix, bid, "bias")); + pl.cross_attn_k_w = get_tensor(string_format(TN_QF_CROSS_ATTN_K, prefix, bid, "weight")); + pl.cross_attn_k_b = get_tensor(string_format(TN_QF_CROSS_ATTN_K, prefix, bid, "bias")); + pl.cross_attn_v_w = get_tensor(string_format(TN_QF_CROSS_ATTN_V, prefix, bid, "weight")); + pl.cross_attn_v_b = get_tensor(string_format(TN_QF_CROSS_ATTN_V, prefix, bid, "bias")); + pl.cross_attn_o_w = get_tensor(string_format(TN_QF_CROSS_ATTN_O, prefix, bid, "weight")); + pl.cross_attn_o_b = get_tensor(string_format(TN_QF_CROSS_ATTN_O, prefix, bid, "bias")); + pl.cross_attn_norm_w = get_tensor(string_format(TN_QF_CROSS_ATTN_N, prefix, bid, "weight")); + pl.cross_attn_norm_b = get_tensor(string_format(TN_QF_CROSS_ATTN_N, prefix, bid, "bias")); + + pl.ff_up_w = get_tensor(string_format(TN_QF_FFN_UP, prefix, bid, "weight")); + pl.ff_up_b = get_tensor(string_format(TN_QF_FFN_UP, prefix, bid, "bias")); + pl.ff_down_w = get_tensor(string_format(TN_QF_FFN_DOWN, prefix, bid, "weight")); + pl.ff_down_b = get_tensor(string_format(TN_QF_FFN_DOWN, prefix, bid, "bias")); + pl.ln_2_w = get_tensor(string_format(TN_QF_FFN_NORM, prefix, bid, "weight")); + pl.ln_2_b = get_tensor(string_format(TN_QF_FFN_NORM, prefix, bid, "bias")); + } + + } break; default: GGML_ASSERT(false && "unknown projector type"); } @@ -3085,10 +3160,6 @@ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny memcpy(img->buf.data(), rgb_pixels, img->buf.size()); } -ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) { - return ctx->model.image_newline; -} - void clip_free(clip_ctx * ctx) { if (ctx == nullptr) { return; @@ -3397,6 +3468,23 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im const int ds = ctx->model.hparams.audio_proj_downsample_rate; n_patches = ((img->nx + ws - 1) / ws) * (ws / ds); } break; + case PROJECTOR_TYPE_GRANITE4_VISION: + { + // Per-tile output token count: each projector block outputs + // query_side^2 tokens per window × n^2 windows. + // For 384×384 input: n = 24/8 = 3, query_side = 4 → 144. + const int window_side = ctx->model.hparams.downsample_window_side; + const int query_side = ctx->model.hparams.downsample_query_side; + const int side = img->nx / params.patch_size; + const int n = side / window_side; + n_patches = (query_side * n) * (query_side * n); + if (img->add_newline) { + // For single-tile case: append 1 newline row. + // For multi-tile rowwise: handled by caller, but here we + // report the per-tile count including one trailing newline. + n_patches += 1; + } + } break; default: GGML_ABORT("unsupported projector type"); } @@ -4229,6 +4317,82 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima set_input_f32("attn_mask", mask); } } break; + case PROJECTOR_TYPE_GRANITE4_VISION: + { + // Granite Vision 4.1 uses precomputed permutation index + // tensors to express the _win / _unwin / spatial sampling + // reshapes as ggml_get_rows gathers. The names are set + // by g4v_gather() in models/granite4-vision.cpp. + const int patch_size = model.hparams.patch_size; + const int image_side = imgs.entries.front()->nx / patch_size; + const int window_side = hparams.downsample_window_side; + const int query_side = hparams.downsample_query_side; + const int n = image_side / window_side; + const int new_side = n * query_side; + + // Builds the raster→window permutation indices for a + // (side, side) grid split into (n × n) windows of (win × win) + // tokens each. dst[w * win*win + p] = source raster index. + auto make_win_idx = [](int side, int win) { + const int nn = side / win; + std::vector idx(static_cast(side) * side); + for (int wy = 0; wy < nn; ++wy) { + for (int wx = 0; wx < nn; ++wx) { + for (int iy = 0; iy < win; ++iy) { + for (int ix = 0; ix < win; ++ix) { + const int w = wy * nn + wx; + const int p = iy * win + ix; + const int y = wy * win + iy; + const int x = wx * win + ix; + idx[static_cast(w) * (win*win) + p] = y * side + x; + } + } + } + } + return idx; + }; + + auto make_unwin_idx = [&](int side, int win) { + const std::vector fwd = make_win_idx(side, win); + std::vector inv(fwd.size()); + for (size_t i = 0; i < fwd.size(); ++i) { + inv[fwd[i]] = static_cast(i); + } + return inv; + }; + + auto make_spatial_idx = [](int side, int offset) { + const int off_y = (offset >> 1) & 1; + const int off_x = offset & 1; + const int new_s = side / 2; + std::vector idx(static_cast(new_s) * new_s); + for (int y = 0; y < new_s; ++y) { + for (int x = 0; x < new_s; ++x) { + idx[y * new_s + x] = (y * 2 + off_y) * side + (x * 2 + off_x); + } + } + return idx; + }; + + auto upload = [&](const std::string & name, const std::vector & idx) { + ggml_tensor * t = ggml_graph_get_tensor(gf, name.c_str()); + GGML_ASSERT(t); + ggml_backend_tensor_set(t, idx.data(), 0, idx.size() * sizeof(int32_t)); + }; + + // Stage 1b only uses block 0's permutations; future stages + // will upload all blocks. + for (size_t bid = 0; bid < hparams.vision_feature_layer.size(); ++bid) { + const std::string prefix = "g4v_blk" + std::to_string(bid) + "_"; + upload(prefix + "win_idx", make_win_idx(image_side, window_side)); + upload(prefix + "qwin_idx", make_win_idx(new_side, query_side)); + upload(prefix + "unwin_idx", make_unwin_idx(new_side, query_side)); + const auto spatial_offset = hparams.proj_spatial_offsets[bid]; + if (spatial_offset >= 0) { + upload(prefix + "spatial_idx", make_spatial_idx(image_side,spatial_offset)); + } + } + } break; default: GGML_ABORT("Unknown projector type"); } @@ -4384,7 +4548,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_LFM2A: return ctx->model.position_embeddings->ne[0]; case PROJECTOR_TYPE_GRANITE_SPEECH: - return ctx->model.qf_proj_linear_w->ne[1]; + return ctx->model.qf_proj_blocks[0].qf_proj_linear_w->ne[1]; + case PROJECTOR_TYPE_GRANITE4_VISION: + return ctx->model.qf_proj_blocks.size() * ctx->model.hparams.projection_dim; case PROJECTOR_TYPE_GLM4V: return ctx->model.mm_ffn_down_w->ne[1]; default: diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index 9b807ffa77..a62c9d6187 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -100,8 +100,6 @@ struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch */ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img); -struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx); - bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec); bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec); diff --git a/tools/mtmd/models/granite-speech.cpp b/tools/mtmd/models/granite-speech.cpp index c7e3794a49..5e66f75d0a 100644 --- a/tools/mtmd/models/granite-speech.cpp +++ b/tools/mtmd/models/granite-speech.cpp @@ -199,8 +199,8 @@ ggml_cgraph * clip_graph_granite_speech::build() { ggml_tensor * enc_windows = ggml_reshape_3d(ctx0, cur, n_embd, window_size, nblocks_proj); - ggml_tensor * queries = build_norm(model.qf_proj_query, - model.qf_proj_norm_w, model.qf_proj_norm_b, + ggml_tensor * queries = build_norm(model.qf_proj_blocks[0].qf_proj_query, + model.qf_proj_blocks[0].qf_proj_norm_w, model.qf_proj_blocks[0].qf_proj_norm_b, NORM_TYPE_NORMAL, proj_eps, -1); { ggml_tensor * q_3d = ggml_reshape_3d(ctx0, queries, n_embd, num_queries, 1); @@ -209,8 +209,8 @@ ggml_cgraph * clip_graph_granite_speech::build() { queries = ggml_repeat(ctx0, q_3d, q_shape); } - for (int il = 0; il < (int)model.qf_proj_layers.size(); il++) { - const auto & pl = model.qf_proj_layers[il]; + for (int il = 0; il < (int)model.qf_proj_blocks[0].qf_proj_layers.size(); il++) { + const auto & pl = model.qf_proj_blocks[0].qf_proj_layers[il]; // self-attention { @@ -265,7 +265,7 @@ ggml_cgraph * clip_graph_granite_speech::build() { } cur = ggml_reshape_2d(ctx0, queries, n_embd, num_queries * nblocks_proj); - cur = ggml_add(ctx0, build_mm(model.qf_proj_linear_w, cur), model.qf_proj_linear_b); + cur = ggml_add(ctx0, build_mm(model.qf_proj_blocks[0].qf_proj_linear_w, cur), model.qf_proj_blocks[0].qf_proj_linear_b); cb(cur, "projector_out", -1); } diff --git a/tools/mtmd/models/granite4-vision.cpp b/tools/mtmd/models/granite4-vision.cpp new file mode 100644 index 0000000000..9adb6f0fdb --- /dev/null +++ b/tools/mtmd/models/granite4-vision.cpp @@ -0,0 +1,339 @@ +#include "models.h" +#include "../clip-impl.h" +#include "../clip-model.h" + +#include +#include +#include +#include +#include + +/* + * Granite Vision 4.1 clip graph + * + * Stage 1a: SigLIP vision tower (N layers, post-norm) + * Stage 1b: WindowQFormer blocks (deepstack + spatial) + * Stage 1c: Concatenate and pack outputs + * Stage 1d: Append newline tokens if add_newline is set + */ + +// --------------------------------------------------------------------------- +// Member method implementations +// --------------------------------------------------------------------------- + +ggml_tensor * clip_graph_granite4_vision::gather( + ggml_tensor * src, + const std::string & name, + int idx_len) { + ggml_tensor * idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, idx_len); + ggml_set_name(idx, name.c_str()); + ggml_set_input(idx); + return ggml_get_rows(ctx0, src, idx); +} + +ggml_tensor * clip_graph_granite4_vision::interp_down( + ggml_tensor * src, + int side, + int new_side) { + const int n_embd = src->ne[0]; + ggml_tensor * t = ggml_reshape_4d(ctx0, src, n_embd, side, side, 1); + t = ggml_cont(ctx0, ggml_permute(ctx0, t, 2, 0, 1, 3)); + const int kernel = side / new_side; + t = ggml_pool_2d(ctx0, t, GGML_OP_POOL_AVG, kernel, kernel, kernel, kernel, 0, 0); + t = ggml_cont(ctx0, ggml_permute(ctx0, t, 1, 2, 0, 3)); + return ggml_reshape_2d(ctx0, t, n_embd, new_side * new_side); +} + +// --------------------------------------------------------------------------- +// build_block - WindowQFormer block implementation +// --------------------------------------------------------------------------- + +ggml_tensor * clip_graph_granite4_vision::build_block( + const qf_block & blk, + ggml_tensor * h, + int bid, + int spatial_offset, + int image_side, + int window_side, + int query_side, + float qformer_eps) { + + const int n_embd = h->ne[0]; + GGML_ASSERT(h->ne[1] == image_side * image_side); + const int n = image_side / window_side; + const int new_side = n * query_side; + const int n_windows = n * n; + const int enc_len = window_side * window_side; + const int query_len = query_side * query_side; + + auto cbx = [&](ggml_tensor * & t, const char * step) { + const std::string name = "g4v_blk" + std::to_string(bid) + "_" + step; + ggml_set_name(t, name.c_str()); + }; + + // 1. Top-level LN + cbx(h, "inp"); + ggml_tensor * x = build_norm(h, blk.qf_proj_norm_w, blk.qf_proj_norm_b, NORM_TYPE_NORMAL, eps, bid); + cbx(x, "norm"); + + // 2. enc = _win(x, image_side, window_side) + ggml_tensor * enc; + { + ggml_tensor * enc_flat = gather(x, + "g4v_blk" + std::to_string(bid) + "_win_idx", + image_side * image_side); + enc = ggml_reshape_3d(ctx0, enc_flat, n_embd, enc_len, n_windows); + } + cbx(enc, "enc"); + + // 3. downsampled = downsampler(x) + ggml_tensor * d; + (void) spatial_offset; + if (spatial_offset >= 0) { + d = gather(x, + "g4v_blk" + std::to_string(bid) + "_spatial_idx", + new_side * new_side); + } else { + d = interp_down(x, image_side, new_side); + } + cbx(d, "downsampled"); + + // 4. query_embeds = query + _win(d, new_side, query_side) + ggml_tensor * q_in; + { + ggml_tensor * dw_flat = gather(d, + "g4v_blk" + std::to_string(bid) + "_qwin_idx", + new_side * new_side); + ggml_tensor * dw = ggml_reshape_3d(ctx0, dw_flat, n_embd, query_len, n_windows); + q_in = ggml_add(ctx0, dw, blk.qf_proj_query); + } + cbx(q_in, "query_embeds"); + + // 5. encoder_embeds = enc + image_positions → (C, enc_len, n_windows) + ggml_tensor * e_in = ggml_add(ctx0, enc, blk.qf_proj_img_pos); + cbx(e_in, "encoder_embeds"); + + // 6. Qformer forward. + ggml_tensor * q = build_norm(q_in, blk.qf_proj_post_norm_w, blk.qf_proj_post_norm_b, NORM_TYPE_NORMAL, qformer_eps, bid); + + // Helper for linear projections with window batching + auto linear = [&](ggml_tensor * x, ggml_tensor * w, ggml_tensor * b) -> ggml_tensor * { + ggml_tensor * t = ggml_reshape_2d(ctx0, x, x->ne[0], x->ne[1] * x->ne[2]); + t = build_mm(w, t); + if (b) t = ggml_add(ctx0, t, b); + return t; + }; + + // Get the single QFormer layer + GGML_ASSERT(blk.qf_proj_layers.size() == 1); + const auto & pl = blk.qf_proj_layers[0]; + + // 6a. Self-attention + ggml_tensor * sa_out; + { + const int d_h = 64; + const int n_head = n_embd / d_h; + const int nq = q->ne[1]; + const float scale = 1.0f / std::sqrt((float) d_h); + + ggml_tensor * Q = linear(q, pl.q_w, pl.q_b); + ggml_tensor * K = linear(q, pl.k_w, pl.k_b); + ggml_tensor * V = linear(q, pl.v_w, pl.v_b); + + Q = ggml_reshape_4d(ctx0, Q, d_h, n_head, nq, n_windows); + K = ggml_reshape_4d(ctx0, K, d_h, n_head, nq, n_windows); + V = ggml_reshape_4d(ctx0, V, d_h, n_head, nq, n_windows); + + sa_out = build_attn(pl.o_w, pl.o_b, Q, K, V, nullptr, scale, bid); + sa_out = ggml_reshape_3d(ctx0, sa_out, n_embd, nq, n_windows); + + sa_out = ggml_add(ctx0, sa_out, q); + sa_out = build_norm(sa_out, pl.ln_1_w, pl.ln_1_b, + NORM_TYPE_NORMAL, qformer_eps, bid); + } + cbx(sa_out, "sa_out"); + + // 6b. Cross-attention + ggml_tensor * ca_out; + { + const int d_h = 64; + const int n_head = n_embd / d_h; + const int nq = sa_out->ne[1]; + const int nkv = e_in->ne[1]; + const float scale = 1.0f / std::sqrt((float) d_h); + + ggml_tensor * Q = linear(sa_out, pl.cross_attn_q_w, pl.cross_attn_q_b); + ggml_tensor * K = linear(e_in, pl.cross_attn_k_w, pl.cross_attn_k_b); + ggml_tensor * V = linear(e_in, pl.cross_attn_v_w, pl.cross_attn_v_b); + + Q = ggml_reshape_4d(ctx0, Q, d_h, n_head, nq, n_windows); + K = ggml_reshape_4d(ctx0, K, d_h, n_head, nkv, n_windows); + V = ggml_reshape_4d(ctx0, V, d_h, n_head, nkv, n_windows); + + ca_out = build_attn(pl.cross_attn_o_w, pl.cross_attn_o_b, + Q, K, V, nullptr, scale, bid); + ca_out = ggml_reshape_3d(ctx0, ca_out, n_embd, nq, n_windows); + + ca_out = ggml_add(ctx0, ca_out, sa_out); + ca_out = build_norm(ca_out, pl.cross_attn_norm_w, pl.cross_attn_norm_b, + NORM_TYPE_NORMAL, qformer_eps, bid); + } + cbx(ca_out, "ca_out"); + + // 6c. FFN + ggml_tensor * ffn; + { + ggml_tensor * t = ggml_reshape_2d(ctx0, ca_out, n_embd, query_len * n_windows); + t = build_mm(pl.ff_up_w, t); + if (pl.ff_up_b) t = ggml_add(ctx0, t, pl.ff_up_b); + t = ggml_gelu_erf(ctx0, t); + t = build_mm(pl.ff_down_w, t); + if (pl.ff_down_b) t = ggml_add(ctx0, t, pl.ff_down_b); + t = ggml_reshape_3d(ctx0, t, n_embd, query_len, n_windows); + ffn = ggml_add(ctx0, t, ca_out); + ffn = build_norm(ffn, pl.ln_2_w, pl.ln_2_b, NORM_TYPE_NORMAL, qformer_eps, bid); + } + cbx(ffn, "qformer_out"); + + // 7. _unwin back to raster + ggml_tensor * unwinned; + { + ggml_tensor * flat = ggml_reshape_2d(ctx0, ffn, n_embd, query_len * n_windows); + unwinned = gather(flat, + "g4v_blk" + std::to_string(bid) + "_unwin_idx", + new_side * new_side); + } + cbx(unwinned, "unwin"); + + // 8. out_linear + ggml_tensor * out = build_mm(blk.qf_proj_linear_w, unwinned); + if (blk.qf_proj_linear_b) out = ggml_add(ctx0, out, blk.qf_proj_linear_b); + cbx(out, "out"); + + return out; +} + +// --------------------------------------------------------------------------- +// build() - top-level graph +// --------------------------------------------------------------------------- + +// Build the K-tiled, base-scaled newline row tensor. +// Shape: (n_mmproj_embd, 1) +ggml_tensor * clip_graph_granite4_vision::build_newline_row(ggml_context * ctx0) { + const int K = (int) model.qf_proj_blocks.size(); + GGML_ASSERT(K > 0); + GGML_ASSERT(n_mmproj_embd % K == 0); + const int projection_dim = n_mmproj_embd / K; + GGML_ASSERT(model.image_newline != nullptr); + GGML_ASSERT(ggml_nelements(model.image_newline) == projection_dim); + + // Build newline_row[k*projection_dim + d] = nl[d] * (k == 0 ? base : 1.0) + ggml_tensor * nl = model.image_newline; // (projection_dim,) + ggml_tensor * nl_first_2d = ggml_reshape_2d(ctx0, nl, projection_dim, 1); + ggml_tensor * nl_row_2d; + if (K == 1) { + nl_row_2d = nl_first_2d; + } else { + ggml_tensor * nl_2d = ggml_reshape_2d(ctx0, nl, projection_dim, 1); + ggml_tensor * rest_template = ggml_new_tensor_2d( + ctx0, GGML_TYPE_F32, projection_dim, K - 1); + ggml_tensor * nl_rest = ggml_repeat(ctx0, nl_2d, rest_template); + nl_row_2d = ggml_concat(ctx0, nl_first_2d, nl_rest, 1); // (projection_dim, K) + } + nl_row_2d = ggml_cont(ctx0, nl_row_2d); + return ggml_reshape_2d(ctx0, nl_row_2d, n_mmproj_embd, 1); +} + +// Append a single newline row at the end of the tile output. +ggml_tensor * clip_graph_granite4_vision::append_rowwise_newlines(ggml_context * ctx0, ggml_tensor * tile_output) { + // For the single-tile case, append one newline row at the end. + // For the multi-tile rowwise case, this will be called per-tile + // (though currently only the single-tile path uses it). + ggml_tensor * nl_row = build_newline_row(ctx0); + return ggml_concat(ctx0, tile_output, nl_row, 1); +} + +ggml_cgraph * clip_graph_granite4_vision::build() { + GGML_ASSERT(model.patch_embeddings_0 != nullptr); + GGML_ASSERT(model.position_embeddings != nullptr); + GGML_ASSERT(model.class_embedding == nullptr); + GGML_ASSERT(!model.qf_proj_blocks.empty()); + + // --- Stage 1a: SigLIP encoder producing intermediate hidden states --- + ggml_tensor * inp = build_inp(); + inp = ggml_add(ctx0, inp, model.position_embeddings); + cb(inp, "pos_embed", -1); + + ggml_tensor * inpL = inp; + std::vector layer_outs(n_layer, nullptr); + + for (int il = 0; il < n_layer; ++il) { + const auto & layer = model.layers[il]; + ggml_tensor * cur = inpL; + + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); + + // Self-attention + ggml_tensor * Qcur = build_mm(layer.q_w, cur); + if (layer.q_b) Qcur = ggml_add(ctx0, Qcur, layer.q_b); + ggml_tensor * Kcur = build_mm(layer.k_w, cur); + if (layer.k_b) Kcur = ggml_add(ctx0, Kcur, layer.k_b); + ggml_tensor * Vcur = build_mm(layer.v_w, cur); + if (layer.v_b) Vcur = ggml_add(ctx0, Vcur, layer.v_b); + + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches); + + cur = build_attn(layer.o_w, layer.o_b, + Qcur, Kcur, Vcur, nullptr, kq_scale, il); + + cur = ggml_add(ctx0, cur, inpL); + inpL = cur; + + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + layer.ff_gate_w, layer.ff_gate_b, + layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + cur = ggml_add(ctx0, inpL, cur); + cb(cur, "layer_out", il); + layer_outs[il] = cur; + inpL = cur; + } + + // --- Stage 1b/1c: WindowQFormer blocks --- + const int projector_count = hparams.vision_feature_layer.size(); + const float qformer_eps = 1e-12f; + + ggml_tensor * mmproj = nullptr; + for (int bid = 0; bid < projector_count; ++bid) { + const auto & blk = model.qf_proj_blocks[bid]; + + int vlayer = hparams.vision_feature_layer[bid]; + GGML_ASSERT(vlayer >= 0 && vlayer < n_layer); + ggml_tensor * h = layer_outs[vlayer]; + + ggml_tensor * stream = build_block( + blk, h, bid, + hparams.proj_spatial_offsets[bid], + n_patches_x, + hparams.downsample_window_side, + hparams.downsample_query_side, + qformer_eps); + cb(stream, (std::string("proj_") + std::to_string(bid) + std::string("_v_out")).c_str(), vlayer); + mmproj = mmproj ? ggml_concat(ctx0, mmproj, stream, 0) : stream; + } + + // --- Stage 1d: Append newline tokens if add_newline is set --- + if (add_newline) { + mmproj = append_rowwise_newlines(ctx0, mmproj); + ggml_set_name(mmproj, "g4v_mmproj_out_nl"); + } else { + ggml_set_name(mmproj, "g4v_mmproj_out"); + } + ggml_build_forward_expand(gf, mmproj); + + return gf; +} diff --git a/tools/mtmd/models/llava.cpp b/tools/mtmd/models/llava.cpp index 4af17ccfe8..5aa3d2f0fa 100644 --- a/tools/mtmd/models/llava.cpp +++ b/tools/mtmd/models/llava.cpp @@ -51,7 +51,6 @@ ggml_cgraph * clip_graph_llava::build() { } std::vector embedding_stack; - const auto & vision_feature_layer = hparams.vision_feature_layer; // loop over layers for (int il = 0; il < max_feature_layer; il++) { @@ -60,7 +59,7 @@ ggml_cgraph * clip_graph_llava::build() { // If this is an embedding feature layer, save the output. // NOTE: 0 index here refers to the input to the encoder. - if (vision_feature_layer.find(il) != vision_feature_layer.end()) { + if (hparams.is_vision_feature_layer(il)) { embedding_stack.push_back(cur); } @@ -135,7 +134,7 @@ ggml_cgraph * clip_graph_llava::build() { // process vision feature layers (used by granite) { // final layer is a vision feature layer - if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) { + if (hparams.is_vision_feature_layer(max_feature_layer)) { embedding_stack.push_back(inpL); } diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index b882f800dd..d1865103bc 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -211,3 +211,26 @@ struct clip_graph_exaone4_5 : clip_graph { clip_graph_exaone4_5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; }; + +struct clip_graph_granite4_vision : clip_graph { + clip_graph_granite4_vision(clip_ctx * ctx, const clip_image_f32 & img) + : clip_graph(ctx, img), + add_newline(img.add_newline) {} + + ggml_cgraph * build() override; + +private: + // The graph is per-tile since only batch-size 1 is supported in clip. As + // such, this value is set at construct time based on the tile that will be + // encoded, then used during build to determine how to handle newlines. + const bool add_newline; + + ggml_tensor * gather(ggml_tensor * src, const std::string & name, int idx_len); + ggml_tensor * interp_down(ggml_tensor * src, int side, int new_side); + ggml_tensor * build_block(const qf_block & blk, ggml_tensor * h, int bid, + int spatial_offset, int image_side, int window_side, + int query_side, float qformer_eps); + + ggml_tensor * build_newline_row(ggml_context * ctx0); + ggml_tensor * append_rowwise_newlines(ggml_context * ctx0, ggml_tensor * tile_output); +}; diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 0b5caa6cb5..260f307560 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -513,6 +513,12 @@ struct mtmd_context { img_end = ""; image_preproc = std::make_unique(ctx_v); } break; + case PROJECTOR_TYPE_GRANITE4_VISION: + { + img_beg = ""; + img_end = ""; + image_preproc = std::make_unique(ctx_v); + } break; default: throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj)); } @@ -808,6 +814,21 @@ struct mtmd_tokenizer { return 2; } + // Annotate llava-next style tiles so clip_n_output_tokens accounts + // for per-tile newline injection. + if (ctx->proj_type_v() == PROJECTOR_TYPE_GRANITE4_VISION) { + if (batch_f32.entries.size() == 1) { + // Single-tile (overview only): append one newline row. + batch_f32.entries[0]->add_newline = true; + } else { + // Multi-tile: overview gets no newline, grid tiles get one. + batch_f32.entries[0]->add_newline = false; + for (size_t i = 1; i < batch_f32.entries.size(); ++i) { + batch_f32.entries[i]->add_newline = true; + } + } + } + // handle llava-uhd style preprocessing const bool has_tiling_grid = batch_f32.grid_x > 0 && batch_f32.grid_y > 0; if ( @@ -872,9 +893,10 @@ struct mtmd_tokenizer { } } else { + size_t n_tokens = 0; - for (const auto & entry : batch_f32.entries) { - n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get()); + for (const auto & e : batch_f32.entries) { + n_tokens += clip_n_output_tokens(ctx->ctx_v, e.get()); } mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens); @@ -1111,7 +1133,8 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) || proj_type == PROJECTOR_TYPE_MINICPMV || proj_type == PROJECTOR_TYPE_GLM_EDGE || proj_type == PROJECTOR_TYPE_INTERNVL - || proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) { + || proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2 + || proj_type == PROJECTOR_TYPE_GRANITE4_VISION) { // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode() const auto & entries = image_tokens->batch_f32.entries; // entries may have different token counts