diff --git a/conversion/__init__.py b/conversion/__init__.py
index 2c79580f8a..c670798fc2 100644
--- a/conversion/__init__.py
+++ b/conversion/__init__.py
@@ -253,6 +253,7 @@ MMPROJ_MODEL_MAP: dict[str, str] = {
     "Glm4vMoeForConditionalGeneration": "qwen3vl",
     "GlmOcrForConditionalGeneration": "qwen3vl",
     "GlmasrModel": "ultravox",
+    "Granite4VisionForConditionalGeneration": "granite",
     "GraniteSpeechForConditionalGeneration": "granite",
     "HunYuanVLForConditionalGeneration": "hunyuan",
     "Idefics3ForConditionalGeneration": "smolvlm",
diff --git a/conversion/granite.py b/conversion/granite.py
index 647269ba74..53441fe570 100644
--- a/conversion/granite.py
+++ b/conversion/granite.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import re
 from typing import Any, Callable, Iterable, TYPE_CHECKING
 
 import torch
@@ -13,7 +14,7 @@ from .llama import LlamaModel
 from .mamba import Mamba2Model
 
 
-@ModelBase.register("GraniteForCausalLM", "GraniteSpeechForConditionalGeneration")
+@ModelBase.register("GraniteForCausalLM")
 class GraniteModel(LlamaModel):
     """Conversion for IBM's GraniteForCausalLM"""
     model_arch = gguf.MODEL_ARCH.GRANITE
@@ -46,11 +47,29 @@ class GraniteModel(LlamaModel):
             self.gguf_writer.add_logit_scale(logits_scale)
             logger.info("gguf: (granite) logits_scale = %s", logits_scale)
 
+        # If being used as the base for Granite4 Vision, add deepstack_layer_arr
+        if self.hparams.get("spatial_target_layers") or self.hparams.get("deepstack_layer_map"):
+            normalized_projector_map = Granite4VisionMmprojModel.get_normalized_projector_map(self.hparams)
+            deepstack_mapping_arr = [-1 for _ in range(self.block_count)] # Populate with -1 sentinels
+            for proj_idx, (_, llm_layer, _, _) in enumerate(normalized_projector_map):
+                # Skip the first projector which is handled as the base embedding
+                # stream like normal
+                if proj_idx == 0:
+                    continue
+                deepstack_mapping_arr[llm_layer] = proj_idx
+            self.gguf_writer.add_deepstack_mapping(deepstack_mapping_arr)
+
     @classmethod
     def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
         name, gen = item
-        if name.startswith("encoder."):
-            return None
+        # Skip multimodal tensors
+        if (
+            name.startswith(("encoder."))
+            or "image_" in name
+            or "layerwise_projectors" in name
+            or "spatial_projectors" in name
+        ):
+            return
         return super().filter_tensors(item)
 
 
@@ -241,7 +260,8 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
         assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}"
 
     def set_vocab(self):
-        self.hparams["pad_vocab_size_multiple"] = 8
+        # For models with no ssm layers, don't pad for mamba2
+        self.hparams["pad_vocab_size_multiple"] = 8 if self._ssm_layers else 1
         Mamba2Model.set_vocab(self)
 
 
@@ -326,3 +346,133 @@ class GraniteSpeechMmprojModel(MmprojModel):
                 data_torch = data_torch.squeeze(1)
 
         yield from super().modify_tensors(data_torch, name, bid)
+
+
+@ModelBase.register("Granite4VisionForConditionalGeneration")
+class Granite4VisionMmprojModel(MmprojModel):
+    has_vision_encoder = True
+    has_audio_encoder = False
+
+    @staticmethod
+    def get_normalized_projector_map(global_config: dict) -> list[tuple[int, int, str, int]]:
+        """Normalize both deepstack and spatial projector maps to the form:
+        (vision_layer, llm_layer, <type>, type_index)
+
+        This is then used to populate the following mappings:
+        - vision_feature_layers (mmproj hparam): ordered list of all
+          vision_layer values where order corresponds with the order of the
+          stacked projector tensors
+          NOTE: Values may appear multiple times for spatial projectors
+        - tensor_prefix_map (mmproj tensors): mapping from tensor prefixes to
+          the index of the corresponding projector in the stacked tensors
+        - deepstack_layer_arr (llm hparam): per-text-layer array indicating
+          which input vision feature should be injected at that layer
+          (-1 if none)
+
+        Output: (vision_layer, llm_layer, <type>, type_index)
+        """
+        deepstack_map = global_config.get("deepstack_layer_map", [])  # [[vis_layer, llm_layer], ...]
+        spatial_layers = global_config.get("spatial_target_layers", [])  # [llm_layer, ...]
+        n_text_layers = global_config["text_config"]["num_hidden_layers"]
+        n_vision_layers = global_config["vision_config"]["num_hidden_layers"]
+        normalized_projector_map = []
+        if deepstack_map:
+            for deepstack_idx, (vision_layer, llm_layer) in enumerate(sorted(deepstack_map)):
+                if vision_layer < 0:
+                    vision_layer = n_vision_layers + vision_layer
+                if llm_layer < 0:
+                    llm_layer = n_text_layers + llm_layer
+                normalized_projector_map.append((vision_layer, llm_layer, "layerwise", deepstack_idx))
+        if spatial_layers:
+            spatial_vision_layer = global_config.get("spatial_vision_layer", -1)
+            if spatial_vision_layer < 0:
+                spatial_vision_layer = n_vision_layers + spatial_vision_layer
+            for spatial_idx, llm_layer in enumerate(spatial_layers):
+                normalized_projector_map.append((spatial_vision_layer, llm_layer, "spatial", spatial_idx))
+        return list(sorted(normalized_projector_map, key=(lambda entry: entry[1])))
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        normalized_projector_map = self.get_normalized_projector_map(self.global_config)
+        self._n_proj = len(normalized_projector_map)
+
+        self._tensor_prefix_map = {
+            f"model.{proj_type}_projectors.{type_idx}": proj_idx
+            for proj_idx, (_, _, proj_type, type_idx) in enumerate(normalized_projector_map)
+        }
+        self._vision_feature_layers = [vision_layer for vision_layer, _, _, _ in normalized_projector_map]
+        self._spatial_offsets = [
+            type_idx if proj_type == "spatial" else -1
+            for _, _, proj_type, type_idx in normalized_projector_map
+        ]
+
+    def set_gguf_parameters(self):
+        assert self.hparams_vision is not None
+        super().set_gguf_parameters()
+
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GRANITE4_VISION)
+
+        # SigLIP encoder hparams
+        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
+        self.gguf_writer.add_vision_use_gelu(True)
+
+        # Preprocessor
+        self.gguf_writer.add_vision_preproc_image_size(self.hparams.get("image_size", 384))
+
+        # QFormer projector config
+        ds_rate = self.global_config["downsample_rate"]
+        ds_parts = ds_rate.split("/")
+        assert len(ds_parts) == 2, f"Invalid 'downsample_rate' value: {ds_rate}"
+        query_side, window_side = [int(p) for p in ds_parts]
+        self.gguf_writer.add_vision_projector_query_side(query_side)
+        self.gguf_writer.add_vision_projector_window_side(window_side)
+
+        # Set vision feature layers
+        self.gguf_writer.add_vision_feature_layers(self._vision_feature_layers)
+
+        # Set the spatial offests per projector
+        self.gguf_writer.add_vision_spatial_offsets(self._spatial_offsets)
+
+        # Add flattened image grind pinpoints (resolution candidates internally)
+        if pinpoints := self.global_config.get("image_grid_pinpoints"):
+            # Flatten with h, w -> w, h inversion
+            pinpoints = [val for h, w in pinpoints for val in (w, h)]
+            self.gguf_writer.add_vision_image_grid_pinpoints(pinpoints)
+
+    @classmethod
+    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
+        name, _ = item
+        if ("vision_model.head" in name or name.startswith("lm_head")):
+            return None
+        return super().filter_tensors(item)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+
+        # Detect projector tensors and bin them
+        projector_idx = None
+        for prefix, proj_idx in self._tensor_prefix_map.items():
+            if name.startswith(prefix):
+                projector_idx = proj_idx
+                break
+        if projector_idx is not None:
+            # If this projector tensor has a block id within the projector,
+            # alias the bid to projector_idx
+            #
+            # TODO: currently, none of the Granite 4 Vision models have
+            # projectors with multiple QFormer layers, so the `layer.{}` index
+            # is always 0. This allows us to simply map to a single `bid` that
+            # matches the projector index. If this changes, we'll need a
+            # convention that merges the two IDs.
+            id_matches = list(re.finditer(r"\.([0-9]+)\.", name))
+            all_ids = [int(m.group(1)) for m in id_matches]
+            assert len(all_ids) >= 1 and len(all_ids) <= 2, "Must have at least 1 and at most 2 ids in tensor names"
+            # If not layer id, just use the projector index
+            new_bid = projector_idx
+            if len(all_ids) == 1:
+                new_name = name[:id_matches[0].span(1)[0]] + str(new_bid) + name[id_matches[0].span(1)[1]:]
+            else: # len(all_ids) == 2
+                new_bid = projector_idx # + all_ids[1]
+                new_name = name[:id_matches[0].span(0)[0]] + name[id_matches[0].span(1)[1]:id_matches[1].span(1)[0]] + str(new_bid) + name[id_matches[1].span(1)[1]:]
+            yield from super().modify_tensors(data_torch, new_name, new_bid)
+            return
+        yield from super().modify_tensors(data_torch, name, bid)
diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
index 9a6437beab..45202b3338 100755
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -311,6 +311,10 @@ def parse_args() -> argparse.Namespace:
         "--base-model-id", type=str,
         help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')",
     )
+    parser.add_argument(
+        "--trust-remote-code", default=False, action="store_true",
+        help="trust remote code in the model",
+    )
     parser.add_argument(
         "lora_path", type=Path,
         help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
@@ -319,11 +323,11 @@ def parse_args() -> argparse.Namespace:
     return parser.parse_args()
 
 
-def load_hparams_from_hf(hf_model_id: str) -> tuple[dict[str, Any], Path | None]:
+def load_hparams_from_hf(hf_model_id: str, trust_remote_code: bool) -> tuple[dict[str, Any], Path | None]:
     from huggingface_hub import try_to_load_from_cache
 
     # normally, adapter does not come with base model config, we need to load it from AutoConfig
-    config = AutoConfig.from_pretrained(hf_model_id)
+    config = AutoConfig.from_pretrained(hf_model_id, trust_remote_code=trust_remote_code)
     cache_dir = try_to_load_from_cache(hf_model_id, "config.json")
     cache_dir = Path(cache_dir).parent if isinstance(cache_dir, str) else None
 
@@ -372,13 +376,13 @@ if __name__ == '__main__':
     # load base model
     if base_model_id is not None:
         logger.info(f"Loading base model from Hugging Face: {base_model_id}")
-        hparams, dir_base_model = load_hparams_from_hf(base_model_id)
+        hparams, dir_base_model = load_hparams_from_hf(base_model_id, args.trust_remote_code)
     elif dir_base_model is None:
         if "base_model_name_or_path" in lparams:
             model_id = lparams["base_model_name_or_path"]
             logger.info(f"Loading base model from Hugging Face: {model_id}")
             try:
-                hparams, dir_base_model = load_hparams_from_hf(model_id)
+                hparams, dir_base_model = load_hparams_from_hf(model_id, args.trust_remote_code)
             except OSError as e:
                 logger.error(f"Failed to load base model config: {e}")
                 logger.error("Please try downloading the base model and add its path to --base")
@@ -393,7 +397,9 @@ if __name__ == '__main__':
 
     with torch.inference_mode():
         try:
-            model_class = get_model_class(hparams["architectures"][0])
+            model_arch = hparams.get("text_config", {}).get("architectures", hparams["architectures"])[0]
+            logger.info("Using model architecture: %s", model_arch)
+            model_class = get_model_class(model_arch)
         except NotImplementedError:
             logger.error(f"Model {hparams['architectures'][0]} is not supported")
             sys.exit(1)
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index ce556ec9b6..814980ce50 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -128,6 +128,7 @@ class Keys:
         MOE_LATENT_SIZE                   = "{arch}.moe_latent_size"
         NEXTN_PREDICT_LAYERS              = "{arch}.nextn_predict_layers"
         NUM_DEEPSTACK_LAYERS              = "{arch}.n_deepstack_layers"
+        DEEPSTACK_MAPPING                 = "{arch}.deepstack_mapping"
         POOLING_TYPE                      = "{arch}.pooling_type"
         LOGIT_SCALE                       = "{arch}.logit_scale"
         DECODER_START_TOKEN_ID            = "{arch}.decoder_start_token_id"
@@ -325,6 +326,8 @@ class Keys:
         WA_PATTERN_MODE       = "clip.vision.wa_pattern_mode"  # used by mimovl, per-layer -1/0/1
         IS_DEEPSTACK_LAYERS   = "clip.vision.is_deepstack_layers"
         WINDOW_SIZE           = "clip.vision.window_size"
+        FEATURE_LAYERS        = "clip.vision.feature_layer" # Granite4 Vision
+        IMAGE_GRID_PINPOINTS  = "clip.vision.image_grid_pinpoints" # Granite4 Vision
 
         class Attention:
             HEAD_COUNT      = "clip.vision.attention.head_count"
@@ -333,6 +336,9 @@ class Keys:
 
         class Projector:
             SCALE_FACTOR    = "clip.vision.projector.scale_factor"
+            QUERY_SIDE      = "clip.vision.projector.query_side"
+            WINDOW_SIDE     = "clip.vision.projector.window_side"
+            SPATIAL_OFFSETS = "clip.vision.projector.spatial_offsets"
 
         class SAM:
             BLOCK_COUNT         = "clip.vision.sam.block_count"
@@ -821,6 +827,31 @@ class MODEL_TENSOR(IntEnum):
     V_RESMPL_QUERY_768   = auto() # Deepseek-OCR-2
     V_RESMPL_QUERY_1024  = auto() # Deepseek-OCR-2
 
+    # qformer projector (vision) - Granite4 Vision
+    V_QF_PROJ_QUERY      = auto()
+    V_QF_PROJ_NORM       = auto()
+    V_QF_PROJ_LINEAR     = auto()
+    V_QF_SELF_ATTN_Q     = auto()
+    V_QF_SELF_ATTN_K     = auto()
+    V_QF_SELF_ATTN_V     = auto()
+    V_QF_SELF_ATTN_O     = auto()
+    V_QF_SELF_ATTN_NORM  = auto()
+    V_QF_CROSS_ATTN_Q    = auto()
+    V_QF_CROSS_ATTN_K    = auto()
+    V_QF_CROSS_ATTN_V    = auto()
+    V_QF_CROSS_ATTN_O    = auto()
+    V_QF_CROSS_ATTN_NORM = auto()
+    V_QF_FFN_UP          = auto()
+    V_QF_FFN_DOWN        = auto()
+    V_QF_FFN_NORM        = auto()
+    V_PROJ_NORM          = auto()
+    # multi-projector (bid => projector id) - Granite4 vision
+    V_MULTI_PROJ_IMG_POS   = auto()
+    V_MULTI_PROJ_QUERY     = auto()
+    V_MULTI_PROJ_NORM      = auto()
+    V_MULTI_PROJ_LINEAR    = auto()
+    V_MULTI_PROJ_POST_NORM = auto()
+
     # audio (mtmd)
     A_ENC_EMBD_POS        = auto()
     A_ENC_EMBD_NORM       = auto()
@@ -885,7 +916,7 @@ class MODEL_TENSOR(IntEnum):
     A_CTC_OUT              = auto()
     A_CTC_OUT_MID          = auto()
     A_ENC_ATTN_REL_POS_EMB = auto()
-    # qformer projector
+    # audio qformer projector
     A_QF_PROJ_QUERY        = auto()
     A_QF_PROJ_NORM         = auto()
     A_QF_PROJ_LINEAR       = auto()
@@ -1337,10 +1368,33 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
     MODEL_TENSOR.V_SAM_NECK:                "v.sam.neck.{bid}",
     MODEL_TENSOR.V_SAM_NET_2:               "v.sam.net_2",
     MODEL_TENSOR.V_SAM_NET_3:               "v.sam.net_3",
-    MODEL_TENSOR.V_ENC_EMBD_IMGNL:          "v.image_newline", # Deepseek-OCR
+    MODEL_TENSOR.V_ENC_EMBD_IMGNL:          "v.image_newline", # Deepseek-OCR, Granite4Vision
     MODEL_TENSOR.V_ENC_EMBD_VSEP:           "v.view_seperator", # Deepseek-OCR
     MODEL_TENSOR.V_RESMPL_QUERY_768:        "v.resample_query_768", # Deepseek-OCR-2 qwen2
     MODEL_TENSOR.V_RESMPL_QUERY_1024:       "v.resample_query_1024", # Deepseek-OCR-2 qwen2
+    # Granite4 Vision
+    # qformer layers (bid => proj_id)
+    # NOTE: Names align with A_QF_*
+    MODEL_TENSOR.V_QF_SELF_ATTN_Q:          "v.proj_blk.{bid}.self_attn_q",
+    MODEL_TENSOR.V_QF_SELF_ATTN_K:          "v.proj_blk.{bid}.self_attn_k",
+    MODEL_TENSOR.V_QF_SELF_ATTN_V:          "v.proj_blk.{bid}.self_attn_v",
+    MODEL_TENSOR.V_QF_SELF_ATTN_O:          "v.proj_blk.{bid}.self_attn_out",
+    MODEL_TENSOR.V_QF_SELF_ATTN_NORM:       "v.proj_blk.{bid}.self_attn_norm",
+    MODEL_TENSOR.V_QF_CROSS_ATTN_Q:         "v.proj_blk.{bid}.cross_attn_q",
+    MODEL_TENSOR.V_QF_CROSS_ATTN_K:         "v.proj_blk.{bid}.cross_attn_k",
+    MODEL_TENSOR.V_QF_CROSS_ATTN_V:         "v.proj_blk.{bid}.cross_attn_v",
+    MODEL_TENSOR.V_QF_CROSS_ATTN_O:         "v.proj_blk.{bid}.cross_attn_out",
+    MODEL_TENSOR.V_QF_CROSS_ATTN_NORM:      "v.proj_blk.{bid}.cross_attn_norm",
+    MODEL_TENSOR.V_QF_FFN_UP:               "v.proj_blk.{bid}.ffn_up",
+    MODEL_TENSOR.V_QF_FFN_DOWN:             "v.proj_blk.{bid}.ffn_down",
+    MODEL_TENSOR.V_QF_FFN_NORM:             "v.proj_blk.{bid}.ffn_norm",
+    # multi-projector (bid => projector ID)
+    MODEL_TENSOR.V_MULTI_PROJ_IMG_POS:   "v.proj_blk.{bid}.img_pos",
+    MODEL_TENSOR.V_MULTI_PROJ_QUERY:     "v.proj_blk.{bid}.query",
+    MODEL_TENSOR.V_MULTI_PROJ_NORM:      "v.proj_blk.{bid}.norm",
+    MODEL_TENSOR.V_MULTI_PROJ_LINEAR:    "v.proj_blk.{bid}.linear",
+    MODEL_TENSOR.V_MULTI_PROJ_POST_NORM: "v.proj_blk.{bid}.post_norm",
+
     # audio (mtmd)
     # note: all audio tensor names must use prefix "a." or "mm.a."
     MODEL_TENSOR.A_ENC_EMBD_POS:            "a.position_embd",
@@ -1522,6 +1576,29 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.V_SAM_NET_3,
         MODEL_TENSOR.V_RESMPL_QUERY_768,
         MODEL_TENSOR.V_RESMPL_QUERY_1024,
+        MODEL_TENSOR.V_PROJ_NORM,
+        MODEL_TENSOR.V_QF_PROJ_QUERY,
+        MODEL_TENSOR.V_QF_PROJ_NORM,
+        MODEL_TENSOR.V_QF_PROJ_LINEAR,
+        MODEL_TENSOR.V_QF_SELF_ATTN_Q,
+        MODEL_TENSOR.V_QF_SELF_ATTN_K,
+        MODEL_TENSOR.V_QF_SELF_ATTN_V,
+        MODEL_TENSOR.V_QF_SELF_ATTN_O,
+        MODEL_TENSOR.V_QF_SELF_ATTN_NORM,
+        MODEL_TENSOR.V_QF_CROSS_ATTN_Q,
+        MODEL_TENSOR.V_QF_CROSS_ATTN_K,
+        MODEL_TENSOR.V_QF_CROSS_ATTN_V,
+        MODEL_TENSOR.V_QF_CROSS_ATTN_O,
+        MODEL_TENSOR.V_QF_CROSS_ATTN_NORM,
+        MODEL_TENSOR.V_QF_FFN_UP,
+        MODEL_TENSOR.V_QF_FFN_DOWN,
+        MODEL_TENSOR.V_QF_FFN_NORM,
+        MODEL_TENSOR.V_QF_PROJ_NORM,
+        MODEL_TENSOR.V_MULTI_PROJ_IMG_POS,
+        MODEL_TENSOR.V_MULTI_PROJ_QUERY,
+        MODEL_TENSOR.V_MULTI_PROJ_LINEAR,
+        MODEL_TENSOR.V_MULTI_PROJ_NORM,
+        MODEL_TENSOR.V_MULTI_PROJ_POST_NORM,
         # audio
         MODEL_TENSOR.A_ENC_EMBD_POS,
         MODEL_TENSOR.A_ENC_EMBD_NORM,
@@ -4388,6 +4465,7 @@ class VisionProjectorType:
     MINICPMV4_6    = "minicpmv4_6"
     GRANITE_SPEECH = "granite_speech"  # audio
     MIMOVL         = "mimovl"
+    GRANITE4_VISION = "granite4_vision"
 
 
 # Items here are (block size, type size)
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 875d0f73d9..182c9c54a5 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -959,8 +959,13 @@ class GGUFWriter:
         self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
 
     def add_num_deepstack_layers(self, count: int) -> None:
+        """Add scalar deepstack layer count (qwen3vl format)"""
         self.add_uint32(Keys.LLM.NUM_DEEPSTACK_LAYERS.format(arch=self.arch), count)
 
+    def add_deepstack_mapping(self, layers: Sequence[int]) -> None:
+        """Add per-layer deepstack projector indices (Granite4 Vision format)"""
+        self.add_array(Keys.LLM.DEEPSTACK_MAPPING.format(arch=self.arch), list(layers))
+
     def add_rope_dimension_count(self, count: int) -> None:
         self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)
 
@@ -1184,6 +1189,15 @@ class GGUFWriter:
     def add_vision_preproc_image_size(self, value: int) -> None:
         self.add_uint32(Keys.ClipVision.PREPROC_IMAGE_SIZE, value)
 
+    def add_vision_projector_query_side(self, value: int) -> None:
+        self.add_uint32(Keys.ClipVision.Projector.QUERY_SIDE, value)
+
+    def add_vision_projector_window_side(self, value: int) -> None:
+        self.add_uint32(Keys.ClipVision.Projector.WINDOW_SIDE, value)
+
+    def add_vision_spatial_offsets(self, layers: Sequence[int]) -> None:
+        self.add_array(Keys.ClipVision.Projector.SPATIAL_OFFSETS, layers)
+
     def add_vision_image_mean(self, values: Sequence[float]) -> None:
         self.add_array(Keys.ClipVision.IMAGE_MEAN, values)
 
@@ -1240,6 +1254,12 @@ class GGUFWriter:
     def add_vision_window_size(self, value: int) -> None:
         self.add_uint32(Keys.ClipVision.WINDOW_SIZE, value)
 
+    def add_vision_feature_layers(self, layers: Sequence[int]) -> None:
+        self.add_array(Keys.ClipVision.FEATURE_LAYERS, layers)
+
+    def add_vision_image_grid_pinpoints(self, layers: Sequence[Sequence[int]]) -> None:
+        self.add_array(Keys.ClipVision.IMAGE_GRID_PINPOINTS, layers)
+
     def add_vision_sam_layers_count(self, value: int) -> None:
         self.add_uint32(Keys.ClipVision.SAM.BLOCK_COUNT, value)
 
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 82f26e7b30..3e63b21650 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -1408,6 +1408,7 @@ class TensorNameMap:
         ),
 
         MODEL_TENSOR.V_ENC_EMBD_PATCH: (
+            "model.vision_tower.vision_model.embeddings.patch_embedding", # Granite4Vision
             "vision_tower.vision_model.embeddings.patch_embedding",
             "model.vision_tower.embeddings.patch_embedding", # minicpmv4_6
             "model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1
@@ -1439,6 +1440,7 @@ class TensorNameMap:
         ),
 
         MODEL_TENSOR.V_ENC_EMBD_POS: (
+            "model.vision_tower.vision_model.embeddings.position_embedding", # Granite4Vision
             "vision_tower.vision_model.embeddings.position_embedding",
             "model.vision_tower.embeddings.position_embedding", # minicpmv4_6
             "model.vision_tower.embeddings.position_embeddings", # Intern-S1
@@ -1456,8 +1458,9 @@ class TensorNameMap:
             "model.vision_embedder.pos_embedding", # gemma4 unified
         ),
 
+        # TODO: I think these should all be moved to mapping_cfg?
         MODEL_TENSOR.V_ENC_EMBD_IMGNL: (
-            "model.image_newline",  # Deepseek-OCR
+            "model.image_newline",  # Deepseek-OCR, Granite4Vision
             "vit.perceive.image_newline", # HunyuanVL
         ),
 
@@ -1477,6 +1480,7 @@ class TensorNameMap:
         ),
 
         MODEL_TENSOR.V_ENC_ATTN_Q: (
+            "model.vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj", # Granite4Vision
             "vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj",
             "model.vision_tower.encoder.layers.{bid}.self_attn.q_proj", # minicpmv4_6
             "model.vision_tower.encoder.layer.{bid}.attention.q_proj", # Intern-S1
@@ -1502,6 +1506,7 @@ class TensorNameMap:
         ),
 
         MODEL_TENSOR.V_ENC_ATTN_K: (
+            "model.vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj", # Granite4Vision
             "vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj",
             "model.vision_tower.encoder.layers.{bid}.self_attn.k_proj", # minicpmv4_6
             "model.vision_tower.encoder.layer.{bid}.attention.k_proj", # Intern-S1
@@ -1527,6 +1532,7 @@ class TensorNameMap:
         ),
 
         MODEL_TENSOR.V_ENC_ATTN_V: (
+            "model.vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj", # Granite4Vision
             "vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj",
             "model.vision_tower.encoder.layers.{bid}.self_attn.v_proj", # minicpmv4_6
             "model.vision_tower.encoder.layer.{bid}.attention.v_proj", # Intern-S1
@@ -1545,6 +1551,7 @@ class TensorNameMap:
         ),
 
         MODEL_TENSOR.V_ENC_INPUT_NORM: (
+            "model.vision_tower.vision_model.encoder.layers.{bid}.layer_norm1", # Granite4Vision
             "vision_tower.vision_model.encoder.layers.{bid}.layer_norm1",
             "model.vision_tower.encoder.layers.{bid}.layer_norm1", # minicpmv4_6
             "vision_tower.vision_model.encoder.layers.{bid}.norm1", # InternVL
@@ -1567,6 +1574,7 @@ class TensorNameMap:
         ),
 
         MODEL_TENSOR.V_ENC_ATTN_O: (
+            "model.vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj", # Granite4Vision
             "vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj",
             "model.vision_tower.encoder.layers.{bid}.self_attn.out_proj", # minicpmv4_6
             "vision_tower.vision_model.encoder.layers.{bid}.attn.proj", # InternVL
@@ -1595,6 +1603,7 @@ class TensorNameMap:
         ),
 
         MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
+            "model.vision_tower.vision_model.encoder.layers.{bid}.layer_norm2", # Granite4Vision
             "vision_tower.vision_model.encoder.layers.{bid}.layer_norm2",
             "model.vision_tower.encoder.layers.{bid}.layer_norm2", # minicpmv4_6
             "vision_tower.vision_model.encoder.layers.{bid}.norm2", # InternVL
@@ -1618,6 +1627,7 @@ class TensorNameMap:
         ),
 
         MODEL_TENSOR.V_ENC_FFN_UP: (
+            "model.vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1", # Granite4Vision
             "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1",
             "model.vision_tower.encoder.layers.{bid}.mlp.fc1", # minicpmv4_6
             "model.vision_tower.encoder.layer.{bid}.mlp.fc1", # Intern-S1
@@ -1649,6 +1659,7 @@ class TensorNameMap:
         ),
 
         MODEL_TENSOR.V_ENC_FFN_DOWN: (
+            "model.vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2", # Granite4Vision
             "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2",
             "model.vision_tower.encoder.layers.{bid}.mlp.fc2", # minicpmv4_6
             "model.vision_tower.encoder.layer.{bid}.mlp.fc2", # Intern-S1
@@ -1706,6 +1717,7 @@ class TensorNameMap:
         ),
 
         MODEL_TENSOR.V_POST_NORM: (
+            "model.vision_tower.vision_model.post_layernorm", # Granite4Vision
             "vision_tower.vision_model.post_layernorm",
             "model.vision_tower.post_layernorm", # minicpmv4_6
             "model.vision_model.post_layernorm", # SmolVLM
@@ -1952,6 +1964,82 @@ class TensorNameMap:
             "model.vision_tower.std_scale", # gemma4
         ),
 
+        # For these tensors, bid => projector ID
+        MODEL_TENSOR.V_MULTI_PROJ_IMG_POS: (
+            "model.layerwise_projectors.{bid}.image_positions", # Granite4 Vision
+            "model.spatial_projectors.{bid}.image_positions",   # Granite4 Vision
+        ),
+        MODEL_TENSOR.V_MULTI_PROJ_QUERY: (
+            "model.layerwise_projectors.{bid}.query", # Granite4 Vision
+            "model.spatial_projectors.{bid}.query",   # Granite4 Vision
+        ),
+        MODEL_TENSOR.V_MULTI_PROJ_LINEAR: (
+            "model.layerwise_projectors.{bid}.out_linear", # Granite4 Vision
+            "model.spatial_projectors.{bid}.out_linear",   # Granite4 Vision
+        ),
+        MODEL_TENSOR.V_MULTI_PROJ_NORM: (
+            "model.layerwise_projectors.{bid}.norm", # Granite4 Vision
+            "model.spatial_projectors.{bid}.norm",   # Granite4 Vision
+        ),
+        MODEL_TENSOR.V_MULTI_PROJ_POST_NORM: (
+            "model.layerwise_projectors.{bid}.qformer.layernorm", # Granite4 Vision
+            "model.spatial_projectors.{bid}.qformer.layernorm",   # Granite4 Vision
+        ),
+
+        # For these tensors, bid => proj-id
+        MODEL_TENSOR.V_QF_SELF_ATTN_Q: (
+            "model.layerwise_projectors.qformer.encoder.layer.{bid}.attention.attention.query", # Granite4 Vision
+            "model.spatial_projectors.qformer.encoder.layer.{bid}.attention.attention.query",   # Granite4 Vision
+        ),
+        MODEL_TENSOR.V_QF_SELF_ATTN_K: (
+            "model.layerwise_projectors.qformer.encoder.layer.{bid}.attention.attention.key", # Granite4 Vision
+            "model.spatial_projectors.qformer.encoder.layer.{bid}.attention.attention.key",   # Granite4 Vision
+        ),
+        MODEL_TENSOR.V_QF_SELF_ATTN_V: (
+            "model.layerwise_projectors.qformer.encoder.layer.{bid}.attention.attention.value", # Granite4 Vision
+            "model.spatial_projectors.qformer.encoder.layer.{bid}.attention.attention.value",   # Granite4 Vision
+        ),
+        MODEL_TENSOR.V_QF_SELF_ATTN_O: (
+            "model.layerwise_projectors.qformer.encoder.layer.{bid}.attention.output.dense", # Granite4 Vision
+            "model.spatial_projectors.qformer.encoder.layer.{bid}.attention.output.dense",   # Granite4 Vision
+        ),
+        MODEL_TENSOR.V_QF_SELF_ATTN_NORM: (
+            "model.layerwise_projectors.qformer.encoder.layer.{bid}.attention.output.LayerNorm", # Granite4 Vision
+            "model.spatial_projectors.qformer.encoder.layer.{bid}.attention.output.LayerNorm",   # Granite4 Vision
+        ),
+        MODEL_TENSOR.V_QF_CROSS_ATTN_Q: (
+            "model.layerwise_projectors.qformer.encoder.layer.{bid}.crossattention.attention.query", # Granite4 Vision
+            "model.spatial_projectors.qformer.encoder.layer.{bid}.crossattention.attention.query",   # Granite4 Vision
+        ),
+        MODEL_TENSOR.V_QF_CROSS_ATTN_K: (
+            "model.layerwise_projectors.qformer.encoder.layer.{bid}.crossattention.attention.key", # Granite4 Vision
+            "model.spatial_projectors.qformer.encoder.layer.{bid}.crossattention.attention.key",   # Granite4 Vision
+        ),
+        MODEL_TENSOR.V_QF_CROSS_ATTN_V: (
+            "model.layerwise_projectors.qformer.encoder.layer.{bid}.crossattention.attention.value", # Granite4 Vision
+            "model.spatial_projectors.qformer.encoder.layer.{bid}.crossattention.attention.value",   # Granite4 Vision
+        ),
+        MODEL_TENSOR.V_QF_CROSS_ATTN_O: (
+            "model.layerwise_projectors.qformer.encoder.layer.{bid}.crossattention.output.dense", # Granite4 Vision
+            "model.spatial_projectors.qformer.encoder.layer.{bid}.crossattention.output.dense",   # Granite4 Vision
+        ),
+        MODEL_TENSOR.V_QF_CROSS_ATTN_NORM: (
+            "model.layerwise_projectors.qformer.encoder.layer.{bid}.crossattention.output.LayerNorm", # Granite4 Vision
+            "model.spatial_projectors.qformer.encoder.layer.{bid}.crossattention.output.LayerNorm",   # Granite4 Vision
+        ),
+        MODEL_TENSOR.V_QF_FFN_UP: (
+            "model.layerwise_projectors.qformer.encoder.layer.{bid}.intermediate_query.dense", # Granite4 Vision
+            "model.spatial_projectors.qformer.encoder.layer.{bid}.intermediate_query.dense",   # Granite4 Vision
+        ),
+        MODEL_TENSOR.V_QF_FFN_DOWN: (
+            "model.layerwise_projectors.qformer.encoder.layer.{bid}.output_query.dense", # Granite4 Vision
+            "model.spatial_projectors.qformer.encoder.layer.{bid}.output_query.dense",   # Granite4 Vision
+        ),
+        MODEL_TENSOR.V_QF_FFN_NORM: (
+            "model.layerwise_projectors.qformer.encoder.layer.{bid}.output_query.LayerNorm", # Granite4 Vision
+            "model.spatial_projectors.qformer.encoder.layer.{bid}.output_query.LayerNorm",   # Granite4 Vision
+        ),
+
         # audio (mtmd)
 
         MODEL_TENSOR.A_ENC_EMBD_POS: (
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index fea898deaf..52963f8f1e 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -196,6 +196,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_MOE_LATENT_SIZE,                   "%s.moe_latent_size"                   },
     { LLM_KV_NEXTN_PREDICT_LAYERS,              "%s.nextn_predict_layers"              },
     { LLM_KV_NUM_DEEPSTACK_LAYERS,              "%s.n_deepstack_layers"                },
+    { LLM_KV_DEEPSTACK_MAPPING,                 "%s.deepstack_mapping"                 },
     { LLM_KV_HIDDEN_ACT,                        "%s.hidden_activation"                 },
     { LLM_KV_POOLING_TYPE,                      "%s.pooling_type"                      },
     { LLM_KV_LOGIT_SCALE,                       "%s.logit_scale"                       },
diff --git a/src/llama-arch.h b/src/llama-arch.h
index f364f6b0ba..dc9bca9bfc 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -200,6 +200,7 @@ enum llm_kv {
     LLM_KV_MOE_LATENT_SIZE,
     LLM_KV_NEXTN_PREDICT_LAYERS,
     LLM_KV_NUM_DEEPSTACK_LAYERS,
+    LLM_KV_DEEPSTACK_MAPPING,
     LLM_KV_HIDDEN_ACT,
     LLM_KV_POOLING_TYPE,
     LLM_KV_LOGIT_SCALE,
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 172edf24cb..3b8125cde7 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -1859,7 +1859,12 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
     res->t_inp_embd = cur;
 
     // For Granite architecture
-    if (hparams.f_embedding_scale != 0.0f) {
+    // NOTE: Only apply scale to token inputs. Raw embeddings are assumed to be
+    //  multimodal inputs that should not be scaled.
+    if (ubatch.token && hparams.f_embedding_scale != 0.0f) {
+        if (!ggml_is_contiguous(cur)) {
+            cur = ggml_cont(ctx0, cur);
+        }
         cur = ggml_scale(ctx0, cur, hparams.f_embedding_scale);
     }
 
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index fde6183e87..87db4a0dd3 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -219,8 +219,18 @@ struct llama_hparams {
     uint32_t indexer_top_k     = 0;
 
     // qwen3vl deepstack
+    // When parsed from GGUF, this implies the first N layers consume the first
+    // N deepstack embeddings. Use deepstack_mapping_arr if you need a more
+    // complex mapping. If using deepstack_mapping_arr, also make sure to set
+    // n_deepstack_layers to the number of unique deepstack layers so that
+    // n_embd_imp is accurate (see granite.cpp).
     uint32_t n_deepstack_layers = 0;
 
+    // deepstack layer array (Granite4 Vision)
+    // -1  => no deepstack
+    // >=0 => input embedding index for deepstack injection
+    std::array<int32_t, LLAMA_MAX_LAYERS> deepstack_mapping_arr;
+
     // gemma4 per-layer embedding
     uint32_t n_embd_per_layer = 0;
 
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index ba08a19ac7..0d1cf3cc33 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -393,6 +393,7 @@ namespace GGUFMeta {
     }
 
     template bool llama_model_loader::get_arr<std::vector<std::string>>(enum llm_kv kid, std::vector<std::string> & result, bool required);
+    template bool llama_model_loader::get_arr<std::array<int32_t, 512>>(enum llm_kv kid, std::array<int32_t, 512> & result, bool required);
 
     template<typename T>
     bool llama_model_loader::get_key(const std::string & key, T & result, bool required) {
diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp
index b052287809..67d4a9df0f 100644
--- a/src/llama-model-saver.cpp
+++ b/src/llama-model-saver.cpp
@@ -229,6 +229,7 @@ void llama_model_saver::add_kv_from_model() {
     add_kv(LLM_KV_MOE_EVERY_N_LAYERS,                hparams.moe_every_n_layers);
     add_kv(LLM_KV_NEXTN_PREDICT_LAYERS,              hparams.n_layer_nextn);
     add_kv(LLM_KV_NUM_DEEPSTACK_LAYERS,              hparams.n_deepstack_layers);
+    add_kv(LLM_KV_DEEPSTACK_MAPPING,                 hparams.deepstack_mapping_arr);
     add_kv(LLM_KV_POOLING_TYPE,                      uint32_t(hparams.pooling_type));
     add_kv(LLM_KV_LOGIT_SCALE,                       hparams.f_logit_scale);
     add_kv(LLM_KV_DECODER_START_TOKEN_ID,            hparams.dec_start_token_id);
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 784deb70af..6808ad044c 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1100,6 +1100,9 @@ void llama_model_base::load_hparams(llama_model_loader & ml) {
     ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff_arr,   hparams.n_layer(), false);
     ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer(), false);
 
+    // Populate deepstack_mapping_arr - initialized to -1 (no deepstack)
+    std::fill(hparams.deepstack_mapping_arr.begin(), hparams.deepstack_mapping_arr.end(), -1);
+
     // n_head_kv is optional, default to n_head
     hparams.n_head_kv_arr = hparams.n_head_arr;
 
@@ -1678,10 +1681,10 @@ uint64_t llama_model::n_elements() const {
 void llama_model::print_info() const {
     const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
 
-    auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
+    auto print_f = [](const std::function<int32_t(uint32_t)> & f, uint32_t n) {
         bool is_var = false;
 
-        std::vector<uint32_t> v;
+        std::vector<int32_t> v;
         for (uint32_t i = 0; i < n; ++i) {
             v.push_back(f(i));
             if (v[i] != v[0]) {
@@ -1755,6 +1758,14 @@ void llama_model::print_info() const {
         LLAMA_LOG_INFO("%s: n_ctx_orig_yarn       = %u\n",     __func__, hparams.n_ctx_orig_yarn);
         LLAMA_LOG_INFO("%s: rope_yarn_log_mul     = %.4f\n",   __func__, hparams.rope_yarn_log_mul);
         LLAMA_LOG_INFO("%s: rope_finetuned        = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
+        if (arch == LLM_ARCH_GRANITE &&
+            std::any_of(hparams.deepstack_mapping_arr.begin(),
+                        hparams.deepstack_mapping_arr.end(),
+                        [](const auto & entry) { return entry >= 0; })) {
+            LLAMA_LOG_INFO("%s: deepstack_mapping_arr = %s\n", __func__,
+                           print_f([&](uint32_t il) { return hparams.deepstack_mapping_arr[il]; },
+                           hparams.n_layer).c_str());
+        }
         // MRoPE (Multi-axis Rotary Position Embedding) sections
         if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
             LLAMA_LOG_INFO("%s: mrope sections        = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]);
diff --git a/src/models/granite.cpp b/src/models/granite.cpp
index 7aff942da0..4a75c5ff3c 100644
--- a/src/models/granite.cpp
+++ b/src/models/granite.cpp
@@ -1,5 +1,7 @@
 #include "models.h"
 
+#include <sstream>
+
 void llama_model_granite::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
     ml.get_key(LLM_KV_LOGIT_SCALE,                 hparams.f_logit_scale);
@@ -7,6 +9,27 @@ void llama_model_granite::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_EMBEDDING_SCALE,             hparams.f_embedding_scale, false);
     ml.get_key(LLM_KV_ATTENTION_SCALE,             hparams.f_attention_scale, false);
 
+    // Granite4 Vision uses array deepstack_mapping
+    ml.get_arr(LLM_KV_DEEPSTACK_MAPPING, hparams.deepstack_mapping_arr, false);
+
+    // Count the unique deepstack input indices
+    std::unordered_set<uint32_t> unique_deepstack_idxs;
+    for (const auto val : hparams.deepstack_mapping_arr) {
+        if (val >= 0) {
+            unique_deepstack_idxs.insert(val);
+        }
+    }
+    hparams.n_deepstack_layers = unique_deepstack_idxs.size();
+
+    // Ensure all values are valid (avoid overflow attacks)
+    for (const auto val : unique_deepstack_idxs) {
+        if (val > hparams.n_deepstack_layers) {
+            std::stringstream ss;
+            ss << "Invalid deepstack index: " << val << " > " << hparams.n_deepstack_layers;
+            throw std::runtime_error(ss.str());
+        }
+    }
+
     // Granite uses rope_finetuned as a switch for rope, so default to true
     bool rope_finetuned = true;
     ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
@@ -112,6 +135,20 @@ llama_model_granite::graph::graph(
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
     for (int il = 0; il < n_layer; ++il) {
+
+        // Granite Vision 4.1 deepstack: inject the projector stream that
+        // targets decoder layer `il` before the decoder runs.
+        // NOTE: skip the first deepstack layer since that's inpL
+        const auto & deepstack_emb_idx = hparams.deepstack_mapping_arr[il];
+        if (il > 0 && deepstack_emb_idx >= 0) {
+            ggml_tensor * ds = ggml_view_2d(ctx0,
+                res->t_inp_embd, n_embd, n_tokens,
+                res->t_inp_embd->nb[1],
+                deepstack_emb_idx * n_embd * sizeof(float));
+            inpL = ggml_add(ctx0, inpL, ds);
+            cb(inpL, "deepstack_in", il);
+        }
+
         ggml_tensor * inpSA = inpL;
 
         // norm
diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
index 93f005652b..20c5317863 100644
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -25,6 +25,7 @@ add_library(mtmd
             models/gemma4uv.cpp
             models/glm4v.cpp
             models/granite-speech.cpp
+            models/granite4-vision.cpp
             models/hunyuanvl.cpp
             models/internvl.cpp
             models/kimivl.cpp
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
index c055cfb754..393e085f71 100644
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -35,20 +35,22 @@
 #define KEY_LAYER_NORM_EPS      "clip.%s.attention.layer_norm_epsilon"
 
 // vision-specific
-#define KEY_VISION_PROJ_TYPE    "clip.vision.projector_type" // for models with mixed modalities
-#define KEY_IMAGE_SIZE          "clip.vision.image_size"
-#define KEY_IMAGE_MIN_PIXELS    "clip.vision.image_min_pixels"
-#define KEY_IMAGE_MAX_PIXELS    "clip.vision.image_max_pixels"
-#define KEY_PREPROC_MIN_TILES   "clip.vision.preproc_min_tiles"
-#define KEY_PREPROC_MAX_TILES   "clip.vision.preproc_max_tiles"
-#define KEY_PREPROC_IMAGE_SIZE  "clip.vision.preproc_image_size"
-#define KEY_PATCH_SIZE          "clip.vision.patch_size"
-#define KEY_IMAGE_MEAN          "clip.vision.image_mean"
-#define KEY_IMAGE_STD           "clip.vision.image_std"
-#define KEY_FEATURE_LAYER       "clip.vision.feature_layer"
-#define KEY_PROJ_SCALE_FACTOR   "clip.vision.projector.scale_factor"
-#define KEY_SPATIAL_MERGE_SIZE  "clip.vision.spatial_merge_size"
-#define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers"
+#define KEY_VISION_PROJ_TYPE        "clip.vision.projector_type" // for models with mixed modalities
+#define KEY_IMAGE_SIZE              "clip.vision.image_size"
+#define KEY_IMAGE_MIN_PIXELS        "clip.vision.image_min_pixels"
+#define KEY_IMAGE_MAX_PIXELS        "clip.vision.image_max_pixels"
+#define KEY_PREPROC_MIN_TILES       "clip.vision.preproc_min_tiles"
+#define KEY_PREPROC_MAX_TILES       "clip.vision.preproc_max_tiles"
+#define KEY_PREPROC_IMAGE_SIZE      "clip.vision.preproc_image_size"
+#define KEY_PATCH_SIZE              "clip.vision.patch_size"
+#define KEY_IMAGE_MEAN              "clip.vision.image_mean"
+#define KEY_IMAGE_STD               "clip.vision.image_std"
+#define KEY_FEATURE_LAYER           "clip.vision.feature_layer"
+#define KEY_PROJ_SCALE_FACTOR       "clip.vision.projector.scale_factor"
+#define KEY_PROJ_SAMPLE_QUERY_SIDE  "clip.vision.projector.query_side"
+#define KEY_PROJ_SAMPLE_WINDOW_SIDE "clip.vision.projector.window_side"
+#define KEY_PROJ_SPATIAL_OFFSETS    "clip.vision.projector.spatial_offsets"
+#define KEY_SPATIAL_MERGE_SIZE      "clip.vision.spatial_merge_size"
 
 #define KEY_MM_PATCH_MERGE_TYPE    "clip.vision.mm_patch_merge_type"
 #define KEY_IMAGE_GRID_PINPOINTS   "clip.vision.image_grid_pinpoints"
@@ -72,7 +74,6 @@
 #define KEY_A_PROJ_DOWNSAMPLE_RATE "clip.audio.projector.downsample_rate"
 #define KEY_A_PROJ_HEAD_COUNT      "clip.audio.projector.head_count"
 
-
 //
 // tensor name constants
 //
@@ -210,22 +211,28 @@
 #define TN_CTC_OUT_MID     "a.enc_ctc_out_mid.%s"
 #define TN_ATTN_REL_POS_EMB "%s.blk.%d.attn_rel_pos_emb"
 // qformer projector
-#define TN_QF_PROJ_QUERY   "a.proj_query"
-#define TN_QF_PROJ_NORM    "a.proj_norm.%s"
-#define TN_QF_PROJ_LINEAR  "a.proj_linear.%s"
-#define TN_QF_SELF_ATTN_Q  "a.proj_blk.%d.self_attn_q.%s"
-#define TN_QF_SELF_ATTN_K  "a.proj_blk.%d.self_attn_k.%s"
-#define TN_QF_SELF_ATTN_V  "a.proj_blk.%d.self_attn_v.%s"
-#define TN_QF_SELF_ATTN_O  "a.proj_blk.%d.self_attn_out.%s"
-#define TN_QF_SELF_ATTN_N  "a.proj_blk.%d.self_attn_norm.%s"
-#define TN_QF_CROSS_ATTN_Q "a.proj_blk.%d.cross_attn_q.%s"
-#define TN_QF_CROSS_ATTN_K "a.proj_blk.%d.cross_attn_k.%s"
-#define TN_QF_CROSS_ATTN_V "a.proj_blk.%d.cross_attn_v.%s"
-#define TN_QF_CROSS_ATTN_O "a.proj_blk.%d.cross_attn_out.%s"
-#define TN_QF_CROSS_ATTN_N "a.proj_blk.%d.cross_attn_norm.%s"
-#define TN_QF_FFN_UP       "a.proj_blk.%d.ffn_up.%s"
-#define TN_QF_FFN_DOWN     "a.proj_blk.%d.ffn_down.%s"
-#define TN_QF_FFN_NORM     "a.proj_blk.%d.ffn_norm.%s"
+#define TN_QF_PROJ_QUERY   "%s.proj_query"
+#define TN_QF_PROJ_NORM    "%s.proj_norm.%s"
+#define TN_QF_PROJ_LINEAR  "%s.proj_linear.%s"
+#define TN_QF_SELF_ATTN_Q  "%s.proj_blk.%d.self_attn_q.%s"
+#define TN_QF_SELF_ATTN_K  "%s.proj_blk.%d.self_attn_k.%s"
+#define TN_QF_SELF_ATTN_V  "%s.proj_blk.%d.self_attn_v.%s"
+#define TN_QF_SELF_ATTN_O  "%s.proj_blk.%d.self_attn_out.%s"
+#define TN_QF_SELF_ATTN_N  "%s.proj_blk.%d.self_attn_norm.%s"
+#define TN_QF_CROSS_ATTN_Q "%s.proj_blk.%d.cross_attn_q.%s"
+#define TN_QF_CROSS_ATTN_K "%s.proj_blk.%d.cross_attn_k.%s"
+#define TN_QF_CROSS_ATTN_V "%s.proj_blk.%d.cross_attn_v.%s"
+#define TN_QF_CROSS_ATTN_O "%s.proj_blk.%d.cross_attn_out.%s"
+#define TN_QF_CROSS_ATTN_N "%s.proj_blk.%d.cross_attn_norm.%s"
+#define TN_QF_FFN_UP       "%s.proj_blk.%d.ffn_up.%s"
+#define TN_QF_FFN_DOWN     "%s.proj_blk.%d.ffn_down.%s"
+#define TN_QF_FFN_NORM     "%s.proj_blk.%d.ffn_norm.%s"
+// multi-projector qformer (bid => projector ID)
+#define TN_MULTI_PROJ_IMG_POS   "v.proj_blk.%d.img_pos"
+#define TN_MULTI_PROJ_QUERY     "%s.proj_blk.%d.query"
+#define TN_MULTI_PROJ_LINEAR    "%s.proj_blk.%d.linear.%s"
+#define TN_MULTI_PROJ_NORM      "%s.proj_blk.%d.norm.%s"
+#define TN_MULTI_PROJ_POST_NORM "%s.proj_blk.%d.post_norm.%s"
 
 // gemma4 audio conformer
 #define TN_A_MM_INP_PROJ     "mm.a.input_projection.%s"
@@ -354,6 +361,7 @@ enum projector_type {
     PROJECTOR_TYPE_MINICPMV4_6,
     PROJECTOR_TYPE_GRANITE_SPEECH,
     PROJECTOR_TYPE_MIMOVL,
+    PROJECTOR_TYPE_GRANITE4_VISION,
     PROJECTOR_TYPE_UNKNOWN,
 };
 
@@ -407,6 +415,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
     { PROJECTOR_TYPE_MINICPMV4_6, "minicpmv4_6"},
     { PROJECTOR_TYPE_GRANITE_SPEECH, "granite_speech"},
     { PROJECTOR_TYPE_MIMOVL,     "mimovl"},
+    { PROJECTOR_TYPE_GRANITE4_VISION, "granite4_vision"},
 };
 
 static projector_type clip_projector_type_from_string(const std::string & str) {
@@ -438,6 +447,8 @@ struct clip_image_f32 {
 
     // marks the global view in e.g., DeepSeek-OCR Models
     bool add_viewsep = false;
+    // whether a learned newline token should be appended after the image (eg Granite4 Vision)
+    bool add_newline = false;
 };
 
 //
diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h
index 238f805a9a..48796b6306 100644
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@@ -4,6 +4,7 @@
 #include "clip.h"
 #include "clip-impl.h"
 
+#include <algorithm>
 #include <array>
 #include <vector>
 #include <unordered_set>
@@ -90,7 +91,7 @@ struct clip_hparams {
 
     float eps = 1e-6;
     float rope_theta = 0.0;
-    std::unordered_set<int32_t> vision_feature_layer;
+    std::vector<int32_t> vision_feature_layer;
     int32_t attn_window_size = 0;
     int32_t n_wa_pattern = 0;
     std::unordered_set<int32_t> wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL)
@@ -101,6 +102,11 @@ struct clip_hparams {
     int32_t sam_n_head  = 0;
     int32_t sam_n_embd  = 0;
 
+    // Granite4 Vision
+    std::vector<int32_t> proj_spatial_offsets;
+    int32_t downsample_query_side;
+    int32_t downsample_window_side;
+
     // audio
     int32_t n_mel_bins = 0; // whisper preprocessor
     int32_t proj_stack_factor = 0; // ultravox
@@ -158,6 +164,10 @@ struct clip_hparams {
 
         return false;
     }
+
+    bool is_vision_feature_layer(int32_t layer) const {
+        return std::find(vision_feature_layer.begin(), vision_feature_layer.end(), layer) != vision_feature_layer.end();
+    }
 };
 
 struct clip_layer {
@@ -325,6 +335,20 @@ struct yasa2_stage {
     std::vector<yasa2_block> blocks;
 };
 
+// QFormer projector block for models with 1 (or more) QFormer projectors
+// Granite Speech, Granite4 Vision
+struct qf_block {
+    ggml_tensor * qf_proj_query       = nullptr;
+    ggml_tensor * qf_proj_norm_w      = nullptr;
+    ggml_tensor * qf_proj_norm_b      = nullptr;
+    ggml_tensor * qf_proj_linear_w    = nullptr;
+    ggml_tensor * qf_proj_linear_b    = nullptr;
+    ggml_tensor * qf_proj_post_norm_w = nullptr;
+    ggml_tensor * qf_proj_post_norm_b = nullptr;
+    ggml_tensor * qf_proj_img_pos     = nullptr; // Vision only
+    std::vector<clip_layer> qf_proj_layers;
+};
+
 struct clip_model {
     clip_modality modality = CLIP_MODALITY_VISION;
     projector_type proj_type = PROJECTOR_TYPE_MLP;
@@ -589,13 +613,8 @@ struct clip_model {
     ggml_tensor * ctc_out_b     = nullptr;
     ggml_tensor * ctc_out_mid_w = nullptr;
     ggml_tensor * ctc_out_mid_b = nullptr;
-    // qformer projector
-    ggml_tensor * qf_proj_query    = nullptr;
-    ggml_tensor * qf_proj_norm_w   = nullptr;
-    ggml_tensor * qf_proj_norm_b   = nullptr;
-    ggml_tensor * qf_proj_linear_w = nullptr;
-    ggml_tensor * qf_proj_linear_b = nullptr;
-    std::vector<clip_layer> qf_proj_layers;
+    // qformer projector(s)
+    std::vector<qf_block> qf_proj_blocks;
 
     bool audio_has_avgpool() const {
         return proj_type == PROJECTOR_TYPE_QWEN2A
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 80136ed866..c12c910a1c 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -997,6 +997,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             {
                 builder = std::make_unique<clip_graph_yasa2>(ctx, img);
             } break;
+        case PROJECTOR_TYPE_GRANITE4_VISION:
+            {
+                builder = std::make_unique<clip_graph_granite4_vision>(ctx, img);
+            } break;
         default:
             GGML_ABORT("missing cgraph builder");
     }
@@ -1234,12 +1238,7 @@ struct clip_model_loader {
             // to form the final visual features.
             // NOTE: gguf conversions should standardize the values of the vision feature layer to
             // be non-negative, since we use -1 to mark values as unset here.
-            std::vector<int> vision_feature_layer;
-            get_arr_int(KEY_FEATURE_LAYER, vision_feature_layer, false);
-            // convert std::vector to std::unordered_set
-            for (auto & layer : vision_feature_layer) {
-                hparams.vision_feature_layer.insert(layer);
-            }
+            get_arr_int(KEY_FEATURE_LAYER, hparams.vision_feature_layer, false);
 
             // model-specific params
             switch (model.proj_type) {
@@ -1627,6 +1626,23 @@ struct clip_model_loader {
                         hparams.image_pad_color   = {127, 127, 127};
                         hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
                     } break;
+                case PROJECTOR_TYPE_GRANITE4_VISION:
+                    {
+                        // SigLIP tower.
+                        hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
+                        hparams.image_resize_pad = PAD_CEIL;
+
+                        get_arr_int(KEY_FEATURE_LAYER, hparams.vision_feature_layer);
+                        get_arr_int(KEY_PROJ_SPATIAL_OFFSETS, hparams.proj_spatial_offsets);
+                        if (hparams.vision_feature_layer.size() != hparams.proj_spatial_offsets.size()) {
+                            throw std::runtime_error(string_format("%s: vision_feature_layer.size() %d != proj_spatial_offsets.size() %d",
+                                                                   hparams.vision_feature_layer.size(), hparams.proj_spatial_offsets.size()));
+                        }
+
+                        get_u32(KEY_PROJ_SAMPLE_QUERY_SIDE,  hparams.downsample_query_side);
+                        get_u32(KEY_PROJ_SAMPLE_WINDOW_SIDE, hparams.downsample_window_side);
+                        hparams.warmup_image_size = hparams.image_size;
+                    } break;
                 default:
                     throw std::runtime_error(string_format("%s: unknown vision projector type %s\n", __func__, proj_type.c_str()));
             }
@@ -2628,47 +2644,106 @@ struct clip_model_loader {
                         layer.conv_pw2_b  = get_tensor(string_format(TN_CONV_PW2,  prefix, il, "bias"));
                     }
 
-                    model.qf_proj_query    = get_tensor(TN_QF_PROJ_QUERY);
-                    model.qf_proj_norm_w   = get_tensor(string_format(TN_QF_PROJ_NORM, "weight"));
-                    model.qf_proj_norm_b   = get_tensor(string_format(TN_QF_PROJ_NORM, "bias"));
-                    model.qf_proj_linear_w = get_tensor(string_format(TN_QF_PROJ_LINEAR, "weight"));
-                    model.qf_proj_linear_b = get_tensor(string_format(TN_QF_PROJ_LINEAR, "bias"));
+                    model.qf_proj_blocks.resize(1);
+                    auto & qf = model.qf_proj_blocks[0];
+                    qf.qf_proj_query    = get_tensor(string_format(TN_QF_PROJ_QUERY, prefix));
+                    qf.qf_proj_norm_w   = get_tensor(string_format(TN_QF_PROJ_NORM, prefix, "weight"));
+                    qf.qf_proj_norm_b   = get_tensor(string_format(TN_QF_PROJ_NORM, prefix, "bias"));
+                    qf.qf_proj_linear_w = get_tensor(string_format(TN_QF_PROJ_LINEAR, prefix, "weight"));
+                    qf.qf_proj_linear_b = get_tensor(string_format(TN_QF_PROJ_LINEAR, prefix, "bias"));
 
                     const int n_proj_layers = 2;
-                    model.qf_proj_layers.resize(n_proj_layers);
+                    qf.qf_proj_layers.resize(n_proj_layers);
                     for (int il = 0; il < n_proj_layers; ++il) {
-                        auto & pl = model.qf_proj_layers[il];
+                        auto & pl = qf.qf_proj_layers[il];
 
-                        pl.q_w    = get_tensor(string_format(TN_QF_SELF_ATTN_Q, il, "weight"));
-                        pl.q_b    = get_tensor(string_format(TN_QF_SELF_ATTN_Q, il, "bias"));
-                        pl.k_w    = get_tensor(string_format(TN_QF_SELF_ATTN_K, il, "weight"));
-                        pl.k_b    = get_tensor(string_format(TN_QF_SELF_ATTN_K, il, "bias"));
-                        pl.v_w    = get_tensor(string_format(TN_QF_SELF_ATTN_V, il, "weight"));
-                        pl.v_b    = get_tensor(string_format(TN_QF_SELF_ATTN_V, il, "bias"));
-                        pl.o_w    = get_tensor(string_format(TN_QF_SELF_ATTN_O, il, "weight"));
-                        pl.o_b    = get_tensor(string_format(TN_QF_SELF_ATTN_O, il, "bias"));
-                        pl.ln_1_w = get_tensor(string_format(TN_QF_SELF_ATTN_N, il, "weight"));
-                        pl.ln_1_b = get_tensor(string_format(TN_QF_SELF_ATTN_N, il, "bias"));
+                        pl.q_w    = get_tensor(string_format(TN_QF_SELF_ATTN_Q, prefix, il, "weight"));
+                        pl.q_b    = get_tensor(string_format(TN_QF_SELF_ATTN_Q, prefix, il, "bias"));
+                        pl.k_w    = get_tensor(string_format(TN_QF_SELF_ATTN_K, prefix, il, "weight"));
+                        pl.k_b    = get_tensor(string_format(TN_QF_SELF_ATTN_K, prefix, il, "bias"));
+                        pl.v_w    = get_tensor(string_format(TN_QF_SELF_ATTN_V, prefix, il, "weight"));
+                        pl.v_b    = get_tensor(string_format(TN_QF_SELF_ATTN_V, prefix, il, "bias"));
+                        pl.o_w    = get_tensor(string_format(TN_QF_SELF_ATTN_O, prefix, il, "weight"));
+                        pl.o_b    = get_tensor(string_format(TN_QF_SELF_ATTN_O, prefix, il, "bias"));
+                        pl.ln_1_w = get_tensor(string_format(TN_QF_SELF_ATTN_N, prefix, il, "weight"));
+                        pl.ln_1_b = get_tensor(string_format(TN_QF_SELF_ATTN_N, prefix, il, "bias"));
 
-                        pl.cross_attn_q_w    = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, il, "weight"));
-                        pl.cross_attn_q_b    = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, il, "bias"));
-                        pl.cross_attn_k_w    = get_tensor(string_format(TN_QF_CROSS_ATTN_K, il, "weight"));
-                        pl.cross_attn_k_b    = get_tensor(string_format(TN_QF_CROSS_ATTN_K, il, "bias"));
-                        pl.cross_attn_v_w    = get_tensor(string_format(TN_QF_CROSS_ATTN_V, il, "weight"));
-                        pl.cross_attn_v_b    = get_tensor(string_format(TN_QF_CROSS_ATTN_V, il, "bias"));
-                        pl.cross_attn_o_w    = get_tensor(string_format(TN_QF_CROSS_ATTN_O, il, "weight"));
-                        pl.cross_attn_o_b    = get_tensor(string_format(TN_QF_CROSS_ATTN_O, il, "bias"));
-                        pl.cross_attn_norm_w = get_tensor(string_format(TN_QF_CROSS_ATTN_N, il, "weight"));
-                        pl.cross_attn_norm_b = get_tensor(string_format(TN_QF_CROSS_ATTN_N, il, "bias"));
+                        pl.cross_attn_q_w    = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, prefix, il, "weight"));
+                        pl.cross_attn_q_b    = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, prefix, il, "bias"));
+                        pl.cross_attn_k_w    = get_tensor(string_format(TN_QF_CROSS_ATTN_K, prefix, il, "weight"));
+                        pl.cross_attn_k_b    = get_tensor(string_format(TN_QF_CROSS_ATTN_K, prefix, il, "bias"));
+                        pl.cross_attn_v_w    = get_tensor(string_format(TN_QF_CROSS_ATTN_V, prefix, il, "weight"));
+                        pl.cross_attn_v_b    = get_tensor(string_format(TN_QF_CROSS_ATTN_V, prefix, il, "bias"));
+                        pl.cross_attn_o_w    = get_tensor(string_format(TN_QF_CROSS_ATTN_O, prefix, il, "weight"));
+                        pl.cross_attn_o_b    = get_tensor(string_format(TN_QF_CROSS_ATTN_O, prefix, il, "bias"));
+                        pl.cross_attn_norm_w = get_tensor(string_format(TN_QF_CROSS_ATTN_N, prefix, il, "weight"));
+                        pl.cross_attn_norm_b = get_tensor(string_format(TN_QF_CROSS_ATTN_N, prefix, il, "bias"));
 
-                        pl.ff_up_w   = get_tensor(string_format(TN_QF_FFN_UP,   il, "weight"));
-                        pl.ff_up_b   = get_tensor(string_format(TN_QF_FFN_UP,   il, "bias"));
-                        pl.ff_down_w = get_tensor(string_format(TN_QF_FFN_DOWN, il, "weight"));
-                        pl.ff_down_b = get_tensor(string_format(TN_QF_FFN_DOWN, il, "bias"));
-                        pl.ln_2_w    = get_tensor(string_format(TN_QF_FFN_NORM, il, "weight"));
-                        pl.ln_2_b    = get_tensor(string_format(TN_QF_FFN_NORM, il, "bias"));
+                        pl.ff_up_w   = get_tensor(string_format(TN_QF_FFN_UP,   prefix, il, "weight"));
+                        pl.ff_up_b   = get_tensor(string_format(TN_QF_FFN_UP,   prefix, il, "bias"));
+                        pl.ff_down_w = get_tensor(string_format(TN_QF_FFN_DOWN, prefix, il, "weight"));
+                        pl.ff_down_b = get_tensor(string_format(TN_QF_FFN_DOWN, prefix, il, "bias"));
+                        pl.ln_2_w    = get_tensor(string_format(TN_QF_FFN_NORM, prefix, il, "weight"));
+                        pl.ln_2_b    = get_tensor(string_format(TN_QF_FFN_NORM, prefix, il, "bias"));
                     }
                 } break;
+            case PROJECTOR_TYPE_GRANITE4_VISION:
+                {
+                    // image_newline lives at the top-level.
+                    model.image_newline = get_tensor(TN_IMAGE_NEWLINE);
+
+                    // Load separate layerwise and spatial projector tensors
+                    const auto projector_count = hparams.vision_feature_layer.size();
+                    model.qf_proj_blocks.resize(projector_count);
+                    for (size_t bid = 0; bid < projector_count; ++bid) {
+                        auto & b = model.qf_proj_blocks[bid];
+
+                        // non-layerwise tensors
+                        b.qf_proj_img_pos     = get_tensor(string_format(TN_MULTI_PROJ_IMG_POS,           bid));
+                        b.qf_proj_query       = get_tensor(string_format(TN_MULTI_PROJ_QUERY,     prefix, bid));
+                        b.qf_proj_linear_w    = get_tensor(string_format(TN_MULTI_PROJ_LINEAR,    prefix, bid, "weight"));
+                        b.qf_proj_linear_b    = get_tensor(string_format(TN_MULTI_PROJ_LINEAR,    prefix, bid, "bias"));
+                        b.qf_proj_norm_w      = get_tensor(string_format(TN_MULTI_PROJ_NORM,      prefix, bid, "weight"));
+                        b.qf_proj_norm_b      = get_tensor(string_format(TN_MULTI_PROJ_NORM,      prefix, bid, "bias"));
+                        b.qf_proj_post_norm_w = get_tensor(string_format(TN_MULTI_PROJ_POST_NORM, prefix, bid, "weight"));
+                        b.qf_proj_post_norm_b = get_tensor(string_format(TN_MULTI_PROJ_POST_NORM, prefix, bid, "bias"));
+
+                        // laywerwise tensors
+                        // NOTE: If any model uses multi-layer qformers, this will need to change
+                        b.qf_proj_layers.resize(1);
+                        auto & pl = b.qf_proj_layers[0];
+
+                        pl.q_w    = get_tensor(string_format(TN_QF_SELF_ATTN_Q, prefix, bid, "weight"));
+                        pl.q_b    = get_tensor(string_format(TN_QF_SELF_ATTN_Q, prefix, bid, "bias"));
+                        pl.k_w    = get_tensor(string_format(TN_QF_SELF_ATTN_K, prefix, bid, "weight"));
+                        pl.k_b    = get_tensor(string_format(TN_QF_SELF_ATTN_K, prefix, bid, "bias"));
+                        pl.v_w    = get_tensor(string_format(TN_QF_SELF_ATTN_V, prefix, bid, "weight"));
+                        pl.v_b    = get_tensor(string_format(TN_QF_SELF_ATTN_V, prefix, bid, "bias"));
+                        pl.o_w    = get_tensor(string_format(TN_QF_SELF_ATTN_O, prefix, bid, "weight"));
+                        pl.o_b    = get_tensor(string_format(TN_QF_SELF_ATTN_O, prefix, bid, "bias"));
+                        pl.ln_1_w = get_tensor(string_format(TN_QF_SELF_ATTN_N, prefix, bid, "weight"));
+                        pl.ln_1_b = get_tensor(string_format(TN_QF_SELF_ATTN_N, prefix, bid, "bias"));
+
+                        pl.cross_attn_q_w    = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, prefix, bid, "weight"));
+                        pl.cross_attn_q_b    = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, prefix, bid, "bias"));
+                        pl.cross_attn_k_w    = get_tensor(string_format(TN_QF_CROSS_ATTN_K, prefix, bid, "weight"));
+                        pl.cross_attn_k_b    = get_tensor(string_format(TN_QF_CROSS_ATTN_K, prefix, bid, "bias"));
+                        pl.cross_attn_v_w    = get_tensor(string_format(TN_QF_CROSS_ATTN_V, prefix, bid, "weight"));
+                        pl.cross_attn_v_b    = get_tensor(string_format(TN_QF_CROSS_ATTN_V, prefix, bid, "bias"));
+                        pl.cross_attn_o_w    = get_tensor(string_format(TN_QF_CROSS_ATTN_O, prefix, bid, "weight"));
+                        pl.cross_attn_o_b    = get_tensor(string_format(TN_QF_CROSS_ATTN_O, prefix, bid, "bias"));
+                        pl.cross_attn_norm_w = get_tensor(string_format(TN_QF_CROSS_ATTN_N, prefix, bid, "weight"));
+                        pl.cross_attn_norm_b = get_tensor(string_format(TN_QF_CROSS_ATTN_N, prefix, bid, "bias"));
+
+                        pl.ff_up_w   = get_tensor(string_format(TN_QF_FFN_UP,   prefix, bid, "weight"));
+                        pl.ff_up_b   = get_tensor(string_format(TN_QF_FFN_UP,   prefix, bid, "bias"));
+                        pl.ff_down_w = get_tensor(string_format(TN_QF_FFN_DOWN, prefix, bid, "weight"));
+                        pl.ff_down_b = get_tensor(string_format(TN_QF_FFN_DOWN, prefix, bid, "bias"));
+                        pl.ln_2_w    = get_tensor(string_format(TN_QF_FFN_NORM, prefix, bid, "weight"));
+                        pl.ln_2_b    = get_tensor(string_format(TN_QF_FFN_NORM, prefix, bid, "bias"));
+                    }
+
+                } break;
             default:
                 GGML_ASSERT(false && "unknown projector type");
         }
@@ -3085,10 +3160,6 @@ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny
     memcpy(img->buf.data(), rgb_pixels, img->buf.size());
 }
 
-ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
-    return ctx->model.image_newline;
-}
-
 void clip_free(clip_ctx * ctx) {
     if (ctx == nullptr) {
         return;
@@ -3397,6 +3468,23 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
                 const int ds = ctx->model.hparams.audio_proj_downsample_rate;
                 n_patches = ((img->nx + ws - 1) / ws) * (ws / ds);
             } break;
+        case PROJECTOR_TYPE_GRANITE4_VISION:
+            {
+                // Per-tile output token count: each projector block outputs
+                // query_side^2 tokens per window × n^2 windows.
+                // For 384×384 input: n = 24/8 = 3, query_side = 4 → 144.
+                const int window_side = ctx->model.hparams.downsample_window_side;
+                const int query_side  = ctx->model.hparams.downsample_query_side;
+                const int side        = img->nx / params.patch_size;
+                const int n           = side / window_side;
+                n_patches             = (query_side * n) * (query_side * n);
+                if (img->add_newline) {
+                    // For single-tile case: append 1 newline row.
+                    // For multi-tile rowwise: handled by caller, but here we
+                    // report the per-tile count including one trailing newline.
+                    n_patches += 1;
+                }
+            } break;
         default:
             GGML_ABORT("unsupported projector type");
     }
@@ -4229,6 +4317,82 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                     set_input_f32("attn_mask", mask);
                 }
             } break;
+        case PROJECTOR_TYPE_GRANITE4_VISION:
+            {
+                // Granite Vision 4.1 uses precomputed permutation index
+                // tensors to express the _win / _unwin / spatial sampling
+                // reshapes as ggml_get_rows gathers. The names are set
+                // by g4v_gather() in models/granite4-vision.cpp.
+                const int patch_size  = model.hparams.patch_size;
+                const int image_side  = imgs.entries.front()->nx / patch_size;
+                const int window_side = hparams.downsample_window_side;
+                const int query_side  = hparams.downsample_query_side;
+                const int n           = image_side / window_side;
+                const int new_side    = n * query_side;
+
+                // Builds the raster→window permutation indices for a
+                // (side, side) grid split into (n × n) windows of (win × win)
+                // tokens each.  dst[w * win*win + p] = source raster index.
+                auto make_win_idx = [](int side, int win) {
+                    const int nn = side / win;
+                    std::vector<int32_t> idx(static_cast<size_t>(side) * side);
+                    for (int wy = 0; wy < nn; ++wy) {
+                        for (int wx = 0; wx < nn; ++wx) {
+                            for (int iy = 0; iy < win; ++iy) {
+                                for (int ix = 0; ix < win; ++ix) {
+                                    const int w  = wy * nn + wx;
+                                    const int p  = iy * win + ix;
+                                    const int y  = wy * win + iy;
+                                    const int x  = wx * win + ix;
+                                    idx[static_cast<size_t>(w) * (win*win) + p] = y * side + x;
+                                }
+                            }
+                        }
+                    }
+                    return idx;
+                };
+
+                auto make_unwin_idx = [&](int side, int win) {
+                    const std::vector<int32_t> fwd = make_win_idx(side, win);
+                    std::vector<int32_t> inv(fwd.size());
+                    for (size_t i = 0; i < fwd.size(); ++i) {
+                        inv[fwd[i]] = static_cast<int32_t>(i);
+                    }
+                    return inv;
+                };
+
+                auto make_spatial_idx = [](int side, int offset) {
+                    const int off_y = (offset >> 1) & 1;
+                    const int off_x = offset & 1;
+                    const int new_s = side / 2;
+                    std::vector<int32_t> idx(static_cast<size_t>(new_s) * new_s);
+                    for (int y = 0; y < new_s; ++y) {
+                        for (int x = 0; x < new_s; ++x) {
+                            idx[y * new_s + x] = (y * 2 + off_y) * side + (x * 2 + off_x);
+                        }
+                    }
+                    return idx;
+                };
+
+                auto upload = [&](const std::string & name, const std::vector<int32_t> & idx) {
+                    ggml_tensor * t = ggml_graph_get_tensor(gf, name.c_str());
+                    GGML_ASSERT(t);
+                    ggml_backend_tensor_set(t, idx.data(), 0, idx.size() * sizeof(int32_t));
+                };
+
+                // Stage 1b only uses block 0's permutations; future stages
+                // will upload all blocks.
+                for (size_t bid = 0; bid < hparams.vision_feature_layer.size(); ++bid) {
+                    const std::string prefix = "g4v_blk" + std::to_string(bid) + "_";
+                    upload(prefix + "win_idx",     make_win_idx(image_side, window_side));
+                    upload(prefix + "qwin_idx",    make_win_idx(new_side, query_side));
+                    upload(prefix + "unwin_idx",   make_unwin_idx(new_side, query_side));
+                    const auto spatial_offset = hparams.proj_spatial_offsets[bid];
+                    if (spatial_offset >= 0) {
+                        upload(prefix + "spatial_idx", make_spatial_idx(image_side,spatial_offset));
+                    }
+                }
+            } break;
         default:
             GGML_ABORT("Unknown projector type");
     }
@@ -4384,7 +4548,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
         case PROJECTOR_TYPE_LFM2A:
             return ctx->model.position_embeddings->ne[0];
         case PROJECTOR_TYPE_GRANITE_SPEECH:
-            return ctx->model.qf_proj_linear_w->ne[1];
+            return ctx->model.qf_proj_blocks[0].qf_proj_linear_w->ne[1];
+        case PROJECTOR_TYPE_GRANITE4_VISION:
+            return ctx->model.qf_proj_blocks.size() * ctx->model.hparams.projection_dim;
         case PROJECTOR_TYPE_GLM4V:
             return ctx->model.mm_ffn_down_w->ne[1];
         default:
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
index 9b807ffa77..a62c9d6187 100644
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -100,8 +100,6 @@ struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch
  */
 void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
 
-struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
-
 bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
 bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
 
diff --git a/tools/mtmd/models/granite-speech.cpp b/tools/mtmd/models/granite-speech.cpp
index c7e3794a49..5e66f75d0a 100644
--- a/tools/mtmd/models/granite-speech.cpp
+++ b/tools/mtmd/models/granite-speech.cpp
@@ -199,8 +199,8 @@ ggml_cgraph * clip_graph_granite_speech::build() {
 
         ggml_tensor * enc_windows = ggml_reshape_3d(ctx0, cur, n_embd, window_size, nblocks_proj);
 
-        ggml_tensor * queries = build_norm(model.qf_proj_query,
-            model.qf_proj_norm_w, model.qf_proj_norm_b,
+        ggml_tensor * queries = build_norm(model.qf_proj_blocks[0].qf_proj_query,
+            model.qf_proj_blocks[0].qf_proj_norm_w, model.qf_proj_blocks[0].qf_proj_norm_b,
             NORM_TYPE_NORMAL, proj_eps, -1);
         {
             ggml_tensor * q_3d    = ggml_reshape_3d(ctx0, queries, n_embd, num_queries, 1);
@@ -209,8 +209,8 @@ ggml_cgraph * clip_graph_granite_speech::build() {
             queries = ggml_repeat(ctx0, q_3d, q_shape);
         }
 
-        for (int il = 0; il < (int)model.qf_proj_layers.size(); il++) {
-            const auto & pl = model.qf_proj_layers[il];
+        for (int il = 0; il < (int)model.qf_proj_blocks[0].qf_proj_layers.size(); il++) {
+            const auto & pl = model.qf_proj_blocks[0].qf_proj_layers[il];
 
             // self-attention
             {
@@ -265,7 +265,7 @@ ggml_cgraph * clip_graph_granite_speech::build() {
         }
 
         cur = ggml_reshape_2d(ctx0, queries, n_embd, num_queries * nblocks_proj);
-        cur = ggml_add(ctx0, build_mm(model.qf_proj_linear_w, cur), model.qf_proj_linear_b);
+        cur = ggml_add(ctx0, build_mm(model.qf_proj_blocks[0].qf_proj_linear_w, cur), model.qf_proj_blocks[0].qf_proj_linear_b);
         cb(cur, "projector_out", -1);
     }
 
diff --git a/tools/mtmd/models/granite4-vision.cpp b/tools/mtmd/models/granite4-vision.cpp
new file mode 100644
index 0000000000..9adb6f0fdb
--- /dev/null
+++ b/tools/mtmd/models/granite4-vision.cpp
@@ -0,0 +1,339 @@
+#include "models.h"
+#include "../clip-impl.h"
+#include "../clip-model.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <string>
+#include <vector>
+
+/*
+ * Granite Vision 4.1 clip graph
+ *
+ *   Stage 1a: SigLIP vision tower (N layers, post-norm)
+ *   Stage 1b: WindowQFormer blocks (deepstack + spatial)
+ *   Stage 1c: Concatenate and pack outputs
+ *   Stage 1d: Append newline tokens if add_newline is set
+ */
+
+// ---------------------------------------------------------------------------
+// Member method implementations
+// ---------------------------------------------------------------------------
+
+ggml_tensor * clip_graph_granite4_vision::gather(
+        ggml_tensor * src,
+        const std::string & name,
+        int idx_len) {
+    ggml_tensor * idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, idx_len);
+    ggml_set_name(idx, name.c_str());
+    ggml_set_input(idx);
+    return ggml_get_rows(ctx0, src, idx);
+}
+
+ggml_tensor * clip_graph_granite4_vision::interp_down(
+        ggml_tensor * src,
+        int side,
+        int new_side) {
+    const int n_embd = src->ne[0];
+    ggml_tensor * t = ggml_reshape_4d(ctx0, src, n_embd, side, side, 1);
+    t = ggml_cont(ctx0, ggml_permute(ctx0, t, 2, 0, 1, 3));
+    const int kernel = side / new_side;
+    t = ggml_pool_2d(ctx0, t, GGML_OP_POOL_AVG, kernel, kernel, kernel, kernel, 0, 0);
+    t = ggml_cont(ctx0, ggml_permute(ctx0, t, 1, 2, 0, 3));
+    return ggml_reshape_2d(ctx0, t, n_embd, new_side * new_side);
+}
+
+// ---------------------------------------------------------------------------
+// build_block - WindowQFormer block implementation
+// ---------------------------------------------------------------------------
+
+ggml_tensor * clip_graph_granite4_vision::build_block(
+        const qf_block & blk,
+        ggml_tensor * h,
+        int bid,
+        int spatial_offset,
+        int image_side,
+        int window_side,
+        int query_side,
+        float qformer_eps) {
+
+    const int n_embd = h->ne[0];
+    GGML_ASSERT(h->ne[1] == image_side * image_side);
+    const int n = image_side / window_side;
+    const int new_side = n * query_side;
+    const int n_windows = n * n;
+    const int enc_len = window_side * window_side;
+    const int query_len = query_side * query_side;
+
+    auto cbx = [&](ggml_tensor * & t, const char * step) {
+        const std::string name = "g4v_blk" + std::to_string(bid) + "_" + step;
+        ggml_set_name(t, name.c_str());
+    };
+
+    // 1. Top-level LN
+    cbx(h, "inp");
+    ggml_tensor * x = build_norm(h, blk.qf_proj_norm_w, blk.qf_proj_norm_b, NORM_TYPE_NORMAL, eps, bid);
+    cbx(x, "norm");
+
+    // 2. enc = _win(x, image_side, window_side)
+    ggml_tensor * enc;
+    {
+        ggml_tensor * enc_flat = gather(x,
+            "g4v_blk" + std::to_string(bid) + "_win_idx",
+            image_side * image_side);
+        enc = ggml_reshape_3d(ctx0, enc_flat, n_embd, enc_len, n_windows);
+    }
+    cbx(enc, "enc");
+
+    // 3. downsampled = downsampler(x)
+    ggml_tensor * d;
+    (void) spatial_offset;
+    if (spatial_offset >= 0) {
+        d = gather(x,
+            "g4v_blk" + std::to_string(bid) + "_spatial_idx",
+            new_side * new_side);
+    } else {
+        d = interp_down(x, image_side, new_side);
+    }
+    cbx(d, "downsampled");
+
+    // 4. query_embeds = query + _win(d, new_side, query_side)
+    ggml_tensor * q_in;
+    {
+        ggml_tensor * dw_flat = gather(d,
+            "g4v_blk" + std::to_string(bid) + "_qwin_idx",
+            new_side * new_side);
+        ggml_tensor * dw = ggml_reshape_3d(ctx0, dw_flat, n_embd, query_len, n_windows);
+        q_in = ggml_add(ctx0, dw, blk.qf_proj_query);
+    }
+    cbx(q_in, "query_embeds");
+
+    // 5. encoder_embeds = enc + image_positions → (C, enc_len, n_windows)
+    ggml_tensor * e_in = ggml_add(ctx0, enc, blk.qf_proj_img_pos);
+    cbx(e_in, "encoder_embeds");
+
+    // 6. Qformer forward.
+    ggml_tensor * q = build_norm(q_in, blk.qf_proj_post_norm_w, blk.qf_proj_post_norm_b, NORM_TYPE_NORMAL, qformer_eps, bid);
+
+    // Helper for linear projections with window batching
+    auto linear = [&](ggml_tensor * x, ggml_tensor * w, ggml_tensor * b) -> ggml_tensor * {
+        ggml_tensor * t = ggml_reshape_2d(ctx0, x, x->ne[0], x->ne[1] * x->ne[2]);
+        t = build_mm(w, t);
+        if (b) t = ggml_add(ctx0, t, b);
+        return t;
+    };
+
+    // Get the single QFormer layer
+    GGML_ASSERT(blk.qf_proj_layers.size() == 1);
+    const auto & pl = blk.qf_proj_layers[0];
+
+    // 6a. Self-attention
+    ggml_tensor * sa_out;
+    {
+        const int d_h = 64;
+        const int n_head = n_embd / d_h;
+        const int nq = q->ne[1];
+        const float scale = 1.0f / std::sqrt((float) d_h);
+
+        ggml_tensor * Q = linear(q, pl.q_w, pl.q_b);
+        ggml_tensor * K = linear(q, pl.k_w, pl.k_b);
+        ggml_tensor * V = linear(q, pl.v_w, pl.v_b);
+
+        Q = ggml_reshape_4d(ctx0, Q, d_h, n_head, nq, n_windows);
+        K = ggml_reshape_4d(ctx0, K, d_h, n_head, nq, n_windows);
+        V = ggml_reshape_4d(ctx0, V, d_h, n_head, nq, n_windows);
+
+        sa_out = build_attn(pl.o_w, pl.o_b, Q, K, V, nullptr, scale, bid);
+        sa_out = ggml_reshape_3d(ctx0, sa_out, n_embd, nq, n_windows);
+
+        sa_out = ggml_add(ctx0, sa_out, q);
+        sa_out = build_norm(sa_out, pl.ln_1_w, pl.ln_1_b,
+                            NORM_TYPE_NORMAL, qformer_eps, bid);
+    }
+    cbx(sa_out, "sa_out");
+
+    // 6b. Cross-attention
+    ggml_tensor * ca_out;
+    {
+        const int d_h = 64;
+        const int n_head = n_embd / d_h;
+        const int nq = sa_out->ne[1];
+        const int nkv = e_in->ne[1];
+        const float scale = 1.0f / std::sqrt((float) d_h);
+
+        ggml_tensor * Q = linear(sa_out, pl.cross_attn_q_w, pl.cross_attn_q_b);
+        ggml_tensor * K = linear(e_in, pl.cross_attn_k_w, pl.cross_attn_k_b);
+        ggml_tensor * V = linear(e_in, pl.cross_attn_v_w, pl.cross_attn_v_b);
+
+        Q = ggml_reshape_4d(ctx0, Q, d_h, n_head, nq, n_windows);
+        K = ggml_reshape_4d(ctx0, K, d_h, n_head, nkv, n_windows);
+        V = ggml_reshape_4d(ctx0, V, d_h, n_head, nkv, n_windows);
+
+        ca_out = build_attn(pl.cross_attn_o_w, pl.cross_attn_o_b,
+                            Q, K, V, nullptr, scale, bid);
+        ca_out = ggml_reshape_3d(ctx0, ca_out, n_embd, nq, n_windows);
+
+        ca_out = ggml_add(ctx0, ca_out, sa_out);
+        ca_out = build_norm(ca_out, pl.cross_attn_norm_w, pl.cross_attn_norm_b,
+                            NORM_TYPE_NORMAL, qformer_eps, bid);
+    }
+    cbx(ca_out, "ca_out");
+
+    // 6c. FFN
+    ggml_tensor * ffn;
+    {
+        ggml_tensor * t = ggml_reshape_2d(ctx0, ca_out, n_embd, query_len * n_windows);
+        t = build_mm(pl.ff_up_w, t);
+        if (pl.ff_up_b) t = ggml_add(ctx0, t, pl.ff_up_b);
+        t = ggml_gelu_erf(ctx0, t);
+        t = build_mm(pl.ff_down_w, t);
+        if (pl.ff_down_b) t = ggml_add(ctx0, t, pl.ff_down_b);
+        t = ggml_reshape_3d(ctx0, t, n_embd, query_len, n_windows);
+        ffn = ggml_add(ctx0, t, ca_out);
+        ffn = build_norm(ffn, pl.ln_2_w, pl.ln_2_b, NORM_TYPE_NORMAL, qformer_eps, bid);
+    }
+    cbx(ffn, "qformer_out");
+
+    // 7. _unwin back to raster
+    ggml_tensor * unwinned;
+    {
+        ggml_tensor * flat = ggml_reshape_2d(ctx0, ffn, n_embd, query_len * n_windows);
+        unwinned = gather(flat,
+            "g4v_blk" + std::to_string(bid) + "_unwin_idx",
+            new_side * new_side);
+    }
+    cbx(unwinned, "unwin");
+
+    // 8. out_linear
+    ggml_tensor * out = build_mm(blk.qf_proj_linear_w, unwinned);
+    if (blk.qf_proj_linear_b) out = ggml_add(ctx0, out, blk.qf_proj_linear_b);
+    cbx(out, "out");
+
+    return out;
+}
+
+// ---------------------------------------------------------------------------
+// build() - top-level graph
+// ---------------------------------------------------------------------------
+
+// Build the K-tiled, base-scaled newline row tensor.
+// Shape: (n_mmproj_embd, 1)
+ggml_tensor * clip_graph_granite4_vision::build_newline_row(ggml_context * ctx0) {
+    const int K = (int) model.qf_proj_blocks.size();
+    GGML_ASSERT(K > 0);
+    GGML_ASSERT(n_mmproj_embd % K == 0);
+    const int projection_dim = n_mmproj_embd / K;
+    GGML_ASSERT(model.image_newline != nullptr);
+    GGML_ASSERT(ggml_nelements(model.image_newline) == projection_dim);
+
+    // Build newline_row[k*projection_dim + d] = nl[d] * (k == 0 ? base : 1.0)
+    ggml_tensor * nl = model.image_newline; // (projection_dim,)
+    ggml_tensor * nl_first_2d = ggml_reshape_2d(ctx0, nl, projection_dim, 1);
+    ggml_tensor * nl_row_2d;
+    if (K == 1) {
+        nl_row_2d = nl_first_2d;
+    } else {
+        ggml_tensor * nl_2d = ggml_reshape_2d(ctx0, nl, projection_dim, 1);
+        ggml_tensor * rest_template = ggml_new_tensor_2d(
+            ctx0, GGML_TYPE_F32, projection_dim, K - 1);
+        ggml_tensor * nl_rest = ggml_repeat(ctx0, nl_2d, rest_template);
+        nl_row_2d = ggml_concat(ctx0, nl_first_2d, nl_rest, 1); // (projection_dim, K)
+    }
+    nl_row_2d = ggml_cont(ctx0, nl_row_2d);
+    return ggml_reshape_2d(ctx0, nl_row_2d, n_mmproj_embd, 1);
+}
+
+// Append a single newline row at the end of the tile output.
+ggml_tensor * clip_graph_granite4_vision::append_rowwise_newlines(ggml_context * ctx0, ggml_tensor * tile_output) {
+    // For the single-tile case, append one newline row at the end.
+    // For the multi-tile rowwise case, this will be called per-tile
+    // (though currently only the single-tile path uses it).
+    ggml_tensor * nl_row = build_newline_row(ctx0);
+    return ggml_concat(ctx0, tile_output, nl_row, 1);
+}
+
+ggml_cgraph * clip_graph_granite4_vision::build() {
+    GGML_ASSERT(model.patch_embeddings_0 != nullptr);
+    GGML_ASSERT(model.position_embeddings != nullptr);
+    GGML_ASSERT(model.class_embedding == nullptr);
+    GGML_ASSERT(!model.qf_proj_blocks.empty());
+
+    // --- Stage 1a: SigLIP encoder producing intermediate hidden states ---
+    ggml_tensor * inp = build_inp();
+    inp = ggml_add(ctx0, inp, model.position_embeddings);
+    cb(inp, "pos_embed", -1);
+
+    ggml_tensor * inpL = inp;
+    std::vector<ggml_tensor *> layer_outs(n_layer, nullptr);
+
+    for (int il = 0; il < n_layer; ++il) {
+        const auto & layer = model.layers[il];
+        ggml_tensor * cur = inpL;
+
+        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
+
+        // Self-attention
+        ggml_tensor * Qcur = build_mm(layer.q_w, cur);
+        if (layer.q_b) Qcur = ggml_add(ctx0, Qcur, layer.q_b);
+        ggml_tensor * Kcur = build_mm(layer.k_w, cur);
+        if (layer.k_b) Kcur = ggml_add(ctx0, Kcur, layer.k_b);
+        ggml_tensor * Vcur = build_mm(layer.v_w, cur);
+        if (layer.v_b) Vcur = ggml_add(ctx0, Vcur, layer.v_b);
+
+        Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
+        Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
+        Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
+
+        cur = build_attn(layer.o_w, layer.o_b,
+                         Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+
+        cur = ggml_add(ctx0, cur, inpL);
+        inpL = cur;
+
+        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
+        cur = build_ffn(cur,
+                        layer.ff_up_w, layer.ff_up_b,
+                        layer.ff_gate_w, layer.ff_gate_b,
+                        layer.ff_down_w, layer.ff_down_b,
+                        hparams.ffn_op, il);
+        cur = ggml_add(ctx0, inpL, cur);
+        cb(cur, "layer_out", il);
+        layer_outs[il] = cur;
+        inpL = cur;
+    }
+
+    // --- Stage 1b/1c: WindowQFormer blocks ---
+    const int projector_count = hparams.vision_feature_layer.size();
+    const float qformer_eps = 1e-12f;
+
+    ggml_tensor * mmproj = nullptr;
+    for (int bid = 0; bid < projector_count; ++bid) {
+        const auto & blk = model.qf_proj_blocks[bid];
+
+        int vlayer = hparams.vision_feature_layer[bid];
+        GGML_ASSERT(vlayer >= 0 && vlayer < n_layer);
+        ggml_tensor * h = layer_outs[vlayer];
+
+        ggml_tensor * stream = build_block(
+            blk, h, bid,
+            hparams.proj_spatial_offsets[bid],
+            n_patches_x,
+            hparams.downsample_window_side,
+            hparams.downsample_query_side,
+            qformer_eps);
+        cb(stream, (std::string("proj_") + std::to_string(bid) + std::string("_v_out")).c_str(), vlayer);
+        mmproj = mmproj ? ggml_concat(ctx0, mmproj, stream, 0) : stream;
+    }
+
+    // --- Stage 1d: Append newline tokens if add_newline is set ---
+    if (add_newline) {
+        mmproj = append_rowwise_newlines(ctx0, mmproj);
+        ggml_set_name(mmproj, "g4v_mmproj_out_nl");
+    } else {
+        ggml_set_name(mmproj, "g4v_mmproj_out");
+    }
+    ggml_build_forward_expand(gf, mmproj);
+
+    return gf;
+}
diff --git a/tools/mtmd/models/llava.cpp b/tools/mtmd/models/llava.cpp
index 4af17ccfe8..5aa3d2f0fa 100644
--- a/tools/mtmd/models/llava.cpp
+++ b/tools/mtmd/models/llava.cpp
@@ -51,7 +51,6 @@ ggml_cgraph * clip_graph_llava::build() {
     }
 
     std::vector<ggml_tensor *> embedding_stack;
-    const auto & vision_feature_layer = hparams.vision_feature_layer;
 
     // loop over layers
     for (int il = 0; il < max_feature_layer; il++) {
@@ -60,7 +59,7 @@ ggml_cgraph * clip_graph_llava::build() {
 
         // If this is an embedding feature layer, save the output.
         // NOTE: 0 index here refers to the input to the encoder.
-        if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
+        if (hparams.is_vision_feature_layer(il)) {
             embedding_stack.push_back(cur);
         }
 
@@ -135,7 +134,7 @@ ggml_cgraph * clip_graph_llava::build() {
     // process vision feature layers (used by granite)
     {
         // final layer is a vision feature layer
-        if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) {
+        if (hparams.is_vision_feature_layer(max_feature_layer)) {
             embedding_stack.push_back(inpL);
         }
 
diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h
index b882f800dd..d1865103bc 100644
--- a/tools/mtmd/models/models.h
+++ b/tools/mtmd/models/models.h
@@ -211,3 +211,26 @@ struct clip_graph_exaone4_5 : clip_graph {
     clip_graph_exaone4_5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
     ggml_cgraph * build() override;
 };
+
+struct clip_graph_granite4_vision : clip_graph {
+    clip_graph_granite4_vision(clip_ctx * ctx, const clip_image_f32 & img)
+        : clip_graph(ctx, img),
+          add_newline(img.add_newline) {}
+
+    ggml_cgraph * build() override;
+
+private:
+    // The graph is per-tile since only batch-size 1 is supported in clip. As
+    // such, this value is set at construct time based on the tile that will be
+    // encoded, then used during build to determine how to handle newlines.
+    const bool add_newline;
+
+    ggml_tensor * gather(ggml_tensor * src, const std::string & name, int idx_len);
+    ggml_tensor * interp_down(ggml_tensor * src, int side, int new_side);
+    ggml_tensor * build_block(const qf_block & blk, ggml_tensor * h, int bid,
+                              int spatial_offset, int image_side, int window_side,
+                              int query_side, float qformer_eps);
+
+    ggml_tensor * build_newline_row(ggml_context * ctx0);
+    ggml_tensor * append_rowwise_newlines(ggml_context * ctx0, ggml_tensor * tile_output);
+};
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 0b5caa6cb5..260f307560 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -513,6 +513,12 @@ struct mtmd_context {
                     img_end = "</vision>";
                     image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
                 } break;
+            case PROJECTOR_TYPE_GRANITE4_VISION:
+                {
+                    img_beg = "<image>";
+                    img_end = "";
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
+                } break;
             default:
                 throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj));
         }
@@ -808,6 +814,21 @@ struct mtmd_tokenizer {
                 return 2;
             }
 
+            // Annotate llava-next style tiles so clip_n_output_tokens accounts
+            // for per-tile newline injection.
+            if (ctx->proj_type_v() == PROJECTOR_TYPE_GRANITE4_VISION) {
+                if (batch_f32.entries.size() == 1) {
+                    // Single-tile (overview only): append one newline row.
+                    batch_f32.entries[0]->add_newline = true;
+                } else {
+                    // Multi-tile: overview gets no newline, grid tiles get one.
+                    batch_f32.entries[0]->add_newline = false;
+                    for (size_t i = 1; i < batch_f32.entries.size(); ++i) {
+                        batch_f32.entries[i]->add_newline = true;
+                    }
+                }
+            }
+
             // handle llava-uhd style preprocessing
             const bool has_tiling_grid = batch_f32.grid_x > 0 && batch_f32.grid_y > 0;
             if (
@@ -872,9 +893,10 @@ struct mtmd_tokenizer {
                 }
 
             } else {
+
                 size_t n_tokens = 0;
-                for (const auto & entry : batch_f32.entries) {
-                    n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get());
+                for (const auto & e : batch_f32.entries) {
+                    n_tokens += clip_n_output_tokens(ctx->ctx_v, e.get());
                 }
 
                 mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
@@ -1111,7 +1133,8 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
         || proj_type == PROJECTOR_TYPE_MINICPMV
         || proj_type == PROJECTOR_TYPE_GLM_EDGE
         || proj_type == PROJECTOR_TYPE_INTERNVL
-        || proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) {
+        || proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2
+        || proj_type == PROJECTOR_TYPE_GRANITE4_VISION) {
         // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
         const auto & entries = image_tokens->batch_f32.entries;
         // entries may have different token counts