mtmd: llava_uhd should no longer use batch dim (#24732 )

2026-06-18 03:37:39 +02:00 · 2026-06-17 22:40:50 +02:00
3 changed files with 33 additions and 25 deletions
@@ -1105,6 +1105,8 @@ bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, cli
        img_u8_to_f32(*imgs[i], *res, hparams.image_mean, hparams.image_std);
        output.entries.push_back(std::move(res));
    }
+    output.grid_x = inst.grid_size.width;
+    output.grid_y = inst.grid_size.height;
    return true;
 }

@@ -1558,3 +1560,22 @@ bool mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img, clip
    output.entries.push_back(std::move(img_f32));
    return true;
 }
+
+bool mtmd_image_preprocessor_granite::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
+    // call super class preprocessor
+    bool ok = mtmd_image_preprocessor_llava_uhd::preprocess(img, output);
+    if (!ok) {
+        return false;
+    }
+    if (output.entries.size() == 1) {
+        // Single-tile (overview only): append one newline row.
+        output.entries[0]->add_newline = true;
+    } else {
+        // Multi-tile: overview gets no newline, grid tiles get one.
+        output.entries[0]->add_newline = false;
+        for (size_t i = 1; i < output.entries.size(); ++i) {
+            output.entries[i]->add_newline = true;
+        }
+    }
+    return true;
+}
@@ -197,3 +197,9 @@ struct mtmd_image_preprocessor_youtuvl : mtmd_image_preprocessor {
    mtmd_image_preprocessor_youtuvl(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
 };
+
+// similar to llava_uhd, but has add_newline
+struct mtmd_image_preprocessor_granite : mtmd_image_preprocessor_llava_uhd {
+    mtmd_image_preprocessor_granite(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
+    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
+};
@@ -639,7 +639,7 @@ struct mtmd_context {
                {
                    img_beg = "<image>";
                    img_end = "";
-                    image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_granite>(ctx_v);
                } break;
            default:
                throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj));
@@ -1033,7 +1033,10 @@ struct mtmd_tokenizer {
    int32_t add_media(std::vector<const mtmd_bitmap *> & bitmaps) {
        GGML_ASSERT(!bitmaps.empty());

-        if (!bitmaps[0]->is_audio) {
+        // note: only one type of media is supported per call, caller should enforce this
+        const bool is_vision = !bitmaps[0]->is_audio;
+
+        if (is_vision) {
            // handle image

            if (!ctx->ctx_v) {
@@ -1085,31 +1088,9 @@ struct mtmd_tokenizer {
                batch_f32.grid_y = tmp_batch.grid_y;
            }

-            // Annotate llava-next style tiles so clip_n_output_tokens accounts
-            // for per-tile newline injection.
-            if (ctx->proj_type_v() == PROJECTOR_TYPE_GRANITE4_VISION) {
-                if (batch_f32.entries.size() == 1) {
-                    // Single-tile (overview only): append one newline row.
-                    batch_f32.entries[0]->add_newline = true;
-                } else {
-                    // Multi-tile: overview gets no newline, grid tiles get one.
-                    batch_f32.entries[0]->add_newline = false;
-                    for (size_t i = 1; i < batch_f32.entries.size(); ++i) {
-                        batch_f32.entries[i]->add_newline = true;
-                    }
-                }
-            }
-
            // handle llava-uhd style preprocessing
            const bool has_tiling_grid = batch_f32.grid_x > 0 && batch_f32.grid_y > 0;
-            if (
-                ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
-                || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
-                || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
-                || ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
-                || ctx->slice_tmpl == MTMD_SLICE_TMPL_STEP3VL
-                || (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid)
-            ) {
+            if (has_tiling_grid) {
                // [QWEN_VIDEO] we do not support "frame merging" for llama-uhd style, so no batching for now
                GGML_ASSERT(bitmaps.size() == 1);