mtmd: refactor preprocessor, add mtmd_image_preproc_out (#24736 )

* add mtmd_image_preproc_out * add dev docs * remove unused clip API * rm unused clip_image_f32_batch::grid * change preprocess() call signature
2026-06-18 19:57:46 +02:00 · 2026-06-18 12:04:39 +02:00
8 changed files with 279 additions and 370 deletions
@@ -1,10 +1,11 @@
 # Multimodal

 llama.cpp supports multimodal input via `libmtmd`. Currently, there are 2 tools support this feature:
- [llama-mtmd-cli](../tools/mtmd/README.md)
+- [llama-cli](../tools/cli/README.md)
 - [llama-server](../tools/server/README.md) via OpenAI-compatible `/chat/completions` API
+- [llama-mtmd-cli](../tools/mtmd/README.md), for testing and development

-Currently, we support **image** and **audio** input. Audio is highly experimental and may have reduced quality.
+Currently, we support **image**, **audio** and **video** input.

 To enable it, you can use one of the 2 methods below:

@@ -0,0 +1,35 @@
+# libmtmd dev guide
+
+## History
+
+Please refer to [multimodal.md](../../docs/multimodal.md) for a broader context.
+
+In short:
+- `libmtmd` started as a wrapper around `libllava` / `clip.cpp`
+- Various components that used to be in `clip.cpp` are moved progressively to mtmd. For example, preprocessor is now part of mtmd
+
+## Terminologies
+
+- mtmd: **M**ul**T**i**M**o**D**al
+- bitmap: representing a raw input data, for example: RGB image, PCM audio
+- tiles / slices: for llava-uhd-style models, the preprocessor breaks a large input into smaller square images called tiles or slices
+- chunk: a mtmd_input_chunk represents a preprocessed input that can then be passed through `mtmd_encode()`
+
+## Pipeline
+
+A typical pipeline of the core libmtmd is as follows:
+- A bitmap (RGB image or PCM audio) is created
+- Bitmap and the text prompt is provided to `mtmd_tokenize()` that breaks the input into chunks
+    - The tokenizer function first expands a "lazy" bitmap if it finds one. Typically, this is used by video, so that one media token corresponds to one input bitmap
+    - For models that support "fused" temporal frames like Qwen-VL, the tokenizer tries to merge pair of consecutive frames into one batch
+    - The preprocessor will then be called, which produces a list of chunks
+    - Depending on the model itself, special tokens will be injected to separate image chunks (i.e. llava-uhd-style models)
+- Multiple bitmaps may be batched together to form a larger `mtmd_batch()`
+- Single image or batch is encoded, via `mtmd_encode()` or `mtmd_batch_encode()`
+- Get the output embeddings
+
+## Helper
+
+We provide a set of helper functions via `mtmd_helper` to make using libmtmd easier. The helper provides:
+- Image, audio and video file decoding (for example, decode raw JPEG into RGB bitmap)
+- Manage `llama_batch` and calls to `llama_decode`
@@ -367,56 +367,56 @@ enum projector_type {
 };

 static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
-    { PROJECTOR_TYPE_MLP,       "mlp" },
-    { PROJECTOR_TYPE_LDP,       "ldp" },
-    { PROJECTOR_TYPE_LDPV2,     "ldpv2"},
-    { PROJECTOR_TYPE_MINICPMV,  "resampler"},
-    { PROJECTOR_TYPE_GLM_EDGE,  "adapter"},
-    { PROJECTOR_TYPE_QWEN2VL,   "qwen2vl_merger"},
-    { PROJECTOR_TYPE_QWEN25VL,  "qwen2.5vl_merger"},
-    { PROJECTOR_TYPE_QWEN3VL,   "qwen3vl_merger"},
-    { PROJECTOR_TYPE_STEP3VL,   "step3vl"},
-    { PROJECTOR_TYPE_GEMMA3,    "gemma3"},
-    { PROJECTOR_TYPE_GEMMA3NV,  "gemma3nv"},
-    { PROJECTOR_TYPE_GEMMA3NA,  "gemma3na"},
-    { PROJECTOR_TYPE_GEMMA4V,   "gemma4v"},
-    { PROJECTOR_TYPE_GEMMA4A,   "gemma4a"},
-    { PROJECTOR_TYPE_GEMMA4UV,  "gemma4uv"},
-    { PROJECTOR_TYPE_GEMMA4UA,  "gemma4ua"},
-    { PROJECTOR_TYPE_PHI4,      "phi4"},
-    { PROJECTOR_TYPE_IDEFICS3,  "idefics3"},
-    { PROJECTOR_TYPE_PIXTRAL,   "pixtral"},
-    { PROJECTOR_TYPE_ULTRAVOX,  "ultravox"},
-    { PROJECTOR_TYPE_INTERNVL,  "internvl"},
-    { PROJECTOR_TYPE_LLAMA4,    "llama4"},
-    { PROJECTOR_TYPE_QWEN2A,    "qwen2a"},
-    { PROJECTOR_TYPE_QWEN3A,    "qwen3a"},
-    { PROJECTOR_TYPE_GLMA,      "glma"},
-    { PROJECTOR_TYPE_QWEN25O,   "qwen2.5o"},
-    { PROJECTOR_TYPE_VOXTRAL,   "voxtral"},
-    { PROJECTOR_TYPE_MERALION,  "meralion"},
-    { PROJECTOR_TYPE_MUSIC_FLAMINGO, "musicflamingo"},
-    { PROJECTOR_TYPE_LFM2,      "lfm2"},
-    { PROJECTOR_TYPE_KIMIVL,    "kimivl"},
-    { PROJECTOR_TYPE_PADDLEOCR, "paddleocr"},
-    { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
-    { PROJECTOR_TYPE_COGVLM,    "cogvlm"},
-    { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
-    { PROJECTOR_TYPE_DOTS_OCR,  "dots_ocr"},
-    { PROJECTOR_TYPE_DEEPSEEKOCR,"deepseekocr"},
-    { PROJECTOR_TYPE_DEEPSEEKOCR2,"deepseekocr2"},
-    { PROJECTOR_TYPE_LFM2A,     "lfm2a"},
-    { PROJECTOR_TYPE_GLM4V,     "glm4v"},
-    { PROJECTOR_TYPE_YOUTUVL,   "youtuvl"},
-    { PROJECTOR_TYPE_YASA2,     "yasa2"},
-    { PROJECTOR_TYPE_KIMIK25,   "kimik25"},
-    { PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"},
-    { PROJECTOR_TYPE_EXAONE4_5, "exaone4_5"},
-    { PROJECTOR_TYPE_HUNYUANVL,  "hunyuanvl"},
-    { PROJECTOR_TYPE_MINICPMV4_6, "minicpmv4_6"},
-    { PROJECTOR_TYPE_GRANITE_SPEECH, "granite_speech"},
-    { PROJECTOR_TYPE_MIMOVL,     "mimovl"},
-    { PROJECTOR_TYPE_GRANITE4_VISION, "granite4_vision"},
+    { PROJECTOR_TYPE_MLP,               "mlp" },
+    { PROJECTOR_TYPE_LDP,               "ldp" },
+    { PROJECTOR_TYPE_LDPV2,             "ldpv2"},
+    { PROJECTOR_TYPE_MINICPMV,          "resampler"},
+    { PROJECTOR_TYPE_GLM_EDGE,          "adapter"},
+    { PROJECTOR_TYPE_QWEN2VL,           "qwen2vl_merger"},
+    { PROJECTOR_TYPE_QWEN25VL,          "qwen2.5vl_merger"},
+    { PROJECTOR_TYPE_QWEN3VL,           "qwen3vl_merger"},
+    { PROJECTOR_TYPE_STEP3VL,           "step3vl"},
+    { PROJECTOR_TYPE_GEMMA3,            "gemma3"},
+    { PROJECTOR_TYPE_GEMMA3NV,          "gemma3nv"},
+    { PROJECTOR_TYPE_GEMMA3NA,          "gemma3na"},
+    { PROJECTOR_TYPE_GEMMA4V,           "gemma4v"},
+    { PROJECTOR_TYPE_GEMMA4A,           "gemma4a"},
+    { PROJECTOR_TYPE_GEMMA4UV,          "gemma4uv"},
+    { PROJECTOR_TYPE_GEMMA4UA,          "gemma4ua"},
+    { PROJECTOR_TYPE_PHI4,              "phi4"},
+    { PROJECTOR_TYPE_IDEFICS3,          "idefics3"},
+    { PROJECTOR_TYPE_PIXTRAL,           "pixtral"},
+    { PROJECTOR_TYPE_ULTRAVOX,          "ultravox"},
+    { PROJECTOR_TYPE_INTERNVL,          "internvl"},
+    { PROJECTOR_TYPE_LLAMA4,            "llama4"},
+    { PROJECTOR_TYPE_QWEN2A,            "qwen2a"},
+    { PROJECTOR_TYPE_QWEN3A,            "qwen3a"},
+    { PROJECTOR_TYPE_GLMA,              "glma"},
+    { PROJECTOR_TYPE_QWEN25O,           "qwen2.5o"},
+    { PROJECTOR_TYPE_VOXTRAL,           "voxtral"},
+    { PROJECTOR_TYPE_MERALION,          "meralion"},
+    { PROJECTOR_TYPE_MUSIC_FLAMINGO,    "musicflamingo"},
+    { PROJECTOR_TYPE_LFM2,              "lfm2"},
+    { PROJECTOR_TYPE_KIMIVL,            "kimivl"},
+    { PROJECTOR_TYPE_PADDLEOCR,         "paddleocr"},
+    { PROJECTOR_TYPE_LIGHTONOCR,        "lightonocr"},
+    { PROJECTOR_TYPE_COGVLM,            "cogvlm"},
+    { PROJECTOR_TYPE_JANUS_PRO,         "janus_pro"},
+    { PROJECTOR_TYPE_DOTS_OCR,          "dots_ocr"},
+    { PROJECTOR_TYPE_DEEPSEEKOCR,       "deepseekocr"},
+    { PROJECTOR_TYPE_DEEPSEEKOCR2,      "deepseekocr2"},
+    { PROJECTOR_TYPE_LFM2A,             "lfm2a"},
+    { PROJECTOR_TYPE_GLM4V,             "glm4v"},
+    { PROJECTOR_TYPE_YOUTUVL,           "youtuvl"},
+    { PROJECTOR_TYPE_YASA2,             "yasa2"},
+    { PROJECTOR_TYPE_KIMIK25,           "kimik25"},
+    { PROJECTOR_TYPE_NEMOTRON_V2_VL,    "nemotron_v2_vl"},
+    { PROJECTOR_TYPE_EXAONE4_5,         "exaone4_5"},
+    { PROJECTOR_TYPE_HUNYUANVL,         "hunyuanvl"},
+    { PROJECTOR_TYPE_MINICPMV4_6,       "minicpmv4_6"},
+    { PROJECTOR_TYPE_GRANITE_SPEECH,    "granite_speech"},
+    { PROJECTOR_TYPE_MIMOVL,            "mimovl"},
+    { PROJECTOR_TYPE_GRANITE4_VISION,   "granite4_vision"},
 };

 static projector_type clip_projector_type_from_string(const std::string & str) {
@@ -640,47 +640,18 @@ static void clip_log_internal(enum ggml_log_level level, const char * format, ..
 // cpp wrappers
 //

-// wrapper for clip_image_size
-struct clip_image_size_deleter {
-    void operator()(clip_image_size * val) { clip_image_size_free(val); }
-};
-typedef std::unique_ptr<clip_image_size, clip_image_size_deleter> clip_image_size_ptr;
-
-// wrapper for clip_image_u8
-struct clip_image_u8_deleter {
-    void operator()(clip_image_u8 * val) { clip_image_u8_free(val); }
-};
-typedef std::unique_ptr<clip_image_u8, clip_image_u8_deleter> clip_image_u8_ptr;
-
-// wrapper for clip_image_f32
-struct clip_image_f32_deleter {
-    void operator()(clip_image_f32 * val) { clip_image_f32_free(val); }
-};
-typedef std::unique_ptr<clip_image_f32, clip_image_f32_deleter> clip_image_f32_ptr;
-
-struct clip_image_u8_batch {
-    std::vector<clip_image_u8_ptr> entries;
-};
-
 struct clip_image_f32_batch {
-    std::vector<clip_image_f32_ptr> entries;
+    std::vector<clip_image_f32> entries;
    bool is_audio = false;

-    // for llava-uhd style models, we need to know the grid size
-    // note: entries.size() == grid_x * grid_y + 1 (one overview image)
-    int grid_x = 0;
-    int grid_y = 0;
-
    clip_image_f32_batch clone() const {
        clip_image_f32_batch new_batch{
            /* entries  */ {},
            /* is_audio */ is_audio,
-            /* grid_x   */ grid_x,
-            /* grid_y   */ grid_y,
        };
        new_batch.entries.reserve(entries.size());
        for (const auto & entry : entries) {
-            new_batch.entries.emplace_back(new clip_image_f32(*entry));
+            new_batch.entries.emplace_back(entry); // copy
        }
        return new_batch;
    }
@@ -865,7 +865,7 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale
 }

 static std::unique_ptr<clip_graph> clip_get_graph_builder(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
-    const clip_image_f32 & img = *imgs.entries[0];
+    const clip_image_f32 & img = imgs.entries[0];
    std::unique_ptr<clip_graph> builder;

    switch (ctx->proj_type()) {
@@ -2825,16 +2825,16 @@ struct clip_model_loader {
        // create a fake batch
        const auto & hparams = ctx_clip.model.hparams;
        clip_image_f32_batch batch;
-        clip_image_f32_ptr img(clip_image_f32_init());
+        clip_image_f32 img;
        if (ctx_clip.model.modality == CLIP_MODALITY_VISION) {
            const int sz = hparams.warmup_image_size;
-            img->set_size({sz, sz}, false, false);
+            img.set_size({sz, sz}, false, false);
            LOG_INF("%s: warmup with image size = %d x %d\n", __func__, sz, sz);
        } else {
-            img->set_size({hparams.warmup_audio_size, hparams.n_mel_bins}, false, false);
+            img.set_size({hparams.warmup_audio_size, hparams.n_mel_bins}, false, false);
            LOG_INF("%s: warmup with audio size = %d\n", __func__, hparams.warmup_audio_size);
        }
-        batch.entries.push_back(std::move(img));
+        batch.entries.push_back(img);
        return batch;
    }

@@ -3124,64 +3124,6 @@ struct clip_cap clip_get_cap(const char * fname) {
    return res;
 }

-struct clip_image_size * clip_image_size_init() {
-    struct clip_image_size * load_image_size = new struct clip_image_size();
-    load_image_size->width = 448;
-    load_image_size->height = 448;
-    return load_image_size;
-}
-
-struct clip_image_u8 * clip_image_u8_init() {
-    return new clip_image_u8();
-}
-
-struct clip_image_f32 * clip_image_f32_init() {
-    return new clip_image_f32();
-}
-
-struct clip_image_f32_batch * clip_image_f32_batch_init() {
-    return new clip_image_f32_batch();
-}
-
-void clip_image_size_free(struct clip_image_size * load_image_size) {
-    if (load_image_size == nullptr) {
-        return;
-    }
-    delete load_image_size;
-}
-void clip_image_u8_free(struct clip_image_u8  * img) { delete img; }
-void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
-void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { delete batch; }
-void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { delete batch; }
-
-size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch) {
-    return batch->entries.size();
-}
-
-size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx) {
-    if (idx < 0 || idx >= (int)batch->entries.size()) {
-        LOG_ERR("%s: invalid index %d\n", __func__, idx);
-        return 0;
-    }
-    return batch->entries[idx]->nx();
-}
-
-size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx) {
-    if (idx < 0 || idx >= (int)batch->entries.size()) {
-        LOG_ERR("%s: invalid index %d\n", __func__, idx);
-        return 0;
-    }
-    return batch->entries[idx]->ny();
-}
-
-clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx) {
-    if (idx < 0 || idx >= (int)batch->entries.size()) {
-        LOG_ERR("%s: invalid index %d\n", __func__, idx);
-        return nullptr;
-    }
-    return batch->entries[idx].get();
-}
-
 void clip_free(clip_ctx * ctx) {
    if (ctx == nullptr) {
        return;
@@ -3189,23 +3131,11 @@ void clip_free(clip_ctx * ctx) {
    delete ctx;
 }

-int32_t clip_get_image_size(const struct clip_ctx * ctx) {
-    return ctx->model.hparams.image_size;
-}
-
-int32_t clip_get_patch_size(const struct clip_ctx * ctx) {
-    return ctx->model.hparams.patch_size;
-}
-
-int32_t clip_get_hidden_size(const struct clip_ctx * ctx) {
-    return ctx->model.hparams.n_embd;
-}
-
 const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
    return ctx->model.hparams.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ? "spatial_unpad" : "flat";
 }

-int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
+int clip_n_output_tokens_x(const clip_ctx * ctx, const clip_image_f32 * img) {
    const auto & params = ctx->model.hparams;
    const int n_total = clip_n_output_tokens(ctx, img);
    const auto & proj = ctx->proj_type();
@@ -3228,7 +3158,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
    return n_total;
 }

-int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
+int clip_n_output_tokens_y(const clip_ctx * ctx, const clip_image_f32 * img) {
    const auto & params = ctx->model.hparams;
    const auto & proj = ctx->proj_type();
    switch (proj) {
@@ -3250,7 +3180,7 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
    return 1;
 }

-int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
+int clip_n_output_tokens(const clip_ctx * ctx, const clip_image_f32 * img) {
    const auto & params = ctx->model.hparams;

    // for models with fixed size image, the input image is already pre-processed and resized to square
@@ -3500,16 +3430,15 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
    return n_patches;
 }

-bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, std::vector<float> & out_vec) {
+bool clip_image_encode(struct clip_ctx * ctx, int n_threads, const clip_image_f32 * img, std::vector<float> & out_vec) {
    clip_image_f32_batch imgs;
-    clip_image_f32_ptr img_copy(clip_image_f32_init());
-    *img_copy = *img;
+    clip_image_f32 img_copy = *img;
    imgs.entries.push_back(std::move(img_copy));

    return clip_image_batch_encode(ctx, n_threads, &imgs, out_vec);
 }

-bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, std::vector<float> & out_batch_embd) {
+bool clip_image_batch_encode(clip_ctx * ctx, int n_threads, const clip_image_f32_batch * imgs_c_ptr, std::vector<float> & out_batch_embd) {
    const clip_image_f32_batch & imgs = *imgs_c_ptr;
    int n_batch_cur = imgs.entries.size();

@@ -3533,8 +3462,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    const auto & model   = ctx->model;
    const auto & hparams = model.hparams;

-    const int image_size_width  = imgs.entries[0]->nx();
-    const int image_size_height = imgs.entries[0]->ny();
+    const int image_size_width  = imgs.entries[0].nx();
+    const int image_size_height = imgs.entries[0].ny();

    const int patch_size    = hparams.patch_size;
    const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
@@ -3572,7 +3501,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    if (!imgs.is_audio) {
        size_t nelem = 0;
        for (const auto & img : imgs.entries) {
-            nelem += img->nx() * img->ny() * 3;
+            nelem += img.nx() * img.ny() * 3;
        }
        std::vector<float> inp_raw(nelem);

@@ -3590,13 +3519,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        // IMPORTANT: [QWEN_VIDEO] the batch dim is currently used for temporal dim in Qwen-VL models
        // All entries must have the same spatial size (enforced by can_batch_with() during merging)
        {
-            const int nx = imgs.entries[0]->nx();
-            const int ny = imgs.entries[0]->ny();
+            const int nx = imgs.entries[0].nx();
+            const int ny = imgs.entries[0].ny();
            const int n  = nx * ny;

            for (int b = 0; b < n_batch_cur; b++) {
                LOG_DBG("%s: copying image %d/%d to input buffer (nx=%d, ny=%d)\n", __func__, b+1, n_batch_cur, nx, ny);
-                const auto & buf = imgs.entries[b]->get_ro_buf();
+                const auto & buf = imgs.entries[b].get_ro_buf();
                float * batch_entry = inp_raw.data() + b * (3*n);
                for (int y = 0; y < ny; y++) {
                    for (int x = 0; x < nx; x++) {
@@ -3616,9 +3545,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        GGML_ASSERT(imgs.entries.size() == 1);

        const auto & mel_inp = imgs.entries[0];
-        const auto & buf = mel_inp->get_ro_buf();
-        const int n_step = mel_inp->nx();
-        const int n_mel  = mel_inp->ny();
+        const auto & buf = mel_inp.get_ro_buf();
+        const int n_step = mel_inp.nx();
+        const int n_mel  = mel_inp.ny();
        GGML_ASSERT((size_t)n_step * n_mel == buf.size());

        set_input_f32("inp_raw", buf);
@@ -4232,7 +4161,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                GGML_ASSERT(imgs.entries.size() == 1);
                const auto & img0 = imgs.entries.front();
                // Compute n_pos matching SSCP output: two stride-2 convs
-                int n_pos = img0->nx();
+                int n_pos = img0.nx();
                for (int i = 0; i < 2; i++) { n_pos = (n_pos - 1) / 2 + 1; }

                // Chunked local attention: blocked causal mask and RPE
@@ -4280,7 +4209,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        case PROJECTOR_TYPE_LFM2A:
            {
                GGML_ASSERT(imgs.entries.size() == 1);
-                const auto n_frames = clip_n_output_tokens(ctx, imgs.entries.front().get());
+                const auto n_frames = clip_n_output_tokens(ctx, &imgs.entries.front());

                auto d_model = 512;
                auto seq_len = n_frames * 2 - 1;
@@ -4338,7 +4267,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                // reshapes as ggml_get_rows gathers. The names are set
                // by g4v_gather() in models/granite4-vision.cpp.
                const int patch_size  = model.hparams.patch_size;
-                const int image_side  = imgs.entries.front()->nx() / patch_size;
+                const int image_side  = imgs.entries.front().nx() / patch_size;
                const int window_side = hparams.downsample_window_side;
                const int query_side  = hparams.downsample_query_side;
                const int n           = image_side / window_side;
@@ -4432,7 +4361,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima

    // sanity check (assuming that all images in batch have the same number of tokens, so we only check the first one)
    const int n_tokens_out = embeddings->ne[1];
-    const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get());
+    const int expected_n_tokens_out = clip_n_output_tokens(ctx, &imgs.entries[0]);
    if (n_tokens_out != expected_n_tokens_out) {
        LOG_ERR("%s: expected output %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out);
        GGML_ABORT("Invalid number of output tokens");
@@ -29,7 +29,6 @@ struct clip_image_size {
 };

 struct clip_image_f32;
-struct clip_image_u8_batch;
 struct clip_image_f32_batch;

 enum clip_modality {
@@ -63,41 +62,21 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params

 void clip_free(struct clip_ctx * ctx);

-int32_t clip_get_image_size (const struct clip_ctx * ctx);
-int32_t clip_get_patch_size (const struct clip_ctx * ctx);
-int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
-
 // TODO: should be enum, not string
 const char * clip_patch_merge_type(const struct clip_ctx * ctx);

-int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
+int clip_n_output_tokens(const clip_ctx * ctx, const clip_image_f32 * img);

 // for M-RoPE, this will be the number of token positions in X and Y directions
 // for other models, X will be the total number of tokens and Y will be 1
-int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
-int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
+int clip_n_output_tokens_x(const clip_ctx * ctx, const clip_image_f32 * img);
+int clip_n_output_tokens_y(const clip_ctx * ctx, const clip_image_f32 * img);

 // this should be equal to the embedding dimension of the text model
 int clip_n_mmproj_embd(const struct clip_ctx * ctx);

-struct clip_image_size      * clip_image_size_init(void);
-struct clip_image_u8        * clip_image_u8_init (void);
-struct clip_image_f32       * clip_image_f32_init(void);
-struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava
-
-void clip_image_size_free (struct clip_image_size * img_size);
-void clip_image_u8_free (struct clip_image_u8  * img);
-void clip_image_f32_free(struct clip_image_f32 * img);
-void clip_image_u8_batch_free (struct clip_image_u8_batch  * batch);
-void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
-
-// use for accessing underlay data of clip_image_f32_batch
-size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size()
-size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx
-size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
-struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
-
-bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, std::vector<float> & out_vec);
+// TODO: remove clip_image_encode() and always use batched version
+bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, const clip_image_f32 * img, std::vector<float> & out_vec);
 bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, std::vector<float> & out_batch_embd);

 bool clip_is_llava(const struct clip_ctx * ctx);
@@ -4,17 +4,26 @@
 #include <cmath>
 #include <vector>

-//
-// base implementation
-//
-
-void mtmd_image_preprocessor::img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
-    dst.from_u8(src);
-    dst.normalize(mean, std);
+void mtmd_image_preproc_out::append(const clip_hparams & hparams, const clip_image_u8 & img, bool normalized) {
+    clip_image_f32 dst;
+    dst.from_u8(img);
+    if (normalized) {
+        dst.normalize(hparams.image_mean, hparams.image_std);
+    }
+    entries.push_back(std::move(dst));
 }

-void mtmd_image_preprocessor::img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst) {
-    dst.from_u8(src);
+void mtmd_image_preproc_out::append(const clip_hparams & hparams, const std::vector<clip_image_u8> & imgs, bool normalized) {
+    for (const auto & img : imgs) {
+        append(hparams, img, normalized);
+    }
+}
+
+void mtmd_image_preproc_out::append(const clip_hparams & hparams, clip_image_f32 & img, bool normalized) {
+    if (normalized) {
+        img.normalize(hparams.image_mean, hparams.image_std);
+    }
+    entries.push_back(std::move(img));
 }

 // set of tools to manipulate images
@@ -595,21 +604,17 @@ private:
 // mtmd_image_preprocessor_llava_uhd
 //

-bool mtmd_image_preprocessor_llava_uhd::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
+mtmd_image_preproc_out mtmd_image_preprocessor_llava_uhd::preprocess(const clip_image_u8 & img) {
    const clip_image_size original_size = img.get_size();
    auto const inst = get_slice_instructions(original_size);
-    std::vector<clip_image_u8_ptr> imgs = slice_image(img, inst);
-
-    for (size_t i = 0; i < imgs.size(); ++i) {
-        // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
-        clip_image_f32_ptr res(clip_image_f32_init());
-        img_u8_to_f32(*imgs[i], *res, hparams.image_mean, hparams.image_std);
-        output.entries.push_back(std::move(res));
-    }
+    std::vector<clip_image_u8> imgs = slice_image(img, inst);

+    mtmd_image_preproc_out output;
+    output.append(hparams, imgs, true);
    output.grid_x = inst.grid_size.width;
    output.grid_y = inst.grid_size.height;
-    return true;
+
+    return output;
 }

 mtmd_image_preprocessor_llava_uhd::slice_instructions mtmd_image_preprocessor_llava_uhd::get_slice_instructions(const clip_image_size & original_size) {
@@ -717,28 +722,28 @@ mtmd_image_preprocessor_llava_uhd::slice_instructions mtmd_image_preprocessor_ll
    return res;
 }

-std::vector<clip_image_u8_ptr> mtmd_image_preprocessor_llava_uhd::slice_image(const clip_image_u8 & img, const mtmd_image_preprocessor_llava_uhd::slice_instructions & inst, bool overview_first) {
-    std::vector<clip_image_u8_ptr> output;
+std::vector<clip_image_u8> mtmd_image_preprocessor_llava_uhd::slice_image(const clip_image_u8 & img, const mtmd_image_preprocessor_llava_uhd::slice_instructions & inst, bool overview_first) {
+    std::vector<clip_image_u8> output;

    // resize to overview size
-    clip_image_u8_ptr resized_img(clip_image_u8_init());
-    img_tool::resize(img, *resized_img, inst.overview_size, hparams.image_resize_algo_ov,
+    clip_image_u8 resized_img;
+    img_tool::resize(img, resized_img, inst.overview_size, hparams.image_resize_algo_ov,
                        hparams.image_pad_ov, hparams.image_pad_color_ov);
    if (overview_first) {
-        output.push_back(std::move(resized_img));
+        output.push_back(resized_img);
    }

    if (inst.slices.empty()) {
        // no slices, just return the resized image
        if (!overview_first) {
-            output.push_back(std::move(resized_img));
+            output.push_back(resized_img);
        }
        return output;
    }

    // resize to refined size
-    clip_image_u8_ptr refined_img(clip_image_u8_init());
-    img_tool::resize(img, *refined_img, inst.refined_size, hparams.image_resize_algo_rf,
+    clip_image_u8 refined_img;
+    img_tool::resize(img, refined_img, inst.refined_size, hparams.image_resize_algo_rf,
                        hparams.image_pad_rf, hparams.image_pad_color_rf);

    // create slices
@@ -748,13 +753,13 @@ std::vector<clip_image_u8_ptr> mtmd_image_preprocessor_llava_uhd::slice_image(co
        int w = slice.size.width;
        int h = slice.size.height;

-        clip_image_u8_ptr img_slice(clip_image_u8_init());
-        img_tool::crop(*refined_img, *img_slice, x, y, w, h);
+        clip_image_u8 img_slice;
+        img_tool::crop(refined_img, img_slice, x, y, w, h);
        output.push_back(std::move(img_slice));
    }

    if (!overview_first) {
-        output.push_back(std::move(resized_img));
+        output.push_back(resized_img);
    }

    return output;
@@ -871,24 +876,23 @@ clip_image_size mtmd_image_preprocessor_llava_uhd::get_best_grid(const int max_s
 // mtmd_image_preprocessor_fixed_size
 //

-bool mtmd_image_preprocessor_fixed_size::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
+mtmd_image_preproc_out mtmd_image_preprocessor_fixed_size::preprocess(const clip_image_u8 & img) {
    clip_image_u8 resized_image;
    int sz = hparams.image_size;
    img_tool::resize(img, resized_image, {sz, sz},
                        hparams.image_resize_algo,
                        hparams.image_resize_pad,
                        hparams.image_pad_color);
-    clip_image_f32_ptr img_f32(clip_image_f32_init());
-    img_u8_to_f32(resized_image, *img_f32, hparams.image_mean, hparams.image_std);
-    output.entries.push_back(std::move(img_f32));
-    return true;
+    mtmd_image_preproc_out output;
+    output.append(hparams, resized_image, true);
+    return output;
 }

 //
 // mtmd_image_preprocessor_dyn_size
 //

-bool mtmd_image_preprocessor_dyn_size::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
+mtmd_image_preproc_out mtmd_image_preprocessor_dyn_size::preprocess(const clip_image_u8 & img) {
    GGML_ASSERT(hparams.image_min_pixels > 0 && hparams.image_max_pixels > 0);
    clip_image_u8 resized_image;
    const clip_image_size original_size = img.get_size();
@@ -903,17 +907,16 @@ bool mtmd_image_preprocessor_dyn_size::preprocess(const clip_image_u8 & img, cli
                        hparams.image_resize_algo,
                        hparams.image_resize_pad,
                        hparams.image_pad_color);
-    clip_image_f32_ptr img_f32(clip_image_f32_init());
-    img_u8_to_f32(resized_image, *img_f32, hparams.image_mean, hparams.image_std);
-    output.entries.push_back(std::move(img_f32));
-    return true;
+    mtmd_image_preproc_out output;
+    output.append(hparams, resized_image, true);
+    return output;
 }

 //
 // mtmd_image_preprocessor_longest_edge
 //

-bool mtmd_image_preprocessor_longest_edge::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
+mtmd_image_preproc_out mtmd_image_preprocessor_longest_edge::preprocess(const clip_image_u8 & img) {
    GGML_ASSERT(hparams.image_longest_edge > 0);
    clip_image_u8 resized_image;
    const clip_image_size original_size = img.get_size();
@@ -927,10 +930,9 @@ bool mtmd_image_preprocessor_longest_edge::preprocess(const clip_image_u8 & img,
                        hparams.image_resize_algo,
                        hparams.image_resize_pad,
                        hparams.image_pad_color);
-    clip_image_f32_ptr img_f32(clip_image_f32_init());
-    img_u8_to_f32(resized_image, *img_f32, hparams.image_mean, hparams.image_std);
-    output.entries.push_back(std::move(img_f32));
-    return true;
+    mtmd_image_preproc_out output;
+    output.append(hparams, resized_image, true);
+    return output;
 }

 //
@@ -1040,7 +1042,7 @@ clip_image_size mtmd_image_preprocessor_lfm2::get_grid_layout(int height, int wi
 // mtmd_image_preprocessor_idefics3
 //

-bool mtmd_image_preprocessor_idefics3::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
+mtmd_image_preproc_out mtmd_image_preprocessor_idefics3::preprocess(const clip_image_u8 & img) {
    // The refined size has two steps:
    // 1. Resize w/ aspect-ratio preserving such that the longer side is
    //      the preprocessor longest size
@@ -1077,44 +1079,35 @@ bool mtmd_image_preprocessor_idefics3::preprocess(const clip_image_u8 & img, cli
    }
    auto imgs = slice_image(img, instructions);

-    // cast and normalize to f32
-    for (size_t i = 0; i < imgs.size(); ++i) {
-        // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
-        clip_image_f32_ptr res(clip_image_f32_init());
-        img_u8_to_f32(*imgs[i], *res, hparams.image_mean, hparams.image_std);
-        output.entries.push_back(std::move(res));
-    }
-
+    mtmd_image_preproc_out output;
+    output.append(hparams, imgs, true);
    output.grid_x = instructions.grid_size.width;
    output.grid_y = instructions.grid_size.height;
-    return true;
+    return output;
 }

 //
 // mtmd_image_preprocessor_internvl
 //

-bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
+mtmd_image_preproc_out mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img) {
    GGML_ASSERT(!hparams.image_res_candidates.empty());
    const clip_image_size original_size = img.get_size();
    auto const inst = get_slice_instructions(original_size);
-    std::vector<clip_image_u8_ptr> imgs = slice_image(img, inst, false);
+    std::vector<clip_image_u8> imgs = slice_image(img, inst, false);

-    for (size_t i = 0; i < imgs.size(); ++i) {
-        clip_image_f32_ptr res(clip_image_f32_init());
-        img_u8_to_f32(*imgs[i], *res, hparams.image_mean, hparams.image_std);
-        output.entries.push_back(std::move(res));
-    }
+    mtmd_image_preproc_out output;
+    output.append(hparams, imgs, true);
    output.grid_x = inst.grid_size.width;
    output.grid_y = inst.grid_size.height;
-    return true;
+    return output;
 }

 //
 // mtmd_image_preprocessor_deepseekocr
 //

-bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
+mtmd_image_preproc_out mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img) {
    static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ };
    // TODO: support 512 (tiny) and 640 (small) once we have eval data for them

@@ -1137,14 +1130,11 @@ bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img,
    clip_image_u8 padded;
    img_tool::resize(img, padded, {image_size, image_size}, RESIZE_ALGO_BICUBIC_PILLOW,
                     PAD_NEAREST, hparams.image_pad_color);
-
-    clip_image_f32_ptr res(clip_image_f32_init());
-    img_u8_to_f32(padded, *res, hparams.image_mean, hparams.image_std);
-    output.entries.push_back(std::move(res));
-
+    mtmd_image_preproc_out output;
+    output.append(hparams, padded, true);
    output.grid_x = 1;
    output.grid_y = 1;
-    return true;
+    return output;
 }

 //
@@ -1207,10 +1197,11 @@ clip_image_size mtmd_image_preprocessor_deepseekocr2::find_closest_aspect_ratio(
    return best_ratio;
 }

-bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
+mtmd_image_preproc_out mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img) {
    // emit 768x768 local tiles when the image is larger than a tile in either
    // dimension, then always a 1024x1024 global view. order: [tiles..., global].

+    mtmd_image_preproc_out output;
    const auto img_size = img.get_size();
    if (img_size.width > tile_size || img_size.height > tile_size) {
        const float           aspect_ratio  = static_cast<float>(img_size.width) / img_size.height;
@@ -1226,9 +1217,7 @@ bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img,
            for (int col = 0; col < grid.width; col++) {
                clip_image_u8 tile;
                img_tool::crop(refined, tile, col * tile_size, row * tile_size, tile_size, tile_size);
-                clip_image_f32_ptr res(clip_image_f32_init());
-                img_u8_to_f32(tile, *res, hparams.image_mean, hparams.image_std);
-                output.entries.push_back(std::move(res));
+                output.append(hparams, tile, true);
            }
        }
    }
@@ -1237,14 +1226,11 @@ bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img,
    clip_image_u8 padded;
    img_tool::resize(img, padded, { base_size, base_size }, RESIZE_ALGO_BICUBIC_PILLOW,
                     PAD_NEAREST, hparams.image_pad_color);
-    clip_image_f32_ptr global(clip_image_f32_init());
-    img_u8_to_f32(padded, *global, hparams.image_mean, hparams.image_std);
-    global->add_viewsep = true;
-    output.entries.push_back(std::move(global));
-
+    output.append(hparams, padded, true);
+    output.entries.back().add_viewsep = true;
    output.grid_x = 1;
    output.grid_y = 1;
-    return true;
+    return output;
 }

 //
@@ -1260,7 +1246,8 @@ void mtmd_image_preprocessor_step3vl::img_u8_resize_bilinear_to_f32(
        const float std[3]) {
    const auto src_size = src.get_size();
    if (src_size.width == target_width && src_size.height == target_height) {
-        img_u8_to_f32(src, dst, mean, std);
+        dst.from_u8(src);
+        dst.normalize(mean, std);
        return;
    }

@@ -1455,24 +1442,25 @@ mtmd_image_preprocessor_step3vl::slice_instructions mtmd_image_preprocessor_step
    return instructions;
 }

-bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
+mtmd_image_preproc_out mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img) {
    clip_image_u8 prepared = prepare_image(img, hparams);
    const auto instructions = build_slice_instructions(hparams, prepared.get_size());

-    clip_image_f32_ptr overview_f32(clip_image_f32_init());
+    mtmd_image_preproc_out output;
+    clip_image_f32 overview_f32;
    img_u8_resize_bilinear_to_f32(
        prepared,
-        *overview_f32,
+        overview_f32,
        hparams.image_size,
        hparams.image_size,
        hparams.image_mean,
        hparams.image_std);
-    output.entries.push_back(std::move(overview_f32));
+    output.append(hparams, overview_f32, false);

    if (instructions.slices.empty()) {
        output.grid_x = 0;
        output.grid_y = 0;
-        return true;
+        return output;
    }

    clip_image_u8 img_for_crop = prepared;
@@ -1488,28 +1476,28 @@ bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip
        // If the requested patch extends past the source image, pad the out-of-bounds area with black.
        clip_image_u8 patch = crop_with_black_padding(img_for_crop, slice.x, slice.y, slice.size.width, slice.size.height);

-        clip_image_f32_ptr patch_f32(clip_image_f32_init());
+        clip_image_f32 patch_f32;
        img_u8_resize_bilinear_to_f32(
            patch,
-            *patch_f32,
+            patch_f32,
            crop_size,
            crop_size,
            hparams.image_mean,
            hparams.image_std);
-        output.entries.push_back(std::move(patch_f32));
+        output.append(hparams, patch_f32, false);
    }

    output.grid_x = instructions.grid_size.width;
    output.grid_y = instructions.grid_size.height;

-    return true;
+    return output;
 }

 //
 // mtmd_image_preprocessor_youtuvl
 //

-bool mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
+mtmd_image_preproc_out mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img) {
    const int patch_size = hparams.patch_size;   // typically 16
    const int merge_size = hparams.n_merge;      // typically 2
    const int align_size = patch_size * merge_size;  // 32
@@ -1553,29 +1541,22 @@ bool mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img, clip
    clip_image_u8 resized;
    img_tool::resize(img, resized, new_size, hparams.image_resize_algo, hparams.image_resize_pad);

-    // Normalize to float32
-    clip_image_f32_ptr img_f32(clip_image_f32_init());
-    img_u8_to_f32(resized, *img_f32, hparams.image_mean, hparams.image_std);
-    // Add to results
-    output.entries.push_back(std::move(img_f32));
-    return true;
+    mtmd_image_preproc_out output;
+    output.append(hparams, resized, true);
+    return output;
 }

-bool mtmd_image_preprocessor_granite::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
-    // call super class preprocessor
-    bool ok = mtmd_image_preprocessor_llava_uhd::preprocess(img, output);
-    if (!ok) {
-        return false;
-    }
+mtmd_image_preproc_out mtmd_image_preprocessor_granite::preprocess(const clip_image_u8 & img) {
+    auto output = mtmd_image_preprocessor_llava_uhd::preprocess(img);
    if (output.entries.size() == 1) {
        // Single-tile (overview only): append one newline row.
-        output.entries[0]->add_newline = true;
+        output.entries[0].add_newline = true;
    } else {
        // Multi-tile: overview gets no newline, grid tiles get one.
-        output.entries[0]->add_newline = false;
+        output.entries[0].add_newline = false;
        for (size_t i = 1; i < output.entries.size(); ++i) {
-            output.entries[i]->add_newline = true;
+            output.entries[i].add_newline = true;
        }
    }
-    return true;
+    return output;
 }
@@ -8,6 +8,16 @@

 #define MTMD_INTERNAL_HEADER

+struct mtmd_image_preproc_out {
+    std::vector<clip_image_f32> entries;
+    // grid size is required for llava-uhd style models
+    int grid_x = 0;
+    int grid_y = 0;
+    void append(const clip_hparams & hparams, const clip_image_u8 & img, bool normalized = true);
+    void append(const clip_hparams & hparams, const std::vector<clip_image_u8> & imgs, bool normalized = true);
+    void append(const clip_hparams & hparams, clip_image_f32 & img, bool normalized = true);
+};
+
 // base class, models must inherit from this class
 struct mtmd_image_preprocessor {
    const clip_hparams & hparams;
@@ -15,10 +25,7 @@ struct mtmd_image_preprocessor {
    mtmd_image_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {}

    virtual ~mtmd_image_preprocessor() = default;
-    virtual bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) = 0;
-
-    void img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]);
-    void img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst);
+    virtual mtmd_image_preproc_out preprocess(const clip_image_u8 & img) = 0;
 };

 /**
@@ -42,7 +49,7 @@ struct mtmd_image_preprocessor {
 */
 struct mtmd_image_preprocessor_llava_uhd : mtmd_image_preprocessor {
    mtmd_image_preprocessor_llava_uhd(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
-    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
+    mtmd_image_preproc_out preprocess(const clip_image_u8 & img) override;

    struct slice_coordinates {
        int x;
@@ -60,7 +67,7 @@ struct mtmd_image_preprocessor_llava_uhd : mtmd_image_preprocessor {
    // LFM2 override this function to implement its custom slicing logic
    virtual slice_instructions get_slice_instructions(const clip_image_size & original_size);

-    std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 & img, const slice_instructions & inst, bool overview_first = true);
+    std::vector<clip_image_u8> slice_image(const clip_image_u8 & img, const slice_instructions & inst, bool overview_first = true);

 private:
    clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false);
@@ -91,7 +98,7 @@ private:
 // downscale or upscale the input image to fixed size
 struct mtmd_image_preprocessor_fixed_size : mtmd_image_preprocessor {
    mtmd_image_preprocessor_fixed_size(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
-    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
+    mtmd_image_preproc_out preprocess(const clip_image_u8 & img) override;
 };

 // resize image to multiple of patch_size*n_merge, while preserving aspect ratio
@@ -99,13 +106,13 @@ struct mtmd_image_preprocessor_fixed_size : mtmd_image_preprocessor {
 // this is used by models with native support for dynamic image size, for example: Qwen-VL, Pixtral, Kimi-VL, etc
 struct mtmd_image_preprocessor_dyn_size : mtmd_image_preprocessor {
    mtmd_image_preprocessor_dyn_size(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
-    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
+    mtmd_image_preproc_out preprocess(const clip_image_u8 & img) override;
 };

 // similar to mtmd_image_preprocessor_dyn_size, but resize the image to have longest edge equal to hparams.image_longest_edge, while preserving aspect ratio
 struct mtmd_image_preprocessor_longest_edge : mtmd_image_preprocessor {
    mtmd_image_preprocessor_longest_edge(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
-    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
+    mtmd_image_preproc_out preprocess(const clip_image_u8 & img) override;
 };

 // custom llava-uhd slicing logic for LFM2
@@ -131,17 +138,17 @@ private:

 struct mtmd_image_preprocessor_idefics3 : mtmd_image_preprocessor_llava_uhd {
    mtmd_image_preprocessor_idefics3(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
-    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
+    mtmd_image_preproc_out preprocess(const clip_image_u8 & img) override;
 };

 struct mtmd_image_preprocessor_internvl : mtmd_image_preprocessor_llava_uhd {
    mtmd_image_preprocessor_internvl(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
-    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
+    mtmd_image_preproc_out preprocess(const clip_image_u8 & img) override;
 };

 struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor {
    mtmd_image_preprocessor_deepseekocr(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
-    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
+    mtmd_image_preproc_out preprocess(const clip_image_u8 & img) override;
 };

 // DeepSeek-OCR-2: a 1024x1024 global view, plus InternVL-style 768x768 local
@@ -153,7 +160,7 @@ struct mtmd_image_preprocessor_deepseekocr2 : mtmd_image_preprocessor {
    static constexpr int max_tiles = 6;

    mtmd_image_preprocessor_deepseekocr2(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
-    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
+    mtmd_image_preproc_out preprocess(const clip_image_u8 & img) override;

 private:
    static std::vector<clip_image_size> get_target_ratios();
@@ -168,7 +175,7 @@ private:
 // ref: https://huggingface.co/stepfun-ai/Step3-VL-10B/blob/main/processing_step3.py
 struct mtmd_image_preprocessor_step3vl : mtmd_image_preprocessor_llava_uhd {
    mtmd_image_preprocessor_step3vl(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
-    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
+    mtmd_image_preproc_out preprocess(const clip_image_u8 & img) override;
    static slice_instructions build_slice_instructions(const clip_hparams & params, const clip_image_size & prepared_size);

 private:
@@ -195,11 +202,11 @@ private:

 struct mtmd_image_preprocessor_youtuvl : mtmd_image_preprocessor {
    mtmd_image_preprocessor_youtuvl(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
-    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
+    mtmd_image_preproc_out preprocess(const clip_image_u8 & img) override;
 };

 // similar to llava_uhd, but has add_newline
 struct mtmd_image_preprocessor_granite : mtmd_image_preprocessor_llava_uhd {
    mtmd_image_preprocessor_granite(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
-    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
+    mtmd_image_preproc_out preprocess(const clip_image_u8 & img) override;
 };
@@ -114,7 +114,7 @@ struct mtmd_image_tokens {
    // true if one of entries in batch_f32 is a placeholder
    bool is_placeholder() const {
        for (const auto & entry : batch_f32.entries) {
-            if (entry->is_placeholder()) {
+            if (entry.is_placeholder()) {
                return true;
            }
        }
@@ -147,7 +147,7 @@ struct mtmd_audio_tokens {
    // true if one of entries in batch_f32 is a placeholder
    bool is_placeholder() const {
        for (const auto & entry : batch_f32.entries) {
-            if (entry->is_placeholder()) {
+            if (entry.is_placeholder()) {
                return true;
            }
        }
@@ -1050,7 +1050,7 @@ struct mtmd_tokenizer {

            // TODO @ngxson : this is quite hacky because preprocessor only support batch with one single element, that need to be fixed in the future (e.g. by changing the preprocessor interface always take single input)

-            clip_image_f32_batch batch_f32;
+            mtmd_image_preproc_out preproc_out;

            for (const auto * bmp : bitmaps) {
                // sanity check
@@ -1063,42 +1063,40 @@ struct mtmd_tokenizer {
                }

                // convert mtmd_bitmap to clip_image_u8
-                clip_image_u8_ptr img_u8(clip_image_u8_init());
-                img_u8->set_size(
+                clip_image_u8 img_u8;
+                img_u8.set_size(
                    {(int)bmp->nx, (int)bmp->ny},
                    bmp->is_placeholder());
-                img_u8->cpy_buf(bmp->get_ro_buf());
+                img_u8.cpy_buf(bmp->get_ro_buf());

                // preprocess image
-                clip_image_f32_batch tmp_batch;
-                bool ok = ctx->image_preproc->preprocess(*img_u8, tmp_batch);
-                if (!ok) {
-                    LOG_ERR("Unable to preprocess image\n");
-                    return 2;
-                }
+                mtmd_image_preproc_out tmp_preproc_out = ctx->image_preproc->preprocess(img_u8);

-                // move entries and grid dimensions to the "global" batch_f32
-                for (auto & entry : tmp_batch.entries) {
-                    batch_f32.entries.emplace_back(std::move(entry));
+                // move entries and grid dimensions to the "global" preproc_out
+                for (auto & entry : tmp_preproc_out.entries) {
+                    preproc_out.entries.emplace_back(std::move(entry));
                }

                // for llava-uhd style, we need to handle grid too
-                // we don't care about overwriting these values for now because llama-uhd doesn't support batching anyway
-                batch_f32.grid_x = tmp_batch.grid_x;
-                batch_f32.grid_y = tmp_batch.grid_y;
+                // we don't care about overwriting these values for now because the case where bitmaps.size() > 1 is only for frame merging (qwen-vl), not supported by llava-uhd
+                if (tmp_preproc_out.grid_x > 0 && tmp_preproc_out.grid_y > 0) {
+                    GGML_ASSERT(bitmaps.size() == 1);
+                    preproc_out.grid_x = tmp_preproc_out.grid_x;
+                    preproc_out.grid_y = tmp_preproc_out.grid_y;
+                }
            }

            // handle llava-uhd style preprocessing
-            const bool has_tiling_grid = batch_f32.grid_x > 0 && batch_f32.grid_y > 0;
+            const bool has_tiling_grid = preproc_out.grid_x > 0 && preproc_out.grid_y > 0;
            if (has_tiling_grid) {
                // [QWEN_VIDEO] we do not support "frame merging" for llama-uhd style, so no batching for now
                GGML_ASSERT(bitmaps.size() == 1);

-                const int n_col = batch_f32.grid_x;
-                const int n_row = batch_f32.grid_y;
+                const int n_col = preproc_out.grid_x;
+                const int n_row = preproc_out.grid_y;
                // split batch into chunks of single images
-                // NOTE: batch_f32 will be invalidated after this call
-                auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[0]->id);
+                // NOTE: preproc_out will be invalidated after this call
+                auto chunks = split_batch_to_chunk(std::move(preproc_out), bitmaps[0]->id);
                GGML_ASSERT(chunks.size() > 0);

                auto ov_chunk = std::move(chunks.front());
@@ -1150,8 +1148,8 @@ struct mtmd_tokenizer {
            } else {

                size_t n_tokens = 0;
-                for (const auto & e : batch_f32.entries) {
-                    n_tokens += clip_n_output_tokens(ctx->ctx_v, e.get());
+                for (auto & e : preproc_out.entries) {
+                    n_tokens += clip_n_output_tokens(ctx->ctx_v, &e);
                    if (clip_model_n_temporal_merge(ctx->ctx_v) == 2) {
                        // [QWEN_VIDEO] pair input is merged to the same embd, so only count as one image
                        break;
@@ -1165,8 +1163,8 @@ struct mtmd_tokenizer {

                if (mtmd_decode_use_mrope(ctx)) {
                    // for Qwen2VL, we need this information for M-RoPE decoding positions
-                    image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
-                    image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get());
+                    image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, &preproc_out.entries[0]);
+                    image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, &preproc_out.entries[0]);
                } else {
                    // other models, we only need the total number of tokens
                    image_tokens->nx = n_tokens;
@@ -1181,6 +1179,12 @@ struct mtmd_tokenizer {
                    image_tokens->image_idx = n_images_added;
                    GGML_ASSERT(n_tokens == (size_t)image_tokens->n_tokens());
                }
+
+                clip_image_f32_batch batch_f32;
+                batch_f32.is_audio = false;
+                batch_f32.entries = std::move(preproc_out.entries);
+                // do NOT use preproc_out from this point on, it's moved
+
                image_tokens->batch_f32 = std::move(batch_f32);
                image_tokens->id = bitmaps[0]->id; // optional

@@ -1260,13 +1264,13 @@ struct mtmd_tokenizer {
            for (auto & mel_spec : mel_spec_chunks) {
                const bool is_placeholder = mel_spec.data.empty();

-                clip_image_f32_ptr mel_f32(clip_image_f32_init());
-                mel_f32->set_size(
+                clip_image_f32 mel_f32;
+                mel_f32.set_size(
                    {mel_spec.n_len, mel_spec.n_mel},
                    is_placeholder, /* is_audio */ true);
-                mel_f32->cpy_buf(mel_spec.data);
+                mel_f32.cpy_buf(mel_spec.data);

-                size_t n_tokens = clip_n_output_tokens(ctx->ctx_a, mel_f32.get());
+                size_t n_tokens = clip_n_output_tokens(ctx->ctx_a, &mel_f32);

                clip_image_f32_batch batch_f32;
                batch_f32.is_audio = true;
@@ -1296,12 +1300,12 @@ struct mtmd_tokenizer {
        return 0;
    }

-    std::vector<mtmd_input_chunk> split_batch_to_chunk(clip_image_f32_batch && batch_f32, const std::string & id) {
+    std::vector<mtmd_input_chunk> split_batch_to_chunk(mtmd_image_preproc_out && preproc_out, const std::string & id) {
        std::vector<mtmd_input_chunk> chunks;

-        for (auto & entry : batch_f32.entries) {
+        for (auto & entry : preproc_out.entries) {
            mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
-            image_tokens->nx = clip_n_output_tokens(ctx->ctx_v, entry.get());
+            image_tokens->nx = clip_n_output_tokens(ctx->ctx_v, &entry);
            image_tokens->ny = 1;
            image_tokens->batch_f32.entries.push_back(std::move(entry));
            image_tokens->id = id;
@@ -1406,16 +1410,16 @@ static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * im
        // e.g., DeepSeek-OCR-2: 144 per tile views, 257 for the global view
        size_t offset = 0;
        for (size_t i = 0; i < entries.size(); i++) {
-            if (entries[i]->is_placeholder()) {
+            if (entries[i].is_placeholder()) {
                LOG_ERR("%s: image tokens batch entry %zu is placeholder\n", __func__, i);
                return 1;
            }
-            int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
+            int n_tokens_per_image = clip_n_output_tokens(ctx_clip, &entries[i]);
            std::vector<float> tmp_embd((size_t)n_tokens_per_image * n_embd_out);
            bool ok_i = clip_image_encode(
                ctx_clip,
                ctx->n_threads,
-                entries[i].get(),
+                &entries[i],
                tmp_embd);
            if (!ok_i) {
                LOG_ERR("%s: failed to encode image %zu\n", __func__, i);
@@ -2063,16 +2067,18 @@ void mtmd_debug_preprocess_image(mtmd_context * ctx, const std::vector<uint8_t>
    clip_image_u8 img_u8;
    img_u8.set_size({nx, ny}, false);
    img_u8.cpy_buf(rgb_values);
-    clip_image_f32_batch batch_f32;
    GGML_ASSERT(ctx->image_preproc != nullptr);
-    bool ok = ctx->image_preproc->preprocess(img_u8, batch_f32);
-    if (!ok) {
-        LOG_ERR("%s: failed to preprocess image\n", __func__);
-        return;
+    mtmd_image_preproc_out preproc_out = ctx->image_preproc->preprocess(img_u8);
+
+    clip_image_f32_batch batch_f32;
+    batch_f32.is_audio = false;
+    for (auto & entry : preproc_out.entries) {
+        batch_f32.entries.push_back(std::move(entry));
    }
+
    LOG_INF("%s: preprocessed image to batch_f32 with %d entries\n", __func__, (int)batch_f32.entries.size());
    for (size_t i = 0; i < batch_f32.entries.size(); i++) {
-        LOG_INF("%s: entry %zu has nx=%d, ny=%d\n", __func__, i, batch_f32.entries[i]->nx(), batch_f32.entries[i]->ny());
+        LOG_INF("%s: entry %zu has nx=%d, ny=%d\n", __func__, i, batch_f32.entries[i].nx(), batch_f32.entries[i].ny());
        // TODO: better way to dump entry content?
    }
 }