mtmd, server: add "placeholder bitmap" for counting tokens , add */input_tokens API (#23913)

* mtmd: add "placeholder bitmap" for counting tokens w/o preprocessing * fast path skip preproc for placeholder * fix build * correct the api * add server endpoint + tests * add object name * update docs * add proxy handling * fix build * fix audio input path * use is_placeholder in process_mtmd_prompt() * nits * nits (2) * docs: clarify chat/completions/input_tokens is not official * fix merge problem
2026-06-09 07:16:44 +02:00 · 2026-06-06 11:06:51 +02:00
parent 5a69c97439
commit f5c6ae1827
26 changed files with 732 additions and 422 deletions
@@ -4,6 +4,7 @@
 #include "gguf.h"
 #include "clip.h"

+#include <array>
 #include <climits>
 #include <cstdarg>
 #include <cinttypes>
@@ -429,10 +430,68 @@ static projector_type clip_projector_type_from_string(const std::string & str) {

 // RGB uint8 image
 struct clip_image_u8 {
-    int nx;
-    int ny;
+    clip_image_size get_size() const {
+        return { nx, ny };
+    }

+    void set_size(clip_image_size size, bool is_placeholder) {
+        nx = size.width;
+        ny = size.height;
+        if (is_placeholder) {
+            buf.clear();
+        } else {
+            buf.resize((size_t) nx * (size_t) ny * 3);
+        }
+    }
+
+    void cpy_buf(const std::vector<uint8_t> & new_buf) {
+        buf = new_buf;
+    }
+
+    const std::vector<uint8_t> & get_ro_buf() const {
+        if (is_placeholder()) {
+            throw std::runtime_error("this clip_image_u8 is a placeholder");
+        }
+        return buf;
+    }
+
+    // note to contributors: NEVER add a get_rw_buf(), it is a DANGEROUS pattern. always use get_pixel / set_pixel for buffer manipulation
+
+    bool is_placeholder() const {
+        return buf.empty();
+    }
+
+    std::array<uint8_t, 3> get_pixel(int x, int y) const {
+        if (is_placeholder()) {
+            // return a dummy value, so that legacy code can still process image without errors
+            return { 0, 0, 0 };
+        }
+        int idx = (y * nx + x) * 3;
+        return { buf[idx], buf[idx + 1], buf[idx + 2] };
+    }
+
+    void set_pixel(int x, int y, const std::array<uint8_t, 3> & rgb) {
+        if (is_placeholder()) {
+            return; // no-op
+        }
+        int idx = (y * nx + x) * 3;
+        buf[idx] = rgb[0];
+        buf[idx + 1] = rgb[1];
+        buf[idx + 2] = rgb[2];
+    }
+
+    size_t n_pixels() const {
+        return (size_t) nx * (size_t) ny;
+    }
+
+    size_t n_elements() const {
+        return n_pixels() * 3;
+    }
+
+  private:
    std::vector<uint8_t> buf;
+    int nx = 0;
+    int ny = 0;
 };

 // For images, buf.size() == nx*ny*3
@@ -440,15 +499,87 @@ struct clip_image_u8 {
 // For audio, only one channel is used, buf.size() == nx*ny
 //     nx will be n_frames and ny will be n_mel
 struct clip_image_f32 {
-    int nx;
-    int ny;
-
-    std::vector<float> buf;
-
    // marks the global view in e.g., DeepSeek-OCR Models
    bool add_viewsep = false;
-    // whether a learned newline token should be appended after the image (eg Granite4 Vision)
+    // whether a learned newline (or EOI) token should be appended after the image (eg Granite4 Vision)
    bool add_newline = false;
+
+    clip_image_size get_size() const {
+        return { nx_, ny_ };
+    }
+
+    int nx() const { return nx_; }
+    int ny() const { return ny_; }
+
+    void set_size(clip_image_size size, bool is_placeholder, bool is_audio) {
+        nx_ = size.width;
+        ny_ = size.height;
+        if (is_placeholder) {
+            buf.clear();
+        } else {
+            if (is_audio) {
+                buf.resize((size_t) nx_ * (size_t) ny_);
+            } else {
+                buf.resize((size_t) nx_ * (size_t) ny_ * 3);
+            }
+        }
+    }
+
+    void cpy_buf(const std::vector<float> & new_buf) {
+        buf = new_buf;
+    }
+
+    void from_u8(const clip_image_u8 & img) {
+        auto size = img.get_size();
+        nx_ = size.width;
+        ny_ = size.height;
+        if (img.is_placeholder()) {
+            buf.clear();
+            return; // no-op
+        }
+        buf.resize(img.n_elements());
+        const auto & u8_buf = img.get_ro_buf();
+        for (size_t i = 0; i < img.n_elements(); ++i) {
+            buf[i] = (float) u8_buf[i] / 255.0f;
+        }
+    }
+
+    size_t n_pixels() const {
+        return (size_t) nx_ * (size_t) ny_;
+    }
+
+    size_t n_elements() const {
+        return n_pixels() * 3;
+    }
+
+    void normalize(const float mean[3], const float std[3]) {
+        if (is_placeholder()) {
+            return; // no-op
+        }
+        for (size_t i = 0; i < n_pixels(); ++i) {
+            buf[i * 3 + 0] = (buf[i * 3 + 0] - mean[0]) / std[0];
+            buf[i * 3 + 1] = (buf[i * 3 + 1] - mean[1]) / std[1];
+            buf[i * 3 + 2] = (buf[i * 3 + 2] - mean[2]) / std[2];
+        }
+    }
+
+    const std::vector<float> & get_ro_buf() const {
+        if (is_placeholder()) {
+            throw std::runtime_error("this clip_image_f32 is a placeholder");
+        }
+        return buf;
+    }
+
+    // note to contributors: NEVER add a get_rw_buf(), it is a DANGEROUS pattern
+
+    bool is_placeholder() const {
+        return buf.empty();
+    }
+
+  private:
+    std::vector<float> buf;
+    int nx_ = 0;
+    int ny_ = 0;
 };

 //
@@ -39,12 +39,14 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
    }

    // PPM header: P6 format, width, height, and max color value
-    file << "P6\n" << img.nx << " " << img.ny << "\n255\n";
+    const auto ppm_size = img.get_size();
+    file << "P6\n" << ppm_size.width << " " << ppm_size.height << "\n255\n";

    // Write pixel data
-    for (size_t i = 0; i < img.buf.size(); i += 3) {
+    const auto & ppm_buf = img.get_ro_buf();
+    for (size_t i = 0; i < ppm_buf.size(); i += 3) {
        // PPM expects binary data in RGB format, which matches our image buffer
-        file.write(reinterpret_cast<const char*>(&img.buf[i]), 3);
+        file.write(reinterpret_cast<const char*>(&ppm_buf[i]), 3);
    }

    file.close();
@@ -57,9 +59,10 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string&
        return;
    }

-    int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data
+    const auto bmp_size = img.get_size();
+    int fileSize = 54 + 3 * bmp_size.width * bmp_size.height; // File header + info header + pixel data
    int bytesPerPixel = 3;
-    int widthInBytes = img.nx * bytesPerPixel;
+    int widthInBytes = bmp_size.width * bytesPerPixel;
    int paddingAmount = (4 - (widthInBytes % 4)) % 4;
    int stride = widthInBytes + paddingAmount;

@@ -72,7 +75,7 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string&
    };

    // Total file size
-    fileSize = 54 + (stride * img.ny);
+    fileSize = 54 + (stride * bmp_size.height);
    fileHeader[2] = (unsigned char)(fileSize);
    fileHeader[3] = (unsigned char)(fileSize >> 8);
    fileHeader[4] = (unsigned char)(fileSize >> 16);
@@ -94,14 +97,14 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string&
    };

    // Width and height in the information header
-    infoHeader[4] = (unsigned char)(img.nx);
-    infoHeader[5] = (unsigned char)(img.nx >> 8);
-    infoHeader[6] = (unsigned char)(img.nx >> 16);
-    infoHeader[7] = (unsigned char)(img.nx >> 24);
-    infoHeader[8] = (unsigned char)(img.ny);
-    infoHeader[9] = (unsigned char)(img.ny >> 8);
-    infoHeader[10] = (unsigned char)(img.ny >> 16);
-    infoHeader[11] = (unsigned char)(img.ny >> 24);
+    infoHeader[4] = (unsigned char)(bmp_size.width);
+    infoHeader[5] = (unsigned char)(bmp_size.width >> 8);
+    infoHeader[6] = (unsigned char)(bmp_size.width >> 16);
+    infoHeader[7] = (unsigned char)(bmp_size.width >> 24);
+    infoHeader[8] = (unsigned char)(bmp_size.height);
+    infoHeader[9] = (unsigned char)(bmp_size.height >> 8);
+    infoHeader[10] = (unsigned char)(bmp_size.height >> 16);
+    infoHeader[11] = (unsigned char)(bmp_size.height >> 24);

    // Write file headers
    file.write(reinterpret_cast<char*>(fileHeader), sizeof(fileHeader));
@@ -109,14 +112,14 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string&

    // Pixel data
    std::vector<unsigned char> padding(3, 0); // Max padding size to be added to each row
-    for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
-        for (int x = 0; x < img.nx; ++x) {
+    for (int y = bmp_size.height - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
+        for (int x = 0; x < bmp_size.width; ++x) {
            // Each pixel
-            size_t pixelIndex = (y * img.nx + x) * 3;
+            const auto px = img.get_pixel(x, y);
            unsigned char pixel[3] = {
-                img.buf[pixelIndex + 2], // BMP stores pixels in BGR format
-                img.buf[pixelIndex + 1],
-                img.buf[pixelIndex]
+                px[2], // BMP stores pixels in BGR format
+                px[1],
+                px[0]
            };
            file.write(reinterpret_cast<char*>(pixel), 3);
        }
@@ -129,12 +132,13 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string&

 // debug function to convert f32 to u8
 static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
-    dst.nx = src.nx;
-    dst.ny = src.ny;
-    dst.buf.resize(3 * src.nx * src.ny);
-    for (size_t i = 0; i < src.buf.size(); ++i) {
-        dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255));
+    dst.set_size(src.get_size(), false);
+    const auto & src_buf = src.get_ro_buf();
+    std::vector<uint8_t> dst_buf(src.n_elements());
+    for (size_t i = 0; i < src.n_elements(); ++i) {
+        dst_buf[i] = static_cast<uint8_t>(std::min(std::max(int(src_buf[i] * 255.0f), 0), 255));
    }
+    dst.cpy_buf(dst_buf);
 }
 #endif

@@ -241,8 +245,8 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
        proj_type(ctx->proj_type()),
        img(img),
        patch_size(hparams.patch_size),
-        n_patches_x(img.nx / patch_size),
-        n_patches_y(img.ny / patch_size),
+        n_patches_x(img.nx() / patch_size),
+        n_patches_y(img.ny() / patch_size),
        n_patches(n_patches_x * n_patches_y),
        n_embd(hparams.n_embd),
        n_head(hparams.n_head),
@@ -278,8 +282,8 @@ void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const {
 // siglip2 naflex
 ggml_tensor * clip_graph::resize_position_embeddings(uint32_t interpolation_mode) {
    ggml_tensor * pos_embd = model.position_embeddings;
-    const int height       = img.ny / patch_size;
-    const int width        = img.nx / patch_size;
+    const int height       = img.ny() / patch_size;
+    const int width        = img.nx() / patch_size;
    const uint32_t mode    = interpolation_mode;
    const int n_per_side   = (int)std::sqrt(pos_embd->ne[1]);

@@ -523,7 +527,7 @@ ggml_tensor * clip_graph::build_inp() {
 }

 ggml_tensor * clip_graph::build_inp_raw(int channels) {
-    ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels);
+    ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx(), img.ny(), channels);
    ggml_set_name(inp_raw, "inp_raw");
    ggml_set_input(inp_raw);
    return inp_raw;
@@ -816,8 +820,8 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale
    GGML_ASSERT(scale_factor > 1);

    const int n_embd = cur->ne[0];
-    int width  = img.nx / patch_size;
-    int height = img.ny / patch_size;
+    int width  = img.nx() / patch_size;
+    int height = img.ny() / patch_size;

    // pad width and height to factor
    const int64_t pad_width  = CLIP_ALIGN(width,  scale_factor) - width;
@@ -2805,13 +2809,12 @@ struct clip_model_loader {
        clip_image_f32_batch batch;
        clip_image_f32_ptr img(clip_image_f32_init());
        if (ctx_clip.model.modality == CLIP_MODALITY_VISION) {
-            img->nx = hparams.warmup_image_size;
-            img->ny = hparams.warmup_image_size;
-            LOG_INF("%s: warmup with image size = %d x %d\n", __func__, img->nx, img->ny);
+            const int sz = hparams.warmup_image_size;
+            img->set_size({sz, sz}, false, false);
+            LOG_INF("%s: warmup with image size = %d x %d\n", __func__, sz, sz);
        } else {
-            img->nx = hparams.warmup_audio_size;
-            img->ny = hparams.n_mel_bins;
-            LOG_INF("%s: warmup with audio size = %d\n", __func__, img->nx);
+            img->set_size({hparams.warmup_audio_size, hparams.n_mel_bins}, false, false);
+            LOG_INF("%s: warmup with audio size = %d\n", __func__, hparams.warmup_audio_size);
        }
        batch.entries.push_back(std::move(img));
        warmup(ctx_clip, batch);
@@ -3108,12 +3111,6 @@ struct clip_image_f32_batch * clip_image_f32_batch_init() {
    return new clip_image_f32_batch();
 }

-unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) {
-    if (nx) *nx = img->nx;
-    if (ny) *ny = img->ny;
-    return img->buf.data();
-}
-
 void clip_image_size_free(struct clip_image_size * load_image_size) {
    if (load_image_size == nullptr) {
        return;
@@ -3134,7 +3131,7 @@ size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int id
        LOG_ERR("%s: invalid index %d\n", __func__, idx);
        return 0;
    }
-    return batch->entries[idx]->nx;
+    return batch->entries[idx]->nx();
 }

 size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx) {
@@ -3142,7 +3139,7 @@ size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int id
        LOG_ERR("%s: invalid index %d\n", __func__, idx);
        return 0;
    }
-    return batch->entries[idx]->ny;
+    return batch->entries[idx]->ny();
 }

 clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx) {
@@ -3153,13 +3150,6 @@ clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batc
    return batch->entries[idx].get();
 }

-void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
-    img->nx = nx;
-    img->ny = ny;
-    img->buf.resize(3 * nx * ny);
-    memcpy(img->buf.data(), rgb_pixels, img->buf.size());
-}
-
 void clip_free(clip_ctx * ctx) {
    if (ctx == nullptr) {
        return;
@@ -3167,20 +3157,6 @@ void clip_free(clip_ctx * ctx) {
    delete ctx;
 }

-// deprecated
-size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
-    const int32_t nx = ctx->model.hparams.image_size;
-    const int32_t ny = ctx->model.hparams.image_size;
-    return clip_embd_nbytes_by_img(ctx, nx, ny);
-}
-
-size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) {
-    clip_image_f32 img;
-    img.nx = img_w;
-    img.ny = img_h;
-    return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
-}
-
 int32_t clip_get_image_size(const struct clip_ctx * ctx) {
    return ctx->model.hparams.image_size;
 }
@@ -3211,9 +3187,9 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
        case PROJECTOR_TYPE_PADDLEOCR:
        case PROJECTOR_TYPE_HUNYUANVL:
        case PROJECTOR_TYPE_YOUTUVL:
-            return (img->nx / params.patch_size) / 2;
+            return (img->nx() / params.patch_size) / 2;
        case PROJECTOR_TYPE_STEP3VL:
-            return img->nx / (params.patch_size * params.n_merge);
+            return img->nx() / (params.patch_size * params.n_merge);
        default:
            break;
    }
@@ -3233,9 +3209,9 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
        case PROJECTOR_TYPE_PADDLEOCR:
        case PROJECTOR_TYPE_HUNYUANVL:
        case PROJECTOR_TYPE_YOUTUVL:
-            return (img->ny / params.patch_size) / 2;
+            return (img->ny() / params.patch_size) / 2;
        case PROJECTOR_TYPE_STEP3VL:
-            return img->ny / (params.patch_size * params.n_merge);
+            return img->ny() / (params.patch_size * params.n_merge);
        default:
            break;
    }
@@ -3247,7 +3223,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im

    // for models with fixed size image, the input image is already pre-processed and resized to square
    int patch_size = params.patch_size;
-    int n_patches = (img->nx / patch_size) * (img->ny / patch_size);
+    int n_patches = (img->nx() / patch_size) * (img->ny() / patch_size);

    projector_type proj = ctx->proj_type();

@@ -3313,14 +3289,14 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
        case PROJECTOR_TYPE_YOUTUVL:
            {
                // dynamic size (2 conv, so double patch size)
-                int x_patch = img->nx / (params.patch_size * 2);
-                int y_patch = img->ny / (params.patch_size * 2);
+                int x_patch = img->nx() / (params.patch_size * 2);
+                int y_patch = img->ny() / (params.patch_size * 2);
                n_patches = x_patch * y_patch;
            } break;
        case PROJECTOR_TYPE_STEP3VL:
            {
-                int x_patch = img->nx / (params.patch_size * params.n_merge);
-                int y_patch = img->ny / (params.patch_size * params.n_merge);
+                int x_patch = img->nx() / (params.patch_size * params.n_merge);
+                int y_patch = img->ny() / (params.patch_size * params.n_merge);
                n_patches = x_patch * y_patch;
            } break;
        case PROJECTOR_TYPE_GEMMA3:
@@ -3347,8 +3323,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
            {
                // dynamic size
                int out_patch_size = params.patch_size * ctx->model.hparams.n_merge;
-                int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size;
-                int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size;
+                int x_patch = CLIP_ALIGN(img->nx(), out_patch_size) / out_patch_size;
+                int y_patch = CLIP_ALIGN(img->ny(), out_patch_size) / out_patch_size;
                n_patches = x_patch * y_patch;
            } break;
        case PROJECTOR_TYPE_PADDLEOCR:
@@ -3364,8 +3340,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
            {
                // dynamic size
                int n_merge = ctx->model.hparams.n_merge;
-                int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1);
-                int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1);
+                int n_patches_x = img->nx() / patch_size / (n_merge > 0 ? n_merge : 1);
+                int n_patches_y = img->ny() / patch_size / (n_merge > 0 ? n_merge : 1);
                if (ctx->model.token_embd_img_break) {
                    n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
                } else {
@@ -3378,7 +3354,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
        case PROJECTOR_TYPE_MERALION:
        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
            {
-                n_patches = img->nx;
+                n_patches = img->nx();

                const int proj_stack_factor = ctx->model.hparams.proj_stack_factor;
                if (ctx->model.audio_has_stack_frames()) {
@@ -3400,11 +3376,11 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
                // chunk_size=100 frames --> 3x stride-2 conv2d --> 13 tokens per chunk
                const int chunk_size       = 100;
                const int tokens_per_chunk = 13;
-                n_patches = (img->nx / chunk_size) * tokens_per_chunk;
+                n_patches = (img->nx() / chunk_size) * tokens_per_chunk;
            } break;
        case PROJECTOR_TYPE_GLMA:
            {
-                n_patches = img->nx;
+                n_patches = img->nx();
                // whisper downscales input token by half after conv1d
                n_patches /= 2;
                // reshape by merge_factor
@@ -3431,8 +3407,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
        case PROJECTOR_TYPE_HUNYUANVL:
            {
                int merge = ctx->model.hparams.n_merge;
-                int ow = (img->nx / patch_size) / merge;
-                int oh = (img->ny / patch_size) / merge;
+                int ow = (img->nx() / patch_size) / merge;
+                int oh = (img->ny() / patch_size) / merge;
                n_patches = (ow + 1) * oh + 2;
            } break;
        case PROJECTOR_TYPE_DEEPSEEKOCR2:
@@ -3446,13 +3422,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
        } break;
        case PROJECTOR_TYPE_LFM2A:
            {
-                n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
+                n_patches = ((((img->nx() + 1) / 2) + 1) / 2 + 1) / 2;
            } break;
        case PROJECTOR_TYPE_GEMMA4A:
            {
                // Two Conv2D stride-2: O = floor((I + 2p - k) / s) + 1, p=1, k=3, s=2
                // O = floor((I - 1) / 2) + 1
-                int n = img->nx;
+                int n = img->nx();
                for (int i = 0; i < 2; i++) {
                    n = (n - 1) / 2 + 1;
                }
@@ -3460,13 +3436,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
            } break;
        case PROJECTOR_TYPE_GEMMA4UA:
            {
-                n_patches = img->nx;  // no downsampling: one token per raw waveform frame
+                n_patches = img->nx();  // no downsampling: one token per raw waveform frame
            } break;
        case PROJECTOR_TYPE_GRANITE_SPEECH:
            {
                const int ws = ctx->model.hparams.audio_proj_window_size;
                const int ds = ctx->model.hparams.audio_proj_downsample_rate;
-                n_patches = ((img->nx + ws - 1) / ws) * (ws / ds);
+                n_patches = ((img->nx() + ws - 1) / ws) * (ws / ds);
            } break;
        case PROJECTOR_TYPE_GRANITE4_VISION:
            {
@@ -3475,7 +3451,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
                // For 384×384 input: n = 24/8 = 3, query_side = 4 → 144.
                const int window_side = ctx->model.hparams.downsample_window_side;
                const int query_side  = ctx->model.hparams.downsample_query_side;
-                const int side        = img->nx / params.patch_size;
+                const int side        = img->nx() / params.patch_size;
                const int n           = side / window_side;
                n_patches             = (query_side * n) * (query_side * n);
                if (img->add_newline) {
@@ -3525,8 +3501,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    const auto & model   = ctx->model;
    const auto & hparams = model.hparams;

-    const int image_size_width  = imgs.entries[0]->nx;
-    const int image_size_height = imgs.entries[0]->ny;
+    const int image_size_width  = imgs.entries[0]->nx();
+    const int image_size_height = imgs.entries[0]->ny();

    const int patch_size    = hparams.patch_size;
    const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
@@ -3546,7 +3522,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        return inp;
    };

-    auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector<float> & values) {
+    auto set_input_f32 = [&get_inp_tensor](const char * name, const std::vector<float> & values) {
        ggml_tensor * cur = get_inp_tensor(name);
        GGML_ASSERT(cur->type == GGML_TYPE_F32);
        GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
@@ -3564,7 +3540,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    if (!imgs.is_audio) {
        size_t nelem = 0;
        for (const auto & img : imgs.entries) {
-            nelem += img->nx * img->ny * 3;
+            nelem += img->nx() * img->ny() * 3;
        }
        std::vector<float> inp_raw(nelem);

@@ -3580,19 +3556,20 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        //   ──────┘ x B

        for (size_t i = 0; i < imgs.entries.size(); i++) {
-            const int nx = imgs.entries[i]->nx;
-            const int ny = imgs.entries[i]->ny;
+            const int nx = imgs.entries[i]->nx();
+            const int ny = imgs.entries[i]->ny();
            const int n = nx * ny;

            for (int b = 0; b < batch_size; b++) {
+                const auto & buf = imgs.entries[b]->get_ro_buf();
                float * batch_entry = inp_raw.data() + b * (3*n);
                for (int y = 0; y < ny; y++) {
                    for (int x = 0; x < nx; x++) {
                        size_t base_src = 3*(y * nx + x); // idx of the first channel
                        size_t base_dst =    y * nx + x;  // idx of the first channel
-                        batch_entry[      base_dst] = imgs.entries[b]->buf[base_src    ];
-                        batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1];
-                        batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2];
+                        batch_entry[      base_dst] = buf[base_src    ];
+                        batch_entry[1*n + base_dst] = buf[base_src + 1];
+                        batch_entry[2*n + base_dst] = buf[base_src + 2];
                    }
                }
            }
@@ -3602,12 +3579,14 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    } else {
        // audio input
        GGML_ASSERT(imgs.entries.size() == 1);
+
        const auto & mel_inp = imgs.entries[0];
-        const int n_step = mel_inp->nx;
-        const int n_mel  = mel_inp->ny;
-        std::vector<float> inp_raw(n_step * n_mel);
-        std::memcpy(inp_raw.data(), mel_inp->buf.data(), n_step * n_mel * sizeof(float));
-        set_input_f32("inp_raw", inp_raw);
+        const auto & buf = mel_inp->get_ro_buf();
+        const int n_step = mel_inp->nx();
+        const int n_mel  = mel_inp->ny();
+        GGML_ASSERT((size_t)n_step * n_mel == buf.size());
+
+        set_input_f32("inp_raw", buf);
    }

    // set input per projector
@@ -4218,7 +4197,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                GGML_ASSERT(imgs.entries.size() == 1);
                const auto & img0 = imgs.entries.front();
                // Compute n_pos matching SSCP output: two stride-2 convs
-                int n_pos = img0->nx;
+                int n_pos = img0->nx();
                for (int i = 0; i < 2; i++) { n_pos = (n_pos - 1) / 2 + 1; }

                // Chunked local attention: blocked causal mask and RPE
@@ -4324,7 +4303,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                // reshapes as ggml_get_rows gathers. The names are set
                // by g4v_gather() in models/granite4-vision.cpp.
                const int patch_size  = model.hparams.patch_size;
-                const int image_side  = imgs.entries.front()->nx / patch_size;
+                const int image_side  = imgs.entries.front()->nx() / patch_size;
                const int window_side = hparams.downsample_window_side;
                const int query_side  = hparams.downsample_query_side;
                const int n           = image_side / window_side;
@@ -4570,19 +4549,6 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
    return ctx->model.modality == CLIP_MODALITY_AUDIO;
 }

-bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
-    clip_image_f32 clip_img;
-    clip_img.buf.resize(h * w * 3);
-    for (int i = 0; i < h*w*3; i++)
-    {
-        clip_img.buf[i] = img[i];
-    }
-    clip_img.nx = w;
-    clip_img.ny = h;
-    clip_image_encode(ctx, n_threads, &clip_img, vec);
-    return true;
-}
-
 //
 // API used internally with mtmd
 //
@@ -4591,17 +4557,6 @@ projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
    return ctx->proj_type();
 }

-void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel) {
-    clip_image_f32 * audio = new clip_image_f32;
-    audio->nx = n_frames;
-    audio->ny = n_mel;
-    audio->buf.resize(n_frames * n_mel);
-    std::memcpy(audio->buf.data(), mel, n_frames * n_mel * sizeof(float));
-
-    batch->entries.push_back(clip_image_f32_ptr(audio));
-    batch->is_audio = true;
-}
-
 const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
    return &ctx->model.hparams;
 }
@@ -17,6 +17,9 @@ struct clip_ctx;
 struct clip_image_size {
    int width;
    int height;
+    bool operator==(const clip_image_size & other) const {
+        return width == other.width && height == other.height;
+    }
 };

 struct clip_image_f32;
@@ -54,9 +57,6 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params

 void clip_free(struct clip_ctx * ctx);

-size_t clip_embd_nbytes(const struct clip_ctx * ctx);
-size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
-
 int32_t clip_get_image_size (const struct clip_ctx * ctx);
 int32_t clip_get_patch_size (const struct clip_ctx * ctx);
 int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
@@ -79,9 +79,6 @@ struct clip_image_u8        * clip_image_u8_init (void);
 struct clip_image_f32       * clip_image_f32_init(void);
 struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava

-// nx, ny are the output image dimensions
-unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
-
 void clip_image_size_free (struct clip_image_size * img_size);
 void clip_image_u8_free (struct clip_image_u8  * img);
 void clip_image_f32_free(struct clip_image_f32 * img);
@@ -94,12 +91,6 @@ size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int id
 size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
 struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data

-/**
- * Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
- * The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes
- */
-void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
-
 bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
 bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);

@@ -107,11 +98,6 @@ bool clip_is_llava(const struct clip_ctx * ctx);
 // note for contributor: this clip_is_(model) pattern is deprecated
 //                       do NOT add new functions like this

-bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
-
-// use by audio input
-void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel);
-
 bool clip_has_vision_encoder(const struct clip_ctx * ctx);
 bool clip_has_audio_encoder(const struct clip_ctx * ctx);

@@ -1,7 +1,7 @@
 #include "models.h"

 ggml_cgraph * clip_graph_conformer::build() {
-    const int n_frames   = img.nx;
+    const int n_frames   = img.nx();
    const int n_pos      = n_frames / 2;
    const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1;
    GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
@@ -22,8 +22,8 @@ ggml_cgraph * clip_graph_exaone4_5::build() {
    ggml_tensor * inp_raw = build_inp_raw();
    ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);

-    GGML_ASSERT(img.nx % (patch_size * 2) == 0);
-    GGML_ASSERT(img.ny % (patch_size * 2) == 0);
+    GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
+    GGML_ASSERT(img.ny() % (patch_size * 2) == 0);

    {
        ggml_tensor * inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
@@ -16,8 +16,8 @@ ggml_cgraph * clip_graph_glm4v::build() {
    ggml_set_name(positions, "positions");
    ggml_set_input(positions);

-    GGML_ASSERT(img.nx % (patch_size * 2) == 0);
-    GGML_ASSERT(img.ny % (patch_size * 2) == 0);
+    GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
+    GGML_ASSERT(img.ny() % (patch_size * 2) == 0);

    // second conv dimension
    {
@@ -1,7 +1,7 @@
 #include "models.h"

 ggml_cgraph * clip_graph_granite_speech::build() {
-    const int n_frames     = img.nx;
+    const int n_frames     = img.nx();
    const int context_size = hparams.audio_chunk_size;
    const int ctc_layer    = n_layer / 2;
    const int conv_kernel  = hparams.audio_conv_kernel_size;
@@ -7,8 +7,8 @@
 // with a w*h? Also the permute is a bit different at (2, 1, 0, 3) instead of (2, 0, 1, 3).
 ggml_tensor * clip_graph_kimik25::resize_position_embeddings_3d(uint32_t interpolation_mode) {
    ggml_tensor * pos_embd = model.position_embeddings;
-    const int height       = img.ny / patch_size;
-    const int width        = img.nx / patch_size;
+    const int height       = img.ny() / patch_size;
+    const int width        = img.nx() / patch_size;
    const uint32_t mode    = interpolation_mode;

    GGML_ASSERT(pos_embd);
@@ -56,8 +56,8 @@ ggml_cgraph * clip_graph_mimovl::build() {
                                           patch_size, patch_size, 0, 0, 1, 1);
        inp = ggml_add(ctx0, inp, inp_1);

-        GGML_ASSERT(img.nx % (patch_size * 2) == 0);
-        GGML_ASSERT(img.ny % (patch_size * 2) == 0);
+        GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
+        GGML_ASSERT(img.ny() % (patch_size * 2) == 0);

        inp = ggml_permute(ctx0, inp, 1, 2, 0, 3);  // [w,h,c,b] -> [c,w,h,b]
        inp = ggml_cont_4d(ctx0, inp, n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
@@ -19,8 +19,8 @@ ggml_cgraph * clip_graph_qwen2vl::build() {
    ggml_tensor * inp_raw = build_inp_raw();
    ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);

-    GGML_ASSERT(img.nx % (patch_size * 2) == 0);
-    GGML_ASSERT(img.ny % (patch_size * 2) == 0);
+    GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
+    GGML_ASSERT(img.ny() % (patch_size * 2) == 0);

    // second conv dimension
    {
@@ -16,8 +16,8 @@ ggml_cgraph * clip_graph_qwen3vl::build() {
    ggml_tensor * inp_raw = build_inp_raw();
    ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);

-    GGML_ASSERT(img.nx % (patch_size * 2) == 0);
-    GGML_ASSERT(img.ny % (patch_size * 2) == 0);
+    GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
+    GGML_ASSERT(img.ny() % (patch_size * 2) == 0);

    // second conv dimension
    {
@@ -1,7 +1,7 @@
 #include "models.h"

 ggml_cgraph * clip_graph_whisper_enc::build() {
-    const int n_frames = img.nx;
+    const int n_frames = img.nx();
    const int n_pos    = n_frames / 2;
    GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);

@@ -166,7 +166,7 @@ struct mtmd_cli_context {
    }

    bool load_media(const std::string & fname) {
-        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str()));
+        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str(), false));
        if (!bmp.ptr) {
            return false;
        }
@@ -478,7 +478,7 @@ static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int

 } // namespace audio_helpers

-mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len) {
+mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder) {
    if (audio_helpers::is_audio_file((const char *)buf, len)) {
        std::vector<float> pcmf32;
        const int sample_rate = mtmd_get_audio_sample_rate(ctx);
@@ -490,7 +490,7 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigne
            LOG_ERR("Unable to read WAV audio file from buffer\n");
            return nullptr;
        }
-        return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
+        return mtmd_bitmap_init_from_audio(pcmf32.size(), placeholder ? nullptr : pcmf32.data());
    }

    // otherwise, we assume it's an image
@@ -502,13 +502,13 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigne
            LOG_ERR("%s: failed to decode image bytes\n", __func__);
            return nullptr;
        }
-        result = mtmd_bitmap_init(nx, ny, data);
+        result = mtmd_bitmap_init(nx, ny, placeholder ? nullptr : data);
        stbi_image_free(data);
    }
    return result;
 }

-mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) {
+mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder) {
    std::vector<unsigned char> buf;
    FILE * f = fopen(fname, "rb");
    if (!f) {
@@ -533,5 +533,6 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char *
        return nullptr;
    }

-    return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size());
+    return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size(), placeholder);
 }
+
@@ -29,7 +29,7 @@ MTMD_API void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_da
 // it calls mtmd_helper_bitmap_init_from_buf() internally
 // returns nullptr on failure
 // this function is thread-safe
-MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname);
+MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder);

 // helper function to construct a mtmd_bitmap from a buffer containing a file
 // supported formats:
@@ -38,7 +38,7 @@ MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, con
 // note: audio files will be auto-detected based on magic bytes
 // returns nullptr on failure
 // this function is thread-safe
-MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
+MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder);

 // helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
 MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
@@ -9,25 +9,12 @@
 //

 void mtmd_image_preprocessor::img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
-    dst.nx = src.nx;
-    dst.ny = src.ny;
-    dst.buf.resize(src.buf.size());
-
-    // TODO @ngxson : seems like this could be done more efficiently on cgraph
-    for (size_t i = 0; i < src.buf.size(); ++i) {
-        int c = i % 3; // rgb
-        dst.buf[i] = (static_cast<float>(src.buf[i]) / 255.0f - mean[c]) / std[c];
-    }
+    dst.from_u8(src);
+    dst.normalize(mean, std);
 }

 void mtmd_image_preprocessor::img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst) {
-    dst.nx = src.nx;
-    dst.ny = src.ny;
-    dst.buf.resize(src.buf.size());
-
-    for (size_t i = 0; i < src.buf.size(); ++i) {
-        dst.buf[i] = static_cast<float>(src.buf[i]);
-    }
+    dst.from_u8(src);
 }

 // set of tools to manipulate images
@@ -40,13 +27,16 @@ struct img_tool {
            resize_algo algo,
            pad_style padding = PAD_CEIL,
            std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
-        dst.nx = target_resolution.width;
-        dst.ny = target_resolution.height;
-        dst.buf.resize(3 * dst.nx * dst.ny);
+        dst.set_size(target_resolution, src.is_placeholder());

-        if (dst.nx == src.nx && dst.ny == src.ny) {
+        if (src.is_placeholder()) {
+            // no-op for placeholder image, just set the size and return
+            return;
+        }
+
+        if (dst.get_size() == src.get_size()) {
            // no resize needed, simple copy
-            dst.buf = src.buf;
+            dst.cpy_buf(src.get_ro_buf());
            return;
        }

@@ -68,17 +58,17 @@ struct img_tool {
        } else {
            // resize with padding
            clip_image_u8 resized_image;
-            float scale_w = static_cast<float>(target_resolution.width) / src.nx;
-            float scale_h = static_cast<float>(target_resolution.height) / src.ny;
+            float scale_w = static_cast<float>(target_resolution.width) / src.get_size().width;
+            float scale_h = static_cast<float>(target_resolution.height) / src.get_size().height;
            float scale = std::min(scale_w, scale_h);

            int new_width, new_height;
            if (padding == PAD_NEAREST) {
-                new_width  = std::min(static_cast<int>(std::round(src.nx * scale)), target_resolution.width);
-                new_height = std::min(static_cast<int>(std::round(src.ny * scale)), target_resolution.height);
+                new_width  = std::min(static_cast<int>(std::round(src.get_size().width * scale)), target_resolution.width);
+                new_height = std::min(static_cast<int>(std::round(src.get_size().height * scale)), target_resolution.height);
            } else {
-                new_width  = std::min(static_cast<int>(std::ceil(src.nx * scale)), target_resolution.width);
-                new_height = std::min(static_cast<int>(std::ceil(src.ny * scale)), target_resolution.height);
+                new_width  = std::min(static_cast<int>(std::ceil(src.get_size().width * scale)), target_resolution.width);
+                new_height = std::min(static_cast<int>(std::ceil(src.get_size().height * scale)), target_resolution.height);
            }

            switch (algo) {
@@ -112,18 +102,17 @@ struct img_tool {

    static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) {
        GGML_ASSERT(x >= 0 && y >= 0 && w > 0 && h > 0);
-        GGML_ASSERT(x + w <= image.nx && y + h <= image.ny);
-        dst.nx = w;
-        dst.ny = h;
-        dst.buf.resize(3 * w * h);
+        GGML_ASSERT(x + w <= image.get_size().width && y + h <= image.get_size().height);
+        dst.set_size({w, h}, image.is_placeholder());
+
+        if (image.is_placeholder()) {
+            // no-op for placeholder image, just set the size and return
+            return;
+        }

        for (int i = 0; i < h; ++i) {
            for (int j = 0; j < w; ++j) {
-                int src_idx = 3 * ((y + i)*image.nx + (x + j));
-                int dst_idx = 3 * (i*w + j);
-                dst.buf[dst_idx]     = image.buf[src_idx];
-                dst.buf[dst_idx + 1] = image.buf[src_idx + 1];
-                dst.buf[dst_idx + 2] = image.buf[src_idx + 2];
+                dst.set_pixel(j, i, image.get_pixel(x + j, y + i));
            }
        }
    }
@@ -181,81 +170,101 @@ struct img_tool {

    // draw src image into dst image at offset (offset_x, offset_y)
    static void composite(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) {
-        for (int y = 0; y < src.ny; ++y) {
-            for (int x = 0; x < src.nx; ++x) {
+        if (src.is_placeholder()) {
+            // no-op for placeholder image
+            return;
+        }
+
+        const auto src_size = src.get_size();
+        const auto dst_size = dst.get_size();
+        for (int y = 0; y < src_size.height; ++y) {
+            for (int x = 0; x < src_size.width; ++x) {
                int dx = x + offset_x;
                int dy = y + offset_y;
                // skip pixels that would be out of bounds in the destination
-                if (dx < 0 || dy < 0 || dx >= dst.nx || dy >= dst.ny) {
+                if (dx < 0 || dy < 0 || dx >= dst_size.width || dy >= dst_size.height) {
                    continue;
                }
-                size_t dst_idx = 3 * (static_cast<size_t>(dy) * dst.nx + static_cast<size_t>(dx));
-                size_t src_idx = 3 * (static_cast<size_t>(y) * src.nx + static_cast<size_t>(x));
-                dst.buf[dst_idx + 0] = src.buf[src_idx + 0];
-                dst.buf[dst_idx + 1] = src.buf[src_idx + 1];
-                dst.buf[dst_idx + 2] = src.buf[src_idx + 2];
+                dst.set_pixel(dx, dy, src.get_pixel(x, y));
            }
        }
    }

    // fill the image with a solid color
    static void fill(clip_image_u8 & img, const std::array<uint8_t, 3> & color) {
-        for (size_t i = 0; i < img.buf.size(); i += 3) {
-            img.buf[i]     = color[0];
-            img.buf[i + 1] = color[1];
-            img.buf[i + 2] = color[2];
+        if (img.is_placeholder()) {
+            // no-op for placeholder image
+            return;
+        }
+
+        const auto size = img.get_size();
+        for (int y = 0; y < size.height; ++y) {
+            for (int x = 0; x < size.width; ++x) {
+                img.set_pixel(x, y, color);
+            }
        }
    }

 private:
    // Bilinear resize function
    static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) {
-        if (src.nx == 0 || src.ny == 0) { dst.nx = dst.ny = 0; dst.buf.clear(); return; }
+        const auto src_size = src.get_size();
+        if (src_size.width == 0 || src_size.height == 0) { dst.set_size({0, 0}, false); return; }
        if (target_width  <= 0) target_width  = 1;
        if (target_height <= 0) target_height = 1;

-        dst.nx = target_width;
-        dst.ny = target_height;
-        dst.buf.resize(3 * target_width * target_height);
+        dst.set_size({target_width, target_height}, false);

-        float x_ratio = target_width  > 1 ? static_cast<float>(src.nx - 1) / (target_width  - 1) : 0.0f;
-        float y_ratio = target_height > 1 ? static_cast<float>(src.ny - 1) / (target_height - 1) : 0.0f;
+        if (src.is_placeholder()) {
+            // no-op for placeholder image, just set the size and return
+            return;
+        }
+
+        float x_ratio = target_width  > 1 ? static_cast<float>(src_size.width  - 1) / (target_width  - 1) : 0.0f;
+        float y_ratio = target_height > 1 ? static_cast<float>(src_size.height - 1) / (target_height - 1) : 0.0f;

        for (int y = 0; y < target_height; ++y) {
            for (int x = 0; x < target_width; ++x) {
                float px = x * x_ratio;
                float py = y * y_ratio;

-                int x0 = std::min(static_cast<int>(px), src.nx - 1);
-                int y0 = std::min(static_cast<int>(py), src.ny - 1);
-                int x1 = std::min(x0 + 1, src.nx - 1);
-                int y1 = std::min(y0 + 1, src.ny - 1);
+                int x0 = std::min(static_cast<int>(px), src_size.width  - 1);
+                int y0 = std::min(static_cast<int>(py), src_size.height - 1);
+                int x1 = std::min(x0 + 1, src_size.width  - 1);
+                int y1 = std::min(y0 + 1, src_size.height - 1);

                float xf = px - x0;
                float yf = py - y0;

+                const auto p00 = src.get_pixel(x0, y0);
+                const auto p10 = src.get_pixel(x1, y0);
+                const auto p01 = src.get_pixel(x0, y1);
+                const auto p11 = src.get_pixel(x1, y1);
+
+                std::array<uint8_t, 3> pixel;
                for (int c = 0; c < 3; ++c) {
-                    float top    = lerp(static_cast<float>(src.buf[3 * (y0 * src.nx + x0) + c]),
-                                        static_cast<float>(src.buf[3 * (y0 * src.nx + x1) + c]),
-                                        xf);
-                    float bottom = lerp(static_cast<float>(src.buf[3 * (y1 * src.nx + x0) + c]),
-                                        static_cast<float>(src.buf[3 * (y1 * src.nx + x1) + c]),
-                                        xf);
-                    dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, yf));
+                    float top    = lerp(static_cast<float>(p00[c]), static_cast<float>(p10[c]), xf);
+                    float bottom = lerp(static_cast<float>(p01[c]), static_cast<float>(p11[c]), xf);
+                    pixel[c] = static_cast<uint8_t>(lerp(top, bottom, yf));
                }
+                dst.set_pixel(x, y, pixel);
            }
        }
    }

    // Bicubic resize function
    // part of image will be cropped if the aspect ratio is different
-    static bool resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
-        const int nx = img.nx;
-        const int ny = img.ny;
+    static void resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
+        const auto img_size = img.get_size();
+        const int nx = img_size.width;
+        const int ny = img_size.height;

-        dst.nx = target_width;
-        dst.ny = target_height;
-        dst.buf.resize(3 * target_width * target_height);
+        dst.set_size({target_width, target_height}, false);
+
+        if (img.is_placeholder()) {
+            // no-op for placeholder image, just set the size and return
+            return;
+        }

        float Cc;
        float C[5] = {};
@@ -280,12 +289,13 @@ private:
                dx = tx * j - x;
                dy = ty * i - y;

+                std::array<uint8_t, 3> pixel;
                for (k = 0; k < 3; k++) {
                    for (jj = 0; jj <= 3; jj++) {
-                        d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
-                        d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
-                        d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
-                        a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+                        d0 = img.get_pixel(clip(x - 1, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k] - img.get_pixel(clip(x, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k];
+                        d2 = img.get_pixel(clip(x + 1, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k] - img.get_pixel(clip(x, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k];
+                        d3 = img.get_pixel(clip(x + 2, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k] - img.get_pixel(clip(x, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k];
+                        a0 = img.get_pixel(clip(x, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k];

                        a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
                        a2 =  1.0 / 2 * d0 +      1.0 / 2 * d2;
@@ -303,13 +313,12 @@ private:
                        Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;

                        const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
-                        dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
+                        pixel[k] = Cc2;
                    }
                }
+                dst.set_pixel(j, i, pixel);
            }
        }
-
-        return true;
    }

    // Bicubic resize function using Pillow's ImagingResample algorithm
@@ -455,16 +464,17 @@ private:
        };

        // Horizontal resampling pass
-        // Resizes width from imIn.nx to imOut.nx, preserving height
+        // Resizes width from imIn to out_nx, preserving height
        auto resample_horizontal = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut,
+                                       int out_nx,
                                       int ksize, const std::vector<int> & bounds, const std::vector<int32_t> & weights) {
-            imOut.ny = imIn.ny;
-            imOut.buf.resize(3 * imOut.nx * imOut.ny);
+            const int in_ny = imIn.get_size().height;
+            imOut.set_size({out_nx, in_ny}, false);

            // Process each row independently
-            for (int yy = 0; yy < imOut.ny; yy++) {
+            for (int yy = 0; yy < in_ny; yy++) {
                // For each output pixel in this row
-                for (int xx = 0; xx < imOut.nx; xx++) {
+                for (int xx = 0; xx < out_nx; xx++) {
                    // Get the range of input pixels and filter coefficients
                    int xmin = bounds[xx * 2 + 0];  // First input pixel index
                    int xcnt = bounds[xx * 2 + 1];  // Number of input pixels
@@ -476,36 +486,36 @@ private:

                    // Convolve: sum weighted input pixels
                    for (int x = 0; x < xcnt; x++) {
-                        int src_idx = ((yy * imIn.nx) + (x + xmin)) * 3;
-                        ss0 += static_cast<uint8_t>(imIn.buf[src_idx + 0]) * weights[xx * ksize + x];  // R channel
-                        ss1 += static_cast<uint8_t>(imIn.buf[src_idx + 1]) * weights[xx * ksize + x];  // G channel
-                        ss2 += static_cast<uint8_t>(imIn.buf[src_idx + 2]) * weights[xx * ksize + x];  // B channel
+                        const auto src_px = imIn.get_pixel(x + xmin, yy);
+                        ss0 += src_px[0] * weights[xx * ksize + x];  // R channel
+                        ss1 += src_px[1] * weights[xx * ksize + x];  // G channel
+                        ss2 += src_px[2] * weights[xx * ksize + x];  // B channel
                    }

                    // Convert back from fixed-point (divide by 2^PRECISION_BITS) and clamp to [0,255]
-                    int dst_idx = (yy * imOut.nx + xx) * 3;
-                    imOut.buf[dst_idx + 0] = clip8(ss0 >> PRECISION_BITS);
-                    imOut.buf[dst_idx + 1] = clip8(ss1 >> PRECISION_BITS);
-                    imOut.buf[dst_idx + 2] = clip8(ss2 >> PRECISION_BITS);
+                    imOut.set_pixel(xx, yy, {clip8(ss0 >> PRECISION_BITS),
+                                             clip8(ss1 >> PRECISION_BITS),
+                                             clip8(ss2 >> PRECISION_BITS)});
                }
            }
        };

        // Vertical resampling pass
-        // Resizes height from imIn.ny to imOut.ny, preserving width
+        // Resizes height from imIn to out_ny, preserving width
        auto resample_vertical = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut,
+                                     int out_ny,
                                     int ksize, const std::vector<int> & bounds, const std::vector<int32_t> & weight) {
-            imOut.nx = imIn.nx;
-            imOut.buf.resize(3 * imOut.nx * imOut.ny);
+            const int in_nx = imIn.get_size().width;
+            imOut.set_size({in_nx, out_ny}, false);

            // For each output row
-            for (int yy = 0; yy < imOut.ny; yy++) {
+            for (int yy = 0; yy < out_ny; yy++) {
                // Get the range of input rows and filter coefficients
                int ymin = bounds[yy * 2 + 0];  // First input row index
                int ycnt = bounds[yy * 2 + 1];  // Number of input rows

                // Process each column in this output row
-                for (int xx = 0; xx < imOut.nx; xx++) {
+                for (int xx = 0; xx < in_nx; xx++) {
                    // Initialize accumulators for RGB channels with rounding bias
                    int32_t ss0 = 1 << (PRECISION_BITS - 1);
                    int32_t ss1 = 1 << (PRECISION_BITS - 1);
@@ -513,27 +523,23 @@ private:

                    // Convolve: sum weighted input pixels vertically
                    for (int y = 0; y < ycnt; y++) {
-                        int src_idx = ((y + ymin) * imIn.nx + xx) * 3;
-                        ss0 += static_cast<uint8_t>(imIn.buf[src_idx + 0]) * weight[yy * ksize + y];  // R channel
-                        ss1 += static_cast<uint8_t>(imIn.buf[src_idx + 1]) * weight[yy * ksize + y];  // G channel
-                        ss2 += static_cast<uint8_t>(imIn.buf[src_idx + 2]) * weight[yy * ksize + y];  // B channel
+                        const auto src_px = imIn.get_pixel(xx, y + ymin);
+                        ss0 += src_px[0] * weight[yy * ksize + y];  // R channel
+                        ss1 += src_px[1] * weight[yy * ksize + y];  // G channel
+                        ss2 += src_px[2] * weight[yy * ksize + y];  // B channel
                    }

                    // Convert back from fixed-point and clamp to [0,255]
-                    int dst_idx = (yy * imOut.nx + xx) * 3;
-                    imOut.buf[dst_idx + 0] = clip8(ss0 >> PRECISION_BITS);
-                    imOut.buf[dst_idx + 1] = clip8(ss1 >> PRECISION_BITS);
-                    imOut.buf[dst_idx + 2] = clip8(ss2 >> PRECISION_BITS);
+                    imOut.set_pixel(xx, yy, {clip8(ss0 >> PRECISION_BITS),
+                                             clip8(ss1 >> PRECISION_BITS),
+                                             clip8(ss2 >> PRECISION_BITS)});
                }
            }
        };

        // Main resampling logic using separable two-pass approach
-        const int src_width = img.nx;
-        const int src_height = img.ny;
-
-        dst.nx = target_width;
-        dst.ny = target_height;
+        const int src_width  = img.get_size().width;
+        const int src_height = img.get_size().height;

        bool need_horizontal = (target_width != src_width);
        bool need_vertical = (target_height != src_height);
@@ -555,18 +561,20 @@ private:
        if (need_horizontal && need_vertical) {
            // Both horizontal and vertical
            clip_image_u8 temp;
-            temp.nx = target_width;
-            resample_horizontal(img, temp, ksize_horiz, bounds_horiz, weights_horiz);
-            resample_vertical(temp, dst, ksize_vert, bounds_vert, weights_vert);
+            resample_horizontal(img, temp, target_width, ksize_horiz, bounds_horiz, weights_horiz);
+            resample_vertical(temp, dst, target_height, ksize_vert, bounds_vert, weights_vert);
        } else if (need_horizontal) {
            // Only horizontal
-            resample_horizontal(img, dst, ksize_horiz, bounds_horiz, weights_horiz);
+            resample_horizontal(img, dst, target_width, ksize_horiz, bounds_horiz, weights_horiz);
        } else if (need_vertical) {
            // Only vertical
-            resample_vertical(img, dst, ksize_vert, bounds_vert, weights_vert);
+            resample_vertical(img, dst, target_height, ksize_vert, bounds_vert, weights_vert);
        } else {
            // No resizing needed - direct copy
-            dst.buf = img.buf;
+            dst.set_size(img.get_size(), img.is_placeholder());
+            if (!img.is_placeholder()) {
+                dst.cpy_buf(img.get_ro_buf());
+            }
        }

        return true;
@@ -588,7 +596,7 @@ private:
 //

 bool mtmd_image_preprocessor_llava_uhd::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
-    const clip_image_size original_size{img.nx, img.ny};
+    const clip_image_size original_size = img.get_size();
    auto const inst = get_slice_instructions(original_size);
    std::vector<clip_image_u8_ptr> imgs = slice_image(img, inst);

@@ -883,7 +891,7 @@ bool mtmd_image_preprocessor_fixed_size::preprocess(const clip_image_u8 & img, c
 bool mtmd_image_preprocessor_dyn_size::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
    GGML_ASSERT(hparams.image_min_pixels > 0 && hparams.image_max_pixels > 0);
    clip_image_u8 resized_image;
-    const clip_image_size original_size{img.nx, img.ny};
+    const clip_image_size original_size = img.get_size();
    // the original pixtral model doesn't have n_merge
    const int cur_merge = hparams.n_merge == 0 ? 1 : hparams.n_merge;
    const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
@@ -908,7 +916,7 @@ bool mtmd_image_preprocessor_dyn_size::preprocess(const clip_image_u8 & img, cli
 bool mtmd_image_preprocessor_longest_edge::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
    GGML_ASSERT(hparams.image_longest_edge > 0);
    clip_image_u8 resized_image;
-    const clip_image_size original_size{img.nx, img.ny};
+    const clip_image_size original_size = img.get_size();
    // the original pixtral model doesn't have n_merge
    const int cur_merge = hparams.n_merge == 0 ? 1 : hparams.n_merge;
    const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
@@ -1040,7 +1048,7 @@ bool mtmd_image_preprocessor_idefics3::preprocess(const clip_image_u8 & img, cli
    //      multiples of image_size (always rounding up)
    //
    // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
-    const clip_image_size original_size{img.nx, img.ny};
+    const clip_image_size original_size = img.get_size();
    const clip_image_size refined_size = img_tool::calc_size_preserved_ratio(
        original_size, hparams.image_size, hparams.image_longest_edge);
    // LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n",
@@ -1088,7 +1096,7 @@ bool mtmd_image_preprocessor_idefics3::preprocess(const clip_image_u8 & img, cli

 bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
    GGML_ASSERT(!hparams.image_res_candidates.empty());
-    const clip_image_size original_size{img.nx, img.ny};
+    const clip_image_size original_size = img.get_size();
    auto const inst = get_slice_instructions(original_size);
    std::vector<clip_image_u8_ptr> imgs = slice_image(img, inst, false);

@@ -1108,7 +1116,7 @@ bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img,
    static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ };
    // TODO: support 512 (tiny) and 640 (small) once we have eval data for them

-    const int64_t orig_area = static_cast<int64_t>(img.nx) * img.ny;
+    const int64_t orig_area = static_cast<int64_t>(img.n_pixels());

    size_t  mode_i   = 0;
    int64_t min_diff = std::numeric_limits<int64_t>::max();
@@ -1201,10 +1209,11 @@ bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img,
    // emit 768x768 local tiles when the image is larger than a tile in either
    // dimension, then always a 1024x1024 global view. order: [tiles..., global].

-    if (img.nx > tile_size || img.ny > tile_size) {
-        const float           aspect_ratio  = static_cast<float>(img.nx) / img.ny;
+    const auto img_size = img.get_size();
+    if (img_size.width > tile_size || img_size.height > tile_size) {
+        const float           aspect_ratio  = static_cast<float>(img_size.width) / img_size.height;
        const auto            target_ratios = get_target_ratios();
-        const clip_image_size grid          = find_closest_aspect_ratio(aspect_ratio, target_ratios, img.nx, img.ny);
+        const clip_image_size grid          = find_closest_aspect_ratio(aspect_ratio, target_ratios, img_size.width, img_size.height);

        // stretch onto the grid (no aspect preserve), then crop tiles row-major.
        clip_image_u8 refined;
@@ -1247,50 +1256,57 @@ void mtmd_image_preprocessor_step3vl::img_u8_resize_bilinear_to_f32(
        int target_height,
        const float mean[3],
        const float std[3]) {
-    if (src.nx == target_width && src.ny == target_height) {
+    const auto src_size = src.get_size();
+    if (src_size.width == target_width && src_size.height == target_height) {
        img_u8_to_f32(src, dst, mean, std);
        return;
    }

-    dst.nx = target_width;
-    dst.ny = target_height;
-    dst.buf.resize(3 * target_width * target_height);
+    dst.set_size({target_width, target_height}, false, false);

-    const float scale_x = static_cast<float>(src.nx) / target_width;
-    const float scale_y = static_cast<float>(src.ny) / target_height;
+    if (src.is_placeholder()) {
+        // no-op for placeholder image, just set the size and return
+        return;
+    }
+
+    const float scale_x = static_cast<float>(src_size.width)  / target_width;
+    const float scale_y = static_cast<float>(src_size.height) / target_height;
+
+    std::vector<float> local_buf(3 * target_width * target_height);

    for (int y = 0; y < target_height; ++y) {
        const float src_y = (static_cast<float>(y) + 0.5f) * scale_y - 0.5f;
        const int y0_floor = static_cast<int>(std::floor(src_y));
-        const int y0 = std::max(0, std::min(y0_floor, src.ny - 1));
-        const int y1 = std::max(0, std::min(y0_floor + 1, src.ny - 1));
+        const int y0 = std::max(0, std::min(y0_floor,     src_size.height - 1));
+        const int y1 = std::max(0, std::min(y0_floor + 1, src_size.height - 1));
        const float ly = src_y - y0_floor;

        for (int x = 0; x < target_width; ++x) {
            const float src_x = (static_cast<float>(x) + 0.5f) * scale_x - 0.5f;
            const int x0_floor = static_cast<int>(std::floor(src_x));
-            const int x0 = std::max(0, std::min(x0_floor, src.nx - 1));
-            const int x1 = std::max(0, std::min(x0_floor + 1, src.nx - 1));
+            const int x0 = std::max(0, std::min(x0_floor,     src_size.width - 1));
+            const int x1 = std::max(0, std::min(x0_floor + 1, src_size.width - 1));
            const float lx = src_x - x0_floor;

-            const size_t idx00 = 3 * (y0 * src.nx + x0);
-            const size_t idx01 = 3 * (y0 * src.nx + x1);
-            const size_t idx10 = 3 * (y1 * src.nx + x0);
-            const size_t idx11 = 3 * (y1 * src.nx + x1);
-            const size_t idx_dst = 3 * (y * target_width + x);
+            const auto p00 = src.get_pixel(x0, y0);
+            const auto p01 = src.get_pixel(x1, y0);
+            const auto p10 = src.get_pixel(x0, y1);
+            const auto p11 = src.get_pixel(x1, y1);

+            const size_t idx_dst = 3 * (y * target_width + x);
            for (int c = 0; c < 3; ++c) {
-                const float v00 = (static_cast<float>(src.buf[idx00 + c]) / 255.0f - mean[c]) / std[c];
-                const float v01 = (static_cast<float>(src.buf[idx01 + c]) / 255.0f - mean[c]) / std[c];
-                const float v10 = (static_cast<float>(src.buf[idx10 + c]) / 255.0f - mean[c]) / std[c];
-                const float v11 = (static_cast<float>(src.buf[idx11 + c]) / 255.0f - mean[c]) / std[c];
+                const float v00 = (static_cast<float>(p00[c]) / 255.0f - mean[c]) / std[c];
+                const float v01 = (static_cast<float>(p01[c]) / 255.0f - mean[c]) / std[c];
+                const float v10 = (static_cast<float>(p10[c]) / 255.0f - mean[c]) / std[c];
+                const float v11 = (static_cast<float>(p11[c]) / 255.0f - mean[c]) / std[c];

                const float top = v00 + (v01 - v00) * lx;
                const float bot = v10 + (v11 - v10) * lx;
-                dst.buf[idx_dst + c] = top + (bot - top) * ly;
+                local_buf[idx_dst + c] = top + (bot - top) * ly;
            }
        }
    }
+    dst.cpy_buf(local_buf);
 }

 int mtmd_image_preprocessor_step3vl::get_image_longest_edge(const clip_hparams & params) {
@@ -1341,26 +1357,26 @@ std::vector<int> mtmd_image_preprocessor_step3vl::calc_grid(int length, int wind

 clip_image_u8 mtmd_image_preprocessor_step3vl::prepare_image(const clip_image_u8 & img, const clip_hparams & params) {
    clip_image_u8 resized = img;
-    const float aspect_ratio = img.ny > 0 ? static_cast<float>(img.nx) / img.ny : 1.0f;
-    if (std::min(img.nx, img.ny) < 32 &&
+    const auto img_size = img.get_size();
+    const float aspect_ratio = img_size.height > 0 ? static_cast<float>(img_size.width) / img_size.height : 1.0f;
+    if (std::min(img_size.width, img_size.height) < 32 &&
        (aspect_ratio > wide_aspect_ratio_limit ||
         aspect_ratio < 1.0f / wide_aspect_ratio_limit)) {
-        const int square_size = std::max(img.nx, img.ny);
+        const int square_size = std::max(img_size.width, img_size.height);
        clip_image_u8 padded;
-        padded.nx = square_size;
-        padded.ny = square_size;
-        padded.buf.resize(3 * square_size * square_size);
+        padded.set_size({square_size, square_size}, false);
        img_tool::fill(padded, {0, 0, 0});
        img_tool::composite(padded, img, 0, 0);
        resized = std::move(padded);
    }

    const int max_image_size = get_image_longest_edge(params);
-    if (std::max(resized.nx, resized.ny) > max_image_size) {
-        const float scale = static_cast<float>(max_image_size) / std::max(resized.nx, resized.ny);
+    const auto resized_size = resized.get_size();
+    if (std::max(resized_size.width, resized_size.height) > max_image_size) {
+        const float scale = static_cast<float>(max_image_size) / std::max(resized_size.width, resized_size.height);
        const clip_image_size new_size = {
-            std::max(1, static_cast<int>(std::floor(resized.nx * scale))),
-            std::max(1, static_cast<int>(std::floor(resized.ny * scale))),
+            std::max(1, static_cast<int>(std::floor(resized_size.width  * scale))),
+            std::max(1, static_cast<int>(std::floor(resized_size.height * scale))),
        };
        clip_image_u8 scaled;
        img_tool::resize(resized, scaled, new_size, RESIZE_ALGO_BILINEAR, PAD_NONE);
@@ -1372,14 +1388,14 @@ clip_image_u8 mtmd_image_preprocessor_step3vl::prepare_image(const clip_image_u8

 clip_image_u8 mtmd_image_preprocessor_step3vl::crop_with_black_padding(const clip_image_u8 & image, int x, int y, int w, int h) {
    clip_image_u8 dst;
-    dst.nx = w;
-    dst.ny = h;
-    dst.buf.resize(3 * w * h, 0);
+    dst.set_size({w, h}, false);
+    img_tool::fill(dst, {0, 0, 0});

+    const auto img_size = image.get_size();
    const int src_x0 = std::max(0, x);
    const int src_y0 = std::max(0, y);
-    const int src_x1 = std::min(image.nx, x + w);
-    const int src_y1 = std::min(image.ny, y + h);
+    const int src_x1 = std::min(img_size.width,  x + w);
+    const int src_y1 = std::min(img_size.height, y + h);

    if (src_x0 >= src_x1 || src_y0 >= src_y1) {
        return dst;
@@ -1390,11 +1406,7 @@ clip_image_u8 mtmd_image_preprocessor_step3vl::crop_with_black_padding(const cli

    for (int yy = 0; yy < src_y1 - src_y0; ++yy) {
        for (int xx = 0; xx < src_x1 - src_x0; ++xx) {
-            const int src_idx = 3 * ((src_y0 + yy) * image.nx + (src_x0 + xx));
-            const int dst_idx = 3 * ((dst_y0 + yy) * w + (dst_x0 + xx));
-            dst.buf[dst_idx + 0] = image.buf[src_idx + 0];
-            dst.buf[dst_idx + 1] = image.buf[src_idx + 1];
-            dst.buf[dst_idx + 2] = image.buf[src_idx + 2];
+            dst.set_pixel(dst_x0 + xx, dst_y0 + yy, image.get_pixel(src_x0 + xx, src_y0 + yy));
        }
    }

@@ -1443,7 +1455,7 @@ mtmd_image_preprocessor_step3vl::slice_instructions mtmd_image_preprocessor_step

 bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
    clip_image_u8 prepared = prepare_image(img, hparams);
-    const auto instructions = build_slice_instructions(hparams, {prepared.nx, prepared.ny});
+    const auto instructions = build_slice_instructions(hparams, prepared.get_size());

    clip_image_f32_ptr overview_f32(clip_image_f32_init());
    img_u8_resize_bilinear_to_f32(
@@ -1462,7 +1474,8 @@ bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip
    }

    clip_image_u8 img_for_crop = prepared;
-    if (instructions.refined_size.width != prepared.nx || instructions.refined_size.height != prepared.ny) {
+    const auto prepared_size = prepared.get_size();
+    if (instructions.refined_size.width != prepared_size.width || instructions.refined_size.height != prepared_size.height) {
        clip_image_u8 refined;
        img_tool::resize(prepared, refined, instructions.refined_size, RESIZE_ALGO_BILINEAR, PAD_NONE);
        img_for_crop = std::move(refined);
@@ -1503,9 +1516,10 @@ bool mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img, clip
        hparams.image_max_pixels / (patch_size * patch_size) : 256;

    // Linear search for optimal scale to fit within max_num_patches
+    const auto img_size = img.get_size();
    float scale = 1.0f;
-    int target_height = img.ny;
-    int target_width  = img.nx;
+    int target_height = img_size.height;
+    int target_width  = img_size.width;

    auto get_scaled_image_size = [align_size](float scale, int size) -> int {
        float scaled_size = size * scale;
@@ -1517,8 +1531,8 @@ bool mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img, clip

    // Linear search with 0.02 step size
    while (scale > 0.0f) {
-        target_height = get_scaled_image_size(scale, img.ny);
-        target_width  = get_scaled_image_size(scale, img.nx);
+        target_height = get_scaled_image_size(scale, img_size.height);
+        target_width  = get_scaled_image_size(scale, img_size.width);

        int num_patches_h = target_height / patch_size;
        int num_patches_w = target_width / patch_size;
@@ -26,12 +26,46 @@

 // represents raw image data, layout is RGBRGBRGB...
 // length of data must be nx * ny * 3
+// for audio bitmap: nx = sample count, ny = 1, layout is F32 F32 F32 ...
+// length of data must be nx * sizeof(float)
 struct mtmd_bitmap {
-    uint32_t nx;
-    uint32_t ny;
-    std::vector<unsigned char> data;
+    uint32_t nx = 0;
+    uint32_t ny = 0;
    std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
    bool is_audio = false; // true if the bitmap is audio
+
+    mtmd_bitmap(const unsigned char * data, uint32_t nx, uint32_t ny)
+        : nx(nx), ny(ny) {
+        if (data) {
+            size_t data_size = (size_t)nx * ny * 3;
+            this->data.resize(data_size);
+            std::memcpy(this->data.data(), data, data_size);
+        }
+    }
+
+    mtmd_bitmap(const unsigned char * data, uint32_t n_samples)
+        : nx(n_samples), ny(1), is_audio(true) {
+        if (data) {
+            size_t data_size = (size_t)nx * sizeof(float);
+            this->data.resize(data_size);
+            std::memcpy(this->data.data(), data, data_size);
+        }
+    }
+
+    const std::vector<unsigned char> & get_ro_buf() const {
+        return data;
+    }
+
+    bool is_placeholder() const {
+        return data.empty();
+    }
+
+    size_t n_bytes() const {
+        return data.size();
+    }
+
+  private:
+    std::vector<unsigned char> data;
 };

 // position indexing for decoder model
@@ -42,8 +76,8 @@ enum mtmd_pos_type {
 };

 struct mtmd_image_tokens {
-    uint32_t nx; // number of tokens in x direction
-    uint32_t ny; // number of tokens in y direction
+    uint32_t nx = 0; // number of tokens in x direction
+    uint32_t ny = 0; // number of tokens in y direction
    mtmd_pos_type pos = MTMD_POS_TYPE_NORMAL;
    uint32_t image_idx = 0; // 0-based position of this image among image chunks in the prompt(used by pos == MTMD_POS_TYPE_HUNYUANVL)
    uint32_t n_tokens() const {
@@ -56,6 +90,16 @@ struct mtmd_image_tokens {
    clip_image_f32_batch batch_f32; // preprocessed image patches
    std::string id; // optional user-defined ID, useful for KV cache tracking

+    // true if one of entries in batch_f32 is a placeholder
+    bool is_placeholder() const {
+        for (const auto & entry : batch_f32.entries) {
+            if (entry->is_placeholder()) {
+                return true;
+            }
+        }
+        return false;
+    }
+
    mtmd_image_tokens clone() {
        return mtmd_image_tokens{
            nx,
@@ -70,10 +114,20 @@ struct mtmd_image_tokens {
 using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens>;

 struct mtmd_audio_tokens {
-    uint32_t n_tokens; // number of tokens
+    uint32_t n_tokens = 0; // number of tokens
    clip_image_f32_batch batch_f32; // preprocessed image patches
    std::string id; // optional user-defined ID, useful for KV cache tracking

+    // true if one of entries in batch_f32 is a placeholder
+    bool is_placeholder() const {
+        for (const auto & entry : batch_f32.entries) {
+            if (entry->is_placeholder()) {
+                return true;
+            }
+        }
+        return false;
+    }
+
    mtmd_audio_tokens clone() {
        return mtmd_audio_tokens{
            n_tokens,
@@ -795,16 +849,19 @@ struct mtmd_tokenizer {
            }

            // sanity check
-            GGML_ASSERT(bitmap->nx > 0 && bitmap->ny > 0);
-            GGML_ASSERT(bitmap->data.size() == (size_t)bitmap->nx * bitmap->ny * 3);
+            if (bitmap->nx <= 0 || bitmap->ny <= 0) {
+                LOG_ERR("%s: error: invalid bitmap dimensions: nx = %d, ny = %d\n",
+                        __func__, bitmap->nx, bitmap->ny);
+                return 2;
+            }
            GGML_ASSERT(ctx->image_preproc != nullptr);

            // convert mtmd_bitmap to clip_image_u8
            clip_image_u8_ptr img_u8(clip_image_u8_init());
-            img_u8->nx = bitmap->nx;
-            img_u8->ny = bitmap->ny;
-            img_u8->buf.resize(bitmap->data.size());
-            std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
+            img_u8->set_size(
+                {(int)bitmap->nx, (int)bitmap->ny},
+                bitmap->is_placeholder());
+            img_u8->cpy_buf(bitmap->get_ro_buf());

            // preprocess image
            clip_image_f32_batch batch_f32;
@@ -949,7 +1006,7 @@ struct mtmd_tokenizer {
                return 2;
            }

-            if (bitmap->data.size() == 0) {
+            if (bitmap->nx == 0) {
                LOG_ERR("%s: error: empty audio data\n", __func__);
                return 2;
            }
@@ -960,26 +1017,46 @@ struct mtmd_tokenizer {

            // sanity check
            GGML_ASSERT(ctx->audio_preproc != nullptr);
-            GGML_ASSERT(bitmap->data.size() > sizeof(float));
-            GGML_ASSERT(bitmap->data.size() % sizeof(float) == 0);

            // preprocess audio
            std::vector<mtmd_audio_mel> mel_spec_chunks;
-            const float * samples = (const float *)bitmap->data.data();
-            size_t n_samples = bitmap->data.size() / sizeof(float);
-            bool ok = ctx->audio_preproc->preprocess(samples, n_samples, mel_spec_chunks);
-            if (!ok) {
-                LOG_ERR("Unable to preprocess audio\n");
-                return 2;
+            {
+                std::vector<float> dummy;
+                const float * samples = nullptr;
+                size_t n_samples = 0;
+                if (bitmap->is_placeholder()) {
+                    // TODO @ngxson : skip underlay processing if bitmap is placeholder
+                    GGML_ASSERT(bitmap->ny == 1);
+
+                    dummy.resize(bitmap->nx);
+                    samples = dummy.data();
+                    n_samples = dummy.size();
+                } else {
+                    const auto & buf = bitmap->get_ro_buf();
+                    GGML_ASSERT(buf.size() > sizeof(float));
+                    GGML_ASSERT(buf.size() % sizeof(float) == 0);
+
+                    samples = (const float *)buf.data();
+                    n_samples = buf.size() / sizeof(float);
+                }
+                bool ok = ctx->audio_preproc->preprocess(samples, n_samples, mel_spec_chunks);
+                if (!ok) {
+                    LOG_ERR("Unable to preprocess audio\n");
+                    return 2;
+                }
            }

            // consider each mel_spec as a separate audio chunk
            // TODO: maybe support batching, but this may come with memory cost
            for (auto & mel_spec : mel_spec_chunks) {
+                const bool is_placeholder = mel_spec.data.empty();
+
                clip_image_f32_ptr mel_f32(clip_image_f32_init());
-                mel_f32->nx  = mel_spec.n_len;
-                mel_f32->ny  = mel_spec.n_mel;
-                mel_f32->buf = std::move(mel_spec.data);
+                mel_f32->set_size(
+                    {mel_spec.n_len, mel_spec.n_mel},
+                    is_placeholder, /* is_audio */ true);
+                mel_f32->cpy_buf(mel_spec.data);
+
                size_t n_tokens = clip_n_output_tokens(ctx->ctx_a, mel_f32.get());

                clip_image_f32_batch batch_f32;
@@ -1098,12 +1175,28 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
            LOG_ERR("%s: model does not support vision input\n", __func__);
            return 1;
        }
+        if (chunk->tokens_image == nullptr) {
+            LOG_ERR("%s: image tokens are null\n", __func__);
+            return 1;
+        }
+        if (chunk->tokens_image->is_placeholder()) {
+            LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
+            return 1;
+        }
        return mtmd_encode(ctx, chunk->tokens_image.get());
    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
        if (!ctx->ctx_a) {
            LOG_ERR("%s: model does not support audio input\n", __func__);
            return 1;
        }
+        if (chunk->tokens_audio == nullptr) {
+            LOG_ERR("%s: audio tokens are null\n", __func__);
+            return 1;
+        }
+        if (chunk->tokens_audio->is_placeholder()) {
+            LOG_ERR("%s: audio tokens batch is placeholder\n", __func__);
+            return 1;
+        }
        int n_mmproj_embd = ctx->n_embd_text;
        ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
        bool ok = clip_image_batch_encode(
@@ -1141,6 +1234,10 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
        // e.g., DeepSeek-OCR-2: 144 per tile views, 257 for the global view
        size_t offset = 0;
        for (size_t i = 0; i < entries.size(); i++) {
+            if (entries[i]->is_placeholder()) {
+                LOG_ERR("%s: image tokens batch entry %zu is placeholder\n", __func__, i);
+                return 1;
+            }
            int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
            ok = clip_image_encode(
                ctx_clip,
@@ -1150,6 +1247,10 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
            offset += static_cast<size_t>(n_mmproj_embd) * n_tokens_per_image;
        }
    } else {
+        if (image_tokens->is_placeholder()) {
+            LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
+            return 1;
+        }
        ok = clip_image_batch_encode(
            ctx_clip,
            ctx->n_threads,
@@ -1207,24 +1308,17 @@ int mtmd_get_audio_sample_rate(const mtmd_context * ctx) {
 mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
                               uint32_t ny,
                               const unsigned char * data) {
-    mtmd_bitmap * bitmap = new mtmd_bitmap;
-    bitmap->nx = nx;
-    bitmap->ny = ny;
-    size_t data_size = (size_t)nx * ny * 3;
-    bitmap->data.resize(data_size);
-    std::memcpy(bitmap->data.data(), data, data_size);
+    mtmd_bitmap * bitmap = new mtmd_bitmap(data, nx, ny);
    return bitmap;
 }

 mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples,
                                          const float * data) {
-    mtmd_bitmap * bitmap = new mtmd_bitmap;
-    bitmap->nx = n_samples;
-    bitmap->ny = 1;
-    bitmap->is_audio = true;
-    size_t data_size = n_samples * sizeof(float);
-    bitmap->data.resize(data_size);
-    std::memcpy(bitmap->data.data(), data, data_size);
+    mtmd_bitmap * bitmap = new mtmd_bitmap((const unsigned char *)data, n_samples);
+    GGML_ASSERT(bitmap->is_audio);
+    if (!bitmap->is_placeholder()) {
+        GGML_ASSERT(bitmap->get_ro_buf().size() == n_samples * sizeof(float));
+    }
    return bitmap;
 }

@@ -1237,11 +1331,11 @@ uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap) {
 }

 const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
-    return bitmap->data.data();
+    return bitmap->get_ro_buf().data();
 }

 size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) {
-    return bitmap->data.size();
+    return bitmap->get_ro_buf().size();
 }

 bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
@@ -1535,14 +1629,16 @@ void mtmd_debug_encode_image(mtmd_context * ctx, const std::vector<std::vector<f
        LOG_ERR("%s: model does not support vision input\n", __func__);
        return;
    }
-    clip_image_f32 inp_image;
-    inp_image.nx = image.size();
-    inp_image.ny = inp_image.nx;
-    inp_image.buf.reserve(inp_image.nx * inp_image.ny);
+    const int img_sz = (int)image.size();
+    std::vector<float> img_buf;
+    img_buf.reserve(img_sz * img_sz);
    for (const auto & row : image) {
-        inp_image.buf.insert(inp_image.buf.end(), row.begin(), row.end());
+        img_buf.insert(img_buf.end(), row.begin(), row.end());
    }
-    LOG_INF("%s: created input image with nx=%d, ny=%d\n", __func__, inp_image.nx, inp_image.ny);
+    clip_image_f32 inp_image;
+    inp_image.set_size({img_sz, img_sz}, false, false);
+    inp_image.cpy_buf(img_buf);
+    LOG_INF("%s: created input image with nx=%d, ny=%d\n", __func__, img_sz, img_sz);
    mtmd_debug_encode_impl(ctx, ctx->ctx_v, inp_image);
 }

@@ -1552,16 +1648,17 @@ void mtmd_debug_encode_audio(mtmd_context * ctx, const std::vector<float> & inpu
        return;
    }
    int n_mel = clip_get_hparams(ctx->ctx_a)->n_mel_bins;
-    clip_image_f32 inp_audio;
-    inp_audio.nx = input.size();
-    inp_audio.ny = n_mel;
-    inp_audio.buf.resize(input.size() * n_mel);
-    for (size_t i = 0; i < input.size(); i++) {
+    const int audio_nx = (int)input.size();
+    std::vector<float> audio_buf(audio_nx * n_mel);
+    for (int i = 0; i < audio_nx; i++) {
        for (int j = 0; j < n_mel; j++) {
-            inp_audio.buf[j * inp_audio.nx + i] = input[i];
+            audio_buf[j * audio_nx + i] = input[i];
        }
    }
-    LOG_INF("%s: created input audio with nx=%d, ny=%d\n", __func__, inp_audio.nx, inp_audio.ny);
+    clip_image_f32 inp_audio;
+    inp_audio.set_size({audio_nx, n_mel}, false, true);
+    inp_audio.cpy_buf(audio_buf);
+    LOG_INF("%s: created input audio with nx=%d, ny=%d\n", __func__, audio_nx, n_mel);
    mtmd_debug_encode_impl(ctx, ctx->ctx_a, inp_audio);
 }

@@ -1571,9 +1668,8 @@ void mtmd_debug_preprocess_image(mtmd_context * ctx, const std::vector<uint8_t>
        return;
    }
    clip_image_u8 img_u8;
-    img_u8.nx = nx;
-    img_u8.ny = ny;
-    img_u8.buf = rgb_values;
+    img_u8.set_size({nx, ny}, false);
+    img_u8.cpy_buf(rgb_values);
    clip_image_f32_batch batch_f32;
    GGML_ASSERT(ctx->image_preproc != nullptr);
    bool ok = ctx->image_preproc->preprocess(img_u8, batch_f32);
@@ -1583,7 +1679,7 @@ void mtmd_debug_preprocess_image(mtmd_context * ctx, const std::vector<uint8_t>
    }
    LOG_INF("%s: preprocessed image to batch_f32 with %d entries\n", __func__, (int)batch_f32.entries.size());
    for (size_t i = 0; i < batch_f32.entries.size(); i++) {
-        LOG_INF("%s: entry %zu has nx=%d, ny=%d\n", __func__, i, batch_f32.entries[i]->nx, batch_f32.entries[i]->ny);
+        LOG_INF("%s: entry %zu has nx=%d, ny=%d\n", __func__, i, batch_f32.entries[i]->nx(), batch_f32.entries[i]->ny());
        // TODO: better way to dump entry content?
    }
 }
@@ -136,6 +136,11 @@ MTMD_API int mtmd_get_audio_sample_rate(const mtmd_context * ctx);
 // if bitmap is audio:
 //     length of data must be n_samples * sizeof(float)
 //     the data is in float format (PCM F32)
+//
+// if data == nullptr:
+//     the bitmap is considered "empty", and will be treated as a placeholder for counting tokens
+//     you can pass the bitmap via mtmd_tokenize(), then call mtmd_*_get_n_tokens() to count the tokens
+//     note: passing a placeholder bitmap to mtmd_encode() will return an error
 MTMD_API mtmd_bitmap *         mtmd_bitmap_init           (uint32_t nx, uint32_t ny, const unsigned char * data);
 MTMD_API mtmd_bitmap *         mtmd_bitmap_init_from_audio(size_t n_samples,         const float         * data);
 MTMD_API uint32_t              mtmd_bitmap_get_nx     (const mtmd_bitmap * bitmap);
@@ -1447,6 +1447,36 @@ See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-r
  }'
  ```

+### POST `/v1/responses/input_tokens`: Token Counting
+
+Similar to [Response input token counts API](https://developers.openai.com/api/reference/python/resources/responses/subresources/input_tokens/methods/count).
+
+Example response:
+
+```json
+{
+  "object": "response.input_tokens",
+  "input_tokens": 11
+}
+```
+
+### POST `/v1/chat/completions/input_tokens`: Token Counting
+
+Similar to [Response input token counts API](https://developers.openai.com/api/reference/python/resources/responses/subresources/input_tokens/methods/count), but accepts a chat completion body as input.
+
+Note: This is not an official OAI endpoint, but is added for completeness and convenience.
+
+Example response:
+
+```json
+{
+  "object": "response.input_tokens",
+  "input_tokens": 11
+}
+```
+
+## Anthropic-compatible API Endpoints
+
 ### POST `/v1/messages`: Anthropic-compatible Messages API

 Given a list of `messages`, returns the assistant's response. Streaming is supported via Server-Sent Events. While no strong claims of compatibility with the Anthropic API spec are made, in our experience it suffices to support many apps.
@@ -713,10 +713,10 @@ static std::string fnv_hash(const uint8_t * data, size_t len) {
    return std::to_string(hash);
 }

-server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files) {
+server_tokens process_mtmd_prompt(mtmd_context * mctx, const std::string & prompt, const std::vector<raw_buffer> & files, bool is_placeholder) {
    mtmd::bitmaps bitmaps;
    for (auto & file : files) {
-        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size()));
+        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size(), is_placeholder));
        if (!bmp.ptr) {
            throw std::runtime_error("Failed to load image or audio file");
        }
@@ -258,7 +258,8 @@ llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt,
 size_t validate_utf8(const std::string& text);

 // process mtmd prompt, return the server_tokens containing both text tokens and media chunks
-server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files);
+// if is_placeholder is true, the media chunk will be treated as placeholder for counting tokens; the output tokens are not usable for actual inference (e.g. for submitting a task to server_queue)
+server_tokens process_mtmd_prompt(mtmd_context * mctx, const std::string & prompt, const std::vector<raw_buffer> & files, bool is_placeholder = false);

 /**
 * break the input "prompt" object into multiple prompt if needed, then tokenize them
@@ -4333,6 +4333,10 @@ void server_routes::init_routes() {
            TASK_RESPONSE_TYPE_OAI_CHAT);
    };

+    this->post_chat_completions_tok = [this](const server_http_req & req) {
+        return handle_count_tokens(ctx_server.vocab, ctx_server.mctx, req, TASK_RESPONSE_TYPE_OAI_CHAT);
+    };
+
    this->post_control = [this](const server_http_req & req) {
        auto res = create_response();
        const json body = json::parse(req.body);
@@ -4388,6 +4392,10 @@ void server_routes::init_routes() {
            TASK_RESPONSE_TYPE_OAI_RESP);
    };

+    this->post_responses_tok_oai = [this](const server_http_req & req) {
+        return handle_count_tokens(ctx_server.vocab, ctx_server.mctx, req, TASK_RESPONSE_TYPE_OAI_RESP);
+    };
+
    this->post_transcriptions_oai = [this](const server_http_req & req) {
        auto res = create_response();

@@ -4435,20 +4443,7 @@ void server_routes::init_routes() {
    };

    this->post_anthropic_count_tokens = [this](const server_http_req & req) {
-        auto res = create_response();
-        std::vector<raw_buffer> files;
-        json body = server_chat_convert_anthropic_to_oai(json::parse(req.body));
-        SRV_DBG("%s\n", "Request converted: Anthropic -> OpenAI Chat Completions");
-        SRV_DBG("converted request: %s\n", body.dump().c_str());
-        json body_parsed = oaicompat_chat_params_parse(
-            body,
-            meta->chat_params,
-            files);
-
-        json prompt = body_parsed.at("prompt");
-        llama_tokens tokens = tokenize_mixed(ctx_server.vocab, prompt, true, true);
-        res->ok({{"input_tokens", static_cast<int>(tokens.size())}});
-        return res;
+        return handle_count_tokens(ctx_server.vocab, ctx_server.mctx, req, TASK_RESPONSE_TYPE_ANTHROPIC);
    };

    // same with handle_chat_completions, but without inference part
@@ -4928,3 +4923,54 @@ std::unique_ptr<server_res_generator> server_routes::handle_embeddings_impl(cons
    res->ok(root);
    return res;
 }
+
+std::unique_ptr<server_res_generator> server_routes::handle_count_tokens(const llama_vocab * vocab, mtmd_context * mctx, const server_http_req & req, task_response_type res_type) {
+    auto res = create_response();
+    std::vector<raw_buffer> files;
+    json body = json::parse(req.body);
+    bool is_oai = false;
+
+    switch (res_type) {
+        case TASK_RESPONSE_TYPE_OAI_CHAT:
+            {
+                is_oai = true;
+            } break;
+        case TASK_RESPONSE_TYPE_OAI_RESP:
+            {
+                is_oai = true;
+                body = server_chat_convert_responses_to_chatcmpl(body);
+            } break;
+        case TASK_RESPONSE_TYPE_ANTHROPIC:
+            {
+                body = server_chat_convert_anthropic_to_oai(body);
+            } break;
+        default:
+            res->error(format_error_response("invalid res_type", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+    }
+
+    json body_parsed = oaicompat_chat_params_parse(
+            body,
+            meta->chat_params,
+            files);
+    json prompt = body_parsed.at("prompt");
+    // SRV_DBG("prompt = %s\n", prompt.dump().c_str());
+
+    // TODO @ngxson : refactor this code block, move this to server-common and reuse it in other places
+    size_t n_tokens;
+    if (mctx != nullptr) {
+        if (!prompt.is_string()) {
+            throw std::runtime_error("for mtmd, input prompt must be a string.");
+        }
+        n_tokens = process_mtmd_prompt(mctx, prompt.get<std::string>(), files, true).size();
+    } else {
+        n_tokens = tokenize_mixed(vocab, prompt, true, true).size();
+    }
+
+    json response = {{"input_tokens", static_cast<int>(n_tokens)}};
+    if (is_oai) {
+        response["object"] = "response.input_tokens";
+    }
+    res->ok(response);
+    return res;
+}
@@ -110,8 +110,10 @@ struct server_routes {
    server_http_context::handler_t post_completions;
    server_http_context::handler_t post_completions_oai;
    server_http_context::handler_t post_chat_completions;
+    server_http_context::handler_t post_chat_completions_tok;
    server_http_context::handler_t post_control;
    server_http_context::handler_t post_responses_oai;
+    server_http_context::handler_t post_responses_tok_oai;
    server_http_context::handler_t post_transcriptions_oai;
    server_http_context::handler_t post_anthropic_messages;
    server_http_context::handler_t post_anthropic_count_tokens;
@@ -139,6 +141,7 @@ private:
    std::unique_ptr<server_res_generator> handle_slots_restore(const server_http_req & req, int id_slot);
    std::unique_ptr<server_res_generator> handle_slots_erase(const server_http_req &, int id_slot);
    std::unique_ptr<server_res_generator> handle_embeddings_impl(const server_http_req & req, task_response_type res_type);
+    std::unique_ptr<server_res_generator> handle_count_tokens(const llama_vocab * vocab, mtmd_context * mctx, const server_http_req & req, task_response_type res_type);

    // using unique_ptr to allow late initialization of const
    std::unique_ptr<const server_context_meta> meta;
@@ -161,6 +161,8 @@ int llama_server(int argc, char ** argv) {
        routes.post_tokenize               = models_routes->proxy_post;
        routes.post_detokenize             = models_routes->proxy_post;
        routes.post_apply_template         = models_routes->proxy_post;
+        routes.post_chat_completions_tok   = models_routes->proxy_post;
+        routes.post_responses_tok_oai      = models_routes->proxy_post;
        routes.get_lora_adapters           = models_routes->proxy_get;
        routes.post_lora_adapters          = models_routes->proxy_post;
        routes.get_slots                   = models_routes->proxy_get;
@@ -192,7 +194,6 @@ int llama_server(int argc, char ** argv) {
    ctx_http.post("/v1/audio/transcriptions",  ex_wrapper(routes.post_transcriptions_oai));
    ctx_http.post("/audio/transcriptions",     ex_wrapper(routes.post_transcriptions_oai));
    ctx_http.post("/v1/messages",              ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API
-    ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting
    ctx_http.post("/infill",                   ex_wrapper(routes.post_infill));
    ctx_http.post("/embedding",                ex_wrapper(routes.post_embeddings)); // legacy
    ctx_http.post("/embeddings",               ex_wrapper(routes.post_embeddings));
@@ -204,6 +205,12 @@ int llama_server(int argc, char ** argv) {
    ctx_http.post("/tokenize",                 ex_wrapper(routes.post_tokenize));
    ctx_http.post("/detokenize",               ex_wrapper(routes.post_detokenize));
    ctx_http.post("/apply-template",           ex_wrapper(routes.post_apply_template));
+    // token counting
+    ctx_http.post("/chat/completions/input_tokens",    ex_wrapper(routes.post_chat_completions_tok));
+    ctx_http.post("/v1/chat/completions/input_tokens", ex_wrapper(routes.post_chat_completions_tok));
+    ctx_http.post("/responses/input_tokens",           ex_wrapper(routes.post_responses_tok_oai));
+    ctx_http.post("/v1/responses/input_tokens",        ex_wrapper(routes.post_responses_tok_oai));
+    ctx_http.post("/v1/messages/count_tokens",         ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting
    // LoRA adapters hotswap
    ctx_http.get ("/lora-adapters",            ex_wrapper(routes.get_lora_adapters));
    ctx_http.post("/lora-adapters",            ex_wrapper(routes.post_lora_adapters));
@@ -573,3 +573,19 @@ def test_chat_completions_multiple_choices():
        for choice in res.body["choices"]:
            assert "assistant" == choice["message"]["role"]
            assert choice["finish_reason"] == "length"
+
+
+def test_chat_completions_token_count():
+    global server
+    server.start()
+    # make sure cache can be reused across multiple choices and multiple requests
+    # ref: https://github.com/ggml-org/llama.cpp/pull/18663
+    for _ in range(2):
+        res = server.make_request("POST", "/chat/completions/input_tokens", data={
+            "messages": [
+                {"role": "system", "content": "Book"},
+                {"role": "user", "content": "What is the best book"},
+            ],
+        })
+        assert res.status_code == 200
+        assert res.body["input_tokens"] > 5
@@ -98,6 +98,25 @@ def test_vision_chat_completion(prompt, image_url, success, re_content):
        assert res.status_code != 200


+def test_vision_chat_completion_token_count():
+    global server
+    server.start()
+    res = server.make_request("POST", "/chat/completions/input_tokens", data={
+        "temperature": 0.0,
+        "top_k": 1,
+        "messages": [
+            {"role": "user", "content": [
+                {"type": "text", "text": "What is this:"},
+                {"type": "image_url", "image_url": {
+                    "url": get_img_url("IMG_URL_0"),
+                }},
+            ]},
+        ],
+    })
+    assert res.status_code == 200
+    assert res.body["input_tokens"] > 10
+
+
@pytest.mark.parametrize(
    "prompt, image_data, success, re_content",
    [