mtmd : DeepSeek-OCR image processing fixes, img_tool::resize padding refactor (#23345)

* mtmd : deepseek-ocr fixes, improvements and refactoring - image processing changes to achieve full parity with Pillow (reference impl) - SAM mask casting only when flash-attn is on - SAM refactor (build_sam() extracted so deepseek-ocr-2 can reuse it) - llama-chat changes to fix server/WebUI issue (new media_markers_first()) - adapted test-chat-template and added test cases for deepseek-ocr - changed regression test for deepseek-ocr to use CER+chrF scores for ground-truth comparison; removed embedding-model - ty.toml ignore unresolved-import for tools/mtmd/tests/** * image-text reordering fix removed * refactor bool add_padding + pad_rounding enum into a single pad_style enum
2026-06-09 07:16:44 +02:00 · 2026-05-20 17:37:10 +02:00
parent acd604fb27
commit a8681a0ed2
11 changed files with 443 additions and 482 deletions
@@ -35,6 +35,16 @@ enum resize_algo {
    // RESIZE_ALGO_LANCZOS, // TODO
 };
 // Padding style for img_tool::resize
 //   PAD_NONE    - no padding; direct resize to target dimensions
 //   PAD_CEIL    - aspect-preserving pad (default)
 //   PAD_NEAREST - aspect-preserving pad with nearest-integer rounding (Pillow byte-parity)
 enum pad_style {
    PAD_NONE,
    PAD_CEIL,
    PAD_NEAREST,
 };
 struct clip_hparams {
    int32_t image_size = 0;
    int32_t patch_size = 0;
@@ -52,7 +62,7 @@ struct clip_hparams {
    int32_t image_min_pixels = -1;
    int32_t image_max_pixels = -1;
    resize_algo image_resize_algo = RESIZE_ALGO_BICUBIC;
-    bool image_resize_pad = true; // if false, center-crop will be applied when resizing
+    pad_style image_resize_pad = PAD_CEIL; // padding style when resizing
    std::array<uint8_t, 3> image_pad_color = {0, 0, 0};
    // (preprocessor) for llava-uhd style models
@@ -61,8 +71,8 @@ struct clip_hparams {
    int32_t preproc_max_tiles = 0;
    resize_algo image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
    resize_algo image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
-    bool image_pad_rf = true;  // if true, refined image will be padded (e.g. llava-1.6)
+    pad_style image_pad_rf = PAD_CEIL;  // padding style for the refined image (e.g. llava-1.6)
-    bool image_pad_ov = false; // if true, overview image will be padded (e.g. llava-1.6)
+    pad_style image_pad_ov = PAD_NONE;  // padding style for the overview image (e.g. llava-1.6)
    std::array<uint8_t, 3> image_pad_color_rf = {0, 0, 0}; // padding color for refined image
    std::array<uint8_t, 3> image_pad_color_ov = {0, 0, 0}; // padding color for overview image
@@ -1233,12 +1233,12 @@ struct clip_model_loader {
                        hparams.has_llava_projector = model.proj_type != PROJECTOR_TYPE_COGVLM;
                        hparams.image_pad_color     = {122, 116, 104};
                        if (!hparams.image_res_candidates.empty()) {
-                            hparams.image_resize_pad  = true;
+                            hparams.image_resize_pad  = PAD_CEIL;
                            hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
                        } else {
                            // llava-1.6 default params
-                            hparams.image_pad_ov         = false;
+                            hparams.image_pad_ov         = PAD_NONE;
-                            hparams.image_pad_rf         = true;
+                            hparams.image_pad_rf         = PAD_CEIL;
                            hparams.image_pad_color_rf   = {122, 116, 104};
                            hparams.image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
                            hparams.image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
@@ -1246,7 +1246,7 @@ struct clip_model_loader {
                    } break;
                case PROJECTOR_TYPE_GLM_EDGE:
                    {
-                        hparams.image_resize_pad  = true;
+                        hparams.image_resize_pad  = PAD_CEIL;
                        hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
                    } break;
                case PROJECTOR_TYPE_MINICPMV:
@@ -1441,7 +1441,7 @@ struct clip_model_loader {
                    {
                        hparams.n_merge = 2;
                        hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
-                        hparams.image_resize_pad  = false;
+                        hparams.image_resize_pad  = PAD_NONE;
                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
                        get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
                        std::vector<int> wa_layer_indexes_vec;
@@ -1461,7 +1461,7 @@ struct clip_model_loader {
                        // reka model performs better when using resize_bicubic, which stretches
                        // the image to fit fixed square size
-                        hparams.image_resize_pad = false;
+                        hparams.image_resize_pad = PAD_NONE;
                    } break;
                case PROJECTOR_TYPE_GLM4V:
                    {
@@ -1516,9 +1516,7 @@ struct clip_model_loader {
                        hparams.image_size = 1024;
                        hparams.warmup_image_size = 1024;
                        hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
-                        hparams.image_pad_color[0] = hparams.image_mean[0];
+                        hparams.image_pad_color = {127, 127, 127};
                        hparams.image_pad_color[1] = hparams.image_mean[1];
                        hparams.image_pad_color[2] = hparams.image_mean[2];
                        get_u32(KEY_SAM_N_BLOCK, hparams.sam_n_layer, true);
                        get_u32(KEY_SAM_N_HEAD, hparams.sam_n_head, true);
@@ -1537,7 +1535,7 @@ struct clip_model_loader {
                    {
                        hparams.n_merge = 2;
                        hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
-                        hparams.image_resize_pad = false;
+                        hparams.image_resize_pad = PAD_NONE;
                        hparams.ffn_op = FFN_GELU;
                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
                        hparams.set_limit_image_tokens(256, 16384);
@@ -88,164 +88,168 @@ static ggml_tensor * get_rel_pos(ggml_context * ctx0,
    return cur;  // [C, k_size, q_size]
 }
 ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
    // Building SAM
    const int n_embd  = hparams.sam_n_embd;
    const int n_layer = hparams.sam_n_layer;
    const int n_heads = hparams.sam_n_head;
    const int d_heads = n_embd / n_heads;
    const int window  = hparams.attn_window_size;
    ggml_tensor * inpL;
    inpL = ggml_conv_2d_sk_p0(ctx0, model.patch_embed_proj_w, inp_raw);
    inpL = ggml_add(ctx0, inpL, ggml_reshape_3d(ctx0, model.patch_embed_proj_b, 1, 1, n_embd));
    inpL = ggml_cont(ctx0, ggml_permute(ctx0, inpL, 1, 2, 0, 3));
    ggml_tensor * rel_pos_indices_local;
    ggml_tensor * rel_pos_indices_global;
    rel_pos_indices_local  = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, window, window);
    rel_pos_indices_global = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, inpL->ne[1], inpL->ne[2]);
    ggml_set_name(rel_pos_indices_local, "rel_pos_indices_local");
    ggml_set_name(rel_pos_indices_global, "rel_pos_indices_global");
    ggml_set_input(rel_pos_indices_local);
    ggml_set_input(rel_pos_indices_global);
    ggml_tensor * cur;
    const auto    tgt_size = inpL->ne[1];
    const auto    str_size = model.pos_embed->ne[1];
    if (str_size != tgt_size) {
        ggml_tensor * old_pos_embed = nullptr;
        old_pos_embed               = ggml_cont(ctx0, ggml_permute(ctx0, model.pos_embed, 2, 0, 1, 3));
        ggml_tensor * new_pos_embed =
            ggml_interpolate(ctx0, old_pos_embed, tgt_size, tgt_size, n_embd, 1, GGML_SCALE_MODE_BICUBIC);
        new_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, new_pos_embed, 1, 2, 0, 3));
        cur           = ggml_add(ctx0, inpL, new_pos_embed);
    } else {
        cur = ggml_add(ctx0, inpL, model.pos_embed);
    }
    // loop over layers
    for (int il = 0; il < n_layer; il++) {
        auto &        layer    = model.sam_layers[il];
        ggml_tensor * shortcut = cur;
        // layernorm1
        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
        const int64_t w0 = cur->ne[1];
        const int64_t h0 = cur->ne[2];
        ggml_tensor * indices;
        if (hparams.is_global_attn(il)) {
            indices = rel_pos_indices_global;
        } else {
            // local attention layer - apply window partition
            cur     = window_partition(ctx0, cur, window);
            indices = rel_pos_indices_local;
        }
        const int64_t W = cur->ne[1];
        const int64_t H = cur->ne[2];
        // self-attention
        {
            const int B = cur->ne[3];
            cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
            cur = ggml_add(ctx0, cur, layer.qkv_b);
            cur = ggml_cont(ctx0, cur); // Ensure tensor is contiguous before reshape
            cur = ggml_reshape_4d(ctx0, cur, n_embd, 3, W * H, B);
            ggml_tensor * Q;
            ggml_tensor * K;
            ggml_tensor * V;
            Q = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 0 * cur->nb[1]);
            Q = ggml_reshape_4d(ctx0, ggml_cont(ctx0, Q), d_heads, n_heads, W * H, B);
            K = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 1 * cur->nb[1]);
            K = ggml_reshape_4d(ctx0, ggml_cont(ctx0, K), d_heads, n_heads, W * H, B);
            V = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 2 * cur->nb[1]);
            V = ggml_reshape_4d(ctx0, ggml_cont(ctx0, V), d_heads, n_heads, W * H, B);
            ggml_tensor * mask;
            ggml_tensor * rw;
            ggml_tensor * rh;
            ggml_tensor * qr;
            rw = get_rel_pos(ctx0, layer.rel_pos_w, indices, W, W); // [W, W, C]
            rh = get_rel_pos(ctx0, layer.rel_pos_h, indices, H, H); // [H, H, C]
            qr = ggml_permute(ctx0, Q, 0, 2, 1, 3);
            qr = ggml_reshape_4d(ctx0, ggml_cont(ctx0, qr), d_heads, W, H, B * n_heads);
            rw = ggml_mul_mat(ctx0, rw,
                              ggml_cont(ctx0, ggml_permute(ctx0, qr, 0, 2, 1, 3))); // [B*n_heads, W, H, W]
            rw   = ggml_cont(ctx0, ggml_permute(ctx0, rw, 0, 2, 1, 3)); // [B*n_heads, H, W, W]
            rw   = ggml_reshape_4d(ctx0, rw, W, 1, W * H, n_heads * B);
            rw   = ggml_repeat_4d(ctx0, rw, W, H, W * H, n_heads * B);
            rh   = ggml_mul_mat(ctx0, rh, qr); // [B*n_heads, H, W, H]
            rh   = ggml_reshape_4d(ctx0, rh, 1, H, W * H, n_heads * B);
            mask = ggml_add(ctx0, rw, rh); // [B*n_heads, H*W, H, W]
            mask = ggml_reshape_4d(ctx0, mask, W * H, W * H, n_heads, B);
            // casting mask to F16 only required when flash-attn is enabled
            if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
                mask = ggml_cast(ctx0, mask, GGML_TYPE_F16);
            }
            const float scale = 1.0f / sqrtf(static_cast<float>(d_heads));
            cur = build_attn(layer.o_w, layer.o_b, Q, K, V, mask, scale,
                             il); // [B, H*W, n_embd]
            cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), n_embd, W, H, B);
        }
        if (hparams.is_global_attn(il) == false) {
            // local attention layer - reverse window partition
            cur = window_unpartition(ctx0, cur, w0, h0, window);
        }
        // re-add the layer input, e.g., residual
        cur = ggml_add(ctx0, cur, shortcut);
        ggml_tensor * inpFF = cur;
        // layernorm2
        cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
        // ffn
        cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b,
                        hparams.ffn_op, il);
        // residual 2
        cur = ggml_add(ctx0, cur, inpFF);
        cb(cur, "sam_layer_out", il);
    }
    cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
    cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1);
    cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
    cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1);
    cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
    cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1);
    cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
    cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1);
    cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
    cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1);
    cur = ggml_conv_2d(ctx0, model.net_3, cur, 2, 2, 1, 1, 1, 1);
    cb(cur, "sam_output", -1);
    ggml_build_forward_expand(gf, cur);
    return cur;
 }
 ggml_cgraph * clip_graph_deepseekocr::build() {
    // patch embedding
    ggml_tensor * inp_raw = build_inp_raw();
-
+    ggml_tensor * sam_out = build_sam(inp_raw);
    ggml_tensor * sam_out;
    // Building SAM
    {
        const int n_embd  = hparams.sam_n_embd;
        const int n_layer = hparams.sam_n_layer;
        const int n_heads = hparams.sam_n_head;
        const int d_heads = n_embd / n_heads;
        const int window  = hparams.attn_window_size;
        ggml_tensor * inpL;
        inpL = ggml_conv_2d_sk_p0(ctx0, model.patch_embed_proj_w, inp_raw);
        inpL = ggml_add(ctx0, inpL, ggml_reshape_3d(ctx0, model.patch_embed_proj_b, 1, 1, n_embd));
        inpL = ggml_cont(ctx0, ggml_permute(ctx0, inpL, 1, 2, 0, 3));
        ggml_tensor * rel_pos_indices_local;
        ggml_tensor * rel_pos_indices_global;
        rel_pos_indices_local  = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, window, window);
        rel_pos_indices_global = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, inpL->ne[1], inpL->ne[2]);
        ggml_set_name(rel_pos_indices_local, "rel_pos_indices_local");
        ggml_set_name(rel_pos_indices_global, "rel_pos_indices_global");
        ggml_set_input(rel_pos_indices_local);
        ggml_set_input(rel_pos_indices_global);
        ggml_tensor * cur;
        const auto    tgt_size = inpL->ne[1];
        const auto    str_size = model.pos_embed->ne[1];
        if (str_size != tgt_size) {
            ggml_tensor * old_pos_embed = nullptr;
            old_pos_embed               = ggml_cont(ctx0, ggml_permute(ctx0, model.pos_embed, 2, 0, 1, 3));
            ggml_tensor * new_pos_embed =
                ggml_interpolate(ctx0, old_pos_embed, tgt_size, tgt_size, n_embd, 1, GGML_SCALE_MODE_BICUBIC);
            new_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, new_pos_embed, 1, 2, 0, 3));
            cur           = ggml_add(ctx0, inpL, new_pos_embed);
        } else {
            cur = ggml_add(ctx0, inpL, model.pos_embed);
        }
        // loop over layers
        for (int il = 0; il < n_layer; il++) {
            auto &        layer    = model.sam_layers[il];
            ggml_tensor * shortcut = cur;
            // layernorm1
            cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
            const int64_t w0 = cur->ne[1];
            const int64_t h0 = cur->ne[2];
            ggml_tensor * indices;
            if (hparams.is_global_attn(il)) {
                indices = rel_pos_indices_global;
            } else {
                // local attention layer - apply window partition
                cur     = window_partition(ctx0, cur, window);
                indices = rel_pos_indices_local;
            }
            const int64_t W = cur->ne[1];
            const int64_t H = cur->ne[2];
            // self-attention
            {
                const int B = cur->ne[3];
                cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
                cur = ggml_add(ctx0, cur, layer.qkv_b);
                cur = ggml_cont(ctx0, cur);  // Ensure tensor is contiguous before reshape
                cur = ggml_reshape_4d(ctx0, cur, n_embd, 3, W * H, B);
                ggml_tensor * Q;
                ggml_tensor * K;
                ggml_tensor * V;
                Q = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 0 * cur->nb[1]);
                Q = ggml_reshape_4d(ctx0, ggml_cont(ctx0, Q), d_heads, n_heads, W * H, B);
                K = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 1 * cur->nb[1]);
                K = ggml_reshape_4d(ctx0, ggml_cont(ctx0, K), d_heads, n_heads, W * H, B);
                V = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 2 * cur->nb[1]);
                V = ggml_reshape_4d(ctx0, ggml_cont(ctx0, V), d_heads, n_heads, W * H, B);
                ggml_tensor * mask;
                ggml_tensor * rw;
                ggml_tensor * rh;
                ggml_tensor * qr;
                rw = get_rel_pos(ctx0, layer.rel_pos_w, indices, W, W);  // [W, W, C]
                rh = get_rel_pos(ctx0, layer.rel_pos_h, indices, H, H);  // [H, H, C]
                qr = ggml_permute(ctx0, Q, 0, 2, 1, 3);
                qr = ggml_reshape_4d(ctx0, ggml_cont(ctx0, qr), d_heads, W, H, B * n_heads);
                rw   = ggml_mul_mat(ctx0, rw,
                                    ggml_cont(ctx0, ggml_permute(ctx0, qr, 0, 2, 1, 3)));  // [B*n_heads, W, H, W]
                rw   = ggml_cont(ctx0, ggml_permute(ctx0, rw, 0, 2, 1, 3));                // [B*n_heads, H, W, W]
                rw   = ggml_reshape_4d(ctx0, rw, W, 1, W * H, n_heads * B);
                rw   = ggml_repeat_4d(ctx0, rw, W, H, W * H, n_heads * B);
                rh   = ggml_mul_mat(ctx0, rh, qr);  // [B*n_heads, H, W, H]
                rh   = ggml_reshape_4d(ctx0, rh, 1, H, W * H, n_heads * B);
                mask = ggml_add(ctx0, rw, rh);      // [B*n_heads, H*W, H, W]
                mask = ggml_reshape_4d(ctx0, mask, W * H, W * H, n_heads, B);
                mask = ggml_cast(ctx0, mask, GGML_TYPE_F16);
                const float scale = 1.0f / sqrtf(static_cast<float>(d_heads));
                cur = build_attn(layer.o_w, layer.o_b, Q, K, V, mask, scale,
                                 il);  // [B, H*W, n_embd]
                cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), n_embd, W, H, B);
            }
            if (hparams.is_global_attn(il) == false) {
                // local attention layer - reverse window partition
                cur = window_unpartition(ctx0, cur, w0, h0, window);
            }
            // re-add the layer input, e.g., residual
            cur = ggml_add(ctx0, cur, shortcut);
            ggml_tensor * inpFF = cur;
            // layernorm2
            cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
            // ffn
            cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b,
                            hparams.ffn_op, il);
            // residual 2
            cur = ggml_add(ctx0, cur, inpFF);
            cb(cur, "sam_layer_out", il);
        }
        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
        cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1);
        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
        cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1);
        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
        cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1);
        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
        cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1);
        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
        cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1);
        cur = ggml_conv_2d(ctx0, model.net_3, cur, 2, 2, 1, 1, 1, 1);
        cb(cur, "sam_output", -1);
        ggml_build_forward_expand(gf, cur);
        sam_out = cur;
    }
    ggml_tensor * clip_out;
    // Building DS-OCR CLIP
@@ -118,6 +118,7 @@ struct clip_graph_whisper_enc : clip_graph {
 struct clip_graph_deepseekocr : clip_graph {
    clip_graph_deepseekocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
    ggml_tensor * build_sam(ggml_tensor * inp); // build the SAM model
 };
 struct clip_graph_conformer : clip_graph {
@@ -38,7 +38,7 @@ struct img_tool {
            clip_image_u8 & dst,
            const clip_image_size & target_resolution,
            resize_algo algo,
-            bool add_padding = true, // TODO: define the behavior for add_padding = false
+            pad_style padding = PAD_CEIL,
            std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
        dst.nx = target_resolution.width;
        dst.ny = target_resolution.height;
@@ -50,7 +50,7 @@ struct img_tool {
            return;
        }
-        if (!add_padding) {
+        if (padding == PAD_NONE) {
            // direct resize
            switch (algo) {
                case RESIZE_ALGO_BILINEAR:
@@ -71,8 +71,15 @@ struct img_tool {
            float scale_w = static_cast<float>(target_resolution.width) / src.nx;
            float scale_h = static_cast<float>(target_resolution.height) / src.ny;
            float scale = std::min(scale_w, scale_h);
-            int new_width  = std::min(static_cast<int>(std::ceil(src.nx * scale)), target_resolution.width);
+
-            int new_height = std::min(static_cast<int>(std::ceil(src.ny * scale)), target_resolution.height);
+            int new_width, new_height;
            if (padding == PAD_NEAREST) {
                new_width  = std::min(static_cast<int>(std::round(src.nx * scale)), target_resolution.width);
                new_height = std::min(static_cast<int>(std::round(src.ny * scale)), target_resolution.height);
            } else {
                new_width  = std::min(static_cast<int>(std::ceil(src.nx * scale)), target_resolution.width);
                new_height = std::min(static_cast<int>(std::ceil(src.ny * scale)), target_resolution.height);
            }
            switch (algo) {
                case RESIZE_ALGO_BILINEAR:
@@ -91,9 +98,14 @@ struct img_tool {
            // fill dst with pad_color
            fill(dst, pad_color);
-            int offset_x = (target_resolution.width  - new_width)  / 2;
+            int offset_x, offset_y;
-            int offset_y = (target_resolution.height - new_height) / 2;
+            if (padding == PAD_NEAREST) {
-
+                offset_x = static_cast<int>(std::round((target_resolution.width  - new_width)  / 2.0f));
                offset_y = static_cast<int>(std::round((target_resolution.height - new_height) / 2.0f));
            } else {
                offset_x = (target_resolution.width  - new_width)  / 2;
                offset_y = (target_resolution.height - new_height) / 2;
            }
            composite(dst, resized_image, offset_x, offset_y);
        }
    }
@@ -356,10 +368,10 @@ private:
            GGML_ASSERT(inSize > 0 && outSize > 0);
            double support, scale, filterscale;
            double center, ww, ss;
-            int xx, x, ksize, xmin, xmax, xcnt;
+            int xx, x, ksize, xmin, xmax;
            // Calculate scaling factor: ratio of input range to output size
-            filterscale = scale = (double)inSize / outSize;
+            filterscale = scale = static_cast<double>(inSize) / outSize;
            // For upsampling (scale < 1), keep filterscale = 1 to maintain filter sharpness
            // For downsampling (scale > 1), widen filter to prevent aliasing
            if (filterscale < 1.0) {
@@ -373,6 +385,7 @@ private:
            std::vector<double> pre_weights(outSize * ksize);  // Temporary weights
            bounds.resize(outSize * 2);
            // For each output pixel, compute its filter coefficients
            for (xx = 0; xx < outSize; xx++) {
                // Calculate the center position in input space (pixel-center convention: +0.5)
@@ -391,10 +404,10 @@ private:
                    xmax = inSize;
                }
-                xcnt = xmax - xmin;
+                xmax -= xmin;
                // Compute filter weights for each contributing input pixel
-                for (x = 0; x < xcnt; x++) {
+                for (x = 0; x < xmax; x++) {
                    // Distance from input pixel center to output pixel center in input space
                    double w = bicubic_filter((x + xmin - center + 0.5) * ss);
                    pre_weights[xx * ksize + x] = w;
@@ -402,7 +415,7 @@ private:
                }
                // Normalize weights to sum to 1.0 (preserves brightness)
-                for (x = 0; x < xcnt; x++) {
+                for (x = 0; x < xmax; x++) {
                    if (ww != 0.0) {
                        pre_weights[xx * ksize + x] /= ww;
                    }
@@ -415,18 +428,27 @@ private:
                // Store input pixel range for this output pixel
                bounds[xx * 2 + 0] = xmin;
-                bounds[xx * 2 + 1] = xcnt;
+                bounds[xx * 2 + 1] = xmax;
            }
            // Convert floating-point coefficients to fixed-point integers
            // Formula: int32 = round(float * 2^PRECISION_BITS)
            weights.resize(outSize * ksize);
            const double fxp_scale = std::ldexp(1.0, PRECISION_BITS); // 1.0 * 2^PRECISION_BITS
            for (int i = 0; i < outSize * ksize; i++) {
                double tmp_val = pre_weights[i] * fxp_scale;
                if (pre_weights[i] < 0) {
-                    weights[i] = static_cast<int32_t>(-0.5 + pre_weights[i] * (1 << PRECISION_BITS));
+                    tmp_val -= 0.5;
                } else {
-                    weights[i] = static_cast<int32_t>(0.5 + pre_weights[i] * (1 << PRECISION_BITS));
+                    tmp_val += 0.5;
                }
                tmp_val = std::round(tmp_val);
                tmp_val = std::clamp(tmp_val,
                                     static_cast<double>(std::numeric_limits<int32_t>::min()),
                                     static_cast<double>(std::numeric_limits<int32_t>::max()));
                weights[i] = static_cast<int32_t>(tmp_val);
            }
            return ksize;
@@ -1083,35 +1105,31 @@ bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, cli
 //
 bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
-    const std::vector native_resolutions = {
+    static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ };
-        /*512 tiny , 640 small, */ 1024 /* base */, 1280 /* large */
+    // TODO: support 512 (tiny) and 640 (small) once we have eval data for them
    };
    // original image size
    const clip_image_size original_size{img.nx, img.ny};
    const int orig_w = original_size.width;
    const int orig_h = original_size.height;
    const int orig_area = orig_h * orig_w;
-    size_t mode_i = 0;
+    const int64_t orig_area = static_cast<int64_t>(img.nx) * img.ny;
    int min_diff = orig_area;
-    for (size_t i = 0; i < native_resolutions.size(); i++) {
+    size_t  mode_i   = 0;
-        int r = native_resolutions[i];
+    int64_t min_diff = std::numeric_limits<int64_t>::max();
-        if (std::abs(orig_area - r * r) < min_diff) {
+    for (size_t i = 0; i < std::size(native_resolutions); i++) {
-            mode_i = i;
+        const int64_t r    = native_resolutions[i];
-            min_diff = std::abs(orig_area - r * r);
+        const int64_t diff = std::abs(orig_area - r * r);
        if (diff < min_diff) {
            mode_i   = i;
            min_diff = diff;
        }
    }
    /* Native Resolution (Base/Large) */
    const int image_size = native_resolutions[mode_i];
-    // scaled and padded image
+    // Aspect-preserving fit-and-pad. Pillow bicubic + PAD_NEAREST for
-    clip_image_u8_ptr scaled_img(clip_image_u8_init());
+    // byte-parity with the upstream deepseek-ai/DeepSeek-OCR HF preprocessor.
-    img_tool::resize(img, *scaled_img, clip_image_size{image_size, image_size}, hparams.image_resize_algo);
+    clip_image_u8 padded;
    img_tool::resize(img, padded, {image_size, image_size}, RESIZE_ALGO_BICUBIC_PILLOW,
                     PAD_NEAREST, hparams.image_pad_color);
    clip_image_f32_ptr res(clip_image_f32_init());
-    img_u8_to_f32(*scaled_img, *res, hparams.image_mean, hparams.image_std);
+    img_u8_to_f32(padded, *res, hparams.image_mean, hparams.image_std);
    output.entries.push_back(std::move(res));
    output.grid_x = 1;
@@ -1246,7 +1264,7 @@ clip_image_u8 mtmd_image_preprocessor_step3vl::prepare_image(const clip_image_u8
            std::max(1, static_cast<int>(std::floor(resized.ny * scale))),
        };
        clip_image_u8 scaled;
-        img_tool::resize(resized, scaled, new_size, RESIZE_ALGO_BILINEAR, false);
+        img_tool::resize(resized, scaled, new_size, RESIZE_ALGO_BILINEAR, PAD_NONE);
        resized = std::move(scaled);
    }
@@ -1347,7 +1365,7 @@ bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip
    clip_image_u8 img_for_crop = prepared;
    if (instructions.refined_size.width != prepared.nx || instructions.refined_size.height != prepared.ny) {
        clip_image_u8 refined;
-        img_tool::resize(prepared, refined, instructions.refined_size, RESIZE_ALGO_BILINEAR, false);
+        img_tool::resize(prepared, refined, instructions.refined_size, RESIZE_ALGO_BILINEAR, PAD_NONE);
        img_for_crop = std::move(refined);
    }
@@ -1,85 +0,0 @@
 <|ref|>title<|/ref|><|det|>[[61, 255, 907, 533]]<|/det|>
 # MEN WALK ON MOON
 ASTRONAUTS LAND ON PLAIN;
 COLLECT ROCKS, PLANT FLAG
 <|ref|>text<|/ref|><|det|>[[56, 559, 268, 629]]<|/det|>
 Voice From Moon:
 Eagle Has Landed'
 <|ref|>text<|/ref|><|det|>[[74, 645, 262, 675]]<|/det|>
 EAGLE (the lunar surface, Houston, Truesquily)
 Base here, The Eagle has landed.
 <|ref|>text<|/ref|><|det|>[[74, 675, 262, 720]]<|/det|>
 BOOTHROOM: Lounge, Truesquily, we enjoy you on the ground. You've got a bunch of guys about to toss bikes. We're breaking again. Thanks a lot.
 <|ref|>text<|/ref|><|det|>[[74, 720, 262, 750]]<|/det|>
 TRAVELLING MADE: Time you. BOOTHROOM: You're looking good here.
 <|ref|>text<|/ref|><|det|>[[74, 750, 262, 780]]<|/det|>
 TRAVELLING MADE: A very smooth touchdown. BEDROOM: Eagle, you are very far. I'll. (The first sign in the lunar appearance) (Over.)
 <|ref|>text<|/ref|><|det|>[[74, 780, 262, 810]]<|/det|>
 TRAVELLING MADE: Eagle, stay for I'll. BOOTHROOM: Bumper and we are you waiting the cue.
 <|ref|>text<|/ref|><|det|>[[74, 810, 262, 830]]<|/det|>
 TRAVELLING MADE: Eagle, and service mobility.
 <|ref|>text<|/ref|><|det|>[[74, 830, 262, 850]]<|/det|>
 How do you read me?
 <|ref|>text<|/ref|><|det|>[[74, 850, 262, 880]]<|/det|>
 TRAVELLING COLUMBIA, he has landed Truesquily. Base, Eagle is at Truesquily. I read you first by. Over.
 <|ref|>text<|/ref|><|det|>[[74, 880, 262, 900]]<|/det|>
 COLUMBIA: Yes, I heard the whole thing.
 <|ref|>text<|/ref|><|det|>[[74, 900, 262, 920]]<|/det|>
 BOOTHROOM: Well, it's a good show.
 <|ref|>text<|/ref|><|det|>[[74, 920, 262, 940]]<|/det|>
 COLUMBIA: Fantastic.
 <|ref|>text<|/ref|><|det|>[[74, 940, 262, 960]]<|/det|>
 TRAVELLING MADE: I'll read that.
 <|ref|>text<|/ref|><|det|>[[74, 960, 262, 980]]<|/det|>
 APOLLO CONTROL: The most major sky to sky will be for the 23 event, that is at 21 minutes 26 sec-
 <|ref|>text<|/ref|><|det|>[[74, 980, 262, 990]]<|/det|>
 tion of lunar descent.
 <|ref|>image<|/ref|><|det|>[[270, 545, 697, 990]]<|/det|>
 <|ref|>text<|/ref|><|det|>[[715, 559, 911, 629]]<|/det|>
 A Powdery Surface
 Is Closely Explored
 <|ref|>text<|/ref|><|det|>[[733, 645, 851, 665]]<|/det|>
 BY JOHN NOBLE WILFORD
 <|ref|>text<|/ref|><|det|>[[715, 669, 911, 700]]<|/det|>
 HOUSTON, Monday, July 21—New hires landed and walked on the moon.
 <|ref|>text<|/ref|><|det|>[[715, 700, 911, 750]]<|/det|>
 Two Americans, astronauts of Apollo 11, steered their Eagle-shaped lunar module safely and smoothly to the lunar landing yesterday at 4:17:40 P.M., Eastern day-light time.
 <|ref|>text<|/ref|><|det|>[[715, 750, 911, 780]]<|/det|>
 Neil A. Armstrong, the 38-year-old civilian commander, radioed to earth and the landing team here.
 <|ref|>text<|/ref|><|det|>[[715, 780, 911, 830]]<|/det|>
 "Boom, Truesquily! Base here. The Eagle has landed," the first man to reach the moon—Neil Armstrong and his engineer, Capt. Charles E. Alder, of the Jet Propulsion Laboratory, the space agency's rocket and space program manager.
 <|ref|>text<|/ref|><|det|>[[715, 830, 911, 880]]<|/det|>
 About six and a half hours later, Mr. Armstrong opened the landing craft's hatch, stepped slowly down the ladder and descended as he pointed his first landing footguard on the lunar crater.
 <|ref|>text<|/ref|><|det|>[[715, 880, 911, 920]]<|/det|>
 "That's one small step for man, one giant leap for mankind."
 <|ref|>text<|/ref|><|det|>[[715, 920, 911, 960]]<|/det|>
 His first step on the moon came on 10:56:29 P.M., as a television camera recorded the craft's transmitted his every word to an aerial and excited audiences of hundreds of millions of people on earth.
 <|ref|>text<|/ref|><|det|>[[749, 960, 861, 974]]<|/det|>
 Testable Slope Test Soil
@@ -1,42 +0,0 @@
 MEN WALK ON MOON
 ASTRONAUTS LAND ON PLAIN;
 COLLECT ROCKS, PLANT FLAG
 Voice From Moon:
 'Eagle Has Landed'
 A Powder Surface
 Is Closely Explored
 By JOHN NOBLE WILFORD
 NOVEMBER, Monday, July 21—New York Herald and
 wished on the moon.
 Two American astronauts of Apollo 11, steered their
 frigate Eagle toward the moon's surface and smoothly to
 the lunar landing yesterday at 4:17:40 P.M., Eastern day-
 light time.
 Neil A. Armstrong, the 38-year-old civilian commander,
 landed on the soft sand of the moon's surface here.
 "Beautiful, Triumph!" he said. "The Eagle has landed."
 The first man to reach the moon—Neil Armstrong and
 his co-pilot, Charles E. "Pete" Conrad, 26, of the Pentagon,
 brought their ship to rest on a level, rock-strewn plain near
 the moon's surface. The two men and two of the three
 astronauts on board, Armstrong, Conrad and Edwin E.
 Aldrin, 38, of Houston, stepped slowly down the ladder
 and descended as he pointed his first full-flaming footpad
 at the lunar crater.
 "That's one small step for man, one giant leap for
 mankind."
 His first step on the moon came at 10:56:20 P.M., as
 a television camera rolled the earth's thousandth line every
 second to an aerial and studied audiences of hundreds of
 millions of people on earth.
 Textile Slope Test Soil
@@ -0,0 +1,24 @@
 A Powdery Surface
  Is Closely Explored
 By JOHN NOBLE WILFORD
 Special to The New York Times
 HOUSTON, Monday, July 21—Men have landed and walked on the moon.
 Two Americans, astronauts of Apollo 11, steered their fragile four-legged lunar module safely and smoothly to the historic landing yesterday at 4:17:40 P.M., Eastern daylight time.
 Neil A. Armstrong, the 38-year-old civilian commander, radioed to earth and the mission control room here:
 "Houston, Tranquility Base here. The Eagle has landed."
 The first men to reach the moon—Mr. Armstrong and his co-pilot, Col. Edwin E. Aldrin Jr. of the Air Force—brought their ship to rest on a level, rock-strewn plain near the southwestern shore of the arid Sea of Tranquility.
 About six and a half hours later, Mr. Armstrong opened the landing craft's hatch, stepped slowly down the ladder and declared as he planted the first human footprint on the lunar crust:
 "That's one small step for man, one giant leap for mankind."
 His first step on the moon came at 10:56:20 P.M., as a television camera outside the craft transmitted his every move to an awed and excited audience of hundreds of millions of people on earth.
 Tentative Steps Test Soil
@@ -1,186 +1,220 @@
 #!/usr/bin/env python3
 """
-Test script to compare llama.cpp mtmd-cli output with HuggingFace reference implementation
+Evaluates llama.cpp's DeepSeek-OCR by comparing its output for a test
-for DeepSeek-OCR model using embedding similarity.
+image to the actual text in part of that image.
 Runs the test image through mtmd-cli, calculates CER and chrF for
 its output, and holds them against the HF model's scores.
 """
 import argparse
 import logging
 import subprocess
 import sys
 import unicodedata
 from pathlib import Path
-from sentence_transformers import SentenceTransformer
+logger = logging.getLogger("deepseek-ocr-test")
-from sentence_transformers import util
+
 DEFAULT_IMAGE = "test-1.jpeg"
 DEFAULT_EXPECTED_TEXT = "test-1-ground-truth.txt"
 RUN_TIMEOUT = 300
 # DeepSeek-OCR reference scores on the test image.
 # This is the baseline the implementation should keep up with.
 HF_REFERENCE_CER = 0.3030
 HF_REFERENCE_CHRF = 67.52
 CER_TOLERANCE = 0.02
 CHRF_TOLERANCE = 2.0
 CER_MAX = HF_REFERENCE_CER + CER_TOLERANCE
 CHRF_MIN = HF_REFERENCE_CHRF - CHRF_TOLERANCE
-def run_mtmd_deepseek_ocr(
+def verdict(ok: bool) -> str:
-        model_path: str,
+    return "PASS" if ok else "FAIL"
-        mmproj_path: str,
+
-        image_path: str,
+
-        bin_path: str,
+def normalize_text(text: str) -> str:
-        prompt: str = "Free OCR."
+    """NFC-normalize and collapse whitespace, so line-wrap and spacing
-) -> str:
+    don't count as CER errors."""
    return " ".join(unicodedata.normalize("NFC", text).split())
 def locally_align(expected: str, ocr_out: str) -> str:
    """Return the span of `ocr_out` that best matches `expected`.
    The ground truth covers part of the article body.
    But the test image includes half of the newspaper's front page.
    Fuzzy partial-ratio matching picks out
    the body so the unrelated text doesn't disturb CER / chrF.
    """
-    Run inference using llama.cpp mtmd-cli.
+    from rapidfuzz import fuzz
    alignment = fuzz.partial_ratio_alignment(expected, ocr_out)
    if alignment is None or alignment.dest_end <= alignment.dest_start:
        return ocr_out
    return ocr_out[alignment.dest_start:alignment.dest_end]
 def compute_cer(expected: str, ocr_out: str) -> float:
    """Character Error Rate. Lower is better.
    CER: fraction of characters you'd insert/delete/substitute to fix the output; 0 = perfect."""
    import jiwer
    return jiwer.cer(expected, ocr_out)
 def compute_chrf(expected: str, ocr_out: str) -> float:
    """chrF score on 0-100. Higher is better.
    chrF: F-score over shared character n-grams; more forgiving of small word/spacing drift than CER.
    """
    from sacrebleu.metrics import CHRF
    return CHRF().sentence_score(ocr_out, [expected]).score
 def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str:
    """Run mtmd-cli on the image and return its output."""
    cmd = [
-        bin_path,
+        str(bin_path),
-        "-m", model_path,
+        "-m", str(model_path),
-        "--mmproj", mmproj_path,
+        "--mmproj", str(mmproj_path),
-        "--image", image_path,
+        "--image", str(image_path),
-        # "-p", "<|grounding|>Convert the document to markdown.",
+        "-p", "Free OCR. ",
        "-p", prompt,
        "--chat-template", "deepseek-ocr",
        "--temp", "0",
-        "-n", "1024",
+        "--flash-attn", "off",  # match the HF "eager" attention reference
-        # "--verbose"
+        "--no-warmup",
    ]
    logger.debug(f"  command: {' '.join(cmd)}")
-    print(f"Running llama.cpp command: {' '.join(cmd)}")
+    try:
-
+        result = subprocess.run(cmd, capture_output=True, text=False, timeout=RUN_TIMEOUT)
-    result = subprocess.run(
+    except subprocess.TimeoutExpired as e:
-        cmd,
+        if e.stderr:
-        capture_output=True,
+            logger.error("llama.cpp stderr:\n%s", e.stderr.decode("utf-8", errors="replace"))
-        text=False,
+        raise RuntimeError(f"llama-mtmd-cli timed out after {RUN_TIMEOUT}s")
        timeout=300
    )
    if result.returncode != 0:
-        stderr = result.stderr.decode('utf-8', errors='replace')
+        logger.error("llama.cpp stderr:\n%s", result.stderr.decode("utf-8", errors="replace"))
        print(f"llama.cpp stderr: {stderr}")
        raise RuntimeError(f"llama-mtmd-cli failed with code {result.returncode}")
-    output = result.stdout.decode('utf-8', errors='replace').strip()
+    output = result.stdout.decode("utf-8", errors="replace").strip()
-    print(f"llama.cpp output length: {len(output)} chars")
+    if not output:
        raise RuntimeError("llama-mtmd-cli produced no output on stdout")
    logger.info(f"  output: {len(output)} chars")
    return output
-def compute_embedding_similarity(text1: str, text2: str, model_name: str) -> float:
+def read_expected_text(file_path: Path) -> str:
-    """
+    with open(file_path, "r", encoding="utf-8") as f:
    Compute cosine similarity between two texts using embedding model.
    """
    print(f"Loading embedding model: {model_name}")
    # Use sentence-transformers for easier embedding extraction
    embed_model = SentenceTransformer(model_name)
    print("Computing embeddings...")
    embeddings = embed_model.encode([text1, text2], convert_to_numpy=True)
    similarity = util.similarity.cos_sim([embeddings[0]], [embeddings[1]])[0][0]
    return float(similarity)
 def read_expected_output(file_path: str) -> str:
    """
    Read expected OCR output from file.
    """
    cur_path = Path(__file__).parent
    expected_path = str(cur_path / file_path)
    with open(expected_path, "r", encoding="utf-8") as f:
        return f.read().strip()
-def main():
+def evaluate(expected: str, ocr_out: str) -> bool:
-    ap = argparse.ArgumentParser(description="Compare llama.cpp and HuggingFace DeepSeek-OCR outputs")
+    expected = normalize_text(expected)
-    ap.add_argument("--llama-model", default="gguf_models/deepseek-ai/deepseek-ocr-f16.gguf",
+    ocr_out = normalize_text(ocr_out)
-                    help="Path to llama.cpp GGUF model")
+    aligned = locally_align(expected, ocr_out)
-    ap.add_argument("--mmproj", default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-f16.gguf",
+
-                    help="Path to mmproj GGUF file")
+    logger.debug(f"\n--- expected (normalized) ---\n{expected}")
-    ap.add_argument("--image", default="test-1.jpeg",
+    logger.debug(f"\n--- OCR output (normalized) ---\n{ocr_out}")
-                    help="Path to test image")
+    logger.debug(f"\n--- aligned span ---\n{aligned}")
    cer = compute_cer(expected, aligned)
    chrf = compute_chrf(expected, aligned)
    cer_pass = cer <= CER_MAX
    chrf_pass = chrf >= CHRF_MIN
    passed = cer_pass and chrf_pass
    logger.info("")
    logger.info("=" * 60)
    logger.info("Free OCR evaluation:")
    logger.info("=" * 60)
    logger.info(f"  CER               {cer:>7.4f}    (<= {CER_MAX:>7.4f}  -> {verdict(cer_pass)})")
    logger.info(f"  chrF (0-100)      {chrf:>7.2f}    (>= {CHRF_MIN:>7.2f}  -> {verdict(chrf_pass)})")
    logger.info(f"  Expected chars    {len(expected):>7}")
    logger.info(f"  Aligned chars     {len(aligned):>7} (of {len(ocr_out)} OCR chars)")
    logger.info("")
    logger.info(f"  Result: {verdict(passed)}")
    logger.info("=" * 60)
    return passed
 def argument_parser() -> argparse.ArgumentParser:
    ap = argparse.ArgumentParser(description="Compare llama.cpp DeepSeek-OCR output with a ground-truth transcript")
    ap.add_argument("--llama-model", default="gguf_models/deepseek-ai/deepseek-ocr-bf16.gguf",
                    help="Path to llama.cpp GGUF model (relative to repo root or absolute)")
    ap.add_argument("--mmproj", default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-bf16.gguf",
                    help="Path to mmproj GGUF file (relative to repo root or absolute)")
    ap.add_argument("--llama-bin", default="build/bin/llama-mtmd-cli",
-                    help="Path to llama-mtmd-cli binary")
+                    help="Path to llama-mtmd-cli binary (relative to repo root or absolute)")
-    ap.add_argument("--embedding-model", default="Qwen/Qwen3-Embedding-0.6B",
+    ap.add_argument("--verbose", action="store_true",
-                    help="Embedding model for similarity computation")
+                    help="Also log the expected, OCR, and aligned text")
-    ap.add_argument("--threshold", type=float, default=0.7,
+    return ap
                    help="Minimum similarity threshold for pass")
    args = ap.parse_args()
    # Validate paths
    # script directory + image
    mtmd_dir = Path(__file__).parent.parent
    args.image = str(mtmd_dir / args.image)
    # project directory + llama model
    args.llama_model = str(mtmd_dir.parent.parent / args.llama_model)
    # project directory + mmproj
    args.mmproj = str(mtmd_dir.parent.parent / args.mmproj)
    args.llama_bin = str(mtmd_dir.parent.parent / args.llama_bin)
    if not Path(args.image).exists():
        print(f"Error: Image not found: {args.image}")
        sys.exit(1)
    if not Path(args.llama_model).exists():
        print(f"Error: Model not found: {args.llama_model}")
        sys.exit(1)
    if not Path(args.mmproj).exists():
        print(f"Error: mmproj not found: {args.mmproj}")
        sys.exit(1)
-    print("=" * 60)
+def configure_logging(verbose: bool) -> None:
-    print("DeepSeek-OCR: llama.cpp vs HuggingFace Comparison")
+    logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO,
-    print("=" * 60)
+                        format="%(message)s")
    # Default paths based on your command
-    # Run llama.cpp inference
+def resolve_path(path: str, base: Path) -> Path:
-    print("\n[2/3] Running llama.cpp implementation...")
+    p = Path(path)
-    llama_free_ocr = run_mtmd_deepseek_ocr(
+    return p if p.is_absolute() else base / p
        args.llama_model,
        args.mmproj,
        args.image,
        args.llama_bin
    )
    llama_md_ocr = run_mtmd_deepseek_ocr(
        args.llama_model,
        args.mmproj,
        args.image,
        args.llama_bin,
        prompt="<|grounding|>Convert the document to markdown."
    )
-    expected_free_ocr = read_expected_output("test-1-extracted.txt")
+def main() -> int:
-    expected_md_ocr = read_expected_output("test-1-extracted.md")
+    args = argument_parser().parse_args()
    configure_logging(args.verbose)
-    # Compute similarity
+    tests_dir = Path(__file__).parent  # tools/mtmd/tests
-    print("\n[3/3] Computing embedding similarity...")
+    mtmd_dir = tests_dir.parent  # tools/mtmd
-    free_ocr_similarity = compute_embedding_similarity(
+    repo_root = mtmd_dir.parent.parent  # repo root
        expected_free_ocr,
        llama_free_ocr,
        args.embedding_model
    )
-    md_ocr_similarity = compute_embedding_similarity(
+    inputs = [
-        expected_md_ocr,
+        ("image", resolve_path(DEFAULT_IMAGE, mtmd_dir)),
-        llama_md_ocr,
+        ("expected-text", resolve_path(DEFAULT_EXPECTED_TEXT, tests_dir)),
-        args.embedding_model
+        ("model", resolve_path(args.llama_model, repo_root)),
-    )
+        ("mmproj", resolve_path(args.mmproj, repo_root)),
        ("binary", resolve_path(args.llama_bin, repo_root)),
    ]
    for label, path in inputs:
        if not path.exists():
            logger.error(f"Error: {label} not found: {path}")
            return 1
    paths = dict(inputs)
-    # Results
+    logger.info("=" * 60)
-    print("\n" + "=" * 60)
+    logger.info("DeepSeek-OCR: llama.cpp vs ground-truth comparison")
-    print("RESULTS")
+    logger.info("=" * 60)
-    print("=" * 60)
+    logger.info(f"HF baselines: CER {HF_REFERENCE_CER:.4f}, chrF {HF_REFERENCE_CHRF:.2f}")
-    print(f"\nReference Model output:\n{'-' * 40}")
+    logger.info(f"Test thresholds: CER <= {CER_MAX:.4f}, chrF >= {CHRF_MIN:.2f}")
    print(expected_free_ocr)
    print(f"\nDeepSeek-OCR output:\n{'-' * 40}")
    print(llama_free_ocr)
    print(f"\n{'=' * 60}")
    print(f"Cosine Similarity: {free_ocr_similarity:.4f}")
    print(f"Threshold: {args.threshold}")
    print(f"Result: {'PASS' if free_ocr_similarity >= args.threshold else 'FAIL'}")
    print("=" * 60)
-    # Markdown OCR results
+    logger.debug("")
-    print(f"\nReference Model Markdown output:\n{'-' * 40}")
+    logger.debug("Resolved test inputs:")
-    print(expected_md_ocr)
+    for label, path in inputs:
-    print(f"\nDeepSeek-OCR Markdown output:\n{'-' * 40}")
+        logger.debug(f"  {label:<14} {path}")
-    print(llama_md_ocr)
+
-    print(f"\n{'=' * 60}")
+    logger.info("")
-    print(f"Cosine Similarity (Markdown): {md_ocr_similarity:.4f}")
+    logger.info("[1/3] Running llama.cpp 'Free OCR'")
-    print(f"Threshold: {args.threshold}")
+    try:
-    print(f"Result: {'PASS' if md_ocr_similarity >= args.threshold else 'FAIL'}")
+        ocr_out = run_mtmd_cli(paths["model"], paths["mmproj"],
-    print("=" * 60)
+                               paths["image"], paths["binary"])
    except RuntimeError as e:
        logger.error(f"Error: {e}")
        return 1
    logger.info("")
    logger.info("[2/3] Reading expected output")
    expected = read_expected_text(paths["expected-text"])
    logger.info(f"  expected: {len(expected)} chars")
    logger.info("")
    logger.info("[3/3] Computing OCR metrics")
    ok = evaluate(expected, ocr_out)
    return 0 if ok else 1
 if __name__ == "__main__":
-    main()
+    sys.exit(main())
@@ -1,5 +1,3 @@
-sentence-transformers
+jiwer
-transformers
+sacrebleu
-tokenizers
+rapidfuzz
 torch
 torchvision
@@ -14,6 +14,7 @@ exclude = [
 include = [
    "./tools/server/tests/**",
    "./scripts/snapdragon/qdc/**",
    "./tools/mtmd/tests/**",
 ]
 [overrides.rules]