mtmd : DeepSeek-OCR image processing fixes, img_tool::resize padding refactor (#23345)

* mtmd : deepseek-ocr fixes, improvements and refactoring

- image processing changes to achieve full parity with Pillow (reference impl)
- SAM mask casting only when flash-attn is on
- SAM refactor (build_sam() extracted so deepseek-ocr-2 can reuse it)
- llama-chat changes to fix server/WebUI issue (new media_markers_first())
- adapted test-chat-template and added test cases for deepseek-ocr
- changed regression test for deepseek-ocr to use CER+chrF scores for ground-truth comparison; removed embedding-model
- ty.toml ignore unresolved-import for tools/mtmd/tests/**

* image-text reordering fix removed

* refactor bool add_padding + pad_rounding enum into a single pad_style enum
This commit is contained in:
Saba Fallah
2026-05-20 17:37:10 +02:00
committed by GitHub
parent acd604fb27
commit a8681a0ed2
11 changed files with 443 additions and 482 deletions
+13 -3
View File
@@ -35,6 +35,16 @@ enum resize_algo {
// RESIZE_ALGO_LANCZOS, // TODO
};
// Padding style for img_tool::resize
// PAD_NONE - no padding; direct resize to target dimensions
// PAD_CEIL - aspect-preserving pad (default)
// PAD_NEAREST - aspect-preserving pad with nearest-integer rounding (Pillow byte-parity)
enum pad_style {
PAD_NONE,
PAD_CEIL,
PAD_NEAREST,
};
struct clip_hparams {
int32_t image_size = 0;
int32_t patch_size = 0;
@@ -52,7 +62,7 @@ struct clip_hparams {
int32_t image_min_pixels = -1;
int32_t image_max_pixels = -1;
resize_algo image_resize_algo = RESIZE_ALGO_BICUBIC;
bool image_resize_pad = true; // if false, center-crop will be applied when resizing
pad_style image_resize_pad = PAD_CEIL; // padding style when resizing
std::array<uint8_t, 3> image_pad_color = {0, 0, 0};
// (preprocessor) for llava-uhd style models
@@ -61,8 +71,8 @@ struct clip_hparams {
int32_t preproc_max_tiles = 0;
resize_algo image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
resize_algo image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
bool image_pad_rf = true; // if true, refined image will be padded (e.g. llava-1.6)
bool image_pad_ov = false; // if true, overview image will be padded (e.g. llava-1.6)
pad_style image_pad_rf = PAD_CEIL; // padding style for the refined image (e.g. llava-1.6)
pad_style image_pad_ov = PAD_NONE; // padding style for the overview image (e.g. llava-1.6)
std::array<uint8_t, 3> image_pad_color_rf = {0, 0, 0}; // padding color for refined image
std::array<uint8_t, 3> image_pad_color_ov = {0, 0, 0}; // padding color for overview image
+8 -10
View File
@@ -1233,12 +1233,12 @@ struct clip_model_loader {
hparams.has_llava_projector = model.proj_type != PROJECTOR_TYPE_COGVLM;
hparams.image_pad_color = {122, 116, 104};
if (!hparams.image_res_candidates.empty()) {
hparams.image_resize_pad = true;
hparams.image_resize_pad = PAD_CEIL;
hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
} else {
// llava-1.6 default params
hparams.image_pad_ov = false;
hparams.image_pad_rf = true;
hparams.image_pad_ov = PAD_NONE;
hparams.image_pad_rf = PAD_CEIL;
hparams.image_pad_color_rf = {122, 116, 104};
hparams.image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
hparams.image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
@@ -1246,7 +1246,7 @@ struct clip_model_loader {
} break;
case PROJECTOR_TYPE_GLM_EDGE:
{
hparams.image_resize_pad = true;
hparams.image_resize_pad = PAD_CEIL;
hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
} break;
case PROJECTOR_TYPE_MINICPMV:
@@ -1441,7 +1441,7 @@ struct clip_model_loader {
{
hparams.n_merge = 2;
hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
hparams.image_resize_pad = false;
hparams.image_resize_pad = PAD_NONE;
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
std::vector<int> wa_layer_indexes_vec;
@@ -1461,7 +1461,7 @@ struct clip_model_loader {
// reka model performs better when using resize_bicubic, which stretches
// the image to fit fixed square size
hparams.image_resize_pad = false;
hparams.image_resize_pad = PAD_NONE;
} break;
case PROJECTOR_TYPE_GLM4V:
{
@@ -1516,9 +1516,7 @@ struct clip_model_loader {
hparams.image_size = 1024;
hparams.warmup_image_size = 1024;
hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
hparams.image_pad_color[0] = hparams.image_mean[0];
hparams.image_pad_color[1] = hparams.image_mean[1];
hparams.image_pad_color[2] = hparams.image_mean[2];
hparams.image_pad_color = {127, 127, 127};
get_u32(KEY_SAM_N_BLOCK, hparams.sam_n_layer, true);
get_u32(KEY_SAM_N_HEAD, hparams.sam_n_head, true);
@@ -1537,7 +1535,7 @@ struct clip_model_loader {
{
hparams.n_merge = 2;
hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
hparams.image_resize_pad = false;
hparams.image_resize_pad = PAD_NONE;
hparams.ffn_op = FFN_GELU;
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
hparams.set_limit_image_tokens(256, 16384);
+159 -155
View File
@@ -88,164 +88,168 @@ static ggml_tensor * get_rel_pos(ggml_context * ctx0,
return cur; // [C, k_size, q_size]
}
ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
// Building SAM
const int n_embd = hparams.sam_n_embd;
const int n_layer = hparams.sam_n_layer;
const int n_heads = hparams.sam_n_head;
const int d_heads = n_embd / n_heads;
const int window = hparams.attn_window_size;
ggml_tensor * inpL;
inpL = ggml_conv_2d_sk_p0(ctx0, model.patch_embed_proj_w, inp_raw);
inpL = ggml_add(ctx0, inpL, ggml_reshape_3d(ctx0, model.patch_embed_proj_b, 1, 1, n_embd));
inpL = ggml_cont(ctx0, ggml_permute(ctx0, inpL, 1, 2, 0, 3));
ggml_tensor * rel_pos_indices_local;
ggml_tensor * rel_pos_indices_global;
rel_pos_indices_local = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, window, window);
rel_pos_indices_global = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, inpL->ne[1], inpL->ne[2]);
ggml_set_name(rel_pos_indices_local, "rel_pos_indices_local");
ggml_set_name(rel_pos_indices_global, "rel_pos_indices_global");
ggml_set_input(rel_pos_indices_local);
ggml_set_input(rel_pos_indices_global);
ggml_tensor * cur;
const auto tgt_size = inpL->ne[1];
const auto str_size = model.pos_embed->ne[1];
if (str_size != tgt_size) {
ggml_tensor * old_pos_embed = nullptr;
old_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, model.pos_embed, 2, 0, 1, 3));
ggml_tensor * new_pos_embed =
ggml_interpolate(ctx0, old_pos_embed, tgt_size, tgt_size, n_embd, 1, GGML_SCALE_MODE_BICUBIC);
new_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, new_pos_embed, 1, 2, 0, 3));
cur = ggml_add(ctx0, inpL, new_pos_embed);
} else {
cur = ggml_add(ctx0, inpL, model.pos_embed);
}
// loop over layers
for (int il = 0; il < n_layer; il++) {
auto & layer = model.sam_layers[il];
ggml_tensor * shortcut = cur;
// layernorm1
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
const int64_t w0 = cur->ne[1];
const int64_t h0 = cur->ne[2];
ggml_tensor * indices;
if (hparams.is_global_attn(il)) {
indices = rel_pos_indices_global;
} else {
// local attention layer - apply window partition
cur = window_partition(ctx0, cur, window);
indices = rel_pos_indices_local;
}
const int64_t W = cur->ne[1];
const int64_t H = cur->ne[2];
// self-attention
{
const int B = cur->ne[3];
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
cur = ggml_add(ctx0, cur, layer.qkv_b);
cur = ggml_cont(ctx0, cur); // Ensure tensor is contiguous before reshape
cur = ggml_reshape_4d(ctx0, cur, n_embd, 3, W * H, B);
ggml_tensor * Q;
ggml_tensor * K;
ggml_tensor * V;
Q = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 0 * cur->nb[1]);
Q = ggml_reshape_4d(ctx0, ggml_cont(ctx0, Q), d_heads, n_heads, W * H, B);
K = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 1 * cur->nb[1]);
K = ggml_reshape_4d(ctx0, ggml_cont(ctx0, K), d_heads, n_heads, W * H, B);
V = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 2 * cur->nb[1]);
V = ggml_reshape_4d(ctx0, ggml_cont(ctx0, V), d_heads, n_heads, W * H, B);
ggml_tensor * mask;
ggml_tensor * rw;
ggml_tensor * rh;
ggml_tensor * qr;
rw = get_rel_pos(ctx0, layer.rel_pos_w, indices, W, W); // [W, W, C]
rh = get_rel_pos(ctx0, layer.rel_pos_h, indices, H, H); // [H, H, C]
qr = ggml_permute(ctx0, Q, 0, 2, 1, 3);
qr = ggml_reshape_4d(ctx0, ggml_cont(ctx0, qr), d_heads, W, H, B * n_heads);
rw = ggml_mul_mat(ctx0, rw,
ggml_cont(ctx0, ggml_permute(ctx0, qr, 0, 2, 1, 3))); // [B*n_heads, W, H, W]
rw = ggml_cont(ctx0, ggml_permute(ctx0, rw, 0, 2, 1, 3)); // [B*n_heads, H, W, W]
rw = ggml_reshape_4d(ctx0, rw, W, 1, W * H, n_heads * B);
rw = ggml_repeat_4d(ctx0, rw, W, H, W * H, n_heads * B);
rh = ggml_mul_mat(ctx0, rh, qr); // [B*n_heads, H, W, H]
rh = ggml_reshape_4d(ctx0, rh, 1, H, W * H, n_heads * B);
mask = ggml_add(ctx0, rw, rh); // [B*n_heads, H*W, H, W]
mask = ggml_reshape_4d(ctx0, mask, W * H, W * H, n_heads, B);
// casting mask to F16 only required when flash-attn is enabled
if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
mask = ggml_cast(ctx0, mask, GGML_TYPE_F16);
}
const float scale = 1.0f / sqrtf(static_cast<float>(d_heads));
cur = build_attn(layer.o_w, layer.o_b, Q, K, V, mask, scale,
il); // [B, H*W, n_embd]
cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), n_embd, W, H, B);
}
if (hparams.is_global_attn(il) == false) {
// local attention layer - reverse window partition
cur = window_unpartition(ctx0, cur, w0, h0, window);
}
// re-add the layer input, e.g., residual
cur = ggml_add(ctx0, cur, shortcut);
ggml_tensor * inpFF = cur;
// layernorm2
cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
// ffn
cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b,
hparams.ffn_op, il);
// residual 2
cur = ggml_add(ctx0, cur, inpFF);
cb(cur, "sam_layer_out", il);
}
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1);
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1);
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1);
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1);
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1);
cur = ggml_conv_2d(ctx0, model.net_3, cur, 2, 2, 1, 1, 1, 1);
cb(cur, "sam_output", -1);
ggml_build_forward_expand(gf, cur);
return cur;
}
ggml_cgraph * clip_graph_deepseekocr::build() {
// patch embedding
ggml_tensor * inp_raw = build_inp_raw();
ggml_tensor * sam_out;
// Building SAM
{
const int n_embd = hparams.sam_n_embd;
const int n_layer = hparams.sam_n_layer;
const int n_heads = hparams.sam_n_head;
const int d_heads = n_embd / n_heads;
const int window = hparams.attn_window_size;
ggml_tensor * inpL;
inpL = ggml_conv_2d_sk_p0(ctx0, model.patch_embed_proj_w, inp_raw);
inpL = ggml_add(ctx0, inpL, ggml_reshape_3d(ctx0, model.patch_embed_proj_b, 1, 1, n_embd));
inpL = ggml_cont(ctx0, ggml_permute(ctx0, inpL, 1, 2, 0, 3));
ggml_tensor * rel_pos_indices_local;
ggml_tensor * rel_pos_indices_global;
rel_pos_indices_local = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, window, window);
rel_pos_indices_global = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, inpL->ne[1], inpL->ne[2]);
ggml_set_name(rel_pos_indices_local, "rel_pos_indices_local");
ggml_set_name(rel_pos_indices_global, "rel_pos_indices_global");
ggml_set_input(rel_pos_indices_local);
ggml_set_input(rel_pos_indices_global);
ggml_tensor * cur;
const auto tgt_size = inpL->ne[1];
const auto str_size = model.pos_embed->ne[1];
if (str_size != tgt_size) {
ggml_tensor * old_pos_embed = nullptr;
old_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, model.pos_embed, 2, 0, 1, 3));
ggml_tensor * new_pos_embed =
ggml_interpolate(ctx0, old_pos_embed, tgt_size, tgt_size, n_embd, 1, GGML_SCALE_MODE_BICUBIC);
new_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, new_pos_embed, 1, 2, 0, 3));
cur = ggml_add(ctx0, inpL, new_pos_embed);
} else {
cur = ggml_add(ctx0, inpL, model.pos_embed);
}
// loop over layers
for (int il = 0; il < n_layer; il++) {
auto & layer = model.sam_layers[il];
ggml_tensor * shortcut = cur;
// layernorm1
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
const int64_t w0 = cur->ne[1];
const int64_t h0 = cur->ne[2];
ggml_tensor * indices;
if (hparams.is_global_attn(il)) {
indices = rel_pos_indices_global;
} else {
// local attention layer - apply window partition
cur = window_partition(ctx0, cur, window);
indices = rel_pos_indices_local;
}
const int64_t W = cur->ne[1];
const int64_t H = cur->ne[2];
// self-attention
{
const int B = cur->ne[3];
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
cur = ggml_add(ctx0, cur, layer.qkv_b);
cur = ggml_cont(ctx0, cur); // Ensure tensor is contiguous before reshape
cur = ggml_reshape_4d(ctx0, cur, n_embd, 3, W * H, B);
ggml_tensor * Q;
ggml_tensor * K;
ggml_tensor * V;
Q = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 0 * cur->nb[1]);
Q = ggml_reshape_4d(ctx0, ggml_cont(ctx0, Q), d_heads, n_heads, W * H, B);
K = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 1 * cur->nb[1]);
K = ggml_reshape_4d(ctx0, ggml_cont(ctx0, K), d_heads, n_heads, W * H, B);
V = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 2 * cur->nb[1]);
V = ggml_reshape_4d(ctx0, ggml_cont(ctx0, V), d_heads, n_heads, W * H, B);
ggml_tensor * mask;
ggml_tensor * rw;
ggml_tensor * rh;
ggml_tensor * qr;
rw = get_rel_pos(ctx0, layer.rel_pos_w, indices, W, W); // [W, W, C]
rh = get_rel_pos(ctx0, layer.rel_pos_h, indices, H, H); // [H, H, C]
qr = ggml_permute(ctx0, Q, 0, 2, 1, 3);
qr = ggml_reshape_4d(ctx0, ggml_cont(ctx0, qr), d_heads, W, H, B * n_heads);
rw = ggml_mul_mat(ctx0, rw,
ggml_cont(ctx0, ggml_permute(ctx0, qr, 0, 2, 1, 3))); // [B*n_heads, W, H, W]
rw = ggml_cont(ctx0, ggml_permute(ctx0, rw, 0, 2, 1, 3)); // [B*n_heads, H, W, W]
rw = ggml_reshape_4d(ctx0, rw, W, 1, W * H, n_heads * B);
rw = ggml_repeat_4d(ctx0, rw, W, H, W * H, n_heads * B);
rh = ggml_mul_mat(ctx0, rh, qr); // [B*n_heads, H, W, H]
rh = ggml_reshape_4d(ctx0, rh, 1, H, W * H, n_heads * B);
mask = ggml_add(ctx0, rw, rh); // [B*n_heads, H*W, H, W]
mask = ggml_reshape_4d(ctx0, mask, W * H, W * H, n_heads, B);
mask = ggml_cast(ctx0, mask, GGML_TYPE_F16);
const float scale = 1.0f / sqrtf(static_cast<float>(d_heads));
cur = build_attn(layer.o_w, layer.o_b, Q, K, V, mask, scale,
il); // [B, H*W, n_embd]
cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), n_embd, W, H, B);
}
if (hparams.is_global_attn(il) == false) {
// local attention layer - reverse window partition
cur = window_unpartition(ctx0, cur, w0, h0, window);
}
// re-add the layer input, e.g., residual
cur = ggml_add(ctx0, cur, shortcut);
ggml_tensor * inpFF = cur;
// layernorm2
cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
// ffn
cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b,
hparams.ffn_op, il);
// residual 2
cur = ggml_add(ctx0, cur, inpFF);
cb(cur, "sam_layer_out", il);
}
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1);
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1);
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1);
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1);
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1);
cur = ggml_conv_2d(ctx0, model.net_3, cur, 2, 2, 1, 1, 1, 1);
cb(cur, "sam_output", -1);
ggml_build_forward_expand(gf, cur);
sam_out = cur;
}
ggml_tensor * sam_out = build_sam(inp_raw);
ggml_tensor * clip_out;
// Building DS-OCR CLIP
+1
View File
@@ -118,6 +118,7 @@ struct clip_graph_whisper_enc : clip_graph {
struct clip_graph_deepseekocr : clip_graph {
clip_graph_deepseekocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
ggml_tensor * build_sam(ggml_tensor * inp); // build the SAM model
};
struct clip_graph_conformer : clip_graph {
+56 -38
View File
@@ -38,7 +38,7 @@ struct img_tool {
clip_image_u8 & dst,
const clip_image_size & target_resolution,
resize_algo algo,
bool add_padding = true, // TODO: define the behavior for add_padding = false
pad_style padding = PAD_CEIL,
std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
dst.nx = target_resolution.width;
dst.ny = target_resolution.height;
@@ -50,7 +50,7 @@ struct img_tool {
return;
}
if (!add_padding) {
if (padding == PAD_NONE) {
// direct resize
switch (algo) {
case RESIZE_ALGO_BILINEAR:
@@ -71,8 +71,15 @@ struct img_tool {
float scale_w = static_cast<float>(target_resolution.width) / src.nx;
float scale_h = static_cast<float>(target_resolution.height) / src.ny;
float scale = std::min(scale_w, scale_h);
int new_width = std::min(static_cast<int>(std::ceil(src.nx * scale)), target_resolution.width);
int new_height = std::min(static_cast<int>(std::ceil(src.ny * scale)), target_resolution.height);
int new_width, new_height;
if (padding == PAD_NEAREST) {
new_width = std::min(static_cast<int>(std::round(src.nx * scale)), target_resolution.width);
new_height = std::min(static_cast<int>(std::round(src.ny * scale)), target_resolution.height);
} else {
new_width = std::min(static_cast<int>(std::ceil(src.nx * scale)), target_resolution.width);
new_height = std::min(static_cast<int>(std::ceil(src.ny * scale)), target_resolution.height);
}
switch (algo) {
case RESIZE_ALGO_BILINEAR:
@@ -91,9 +98,14 @@ struct img_tool {
// fill dst with pad_color
fill(dst, pad_color);
int offset_x = (target_resolution.width - new_width) / 2;
int offset_y = (target_resolution.height - new_height) / 2;
int offset_x, offset_y;
if (padding == PAD_NEAREST) {
offset_x = static_cast<int>(std::round((target_resolution.width - new_width) / 2.0f));
offset_y = static_cast<int>(std::round((target_resolution.height - new_height) / 2.0f));
} else {
offset_x = (target_resolution.width - new_width) / 2;
offset_y = (target_resolution.height - new_height) / 2;
}
composite(dst, resized_image, offset_x, offset_y);
}
}
@@ -356,10 +368,10 @@ private:
GGML_ASSERT(inSize > 0 && outSize > 0);
double support, scale, filterscale;
double center, ww, ss;
int xx, x, ksize, xmin, xmax, xcnt;
int xx, x, ksize, xmin, xmax;
// Calculate scaling factor: ratio of input range to output size
filterscale = scale = (double)inSize / outSize;
filterscale = scale = static_cast<double>(inSize) / outSize;
// For upsampling (scale < 1), keep filterscale = 1 to maintain filter sharpness
// For downsampling (scale > 1), widen filter to prevent aliasing
if (filterscale < 1.0) {
@@ -373,6 +385,7 @@ private:
std::vector<double> pre_weights(outSize * ksize); // Temporary weights
bounds.resize(outSize * 2);
// For each output pixel, compute its filter coefficients
for (xx = 0; xx < outSize; xx++) {
// Calculate the center position in input space (pixel-center convention: +0.5)
@@ -391,10 +404,10 @@ private:
xmax = inSize;
}
xcnt = xmax - xmin;
xmax -= xmin;
// Compute filter weights for each contributing input pixel
for (x = 0; x < xcnt; x++) {
for (x = 0; x < xmax; x++) {
// Distance from input pixel center to output pixel center in input space
double w = bicubic_filter((x + xmin - center + 0.5) * ss);
pre_weights[xx * ksize + x] = w;
@@ -402,7 +415,7 @@ private:
}
// Normalize weights to sum to 1.0 (preserves brightness)
for (x = 0; x < xcnt; x++) {
for (x = 0; x < xmax; x++) {
if (ww != 0.0) {
pre_weights[xx * ksize + x] /= ww;
}
@@ -415,18 +428,27 @@ private:
// Store input pixel range for this output pixel
bounds[xx * 2 + 0] = xmin;
bounds[xx * 2 + 1] = xcnt;
bounds[xx * 2 + 1] = xmax;
}
// Convert floating-point coefficients to fixed-point integers
// Formula: int32 = round(float * 2^PRECISION_BITS)
weights.resize(outSize * ksize);
const double fxp_scale = std::ldexp(1.0, PRECISION_BITS); // 1.0 * 2^PRECISION_BITS
for (int i = 0; i < outSize * ksize; i++) {
double tmp_val = pre_weights[i] * fxp_scale;
if (pre_weights[i] < 0) {
weights[i] = static_cast<int32_t>(-0.5 + pre_weights[i] * (1 << PRECISION_BITS));
tmp_val -= 0.5;
} else {
weights[i] = static_cast<int32_t>(0.5 + pre_weights[i] * (1 << PRECISION_BITS));
tmp_val += 0.5;
}
tmp_val = std::round(tmp_val);
tmp_val = std::clamp(tmp_val,
static_cast<double>(std::numeric_limits<int32_t>::min()),
static_cast<double>(std::numeric_limits<int32_t>::max()));
weights[i] = static_cast<int32_t>(tmp_val);
}
return ksize;
@@ -1083,35 +1105,31 @@ bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, cli
//
bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
const std::vector native_resolutions = {
/*512 tiny , 640 small, */ 1024 /* base */, 1280 /* large */
};
// original image size
const clip_image_size original_size{img.nx, img.ny};
const int orig_w = original_size.width;
const int orig_h = original_size.height;
const int orig_area = orig_h * orig_w;
static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ };
// TODO: support 512 (tiny) and 640 (small) once we have eval data for them
size_t mode_i = 0;
int min_diff = orig_area;
const int64_t orig_area = static_cast<int64_t>(img.nx) * img.ny;
for (size_t i = 0; i < native_resolutions.size(); i++) {
int r = native_resolutions[i];
if (std::abs(orig_area - r * r) < min_diff) {
mode_i = i;
min_diff = std::abs(orig_area - r * r);
size_t mode_i = 0;
int64_t min_diff = std::numeric_limits<int64_t>::max();
for (size_t i = 0; i < std::size(native_resolutions); i++) {
const int64_t r = native_resolutions[i];
const int64_t diff = std::abs(orig_area - r * r);
if (diff < min_diff) {
mode_i = i;
min_diff = diff;
}
}
/* Native Resolution (Base/Large) */
const int image_size = native_resolutions[mode_i];
// scaled and padded image
clip_image_u8_ptr scaled_img(clip_image_u8_init());
img_tool::resize(img, *scaled_img, clip_image_size{image_size, image_size}, hparams.image_resize_algo);
// Aspect-preserving fit-and-pad. Pillow bicubic + PAD_NEAREST for
// byte-parity with the upstream deepseek-ai/DeepSeek-OCR HF preprocessor.
clip_image_u8 padded;
img_tool::resize(img, padded, {image_size, image_size}, RESIZE_ALGO_BICUBIC_PILLOW,
PAD_NEAREST, hparams.image_pad_color);
clip_image_f32_ptr res(clip_image_f32_init());
img_u8_to_f32(*scaled_img, *res, hparams.image_mean, hparams.image_std);
img_u8_to_f32(padded, *res, hparams.image_mean, hparams.image_std);
output.entries.push_back(std::move(res));
output.grid_x = 1;
@@ -1246,7 +1264,7 @@ clip_image_u8 mtmd_image_preprocessor_step3vl::prepare_image(const clip_image_u8
std::max(1, static_cast<int>(std::floor(resized.ny * scale))),
};
clip_image_u8 scaled;
img_tool::resize(resized, scaled, new_size, RESIZE_ALGO_BILINEAR, false);
img_tool::resize(resized, scaled, new_size, RESIZE_ALGO_BILINEAR, PAD_NONE);
resized = std::move(scaled);
}
@@ -1347,7 +1365,7 @@ bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip
clip_image_u8 img_for_crop = prepared;
if (instructions.refined_size.width != prepared.nx || instructions.refined_size.height != prepared.ny) {
clip_image_u8 refined;
img_tool::resize(prepared, refined, instructions.refined_size, RESIZE_ALGO_BILINEAR, false);
img_tool::resize(prepared, refined, instructions.refined_size, RESIZE_ALGO_BILINEAR, PAD_NONE);
img_for_crop = std::move(refined);
}
-85
View File
@@ -1,85 +0,0 @@
<|ref|>title<|/ref|><|det|>[[61, 255, 907, 533]]<|/det|>
# MEN WALK ON MOON
ASTRONAUTS LAND ON PLAIN;
COLLECT ROCKS, PLANT FLAG
<|ref|>text<|/ref|><|det|>[[56, 559, 268, 629]]<|/det|>
Voice From Moon:
Eagle Has Landed'
<|ref|>text<|/ref|><|det|>[[74, 645, 262, 675]]<|/det|>
EAGLE (the lunar surface, Houston, Truesquily)
Base here, The Eagle has landed.
<|ref|>text<|/ref|><|det|>[[74, 675, 262, 720]]<|/det|>
BOOTHROOM: Lounge, Truesquily, we enjoy you on the ground. You've got a bunch of guys about to toss bikes. We're breaking again. Thanks a lot.
<|ref|>text<|/ref|><|det|>[[74, 720, 262, 750]]<|/det|>
TRAVELLING MADE: Time you. BOOTHROOM: You're looking good here.
<|ref|>text<|/ref|><|det|>[[74, 750, 262, 780]]<|/det|>
TRAVELLING MADE: A very smooth touchdown. BEDROOM: Eagle, you are very far. I'll. (The first sign in the lunar appearance) (Over.)
<|ref|>text<|/ref|><|det|>[[74, 780, 262, 810]]<|/det|>
TRAVELLING MADE: Eagle, stay for I'll. BOOTHROOM: Bumper and we are you waiting the cue.
<|ref|>text<|/ref|><|det|>[[74, 810, 262, 830]]<|/det|>
TRAVELLING MADE: Eagle, and service mobility.
<|ref|>text<|/ref|><|det|>[[74, 830, 262, 850]]<|/det|>
How do you read me?
<|ref|>text<|/ref|><|det|>[[74, 850, 262, 880]]<|/det|>
TRAVELLING COLUMBIA, he has landed Truesquily. Base, Eagle is at Truesquily. I read you first by. Over.
<|ref|>text<|/ref|><|det|>[[74, 880, 262, 900]]<|/det|>
COLUMBIA: Yes, I heard the whole thing.
<|ref|>text<|/ref|><|det|>[[74, 900, 262, 920]]<|/det|>
BOOTHROOM: Well, it's a good show.
<|ref|>text<|/ref|><|det|>[[74, 920, 262, 940]]<|/det|>
COLUMBIA: Fantastic.
<|ref|>text<|/ref|><|det|>[[74, 940, 262, 960]]<|/det|>
TRAVELLING MADE: I'll read that.
<|ref|>text<|/ref|><|det|>[[74, 960, 262, 980]]<|/det|>
APOLLO CONTROL: The most major sky to sky will be for the 23 event, that is at 21 minutes 26 sec-
<|ref|>text<|/ref|><|det|>[[74, 980, 262, 990]]<|/det|>
tion of lunar descent.
<|ref|>image<|/ref|><|det|>[[270, 545, 697, 990]]<|/det|>
<|ref|>text<|/ref|><|det|>[[715, 559, 911, 629]]<|/det|>
A Powdery Surface
Is Closely Explored
<|ref|>text<|/ref|><|det|>[[733, 645, 851, 665]]<|/det|>
BY JOHN NOBLE WILFORD
<|ref|>text<|/ref|><|det|>[[715, 669, 911, 700]]<|/det|>
HOUSTON, Monday, July 21—New hires landed and walked on the moon.
<|ref|>text<|/ref|><|det|>[[715, 700, 911, 750]]<|/det|>
Two Americans, astronauts of Apollo 11, steered their Eagle-shaped lunar module safely and smoothly to the lunar landing yesterday at 4:17:40 P.M., Eastern day-light time.
<|ref|>text<|/ref|><|det|>[[715, 750, 911, 780]]<|/det|>
Neil A. Armstrong, the 38-year-old civilian commander, radioed to earth and the landing team here.
<|ref|>text<|/ref|><|det|>[[715, 780, 911, 830]]<|/det|>
"Boom, Truesquily! Base here. The Eagle has landed," the first man to reach the moon—Neil Armstrong and his engineer, Capt. Charles E. Alder, of the Jet Propulsion Laboratory, the space agency's rocket and space program manager.
<|ref|>text<|/ref|><|det|>[[715, 830, 911, 880]]<|/det|>
About six and a half hours later, Mr. Armstrong opened the landing craft's hatch, stepped slowly down the ladder and descended as he pointed his first landing footguard on the lunar crater.
<|ref|>text<|/ref|><|det|>[[715, 880, 911, 920]]<|/det|>
"That's one small step for man, one giant leap for mankind."
<|ref|>text<|/ref|><|det|>[[715, 920, 911, 960]]<|/det|>
His first step on the moon came on 10:56:29 P.M., as a television camera recorded the craft's transmitted his every word to an aerial and excited audiences of hundreds of millions of people on earth.
<|ref|>text<|/ref|><|det|>[[749, 960, 861, 974]]<|/det|>
Testable Slope Test Soil
-42
View File
@@ -1,42 +0,0 @@
MEN WALK ON MOON
ASTRONAUTS LAND ON PLAIN;
COLLECT ROCKS, PLANT FLAG
Voice From Moon:
'Eagle Has Landed'
A Powder Surface
Is Closely Explored
By JOHN NOBLE WILFORD
NOVEMBER, Monday, July 21—New York Herald and
wished on the moon.
Two American astronauts of Apollo 11, steered their
frigate Eagle toward the moon's surface and smoothly to
the lunar landing yesterday at 4:17:40 P.M., Eastern day-
light time.
Neil A. Armstrong, the 38-year-old civilian commander,
landed on the soft sand of the moon's surface here.
"Beautiful, Triumph!" he said. "The Eagle has landed."
The first man to reach the moon—Neil Armstrong and
his co-pilot, Charles E. "Pete" Conrad, 26, of the Pentagon,
brought their ship to rest on a level, rock-strewn plain near
the moon's surface. The two men and two of the three
astronauts on board, Armstrong, Conrad and Edwin E.
Aldrin, 38, of Houston, stepped slowly down the ladder
and descended as he pointed his first full-flaming footpad
at the lunar crater.
"That's one small step for man, one giant leap for
mankind."
His first step on the moon came at 10:56:20 P.M., as
a television camera rolled the earth's thousandth line every
second to an aerial and studied audiences of hundreds of
millions of people on earth.
Textile Slope Test Soil
+24
View File
@@ -0,0 +1,24 @@
A Powdery Surface
Is Closely Explored
By JOHN NOBLE WILFORD
Special to The New York Times
HOUSTON, Monday, July 21—Men have landed and walked on the moon.
Two Americans, astronauts of Apollo 11, steered their fragile four-legged lunar module safely and smoothly to the historic landing yesterday at 4:17:40 P.M., Eastern daylight time.
Neil A. Armstrong, the 38-year-old civilian commander, radioed to earth and the mission control room here:
"Houston, Tranquility Base here. The Eagle has landed."
The first men to reach the moon—Mr. Armstrong and his co-pilot, Col. Edwin E. Aldrin Jr. of the Air Force—brought their ship to rest on a level, rock-strewn plain near the southwestern shore of the arid Sea of Tranquility.
About six and a half hours later, Mr. Armstrong opened the landing craft's hatch, stepped slowly down the ladder and declared as he planted the first human footprint on the lunar crust:
"That's one small step for man, one giant leap for mankind."
His first step on the moon came at 10:56:20 P.M., as a television camera outside the craft transmitted his every move to an awed and excited audience of hundreds of millions of people on earth.
Tentative Steps Test Soil
+178 -144
View File
@@ -1,186 +1,220 @@
#!/usr/bin/env python3
"""
Test script to compare llama.cpp mtmd-cli output with HuggingFace reference implementation
for DeepSeek-OCR model using embedding similarity.
Evaluates llama.cpp's DeepSeek-OCR by comparing its output for a test
image to the actual text in part of that image.
Runs the test image through mtmd-cli, calculates CER and chrF for
its output, and holds them against the HF model's scores.
"""
import argparse
import logging
import subprocess
import sys
import unicodedata
from pathlib import Path
from sentence_transformers import SentenceTransformer
from sentence_transformers import util
logger = logging.getLogger("deepseek-ocr-test")
DEFAULT_IMAGE = "test-1.jpeg"
DEFAULT_EXPECTED_TEXT = "test-1-ground-truth.txt"
RUN_TIMEOUT = 300
# DeepSeek-OCR reference scores on the test image.
# This is the baseline the implementation should keep up with.
HF_REFERENCE_CER = 0.3030
HF_REFERENCE_CHRF = 67.52
CER_TOLERANCE = 0.02
CHRF_TOLERANCE = 2.0
CER_MAX = HF_REFERENCE_CER + CER_TOLERANCE
CHRF_MIN = HF_REFERENCE_CHRF - CHRF_TOLERANCE
def run_mtmd_deepseek_ocr(
model_path: str,
mmproj_path: str,
image_path: str,
bin_path: str,
prompt: str = "Free OCR."
) -> str:
def verdict(ok: bool) -> str:
return "PASS" if ok else "FAIL"
def normalize_text(text: str) -> str:
"""NFC-normalize and collapse whitespace, so line-wrap and spacing
don't count as CER errors."""
return " ".join(unicodedata.normalize("NFC", text).split())
def locally_align(expected: str, ocr_out: str) -> str:
"""Return the span of `ocr_out` that best matches `expected`.
The ground truth covers part of the article body.
But the test image includes half of the newspaper's front page.
Fuzzy partial-ratio matching picks out
the body so the unrelated text doesn't disturb CER / chrF.
"""
Run inference using llama.cpp mtmd-cli.
from rapidfuzz import fuzz
alignment = fuzz.partial_ratio_alignment(expected, ocr_out)
if alignment is None or alignment.dest_end <= alignment.dest_start:
return ocr_out
return ocr_out[alignment.dest_start:alignment.dest_end]
def compute_cer(expected: str, ocr_out: str) -> float:
"""Character Error Rate. Lower is better.
CER: fraction of characters you'd insert/delete/substitute to fix the output; 0 = perfect."""
import jiwer
return jiwer.cer(expected, ocr_out)
def compute_chrf(expected: str, ocr_out: str) -> float:
"""chrF score on 0-100. Higher is better.
chrF: F-score over shared character n-grams; more forgiving of small word/spacing drift than CER.
"""
from sacrebleu.metrics import CHRF
return CHRF().sentence_score(ocr_out, [expected]).score
def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str:
"""Run mtmd-cli on the image and return its output."""
cmd = [
bin_path,
"-m", model_path,
"--mmproj", mmproj_path,
"--image", image_path,
# "-p", "<|grounding|>Convert the document to markdown.",
"-p", prompt,
str(bin_path),
"-m", str(model_path),
"--mmproj", str(mmproj_path),
"--image", str(image_path),
"-p", "Free OCR. ",
"--chat-template", "deepseek-ocr",
"--temp", "0",
"-n", "1024",
# "--verbose"
"--flash-attn", "off", # match the HF "eager" attention reference
"--no-warmup",
]
logger.debug(f" command: {' '.join(cmd)}")
print(f"Running llama.cpp command: {' '.join(cmd)}")
result = subprocess.run(
cmd,
capture_output=True,
text=False,
timeout=300
)
try:
result = subprocess.run(cmd, capture_output=True, text=False, timeout=RUN_TIMEOUT)
except subprocess.TimeoutExpired as e:
if e.stderr:
logger.error("llama.cpp stderr:\n%s", e.stderr.decode("utf-8", errors="replace"))
raise RuntimeError(f"llama-mtmd-cli timed out after {RUN_TIMEOUT}s")
if result.returncode != 0:
stderr = result.stderr.decode('utf-8', errors='replace')
print(f"llama.cpp stderr: {stderr}")
logger.error("llama.cpp stderr:\n%s", result.stderr.decode("utf-8", errors="replace"))
raise RuntimeError(f"llama-mtmd-cli failed with code {result.returncode}")
output = result.stdout.decode('utf-8', errors='replace').strip()
print(f"llama.cpp output length: {len(output)} chars")
output = result.stdout.decode("utf-8", errors="replace").strip()
if not output:
raise RuntimeError("llama-mtmd-cli produced no output on stdout")
logger.info(f" output: {len(output)} chars")
return output
def compute_embedding_similarity(text1: str, text2: str, model_name: str) -> float:
"""
Compute cosine similarity between two texts using embedding model.
"""
print(f"Loading embedding model: {model_name}")
# Use sentence-transformers for easier embedding extraction
embed_model = SentenceTransformer(model_name)
print("Computing embeddings...")
embeddings = embed_model.encode([text1, text2], convert_to_numpy=True)
similarity = util.similarity.cos_sim([embeddings[0]], [embeddings[1]])[0][0]
return float(similarity)
def read_expected_output(file_path: str) -> str:
"""
Read expected OCR output from file.
"""
cur_path = Path(__file__).parent
expected_path = str(cur_path / file_path)
with open(expected_path, "r", encoding="utf-8") as f:
def read_expected_text(file_path: Path) -> str:
with open(file_path, "r", encoding="utf-8") as f:
return f.read().strip()
def main():
ap = argparse.ArgumentParser(description="Compare llama.cpp and HuggingFace DeepSeek-OCR outputs")
ap.add_argument("--llama-model", default="gguf_models/deepseek-ai/deepseek-ocr-f16.gguf",
help="Path to llama.cpp GGUF model")
ap.add_argument("--mmproj", default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-f16.gguf",
help="Path to mmproj GGUF file")
ap.add_argument("--image", default="test-1.jpeg",
help="Path to test image")
def evaluate(expected: str, ocr_out: str) -> bool:
expected = normalize_text(expected)
ocr_out = normalize_text(ocr_out)
aligned = locally_align(expected, ocr_out)
logger.debug(f"\n--- expected (normalized) ---\n{expected}")
logger.debug(f"\n--- OCR output (normalized) ---\n{ocr_out}")
logger.debug(f"\n--- aligned span ---\n{aligned}")
cer = compute_cer(expected, aligned)
chrf = compute_chrf(expected, aligned)
cer_pass = cer <= CER_MAX
chrf_pass = chrf >= CHRF_MIN
passed = cer_pass and chrf_pass
logger.info("")
logger.info("=" * 60)
logger.info("Free OCR evaluation:")
logger.info("=" * 60)
logger.info(f" CER {cer:>7.4f} (<= {CER_MAX:>7.4f} -> {verdict(cer_pass)})")
logger.info(f" chrF (0-100) {chrf:>7.2f} (>= {CHRF_MIN:>7.2f} -> {verdict(chrf_pass)})")
logger.info(f" Expected chars {len(expected):>7}")
logger.info(f" Aligned chars {len(aligned):>7} (of {len(ocr_out)} OCR chars)")
logger.info("")
logger.info(f" Result: {verdict(passed)}")
logger.info("=" * 60)
return passed
def argument_parser() -> argparse.ArgumentParser:
ap = argparse.ArgumentParser(description="Compare llama.cpp DeepSeek-OCR output with a ground-truth transcript")
ap.add_argument("--llama-model", default="gguf_models/deepseek-ai/deepseek-ocr-bf16.gguf",
help="Path to llama.cpp GGUF model (relative to repo root or absolute)")
ap.add_argument("--mmproj", default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-bf16.gguf",
help="Path to mmproj GGUF file (relative to repo root or absolute)")
ap.add_argument("--llama-bin", default="build/bin/llama-mtmd-cli",
help="Path to llama-mtmd-cli binary")
ap.add_argument("--embedding-model", default="Qwen/Qwen3-Embedding-0.6B",
help="Embedding model for similarity computation")
ap.add_argument("--threshold", type=float, default=0.7,
help="Minimum similarity threshold for pass")
args = ap.parse_args()
help="Path to llama-mtmd-cli binary (relative to repo root or absolute)")
ap.add_argument("--verbose", action="store_true",
help="Also log the expected, OCR, and aligned text")
return ap
# Validate paths
# script directory + image
mtmd_dir = Path(__file__).parent.parent
args.image = str(mtmd_dir / args.image)
# project directory + llama model
args.llama_model = str(mtmd_dir.parent.parent / args.llama_model)
# project directory + mmproj
args.mmproj = str(mtmd_dir.parent.parent / args.mmproj)
args.llama_bin = str(mtmd_dir.parent.parent / args.llama_bin)
if not Path(args.image).exists():
print(f"Error: Image not found: {args.image}")
sys.exit(1)
if not Path(args.llama_model).exists():
print(f"Error: Model not found: {args.llama_model}")
sys.exit(1)
if not Path(args.mmproj).exists():
print(f"Error: mmproj not found: {args.mmproj}")
sys.exit(1)
print("=" * 60)
print("DeepSeek-OCR: llama.cpp vs HuggingFace Comparison")
print("=" * 60)
def configure_logging(verbose: bool) -> None:
logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO,
format="%(message)s")
# Default paths based on your command
# Run llama.cpp inference
print("\n[2/3] Running llama.cpp implementation...")
llama_free_ocr = run_mtmd_deepseek_ocr(
args.llama_model,
args.mmproj,
args.image,
args.llama_bin
)
def resolve_path(path: str, base: Path) -> Path:
p = Path(path)
return p if p.is_absolute() else base / p
llama_md_ocr = run_mtmd_deepseek_ocr(
args.llama_model,
args.mmproj,
args.image,
args.llama_bin,
prompt="<|grounding|>Convert the document to markdown."
)
expected_free_ocr = read_expected_output("test-1-extracted.txt")
expected_md_ocr = read_expected_output("test-1-extracted.md")
def main() -> int:
args = argument_parser().parse_args()
configure_logging(args.verbose)
# Compute similarity
print("\n[3/3] Computing embedding similarity...")
free_ocr_similarity = compute_embedding_similarity(
expected_free_ocr,
llama_free_ocr,
args.embedding_model
)
tests_dir = Path(__file__).parent # tools/mtmd/tests
mtmd_dir = tests_dir.parent # tools/mtmd
repo_root = mtmd_dir.parent.parent # repo root
md_ocr_similarity = compute_embedding_similarity(
expected_md_ocr,
llama_md_ocr,
args.embedding_model
)
inputs = [
("image", resolve_path(DEFAULT_IMAGE, mtmd_dir)),
("expected-text", resolve_path(DEFAULT_EXPECTED_TEXT, tests_dir)),
("model", resolve_path(args.llama_model, repo_root)),
("mmproj", resolve_path(args.mmproj, repo_root)),
("binary", resolve_path(args.llama_bin, repo_root)),
]
for label, path in inputs:
if not path.exists():
logger.error(f"Error: {label} not found: {path}")
return 1
paths = dict(inputs)
# Results
print("\n" + "=" * 60)
print("RESULTS")
print("=" * 60)
print(f"\nReference Model output:\n{'-' * 40}")
print(expected_free_ocr)
print(f"\nDeepSeek-OCR output:\n{'-' * 40}")
print(llama_free_ocr)
print(f"\n{'=' * 60}")
print(f"Cosine Similarity: {free_ocr_similarity:.4f}")
print(f"Threshold: {args.threshold}")
print(f"Result: {'PASS' if free_ocr_similarity >= args.threshold else 'FAIL'}")
print("=" * 60)
logger.info("=" * 60)
logger.info("DeepSeek-OCR: llama.cpp vs ground-truth comparison")
logger.info("=" * 60)
logger.info(f"HF baselines: CER {HF_REFERENCE_CER:.4f}, chrF {HF_REFERENCE_CHRF:.2f}")
logger.info(f"Test thresholds: CER <= {CER_MAX:.4f}, chrF >= {CHRF_MIN:.2f}")
# Markdown OCR results
print(f"\nReference Model Markdown output:\n{'-' * 40}")
print(expected_md_ocr)
print(f"\nDeepSeek-OCR Markdown output:\n{'-' * 40}")
print(llama_md_ocr)
print(f"\n{'=' * 60}")
print(f"Cosine Similarity (Markdown): {md_ocr_similarity:.4f}")
print(f"Threshold: {args.threshold}")
print(f"Result: {'PASS' if md_ocr_similarity >= args.threshold else 'FAIL'}")
print("=" * 60)
logger.debug("")
logger.debug("Resolved test inputs:")
for label, path in inputs:
logger.debug(f" {label:<14} {path}")
logger.info("")
logger.info("[1/3] Running llama.cpp 'Free OCR'")
try:
ocr_out = run_mtmd_cli(paths["model"], paths["mmproj"],
paths["image"], paths["binary"])
except RuntimeError as e:
logger.error(f"Error: {e}")
return 1
logger.info("")
logger.info("[2/3] Reading expected output")
expected = read_expected_text(paths["expected-text"])
logger.info(f" expected: {len(expected)} chars")
logger.info("")
logger.info("[3/3] Computing OCR metrics")
ok = evaluate(expected, ocr_out)
return 0 if ok else 1
if __name__ == "__main__":
main()
sys.exit(main())
+3 -5
View File
@@ -1,5 +1,3 @@
sentence-transformers
transformers
tokenizers
torch
torchvision
jiwer
sacrebleu
rapidfuzz
+1
View File
@@ -14,6 +14,7 @@ exclude = [
include = [
"./tools/server/tests/**",
"./scripts/snapdragon/qdc/**",
"./tools/mtmd/tests/**",
]
[overrides.rules]