mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-09 07:16:44 +02:00
mtmd : DeepSeek-OCR image processing fixes, img_tool::resize padding refactor (#23345)
* mtmd : deepseek-ocr fixes, improvements and refactoring - image processing changes to achieve full parity with Pillow (reference impl) - SAM mask casting only when flash-attn is on - SAM refactor (build_sam() extracted so deepseek-ocr-2 can reuse it) - llama-chat changes to fix server/WebUI issue (new media_markers_first()) - adapted test-chat-template and added test cases for deepseek-ocr - changed regression test for deepseek-ocr to use CER+chrF scores for ground-truth comparison; removed embedding-model - ty.toml ignore unresolved-import for tools/mtmd/tests/** * image-text reordering fix removed * refactor bool add_padding + pad_rounding enum into a single pad_style enum
This commit is contained in:
+13
-3
@@ -35,6 +35,16 @@ enum resize_algo {
|
|||||||
// RESIZE_ALGO_LANCZOS, // TODO
|
// RESIZE_ALGO_LANCZOS, // TODO
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Padding style for img_tool::resize
|
||||||
|
// PAD_NONE - no padding; direct resize to target dimensions
|
||||||
|
// PAD_CEIL - aspect-preserving pad (default)
|
||||||
|
// PAD_NEAREST - aspect-preserving pad with nearest-integer rounding (Pillow byte-parity)
|
||||||
|
enum pad_style {
|
||||||
|
PAD_NONE,
|
||||||
|
PAD_CEIL,
|
||||||
|
PAD_NEAREST,
|
||||||
|
};
|
||||||
|
|
||||||
struct clip_hparams {
|
struct clip_hparams {
|
||||||
int32_t image_size = 0;
|
int32_t image_size = 0;
|
||||||
int32_t patch_size = 0;
|
int32_t patch_size = 0;
|
||||||
@@ -52,7 +62,7 @@ struct clip_hparams {
|
|||||||
int32_t image_min_pixels = -1;
|
int32_t image_min_pixels = -1;
|
||||||
int32_t image_max_pixels = -1;
|
int32_t image_max_pixels = -1;
|
||||||
resize_algo image_resize_algo = RESIZE_ALGO_BICUBIC;
|
resize_algo image_resize_algo = RESIZE_ALGO_BICUBIC;
|
||||||
bool image_resize_pad = true; // if false, center-crop will be applied when resizing
|
pad_style image_resize_pad = PAD_CEIL; // padding style when resizing
|
||||||
std::array<uint8_t, 3> image_pad_color = {0, 0, 0};
|
std::array<uint8_t, 3> image_pad_color = {0, 0, 0};
|
||||||
|
|
||||||
// (preprocessor) for llava-uhd style models
|
// (preprocessor) for llava-uhd style models
|
||||||
@@ -61,8 +71,8 @@ struct clip_hparams {
|
|||||||
int32_t preproc_max_tiles = 0;
|
int32_t preproc_max_tiles = 0;
|
||||||
resize_algo image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
|
resize_algo image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
|
||||||
resize_algo image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
|
resize_algo image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
|
||||||
bool image_pad_rf = true; // if true, refined image will be padded (e.g. llava-1.6)
|
pad_style image_pad_rf = PAD_CEIL; // padding style for the refined image (e.g. llava-1.6)
|
||||||
bool image_pad_ov = false; // if true, overview image will be padded (e.g. llava-1.6)
|
pad_style image_pad_ov = PAD_NONE; // padding style for the overview image (e.g. llava-1.6)
|
||||||
std::array<uint8_t, 3> image_pad_color_rf = {0, 0, 0}; // padding color for refined image
|
std::array<uint8_t, 3> image_pad_color_rf = {0, 0, 0}; // padding color for refined image
|
||||||
std::array<uint8_t, 3> image_pad_color_ov = {0, 0, 0}; // padding color for overview image
|
std::array<uint8_t, 3> image_pad_color_ov = {0, 0, 0}; // padding color for overview image
|
||||||
|
|
||||||
|
|||||||
+8
-10
@@ -1233,12 +1233,12 @@ struct clip_model_loader {
|
|||||||
hparams.has_llava_projector = model.proj_type != PROJECTOR_TYPE_COGVLM;
|
hparams.has_llava_projector = model.proj_type != PROJECTOR_TYPE_COGVLM;
|
||||||
hparams.image_pad_color = {122, 116, 104};
|
hparams.image_pad_color = {122, 116, 104};
|
||||||
if (!hparams.image_res_candidates.empty()) {
|
if (!hparams.image_res_candidates.empty()) {
|
||||||
hparams.image_resize_pad = true;
|
hparams.image_resize_pad = PAD_CEIL;
|
||||||
hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
|
hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
|
||||||
} else {
|
} else {
|
||||||
// llava-1.6 default params
|
// llava-1.6 default params
|
||||||
hparams.image_pad_ov = false;
|
hparams.image_pad_ov = PAD_NONE;
|
||||||
hparams.image_pad_rf = true;
|
hparams.image_pad_rf = PAD_CEIL;
|
||||||
hparams.image_pad_color_rf = {122, 116, 104};
|
hparams.image_pad_color_rf = {122, 116, 104};
|
||||||
hparams.image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
|
hparams.image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
|
||||||
hparams.image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
|
hparams.image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
|
||||||
@@ -1246,7 +1246,7 @@ struct clip_model_loader {
|
|||||||
} break;
|
} break;
|
||||||
case PROJECTOR_TYPE_GLM_EDGE:
|
case PROJECTOR_TYPE_GLM_EDGE:
|
||||||
{
|
{
|
||||||
hparams.image_resize_pad = true;
|
hparams.image_resize_pad = PAD_CEIL;
|
||||||
hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
|
hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
|
||||||
} break;
|
} break;
|
||||||
case PROJECTOR_TYPE_MINICPMV:
|
case PROJECTOR_TYPE_MINICPMV:
|
||||||
@@ -1441,7 +1441,7 @@ struct clip_model_loader {
|
|||||||
{
|
{
|
||||||
hparams.n_merge = 2;
|
hparams.n_merge = 2;
|
||||||
hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
|
hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
|
||||||
hparams.image_resize_pad = false;
|
hparams.image_resize_pad = PAD_NONE;
|
||||||
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
|
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
|
||||||
get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
|
get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
|
||||||
std::vector<int> wa_layer_indexes_vec;
|
std::vector<int> wa_layer_indexes_vec;
|
||||||
@@ -1461,7 +1461,7 @@ struct clip_model_loader {
|
|||||||
|
|
||||||
// reka model performs better when using resize_bicubic, which stretches
|
// reka model performs better when using resize_bicubic, which stretches
|
||||||
// the image to fit fixed square size
|
// the image to fit fixed square size
|
||||||
hparams.image_resize_pad = false;
|
hparams.image_resize_pad = PAD_NONE;
|
||||||
} break;
|
} break;
|
||||||
case PROJECTOR_TYPE_GLM4V:
|
case PROJECTOR_TYPE_GLM4V:
|
||||||
{
|
{
|
||||||
@@ -1516,9 +1516,7 @@ struct clip_model_loader {
|
|||||||
hparams.image_size = 1024;
|
hparams.image_size = 1024;
|
||||||
hparams.warmup_image_size = 1024;
|
hparams.warmup_image_size = 1024;
|
||||||
hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
|
hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
|
||||||
hparams.image_pad_color[0] = hparams.image_mean[0];
|
hparams.image_pad_color = {127, 127, 127};
|
||||||
hparams.image_pad_color[1] = hparams.image_mean[1];
|
|
||||||
hparams.image_pad_color[2] = hparams.image_mean[2];
|
|
||||||
|
|
||||||
get_u32(KEY_SAM_N_BLOCK, hparams.sam_n_layer, true);
|
get_u32(KEY_SAM_N_BLOCK, hparams.sam_n_layer, true);
|
||||||
get_u32(KEY_SAM_N_HEAD, hparams.sam_n_head, true);
|
get_u32(KEY_SAM_N_HEAD, hparams.sam_n_head, true);
|
||||||
@@ -1537,7 +1535,7 @@ struct clip_model_loader {
|
|||||||
{
|
{
|
||||||
hparams.n_merge = 2;
|
hparams.n_merge = 2;
|
||||||
hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
|
hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
|
||||||
hparams.image_resize_pad = false;
|
hparams.image_resize_pad = PAD_NONE;
|
||||||
hparams.ffn_op = FFN_GELU;
|
hparams.ffn_op = FFN_GELU;
|
||||||
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
|
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
|
||||||
hparams.set_limit_image_tokens(256, 16384);
|
hparams.set_limit_image_tokens(256, 16384);
|
||||||
|
|||||||
+159
-155
@@ -88,164 +88,168 @@ static ggml_tensor * get_rel_pos(ggml_context * ctx0,
|
|||||||
return cur; // [C, k_size, q_size]
|
return cur; // [C, k_size, q_size]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
|
||||||
|
// Building SAM
|
||||||
|
const int n_embd = hparams.sam_n_embd;
|
||||||
|
const int n_layer = hparams.sam_n_layer;
|
||||||
|
const int n_heads = hparams.sam_n_head;
|
||||||
|
const int d_heads = n_embd / n_heads;
|
||||||
|
const int window = hparams.attn_window_size;
|
||||||
|
|
||||||
|
ggml_tensor * inpL;
|
||||||
|
|
||||||
|
inpL = ggml_conv_2d_sk_p0(ctx0, model.patch_embed_proj_w, inp_raw);
|
||||||
|
inpL = ggml_add(ctx0, inpL, ggml_reshape_3d(ctx0, model.patch_embed_proj_b, 1, 1, n_embd));
|
||||||
|
inpL = ggml_cont(ctx0, ggml_permute(ctx0, inpL, 1, 2, 0, 3));
|
||||||
|
|
||||||
|
ggml_tensor * rel_pos_indices_local;
|
||||||
|
ggml_tensor * rel_pos_indices_global;
|
||||||
|
|
||||||
|
rel_pos_indices_local = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, window, window);
|
||||||
|
rel_pos_indices_global = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, inpL->ne[1], inpL->ne[2]);
|
||||||
|
ggml_set_name(rel_pos_indices_local, "rel_pos_indices_local");
|
||||||
|
ggml_set_name(rel_pos_indices_global, "rel_pos_indices_global");
|
||||||
|
ggml_set_input(rel_pos_indices_local);
|
||||||
|
ggml_set_input(rel_pos_indices_global);
|
||||||
|
|
||||||
|
ggml_tensor * cur;
|
||||||
|
const auto tgt_size = inpL->ne[1];
|
||||||
|
const auto str_size = model.pos_embed->ne[1];
|
||||||
|
|
||||||
|
if (str_size != tgt_size) {
|
||||||
|
ggml_tensor * old_pos_embed = nullptr;
|
||||||
|
old_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, model.pos_embed, 2, 0, 1, 3));
|
||||||
|
ggml_tensor * new_pos_embed =
|
||||||
|
ggml_interpolate(ctx0, old_pos_embed, tgt_size, tgt_size, n_embd, 1, GGML_SCALE_MODE_BICUBIC);
|
||||||
|
new_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, new_pos_embed, 1, 2, 0, 3));
|
||||||
|
cur = ggml_add(ctx0, inpL, new_pos_embed);
|
||||||
|
} else {
|
||||||
|
cur = ggml_add(ctx0, inpL, model.pos_embed);
|
||||||
|
}
|
||||||
|
|
||||||
|
// loop over layers
|
||||||
|
for (int il = 0; il < n_layer; il++) {
|
||||||
|
auto & layer = model.sam_layers[il];
|
||||||
|
ggml_tensor * shortcut = cur;
|
||||||
|
|
||||||
|
// layernorm1
|
||||||
|
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
|
||||||
|
|
||||||
|
const int64_t w0 = cur->ne[1];
|
||||||
|
const int64_t h0 = cur->ne[2];
|
||||||
|
|
||||||
|
ggml_tensor * indices;
|
||||||
|
|
||||||
|
if (hparams.is_global_attn(il)) {
|
||||||
|
indices = rel_pos_indices_global;
|
||||||
|
} else {
|
||||||
|
// local attention layer - apply window partition
|
||||||
|
cur = window_partition(ctx0, cur, window);
|
||||||
|
indices = rel_pos_indices_local;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int64_t W = cur->ne[1];
|
||||||
|
const int64_t H = cur->ne[2];
|
||||||
|
// self-attention
|
||||||
|
{
|
||||||
|
const int B = cur->ne[3];
|
||||||
|
|
||||||
|
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
|
||||||
|
cur = ggml_add(ctx0, cur, layer.qkv_b);
|
||||||
|
cur = ggml_cont(ctx0, cur); // Ensure tensor is contiguous before reshape
|
||||||
|
cur = ggml_reshape_4d(ctx0, cur, n_embd, 3, W * H, B);
|
||||||
|
|
||||||
|
ggml_tensor * Q;
|
||||||
|
ggml_tensor * K;
|
||||||
|
ggml_tensor * V;
|
||||||
|
|
||||||
|
Q = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 0 * cur->nb[1]);
|
||||||
|
Q = ggml_reshape_4d(ctx0, ggml_cont(ctx0, Q), d_heads, n_heads, W * H, B);
|
||||||
|
|
||||||
|
K = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 1 * cur->nb[1]);
|
||||||
|
K = ggml_reshape_4d(ctx0, ggml_cont(ctx0, K), d_heads, n_heads, W * H, B);
|
||||||
|
|
||||||
|
V = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 2 * cur->nb[1]);
|
||||||
|
V = ggml_reshape_4d(ctx0, ggml_cont(ctx0, V), d_heads, n_heads, W * H, B);
|
||||||
|
|
||||||
|
ggml_tensor * mask;
|
||||||
|
ggml_tensor * rw;
|
||||||
|
ggml_tensor * rh;
|
||||||
|
ggml_tensor * qr;
|
||||||
|
|
||||||
|
rw = get_rel_pos(ctx0, layer.rel_pos_w, indices, W, W); // [W, W, C]
|
||||||
|
rh = get_rel_pos(ctx0, layer.rel_pos_h, indices, H, H); // [H, H, C]
|
||||||
|
qr = ggml_permute(ctx0, Q, 0, 2, 1, 3);
|
||||||
|
qr = ggml_reshape_4d(ctx0, ggml_cont(ctx0, qr), d_heads, W, H, B * n_heads);
|
||||||
|
|
||||||
|
rw = ggml_mul_mat(ctx0, rw,
|
||||||
|
ggml_cont(ctx0, ggml_permute(ctx0, qr, 0, 2, 1, 3))); // [B*n_heads, W, H, W]
|
||||||
|
rw = ggml_cont(ctx0, ggml_permute(ctx0, rw, 0, 2, 1, 3)); // [B*n_heads, H, W, W]
|
||||||
|
rw = ggml_reshape_4d(ctx0, rw, W, 1, W * H, n_heads * B);
|
||||||
|
rw = ggml_repeat_4d(ctx0, rw, W, H, W * H, n_heads * B);
|
||||||
|
rh = ggml_mul_mat(ctx0, rh, qr); // [B*n_heads, H, W, H]
|
||||||
|
rh = ggml_reshape_4d(ctx0, rh, 1, H, W * H, n_heads * B);
|
||||||
|
mask = ggml_add(ctx0, rw, rh); // [B*n_heads, H*W, H, W]
|
||||||
|
mask = ggml_reshape_4d(ctx0, mask, W * H, W * H, n_heads, B);
|
||||||
|
// casting mask to F16 only required when flash-attn is enabled
|
||||||
|
if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
|
||||||
|
mask = ggml_cast(ctx0, mask, GGML_TYPE_F16);
|
||||||
|
}
|
||||||
|
|
||||||
|
const float scale = 1.0f / sqrtf(static_cast<float>(d_heads));
|
||||||
|
|
||||||
|
cur = build_attn(layer.o_w, layer.o_b, Q, K, V, mask, scale,
|
||||||
|
il); // [B, H*W, n_embd]
|
||||||
|
cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), n_embd, W, H, B);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (hparams.is_global_attn(il) == false) {
|
||||||
|
// local attention layer - reverse window partition
|
||||||
|
cur = window_unpartition(ctx0, cur, w0, h0, window);
|
||||||
|
}
|
||||||
|
|
||||||
|
// re-add the layer input, e.g., residual
|
||||||
|
cur = ggml_add(ctx0, cur, shortcut);
|
||||||
|
|
||||||
|
ggml_tensor * inpFF = cur;
|
||||||
|
|
||||||
|
// layernorm2
|
||||||
|
cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
|
||||||
|
|
||||||
|
// ffn
|
||||||
|
cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b,
|
||||||
|
hparams.ffn_op, il);
|
||||||
|
|
||||||
|
// residual 2
|
||||||
|
cur = ggml_add(ctx0, cur, inpFF);
|
||||||
|
cb(cur, "sam_layer_out", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
|
||||||
|
|
||||||
|
cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1);
|
||||||
|
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
|
||||||
|
cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1);
|
||||||
|
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
|
||||||
|
|
||||||
|
cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1);
|
||||||
|
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
|
||||||
|
cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1);
|
||||||
|
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
|
||||||
|
|
||||||
|
cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1);
|
||||||
|
cur = ggml_conv_2d(ctx0, model.net_3, cur, 2, 2, 1, 1, 1, 1);
|
||||||
|
cb(cur, "sam_output", -1);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
return cur;
|
||||||
|
}
|
||||||
|
|
||||||
ggml_cgraph * clip_graph_deepseekocr::build() {
|
ggml_cgraph * clip_graph_deepseekocr::build() {
|
||||||
// patch embedding
|
// patch embedding
|
||||||
ggml_tensor * inp_raw = build_inp_raw();
|
ggml_tensor * inp_raw = build_inp_raw();
|
||||||
|
ggml_tensor * sam_out = build_sam(inp_raw);
|
||||||
ggml_tensor * sam_out;
|
|
||||||
// Building SAM
|
|
||||||
{
|
|
||||||
const int n_embd = hparams.sam_n_embd;
|
|
||||||
const int n_layer = hparams.sam_n_layer;
|
|
||||||
const int n_heads = hparams.sam_n_head;
|
|
||||||
const int d_heads = n_embd / n_heads;
|
|
||||||
const int window = hparams.attn_window_size;
|
|
||||||
|
|
||||||
ggml_tensor * inpL;
|
|
||||||
|
|
||||||
inpL = ggml_conv_2d_sk_p0(ctx0, model.patch_embed_proj_w, inp_raw);
|
|
||||||
inpL = ggml_add(ctx0, inpL, ggml_reshape_3d(ctx0, model.patch_embed_proj_b, 1, 1, n_embd));
|
|
||||||
inpL = ggml_cont(ctx0, ggml_permute(ctx0, inpL, 1, 2, 0, 3));
|
|
||||||
|
|
||||||
ggml_tensor * rel_pos_indices_local;
|
|
||||||
ggml_tensor * rel_pos_indices_global;
|
|
||||||
|
|
||||||
rel_pos_indices_local = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, window, window);
|
|
||||||
rel_pos_indices_global = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, inpL->ne[1], inpL->ne[2]);
|
|
||||||
ggml_set_name(rel_pos_indices_local, "rel_pos_indices_local");
|
|
||||||
ggml_set_name(rel_pos_indices_global, "rel_pos_indices_global");
|
|
||||||
ggml_set_input(rel_pos_indices_local);
|
|
||||||
ggml_set_input(rel_pos_indices_global);
|
|
||||||
|
|
||||||
ggml_tensor * cur;
|
|
||||||
const auto tgt_size = inpL->ne[1];
|
|
||||||
const auto str_size = model.pos_embed->ne[1];
|
|
||||||
|
|
||||||
if (str_size != tgt_size) {
|
|
||||||
ggml_tensor * old_pos_embed = nullptr;
|
|
||||||
old_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, model.pos_embed, 2, 0, 1, 3));
|
|
||||||
ggml_tensor * new_pos_embed =
|
|
||||||
ggml_interpolate(ctx0, old_pos_embed, tgt_size, tgt_size, n_embd, 1, GGML_SCALE_MODE_BICUBIC);
|
|
||||||
new_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, new_pos_embed, 1, 2, 0, 3));
|
|
||||||
cur = ggml_add(ctx0, inpL, new_pos_embed);
|
|
||||||
} else {
|
|
||||||
cur = ggml_add(ctx0, inpL, model.pos_embed);
|
|
||||||
}
|
|
||||||
|
|
||||||
// loop over layers
|
|
||||||
for (int il = 0; il < n_layer; il++) {
|
|
||||||
auto & layer = model.sam_layers[il];
|
|
||||||
ggml_tensor * shortcut = cur;
|
|
||||||
|
|
||||||
// layernorm1
|
|
||||||
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
|
|
||||||
|
|
||||||
const int64_t w0 = cur->ne[1];
|
|
||||||
const int64_t h0 = cur->ne[2];
|
|
||||||
|
|
||||||
ggml_tensor * indices;
|
|
||||||
|
|
||||||
if (hparams.is_global_attn(il)) {
|
|
||||||
indices = rel_pos_indices_global;
|
|
||||||
} else {
|
|
||||||
// local attention layer - apply window partition
|
|
||||||
cur = window_partition(ctx0, cur, window);
|
|
||||||
indices = rel_pos_indices_local;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int64_t W = cur->ne[1];
|
|
||||||
const int64_t H = cur->ne[2];
|
|
||||||
// self-attention
|
|
||||||
{
|
|
||||||
const int B = cur->ne[3];
|
|
||||||
|
|
||||||
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
|
|
||||||
cur = ggml_add(ctx0, cur, layer.qkv_b);
|
|
||||||
cur = ggml_cont(ctx0, cur); // Ensure tensor is contiguous before reshape
|
|
||||||
cur = ggml_reshape_4d(ctx0, cur, n_embd, 3, W * H, B);
|
|
||||||
|
|
||||||
ggml_tensor * Q;
|
|
||||||
ggml_tensor * K;
|
|
||||||
ggml_tensor * V;
|
|
||||||
|
|
||||||
Q = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 0 * cur->nb[1]);
|
|
||||||
Q = ggml_reshape_4d(ctx0, ggml_cont(ctx0, Q), d_heads, n_heads, W * H, B);
|
|
||||||
|
|
||||||
K = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 1 * cur->nb[1]);
|
|
||||||
K = ggml_reshape_4d(ctx0, ggml_cont(ctx0, K), d_heads, n_heads, W * H, B);
|
|
||||||
|
|
||||||
V = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 2 * cur->nb[1]);
|
|
||||||
V = ggml_reshape_4d(ctx0, ggml_cont(ctx0, V), d_heads, n_heads, W * H, B);
|
|
||||||
|
|
||||||
ggml_tensor * mask;
|
|
||||||
ggml_tensor * rw;
|
|
||||||
ggml_tensor * rh;
|
|
||||||
ggml_tensor * qr;
|
|
||||||
|
|
||||||
rw = get_rel_pos(ctx0, layer.rel_pos_w, indices, W, W); // [W, W, C]
|
|
||||||
rh = get_rel_pos(ctx0, layer.rel_pos_h, indices, H, H); // [H, H, C]
|
|
||||||
qr = ggml_permute(ctx0, Q, 0, 2, 1, 3);
|
|
||||||
qr = ggml_reshape_4d(ctx0, ggml_cont(ctx0, qr), d_heads, W, H, B * n_heads);
|
|
||||||
|
|
||||||
rw = ggml_mul_mat(ctx0, rw,
|
|
||||||
ggml_cont(ctx0, ggml_permute(ctx0, qr, 0, 2, 1, 3))); // [B*n_heads, W, H, W]
|
|
||||||
rw = ggml_cont(ctx0, ggml_permute(ctx0, rw, 0, 2, 1, 3)); // [B*n_heads, H, W, W]
|
|
||||||
rw = ggml_reshape_4d(ctx0, rw, W, 1, W * H, n_heads * B);
|
|
||||||
rw = ggml_repeat_4d(ctx0, rw, W, H, W * H, n_heads * B);
|
|
||||||
rh = ggml_mul_mat(ctx0, rh, qr); // [B*n_heads, H, W, H]
|
|
||||||
rh = ggml_reshape_4d(ctx0, rh, 1, H, W * H, n_heads * B);
|
|
||||||
mask = ggml_add(ctx0, rw, rh); // [B*n_heads, H*W, H, W]
|
|
||||||
mask = ggml_reshape_4d(ctx0, mask, W * H, W * H, n_heads, B);
|
|
||||||
mask = ggml_cast(ctx0, mask, GGML_TYPE_F16);
|
|
||||||
|
|
||||||
const float scale = 1.0f / sqrtf(static_cast<float>(d_heads));
|
|
||||||
|
|
||||||
cur = build_attn(layer.o_w, layer.o_b, Q, K, V, mask, scale,
|
|
||||||
il); // [B, H*W, n_embd]
|
|
||||||
cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), n_embd, W, H, B);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (hparams.is_global_attn(il) == false) {
|
|
||||||
// local attention layer - reverse window partition
|
|
||||||
cur = window_unpartition(ctx0, cur, w0, h0, window);
|
|
||||||
}
|
|
||||||
|
|
||||||
// re-add the layer input, e.g., residual
|
|
||||||
cur = ggml_add(ctx0, cur, shortcut);
|
|
||||||
|
|
||||||
ggml_tensor * inpFF = cur;
|
|
||||||
|
|
||||||
// layernorm2
|
|
||||||
cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
|
|
||||||
|
|
||||||
// ffn
|
|
||||||
cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b,
|
|
||||||
hparams.ffn_op, il);
|
|
||||||
|
|
||||||
// residual 2
|
|
||||||
cur = ggml_add(ctx0, cur, inpFF);
|
|
||||||
cb(cur, "sam_layer_out", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
|
|
||||||
|
|
||||||
cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1);
|
|
||||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
|
|
||||||
cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1);
|
|
||||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
|
|
||||||
|
|
||||||
cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1);
|
|
||||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
|
|
||||||
cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1);
|
|
||||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
|
|
||||||
|
|
||||||
cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1);
|
|
||||||
cur = ggml_conv_2d(ctx0, model.net_3, cur, 2, 2, 1, 1, 1, 1);
|
|
||||||
cb(cur, "sam_output", -1);
|
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, cur);
|
|
||||||
sam_out = cur;
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor * clip_out;
|
ggml_tensor * clip_out;
|
||||||
// Building DS-OCR CLIP
|
// Building DS-OCR CLIP
|
||||||
|
|||||||
@@ -118,6 +118,7 @@ struct clip_graph_whisper_enc : clip_graph {
|
|||||||
struct clip_graph_deepseekocr : clip_graph {
|
struct clip_graph_deepseekocr : clip_graph {
|
||||||
clip_graph_deepseekocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
clip_graph_deepseekocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||||
ggml_cgraph * build() override;
|
ggml_cgraph * build() override;
|
||||||
|
ggml_tensor * build_sam(ggml_tensor * inp); // build the SAM model
|
||||||
};
|
};
|
||||||
|
|
||||||
struct clip_graph_conformer : clip_graph {
|
struct clip_graph_conformer : clip_graph {
|
||||||
|
|||||||
+56
-38
@@ -38,7 +38,7 @@ struct img_tool {
|
|||||||
clip_image_u8 & dst,
|
clip_image_u8 & dst,
|
||||||
const clip_image_size & target_resolution,
|
const clip_image_size & target_resolution,
|
||||||
resize_algo algo,
|
resize_algo algo,
|
||||||
bool add_padding = true, // TODO: define the behavior for add_padding = false
|
pad_style padding = PAD_CEIL,
|
||||||
std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
|
std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
|
||||||
dst.nx = target_resolution.width;
|
dst.nx = target_resolution.width;
|
||||||
dst.ny = target_resolution.height;
|
dst.ny = target_resolution.height;
|
||||||
@@ -50,7 +50,7 @@ struct img_tool {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!add_padding) {
|
if (padding == PAD_NONE) {
|
||||||
// direct resize
|
// direct resize
|
||||||
switch (algo) {
|
switch (algo) {
|
||||||
case RESIZE_ALGO_BILINEAR:
|
case RESIZE_ALGO_BILINEAR:
|
||||||
@@ -71,8 +71,15 @@ struct img_tool {
|
|||||||
float scale_w = static_cast<float>(target_resolution.width) / src.nx;
|
float scale_w = static_cast<float>(target_resolution.width) / src.nx;
|
||||||
float scale_h = static_cast<float>(target_resolution.height) / src.ny;
|
float scale_h = static_cast<float>(target_resolution.height) / src.ny;
|
||||||
float scale = std::min(scale_w, scale_h);
|
float scale = std::min(scale_w, scale_h);
|
||||||
int new_width = std::min(static_cast<int>(std::ceil(src.nx * scale)), target_resolution.width);
|
|
||||||
int new_height = std::min(static_cast<int>(std::ceil(src.ny * scale)), target_resolution.height);
|
int new_width, new_height;
|
||||||
|
if (padding == PAD_NEAREST) {
|
||||||
|
new_width = std::min(static_cast<int>(std::round(src.nx * scale)), target_resolution.width);
|
||||||
|
new_height = std::min(static_cast<int>(std::round(src.ny * scale)), target_resolution.height);
|
||||||
|
} else {
|
||||||
|
new_width = std::min(static_cast<int>(std::ceil(src.nx * scale)), target_resolution.width);
|
||||||
|
new_height = std::min(static_cast<int>(std::ceil(src.ny * scale)), target_resolution.height);
|
||||||
|
}
|
||||||
|
|
||||||
switch (algo) {
|
switch (algo) {
|
||||||
case RESIZE_ALGO_BILINEAR:
|
case RESIZE_ALGO_BILINEAR:
|
||||||
@@ -91,9 +98,14 @@ struct img_tool {
|
|||||||
// fill dst with pad_color
|
// fill dst with pad_color
|
||||||
fill(dst, pad_color);
|
fill(dst, pad_color);
|
||||||
|
|
||||||
int offset_x = (target_resolution.width - new_width) / 2;
|
int offset_x, offset_y;
|
||||||
int offset_y = (target_resolution.height - new_height) / 2;
|
if (padding == PAD_NEAREST) {
|
||||||
|
offset_x = static_cast<int>(std::round((target_resolution.width - new_width) / 2.0f));
|
||||||
|
offset_y = static_cast<int>(std::round((target_resolution.height - new_height) / 2.0f));
|
||||||
|
} else {
|
||||||
|
offset_x = (target_resolution.width - new_width) / 2;
|
||||||
|
offset_y = (target_resolution.height - new_height) / 2;
|
||||||
|
}
|
||||||
composite(dst, resized_image, offset_x, offset_y);
|
composite(dst, resized_image, offset_x, offset_y);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -356,10 +368,10 @@ private:
|
|||||||
GGML_ASSERT(inSize > 0 && outSize > 0);
|
GGML_ASSERT(inSize > 0 && outSize > 0);
|
||||||
double support, scale, filterscale;
|
double support, scale, filterscale;
|
||||||
double center, ww, ss;
|
double center, ww, ss;
|
||||||
int xx, x, ksize, xmin, xmax, xcnt;
|
int xx, x, ksize, xmin, xmax;
|
||||||
|
|
||||||
// Calculate scaling factor: ratio of input range to output size
|
// Calculate scaling factor: ratio of input range to output size
|
||||||
filterscale = scale = (double)inSize / outSize;
|
filterscale = scale = static_cast<double>(inSize) / outSize;
|
||||||
// For upsampling (scale < 1), keep filterscale = 1 to maintain filter sharpness
|
// For upsampling (scale < 1), keep filterscale = 1 to maintain filter sharpness
|
||||||
// For downsampling (scale > 1), widen filter to prevent aliasing
|
// For downsampling (scale > 1), widen filter to prevent aliasing
|
||||||
if (filterscale < 1.0) {
|
if (filterscale < 1.0) {
|
||||||
@@ -373,6 +385,7 @@ private:
|
|||||||
std::vector<double> pre_weights(outSize * ksize); // Temporary weights
|
std::vector<double> pre_weights(outSize * ksize); // Temporary weights
|
||||||
bounds.resize(outSize * 2);
|
bounds.resize(outSize * 2);
|
||||||
|
|
||||||
|
|
||||||
// For each output pixel, compute its filter coefficients
|
// For each output pixel, compute its filter coefficients
|
||||||
for (xx = 0; xx < outSize; xx++) {
|
for (xx = 0; xx < outSize; xx++) {
|
||||||
// Calculate the center position in input space (pixel-center convention: +0.5)
|
// Calculate the center position in input space (pixel-center convention: +0.5)
|
||||||
@@ -391,10 +404,10 @@ private:
|
|||||||
xmax = inSize;
|
xmax = inSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
xcnt = xmax - xmin;
|
xmax -= xmin;
|
||||||
|
|
||||||
// Compute filter weights for each contributing input pixel
|
// Compute filter weights for each contributing input pixel
|
||||||
for (x = 0; x < xcnt; x++) {
|
for (x = 0; x < xmax; x++) {
|
||||||
// Distance from input pixel center to output pixel center in input space
|
// Distance from input pixel center to output pixel center in input space
|
||||||
double w = bicubic_filter((x + xmin - center + 0.5) * ss);
|
double w = bicubic_filter((x + xmin - center + 0.5) * ss);
|
||||||
pre_weights[xx * ksize + x] = w;
|
pre_weights[xx * ksize + x] = w;
|
||||||
@@ -402,7 +415,7 @@ private:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Normalize weights to sum to 1.0 (preserves brightness)
|
// Normalize weights to sum to 1.0 (preserves brightness)
|
||||||
for (x = 0; x < xcnt; x++) {
|
for (x = 0; x < xmax; x++) {
|
||||||
if (ww != 0.0) {
|
if (ww != 0.0) {
|
||||||
pre_weights[xx * ksize + x] /= ww;
|
pre_weights[xx * ksize + x] /= ww;
|
||||||
}
|
}
|
||||||
@@ -415,18 +428,27 @@ private:
|
|||||||
|
|
||||||
// Store input pixel range for this output pixel
|
// Store input pixel range for this output pixel
|
||||||
bounds[xx * 2 + 0] = xmin;
|
bounds[xx * 2 + 0] = xmin;
|
||||||
bounds[xx * 2 + 1] = xcnt;
|
bounds[xx * 2 + 1] = xmax;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Convert floating-point coefficients to fixed-point integers
|
// Convert floating-point coefficients to fixed-point integers
|
||||||
// Formula: int32 = round(float * 2^PRECISION_BITS)
|
// Formula: int32 = round(float * 2^PRECISION_BITS)
|
||||||
weights.resize(outSize * ksize);
|
weights.resize(outSize * ksize);
|
||||||
|
|
||||||
|
const double fxp_scale = std::ldexp(1.0, PRECISION_BITS); // 1.0 * 2^PRECISION_BITS
|
||||||
|
|
||||||
for (int i = 0; i < outSize * ksize; i++) {
|
for (int i = 0; i < outSize * ksize; i++) {
|
||||||
|
double tmp_val = pre_weights[i] * fxp_scale;
|
||||||
if (pre_weights[i] < 0) {
|
if (pre_weights[i] < 0) {
|
||||||
weights[i] = static_cast<int32_t>(-0.5 + pre_weights[i] * (1 << PRECISION_BITS));
|
tmp_val -= 0.5;
|
||||||
} else {
|
} else {
|
||||||
weights[i] = static_cast<int32_t>(0.5 + pre_weights[i] * (1 << PRECISION_BITS));
|
tmp_val += 0.5;
|
||||||
}
|
}
|
||||||
|
tmp_val = std::round(tmp_val);
|
||||||
|
tmp_val = std::clamp(tmp_val,
|
||||||
|
static_cast<double>(std::numeric_limits<int32_t>::min()),
|
||||||
|
static_cast<double>(std::numeric_limits<int32_t>::max()));
|
||||||
|
weights[i] = static_cast<int32_t>(tmp_val);
|
||||||
}
|
}
|
||||||
|
|
||||||
return ksize;
|
return ksize;
|
||||||
@@ -1083,35 +1105,31 @@ bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, cli
|
|||||||
//
|
//
|
||||||
|
|
||||||
bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
|
bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
|
||||||
const std::vector native_resolutions = {
|
static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ };
|
||||||
/*512 tiny , 640 small, */ 1024 /* base */, 1280 /* large */
|
// TODO: support 512 (tiny) and 640 (small) once we have eval data for them
|
||||||
};
|
|
||||||
// original image size
|
|
||||||
const clip_image_size original_size{img.nx, img.ny};
|
|
||||||
const int orig_w = original_size.width;
|
|
||||||
const int orig_h = original_size.height;
|
|
||||||
const int orig_area = orig_h * orig_w;
|
|
||||||
|
|
||||||
size_t mode_i = 0;
|
const int64_t orig_area = static_cast<int64_t>(img.nx) * img.ny;
|
||||||
int min_diff = orig_area;
|
|
||||||
|
|
||||||
for (size_t i = 0; i < native_resolutions.size(); i++) {
|
size_t mode_i = 0;
|
||||||
int r = native_resolutions[i];
|
int64_t min_diff = std::numeric_limits<int64_t>::max();
|
||||||
if (std::abs(orig_area - r * r) < min_diff) {
|
for (size_t i = 0; i < std::size(native_resolutions); i++) {
|
||||||
mode_i = i;
|
const int64_t r = native_resolutions[i];
|
||||||
min_diff = std::abs(orig_area - r * r);
|
const int64_t diff = std::abs(orig_area - r * r);
|
||||||
|
if (diff < min_diff) {
|
||||||
|
mode_i = i;
|
||||||
|
min_diff = diff;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Native Resolution (Base/Large) */
|
|
||||||
const int image_size = native_resolutions[mode_i];
|
const int image_size = native_resolutions[mode_i];
|
||||||
|
|
||||||
// scaled and padded image
|
// Aspect-preserving fit-and-pad. Pillow bicubic + PAD_NEAREST for
|
||||||
clip_image_u8_ptr scaled_img(clip_image_u8_init());
|
// byte-parity with the upstream deepseek-ai/DeepSeek-OCR HF preprocessor.
|
||||||
img_tool::resize(img, *scaled_img, clip_image_size{image_size, image_size}, hparams.image_resize_algo);
|
clip_image_u8 padded;
|
||||||
|
img_tool::resize(img, padded, {image_size, image_size}, RESIZE_ALGO_BICUBIC_PILLOW,
|
||||||
|
PAD_NEAREST, hparams.image_pad_color);
|
||||||
|
|
||||||
clip_image_f32_ptr res(clip_image_f32_init());
|
clip_image_f32_ptr res(clip_image_f32_init());
|
||||||
img_u8_to_f32(*scaled_img, *res, hparams.image_mean, hparams.image_std);
|
img_u8_to_f32(padded, *res, hparams.image_mean, hparams.image_std);
|
||||||
output.entries.push_back(std::move(res));
|
output.entries.push_back(std::move(res));
|
||||||
|
|
||||||
output.grid_x = 1;
|
output.grid_x = 1;
|
||||||
@@ -1246,7 +1264,7 @@ clip_image_u8 mtmd_image_preprocessor_step3vl::prepare_image(const clip_image_u8
|
|||||||
std::max(1, static_cast<int>(std::floor(resized.ny * scale))),
|
std::max(1, static_cast<int>(std::floor(resized.ny * scale))),
|
||||||
};
|
};
|
||||||
clip_image_u8 scaled;
|
clip_image_u8 scaled;
|
||||||
img_tool::resize(resized, scaled, new_size, RESIZE_ALGO_BILINEAR, false);
|
img_tool::resize(resized, scaled, new_size, RESIZE_ALGO_BILINEAR, PAD_NONE);
|
||||||
resized = std::move(scaled);
|
resized = std::move(scaled);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1347,7 +1365,7 @@ bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip
|
|||||||
clip_image_u8 img_for_crop = prepared;
|
clip_image_u8 img_for_crop = prepared;
|
||||||
if (instructions.refined_size.width != prepared.nx || instructions.refined_size.height != prepared.ny) {
|
if (instructions.refined_size.width != prepared.nx || instructions.refined_size.height != prepared.ny) {
|
||||||
clip_image_u8 refined;
|
clip_image_u8 refined;
|
||||||
img_tool::resize(prepared, refined, instructions.refined_size, RESIZE_ALGO_BILINEAR, false);
|
img_tool::resize(prepared, refined, instructions.refined_size, RESIZE_ALGO_BILINEAR, PAD_NONE);
|
||||||
img_for_crop = std::move(refined);
|
img_for_crop = std::move(refined);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,85 +0,0 @@
|
|||||||
<|ref|>title<|/ref|><|det|>[[61, 255, 907, 533]]<|/det|>
|
|
||||||
# MEN WALK ON MOON
|
|
||||||
ASTRONAUTS LAND ON PLAIN;
|
|
||||||
COLLECT ROCKS, PLANT FLAG
|
|
||||||
|
|
||||||
<|ref|>text<|/ref|><|det|>[[56, 559, 268, 629]]<|/det|>
|
|
||||||
Voice From Moon:
|
|
||||||
Eagle Has Landed'
|
|
||||||
|
|
||||||
<|ref|>text<|/ref|><|det|>[[74, 645, 262, 675]]<|/det|>
|
|
||||||
EAGLE (the lunar surface, Houston, Truesquily)
|
|
||||||
Base here, The Eagle has landed.
|
|
||||||
|
|
||||||
<|ref|>text<|/ref|><|det|>[[74, 675, 262, 720]]<|/det|>
|
|
||||||
BOOTHROOM: Lounge, Truesquily, we enjoy you on the ground. You've got a bunch of guys about to toss bikes. We're breaking again. Thanks a lot.
|
|
||||||
|
|
||||||
<|ref|>text<|/ref|><|det|>[[74, 720, 262, 750]]<|/det|>
|
|
||||||
TRAVELLING MADE: Time you. BOOTHROOM: You're looking good here.
|
|
||||||
|
|
||||||
<|ref|>text<|/ref|><|det|>[[74, 750, 262, 780]]<|/det|>
|
|
||||||
TRAVELLING MADE: A very smooth touchdown. BEDROOM: Eagle, you are very far. I'll. (The first sign in the lunar appearance) (Over.)
|
|
||||||
|
|
||||||
<|ref|>text<|/ref|><|det|>[[74, 780, 262, 810]]<|/det|>
|
|
||||||
TRAVELLING MADE: Eagle, stay for I'll. BOOTHROOM: Bumper and we are you waiting the cue.
|
|
||||||
|
|
||||||
<|ref|>text<|/ref|><|det|>[[74, 810, 262, 830]]<|/det|>
|
|
||||||
TRAVELLING MADE: Eagle, and service mobility.
|
|
||||||
|
|
||||||
<|ref|>text<|/ref|><|det|>[[74, 830, 262, 850]]<|/det|>
|
|
||||||
How do you read me?
|
|
||||||
|
|
||||||
<|ref|>text<|/ref|><|det|>[[74, 850, 262, 880]]<|/det|>
|
|
||||||
TRAVELLING COLUMBIA, he has landed Truesquily. Base, Eagle is at Truesquily. I read you first by. Over.
|
|
||||||
|
|
||||||
<|ref|>text<|/ref|><|det|>[[74, 880, 262, 900]]<|/det|>
|
|
||||||
COLUMBIA: Yes, I heard the whole thing.
|
|
||||||
|
|
||||||
<|ref|>text<|/ref|><|det|>[[74, 900, 262, 920]]<|/det|>
|
|
||||||
BOOTHROOM: Well, it's a good show.
|
|
||||||
|
|
||||||
<|ref|>text<|/ref|><|det|>[[74, 920, 262, 940]]<|/det|>
|
|
||||||
COLUMBIA: Fantastic.
|
|
||||||
|
|
||||||
<|ref|>text<|/ref|><|det|>[[74, 940, 262, 960]]<|/det|>
|
|
||||||
TRAVELLING MADE: I'll read that.
|
|
||||||
|
|
||||||
<|ref|>text<|/ref|><|det|>[[74, 960, 262, 980]]<|/det|>
|
|
||||||
APOLLO CONTROL: The most major sky to sky will be for the 23 event, that is at 21 minutes 26 sec-
|
|
||||||
|
|
||||||
<|ref|>text<|/ref|><|det|>[[74, 980, 262, 990]]<|/det|>
|
|
||||||
tion of lunar descent.
|
|
||||||
|
|
||||||
<|ref|>image<|/ref|><|det|>[[270, 545, 697, 990]]<|/det|>
|
|
||||||
|
|
||||||
|
|
||||||
<|ref|>text<|/ref|><|det|>[[715, 559, 911, 629]]<|/det|>
|
|
||||||
A Powdery Surface
|
|
||||||
Is Closely Explored
|
|
||||||
|
|
||||||
<|ref|>text<|/ref|><|det|>[[733, 645, 851, 665]]<|/det|>
|
|
||||||
BY JOHN NOBLE WILFORD
|
|
||||||
|
|
||||||
<|ref|>text<|/ref|><|det|>[[715, 669, 911, 700]]<|/det|>
|
|
||||||
HOUSTON, Monday, July 21—New hires landed and walked on the moon.
|
|
||||||
|
|
||||||
<|ref|>text<|/ref|><|det|>[[715, 700, 911, 750]]<|/det|>
|
|
||||||
Two Americans, astronauts of Apollo 11, steered their Eagle-shaped lunar module safely and smoothly to the lunar landing yesterday at 4:17:40 P.M., Eastern day-light time.
|
|
||||||
|
|
||||||
<|ref|>text<|/ref|><|det|>[[715, 750, 911, 780]]<|/det|>
|
|
||||||
Neil A. Armstrong, the 38-year-old civilian commander, radioed to earth and the landing team here.
|
|
||||||
|
|
||||||
<|ref|>text<|/ref|><|det|>[[715, 780, 911, 830]]<|/det|>
|
|
||||||
"Boom, Truesquily! Base here. The Eagle has landed," the first man to reach the moon—Neil Armstrong and his engineer, Capt. Charles E. Alder, of the Jet Propulsion Laboratory, the space agency's rocket and space program manager.
|
|
||||||
|
|
||||||
<|ref|>text<|/ref|><|det|>[[715, 830, 911, 880]]<|/det|>
|
|
||||||
About six and a half hours later, Mr. Armstrong opened the landing craft's hatch, stepped slowly down the ladder and descended as he pointed his first landing footguard on the lunar crater.
|
|
||||||
|
|
||||||
<|ref|>text<|/ref|><|det|>[[715, 880, 911, 920]]<|/det|>
|
|
||||||
"That's one small step for man, one giant leap for mankind."
|
|
||||||
|
|
||||||
<|ref|>text<|/ref|><|det|>[[715, 920, 911, 960]]<|/det|>
|
|
||||||
His first step on the moon came on 10:56:29 P.M., as a television camera recorded the craft's transmitted his every word to an aerial and excited audiences of hundreds of millions of people on earth.
|
|
||||||
|
|
||||||
<|ref|>text<|/ref|><|det|>[[749, 960, 861, 974]]<|/det|>
|
|
||||||
Testable Slope Test Soil
|
|
||||||
@@ -1,42 +0,0 @@
|
|||||||
MEN WALK ON MOON
|
|
||||||
ASTRONAUTS LAND ON PLAIN;
|
|
||||||
COLLECT ROCKS, PLANT FLAG
|
|
||||||
|
|
||||||
Voice From Moon:
|
|
||||||
'Eagle Has Landed'
|
|
||||||
|
|
||||||
A Powder Surface
|
|
||||||
Is Closely Explored
|
|
||||||
|
|
||||||
By JOHN NOBLE WILFORD
|
|
||||||
NOVEMBER, Monday, July 21—New York Herald and
|
|
||||||
wished on the moon.
|
|
||||||
|
|
||||||
Two American astronauts of Apollo 11, steered their
|
|
||||||
frigate Eagle toward the moon's surface and smoothly to
|
|
||||||
the lunar landing yesterday at 4:17:40 P.M., Eastern day-
|
|
||||||
light time.
|
|
||||||
|
|
||||||
Neil A. Armstrong, the 38-year-old civilian commander,
|
|
||||||
landed on the soft sand of the moon's surface here.
|
|
||||||
|
|
||||||
"Beautiful, Triumph!" he said. "The Eagle has landed."
|
|
||||||
|
|
||||||
The first man to reach the moon—Neil Armstrong and
|
|
||||||
his co-pilot, Charles E. "Pete" Conrad, 26, of the Pentagon,
|
|
||||||
brought their ship to rest on a level, rock-strewn plain near
|
|
||||||
the moon's surface. The two men and two of the three
|
|
||||||
astronauts on board, Armstrong, Conrad and Edwin E.
|
|
||||||
Aldrin, 38, of Houston, stepped slowly down the ladder
|
|
||||||
and descended as he pointed his first full-flaming footpad
|
|
||||||
at the lunar crater.
|
|
||||||
|
|
||||||
"That's one small step for man, one giant leap for
|
|
||||||
mankind."
|
|
||||||
|
|
||||||
His first step on the moon came at 10:56:20 P.M., as
|
|
||||||
a television camera rolled the earth's thousandth line every
|
|
||||||
second to an aerial and studied audiences of hundreds of
|
|
||||||
millions of people on earth.
|
|
||||||
|
|
||||||
Textile Slope Test Soil
|
|
||||||
@@ -0,0 +1,24 @@
|
|||||||
|
|
||||||
|
A Powdery Surface
|
||||||
|
Is Closely Explored
|
||||||
|
|
||||||
|
By JOHN NOBLE WILFORD
|
||||||
|
Special to The New York Times
|
||||||
|
|
||||||
|
HOUSTON, Monday, July 21—Men have landed and walked on the moon.
|
||||||
|
|
||||||
|
Two Americans, astronauts of Apollo 11, steered their fragile four-legged lunar module safely and smoothly to the historic landing yesterday at 4:17:40 P.M., Eastern daylight time.
|
||||||
|
|
||||||
|
Neil A. Armstrong, the 38-year-old civilian commander, radioed to earth and the mission control room here:
|
||||||
|
|
||||||
|
"Houston, Tranquility Base here. The Eagle has landed."
|
||||||
|
|
||||||
|
The first men to reach the moon—Mr. Armstrong and his co-pilot, Col. Edwin E. Aldrin Jr. of the Air Force—brought their ship to rest on a level, rock-strewn plain near the southwestern shore of the arid Sea of Tranquility.
|
||||||
|
|
||||||
|
About six and a half hours later, Mr. Armstrong opened the landing craft's hatch, stepped slowly down the ladder and declared as he planted the first human footprint on the lunar crust:
|
||||||
|
|
||||||
|
"That's one small step for man, one giant leap for mankind."
|
||||||
|
|
||||||
|
His first step on the moon came at 10:56:20 P.M., as a television camera outside the craft transmitted his every move to an awed and excited audience of hundreds of millions of people on earth.
|
||||||
|
|
||||||
|
Tentative Steps Test Soil
|
||||||
@@ -1,186 +1,220 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
Test script to compare llama.cpp mtmd-cli output with HuggingFace reference implementation
|
Evaluates llama.cpp's DeepSeek-OCR by comparing its output for a test
|
||||||
for DeepSeek-OCR model using embedding similarity.
|
image to the actual text in part of that image.
|
||||||
|
|
||||||
|
Runs the test image through mtmd-cli, calculates CER and chrF for
|
||||||
|
its output, and holds them against the HF model's scores.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import logging
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
import unicodedata
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from sentence_transformers import SentenceTransformer
|
logger = logging.getLogger("deepseek-ocr-test")
|
||||||
from sentence_transformers import util
|
|
||||||
|
DEFAULT_IMAGE = "test-1.jpeg"
|
||||||
|
DEFAULT_EXPECTED_TEXT = "test-1-ground-truth.txt"
|
||||||
|
RUN_TIMEOUT = 300
|
||||||
|
|
||||||
|
# DeepSeek-OCR reference scores on the test image.
|
||||||
|
# This is the baseline the implementation should keep up with.
|
||||||
|
HF_REFERENCE_CER = 0.3030
|
||||||
|
HF_REFERENCE_CHRF = 67.52
|
||||||
|
|
||||||
|
CER_TOLERANCE = 0.02
|
||||||
|
CHRF_TOLERANCE = 2.0
|
||||||
|
|
||||||
|
CER_MAX = HF_REFERENCE_CER + CER_TOLERANCE
|
||||||
|
CHRF_MIN = HF_REFERENCE_CHRF - CHRF_TOLERANCE
|
||||||
|
|
||||||
|
|
||||||
def run_mtmd_deepseek_ocr(
|
def verdict(ok: bool) -> str:
|
||||||
model_path: str,
|
return "PASS" if ok else "FAIL"
|
||||||
mmproj_path: str,
|
|
||||||
image_path: str,
|
|
||||||
bin_path: str,
|
def normalize_text(text: str) -> str:
|
||||||
prompt: str = "Free OCR."
|
"""NFC-normalize and collapse whitespace, so line-wrap and spacing
|
||||||
) -> str:
|
don't count as CER errors."""
|
||||||
|
return " ".join(unicodedata.normalize("NFC", text).split())
|
||||||
|
|
||||||
|
|
||||||
|
def locally_align(expected: str, ocr_out: str) -> str:
|
||||||
|
"""Return the span of `ocr_out` that best matches `expected`.
|
||||||
|
|
||||||
|
The ground truth covers part of the article body.
|
||||||
|
But the test image includes half of the newspaper's front page.
|
||||||
|
Fuzzy partial-ratio matching picks out
|
||||||
|
the body so the unrelated text doesn't disturb CER / chrF.
|
||||||
"""
|
"""
|
||||||
Run inference using llama.cpp mtmd-cli.
|
from rapidfuzz import fuzz
|
||||||
|
alignment = fuzz.partial_ratio_alignment(expected, ocr_out)
|
||||||
|
if alignment is None or alignment.dest_end <= alignment.dest_start:
|
||||||
|
return ocr_out
|
||||||
|
return ocr_out[alignment.dest_start:alignment.dest_end]
|
||||||
|
|
||||||
|
|
||||||
|
def compute_cer(expected: str, ocr_out: str) -> float:
|
||||||
|
"""Character Error Rate. Lower is better.
|
||||||
|
CER: fraction of characters you'd insert/delete/substitute to fix the output; 0 = perfect."""
|
||||||
|
import jiwer
|
||||||
|
return jiwer.cer(expected, ocr_out)
|
||||||
|
|
||||||
|
|
||||||
|
def compute_chrf(expected: str, ocr_out: str) -> float:
|
||||||
|
"""chrF score on 0-100. Higher is better.
|
||||||
|
chrF: F-score over shared character n-grams; more forgiving of small word/spacing drift than CER.
|
||||||
"""
|
"""
|
||||||
|
from sacrebleu.metrics import CHRF
|
||||||
|
return CHRF().sentence_score(ocr_out, [expected]).score
|
||||||
|
|
||||||
|
|
||||||
|
def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str:
|
||||||
|
"""Run mtmd-cli on the image and return its output."""
|
||||||
cmd = [
|
cmd = [
|
||||||
bin_path,
|
str(bin_path),
|
||||||
"-m", model_path,
|
"-m", str(model_path),
|
||||||
"--mmproj", mmproj_path,
|
"--mmproj", str(mmproj_path),
|
||||||
"--image", image_path,
|
"--image", str(image_path),
|
||||||
# "-p", "<|grounding|>Convert the document to markdown.",
|
"-p", "Free OCR. ",
|
||||||
"-p", prompt,
|
|
||||||
"--chat-template", "deepseek-ocr",
|
"--chat-template", "deepseek-ocr",
|
||||||
"--temp", "0",
|
"--temp", "0",
|
||||||
"-n", "1024",
|
"--flash-attn", "off", # match the HF "eager" attention reference
|
||||||
# "--verbose"
|
"--no-warmup",
|
||||||
]
|
]
|
||||||
|
logger.debug(f" command: {' '.join(cmd)}")
|
||||||
|
|
||||||
print(f"Running llama.cpp command: {' '.join(cmd)}")
|
try:
|
||||||
|
result = subprocess.run(cmd, capture_output=True, text=False, timeout=RUN_TIMEOUT)
|
||||||
result = subprocess.run(
|
except subprocess.TimeoutExpired as e:
|
||||||
cmd,
|
if e.stderr:
|
||||||
capture_output=True,
|
logger.error("llama.cpp stderr:\n%s", e.stderr.decode("utf-8", errors="replace"))
|
||||||
text=False,
|
raise RuntimeError(f"llama-mtmd-cli timed out after {RUN_TIMEOUT}s")
|
||||||
timeout=300
|
|
||||||
)
|
|
||||||
|
|
||||||
if result.returncode != 0:
|
if result.returncode != 0:
|
||||||
stderr = result.stderr.decode('utf-8', errors='replace')
|
logger.error("llama.cpp stderr:\n%s", result.stderr.decode("utf-8", errors="replace"))
|
||||||
print(f"llama.cpp stderr: {stderr}")
|
|
||||||
raise RuntimeError(f"llama-mtmd-cli failed with code {result.returncode}")
|
raise RuntimeError(f"llama-mtmd-cli failed with code {result.returncode}")
|
||||||
|
|
||||||
output = result.stdout.decode('utf-8', errors='replace').strip()
|
output = result.stdout.decode("utf-8", errors="replace").strip()
|
||||||
print(f"llama.cpp output length: {len(output)} chars")
|
if not output:
|
||||||
|
raise RuntimeError("llama-mtmd-cli produced no output on stdout")
|
||||||
|
logger.info(f" output: {len(output)} chars")
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
def compute_embedding_similarity(text1: str, text2: str, model_name: str) -> float:
|
def read_expected_text(file_path: Path) -> str:
|
||||||
"""
|
with open(file_path, "r", encoding="utf-8") as f:
|
||||||
Compute cosine similarity between two texts using embedding model.
|
|
||||||
"""
|
|
||||||
print(f"Loading embedding model: {model_name}")
|
|
||||||
|
|
||||||
# Use sentence-transformers for easier embedding extraction
|
|
||||||
embed_model = SentenceTransformer(model_name)
|
|
||||||
|
|
||||||
print("Computing embeddings...")
|
|
||||||
embeddings = embed_model.encode([text1, text2], convert_to_numpy=True)
|
|
||||||
|
|
||||||
similarity = util.similarity.cos_sim([embeddings[0]], [embeddings[1]])[0][0]
|
|
||||||
return float(similarity)
|
|
||||||
|
|
||||||
|
|
||||||
def read_expected_output(file_path: str) -> str:
|
|
||||||
"""
|
|
||||||
Read expected OCR output from file.
|
|
||||||
"""
|
|
||||||
cur_path = Path(__file__).parent
|
|
||||||
expected_path = str(cur_path / file_path)
|
|
||||||
with open(expected_path, "r", encoding="utf-8") as f:
|
|
||||||
return f.read().strip()
|
return f.read().strip()
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def evaluate(expected: str, ocr_out: str) -> bool:
|
||||||
ap = argparse.ArgumentParser(description="Compare llama.cpp and HuggingFace DeepSeek-OCR outputs")
|
expected = normalize_text(expected)
|
||||||
ap.add_argument("--llama-model", default="gguf_models/deepseek-ai/deepseek-ocr-f16.gguf",
|
ocr_out = normalize_text(ocr_out)
|
||||||
help="Path to llama.cpp GGUF model")
|
aligned = locally_align(expected, ocr_out)
|
||||||
ap.add_argument("--mmproj", default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-f16.gguf",
|
|
||||||
help="Path to mmproj GGUF file")
|
logger.debug(f"\n--- expected (normalized) ---\n{expected}")
|
||||||
ap.add_argument("--image", default="test-1.jpeg",
|
logger.debug(f"\n--- OCR output (normalized) ---\n{ocr_out}")
|
||||||
help="Path to test image")
|
logger.debug(f"\n--- aligned span ---\n{aligned}")
|
||||||
|
|
||||||
|
cer = compute_cer(expected, aligned)
|
||||||
|
chrf = compute_chrf(expected, aligned)
|
||||||
|
|
||||||
|
cer_pass = cer <= CER_MAX
|
||||||
|
chrf_pass = chrf >= CHRF_MIN
|
||||||
|
passed = cer_pass and chrf_pass
|
||||||
|
|
||||||
|
logger.info("")
|
||||||
|
logger.info("=" * 60)
|
||||||
|
logger.info("Free OCR evaluation:")
|
||||||
|
logger.info("=" * 60)
|
||||||
|
logger.info(f" CER {cer:>7.4f} (<= {CER_MAX:>7.4f} -> {verdict(cer_pass)})")
|
||||||
|
logger.info(f" chrF (0-100) {chrf:>7.2f} (>= {CHRF_MIN:>7.2f} -> {verdict(chrf_pass)})")
|
||||||
|
logger.info(f" Expected chars {len(expected):>7}")
|
||||||
|
logger.info(f" Aligned chars {len(aligned):>7} (of {len(ocr_out)} OCR chars)")
|
||||||
|
logger.info("")
|
||||||
|
logger.info(f" Result: {verdict(passed)}")
|
||||||
|
logger.info("=" * 60)
|
||||||
|
return passed
|
||||||
|
|
||||||
|
|
||||||
|
def argument_parser() -> argparse.ArgumentParser:
|
||||||
|
ap = argparse.ArgumentParser(description="Compare llama.cpp DeepSeek-OCR output with a ground-truth transcript")
|
||||||
|
ap.add_argument("--llama-model", default="gguf_models/deepseek-ai/deepseek-ocr-bf16.gguf",
|
||||||
|
help="Path to llama.cpp GGUF model (relative to repo root or absolute)")
|
||||||
|
ap.add_argument("--mmproj", default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-bf16.gguf",
|
||||||
|
help="Path to mmproj GGUF file (relative to repo root or absolute)")
|
||||||
ap.add_argument("--llama-bin", default="build/bin/llama-mtmd-cli",
|
ap.add_argument("--llama-bin", default="build/bin/llama-mtmd-cli",
|
||||||
help="Path to llama-mtmd-cli binary")
|
help="Path to llama-mtmd-cli binary (relative to repo root or absolute)")
|
||||||
ap.add_argument("--embedding-model", default="Qwen/Qwen3-Embedding-0.6B",
|
ap.add_argument("--verbose", action="store_true",
|
||||||
help="Embedding model for similarity computation")
|
help="Also log the expected, OCR, and aligned text")
|
||||||
ap.add_argument("--threshold", type=float, default=0.7,
|
return ap
|
||||||
help="Minimum similarity threshold for pass")
|
|
||||||
args = ap.parse_args()
|
|
||||||
|
|
||||||
# Validate paths
|
|
||||||
# script directory + image
|
|
||||||
mtmd_dir = Path(__file__).parent.parent
|
|
||||||
args.image = str(mtmd_dir / args.image)
|
|
||||||
# project directory + llama model
|
|
||||||
args.llama_model = str(mtmd_dir.parent.parent / args.llama_model)
|
|
||||||
# project directory + mmproj
|
|
||||||
args.mmproj = str(mtmd_dir.parent.parent / args.mmproj)
|
|
||||||
args.llama_bin = str(mtmd_dir.parent.parent / args.llama_bin)
|
|
||||||
if not Path(args.image).exists():
|
|
||||||
print(f"Error: Image not found: {args.image}")
|
|
||||||
sys.exit(1)
|
|
||||||
if not Path(args.llama_model).exists():
|
|
||||||
print(f"Error: Model not found: {args.llama_model}")
|
|
||||||
sys.exit(1)
|
|
||||||
if not Path(args.mmproj).exists():
|
|
||||||
print(f"Error: mmproj not found: {args.mmproj}")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
print("=" * 60)
|
def configure_logging(verbose: bool) -> None:
|
||||||
print("DeepSeek-OCR: llama.cpp vs HuggingFace Comparison")
|
logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO,
|
||||||
print("=" * 60)
|
format="%(message)s")
|
||||||
|
|
||||||
# Default paths based on your command
|
|
||||||
|
|
||||||
# Run llama.cpp inference
|
def resolve_path(path: str, base: Path) -> Path:
|
||||||
print("\n[2/3] Running llama.cpp implementation...")
|
p = Path(path)
|
||||||
llama_free_ocr = run_mtmd_deepseek_ocr(
|
return p if p.is_absolute() else base / p
|
||||||
args.llama_model,
|
|
||||||
args.mmproj,
|
|
||||||
args.image,
|
|
||||||
args.llama_bin
|
|
||||||
)
|
|
||||||
|
|
||||||
llama_md_ocr = run_mtmd_deepseek_ocr(
|
|
||||||
args.llama_model,
|
|
||||||
args.mmproj,
|
|
||||||
args.image,
|
|
||||||
args.llama_bin,
|
|
||||||
prompt="<|grounding|>Convert the document to markdown."
|
|
||||||
)
|
|
||||||
|
|
||||||
expected_free_ocr = read_expected_output("test-1-extracted.txt")
|
def main() -> int:
|
||||||
expected_md_ocr = read_expected_output("test-1-extracted.md")
|
args = argument_parser().parse_args()
|
||||||
|
configure_logging(args.verbose)
|
||||||
|
|
||||||
# Compute similarity
|
tests_dir = Path(__file__).parent # tools/mtmd/tests
|
||||||
print("\n[3/3] Computing embedding similarity...")
|
mtmd_dir = tests_dir.parent # tools/mtmd
|
||||||
free_ocr_similarity = compute_embedding_similarity(
|
repo_root = mtmd_dir.parent.parent # repo root
|
||||||
expected_free_ocr,
|
|
||||||
llama_free_ocr,
|
|
||||||
args.embedding_model
|
|
||||||
)
|
|
||||||
|
|
||||||
md_ocr_similarity = compute_embedding_similarity(
|
inputs = [
|
||||||
expected_md_ocr,
|
("image", resolve_path(DEFAULT_IMAGE, mtmd_dir)),
|
||||||
llama_md_ocr,
|
("expected-text", resolve_path(DEFAULT_EXPECTED_TEXT, tests_dir)),
|
||||||
args.embedding_model
|
("model", resolve_path(args.llama_model, repo_root)),
|
||||||
)
|
("mmproj", resolve_path(args.mmproj, repo_root)),
|
||||||
|
("binary", resolve_path(args.llama_bin, repo_root)),
|
||||||
|
]
|
||||||
|
for label, path in inputs:
|
||||||
|
if not path.exists():
|
||||||
|
logger.error(f"Error: {label} not found: {path}")
|
||||||
|
return 1
|
||||||
|
paths = dict(inputs)
|
||||||
|
|
||||||
# Results
|
logger.info("=" * 60)
|
||||||
print("\n" + "=" * 60)
|
logger.info("DeepSeek-OCR: llama.cpp vs ground-truth comparison")
|
||||||
print("RESULTS")
|
logger.info("=" * 60)
|
||||||
print("=" * 60)
|
logger.info(f"HF baselines: CER {HF_REFERENCE_CER:.4f}, chrF {HF_REFERENCE_CHRF:.2f}")
|
||||||
print(f"\nReference Model output:\n{'-' * 40}")
|
logger.info(f"Test thresholds: CER <= {CER_MAX:.4f}, chrF >= {CHRF_MIN:.2f}")
|
||||||
print(expected_free_ocr)
|
|
||||||
print(f"\nDeepSeek-OCR output:\n{'-' * 40}")
|
|
||||||
print(llama_free_ocr)
|
|
||||||
print(f"\n{'=' * 60}")
|
|
||||||
print(f"Cosine Similarity: {free_ocr_similarity:.4f}")
|
|
||||||
print(f"Threshold: {args.threshold}")
|
|
||||||
print(f"Result: {'PASS' if free_ocr_similarity >= args.threshold else 'FAIL'}")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
# Markdown OCR results
|
logger.debug("")
|
||||||
print(f"\nReference Model Markdown output:\n{'-' * 40}")
|
logger.debug("Resolved test inputs:")
|
||||||
print(expected_md_ocr)
|
for label, path in inputs:
|
||||||
print(f"\nDeepSeek-OCR Markdown output:\n{'-' * 40}")
|
logger.debug(f" {label:<14} {path}")
|
||||||
print(llama_md_ocr)
|
|
||||||
print(f"\n{'=' * 60}")
|
logger.info("")
|
||||||
print(f"Cosine Similarity (Markdown): {md_ocr_similarity:.4f}")
|
logger.info("[1/3] Running llama.cpp 'Free OCR'")
|
||||||
print(f"Threshold: {args.threshold}")
|
try:
|
||||||
print(f"Result: {'PASS' if md_ocr_similarity >= args.threshold else 'FAIL'}")
|
ocr_out = run_mtmd_cli(paths["model"], paths["mmproj"],
|
||||||
print("=" * 60)
|
paths["image"], paths["binary"])
|
||||||
|
except RuntimeError as e:
|
||||||
|
logger.error(f"Error: {e}")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
logger.info("")
|
||||||
|
logger.info("[2/3] Reading expected output")
|
||||||
|
expected = read_expected_text(paths["expected-text"])
|
||||||
|
logger.info(f" expected: {len(expected)} chars")
|
||||||
|
|
||||||
|
logger.info("")
|
||||||
|
logger.info("[3/3] Computing OCR metrics")
|
||||||
|
ok = evaluate(expected, ocr_out)
|
||||||
|
|
||||||
|
return 0 if ok else 1
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
sys.exit(main())
|
||||||
|
|||||||
@@ -1,5 +1,3 @@
|
|||||||
sentence-transformers
|
jiwer
|
||||||
transformers
|
sacrebleu
|
||||||
tokenizers
|
rapidfuzz
|
||||||
torch
|
|
||||||
torchvision
|
|
||||||
|
|||||||
Reference in New Issue
Block a user