Compare commits

...

1 Commits

Author SHA1 Message Date
Xuan-Son Nguyen f3e1828164 mtmd: llava_uhd should no longer use batch dim (#24732) 2026-06-17 22:40:50 +02:00
3 changed files with 33 additions and 25 deletions
+21
View File
@@ -1105,6 +1105,8 @@ bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, cli
img_u8_to_f32(*imgs[i], *res, hparams.image_mean, hparams.image_std);
output.entries.push_back(std::move(res));
}
output.grid_x = inst.grid_size.width;
output.grid_y = inst.grid_size.height;
return true;
}
@@ -1558,3 +1560,22 @@ bool mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img, clip
output.entries.push_back(std::move(img_f32));
return true;
}
bool mtmd_image_preprocessor_granite::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
// call super class preprocessor
bool ok = mtmd_image_preprocessor_llava_uhd::preprocess(img, output);
if (!ok) {
return false;
}
if (output.entries.size() == 1) {
// Single-tile (overview only): append one newline row.
output.entries[0]->add_newline = true;
} else {
// Multi-tile: overview gets no newline, grid tiles get one.
output.entries[0]->add_newline = false;
for (size_t i = 1; i < output.entries.size(); ++i) {
output.entries[i]->add_newline = true;
}
}
return true;
}
+6
View File
@@ -197,3 +197,9 @@ struct mtmd_image_preprocessor_youtuvl : mtmd_image_preprocessor {
mtmd_image_preprocessor_youtuvl(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
};
// similar to llava_uhd, but has add_newline
struct mtmd_image_preprocessor_granite : mtmd_image_preprocessor_llava_uhd {
mtmd_image_preprocessor_granite(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
};
+6 -25
View File
@@ -639,7 +639,7 @@ struct mtmd_context {
{
img_beg = "<image>";
img_end = "";
image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
image_preproc = std::make_unique<mtmd_image_preprocessor_granite>(ctx_v);
} break;
default:
throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj));
@@ -1033,7 +1033,10 @@ struct mtmd_tokenizer {
int32_t add_media(std::vector<const mtmd_bitmap *> & bitmaps) {
GGML_ASSERT(!bitmaps.empty());
if (!bitmaps[0]->is_audio) {
// note: only one type of media is supported per call, caller should enforce this
const bool is_vision = !bitmaps[0]->is_audio;
if (is_vision) {
// handle image
if (!ctx->ctx_v) {
@@ -1085,31 +1088,9 @@ struct mtmd_tokenizer {
batch_f32.grid_y = tmp_batch.grid_y;
}
// Annotate llava-next style tiles so clip_n_output_tokens accounts
// for per-tile newline injection.
if (ctx->proj_type_v() == PROJECTOR_TYPE_GRANITE4_VISION) {
if (batch_f32.entries.size() == 1) {
// Single-tile (overview only): append one newline row.
batch_f32.entries[0]->add_newline = true;
} else {
// Multi-tile: overview gets no newline, grid tiles get one.
batch_f32.entries[0]->add_newline = false;
for (size_t i = 1; i < batch_f32.entries.size(); ++i) {
batch_f32.entries[i]->add_newline = true;
}
}
}
// handle llava-uhd style preprocessing
const bool has_tiling_grid = batch_f32.grid_x > 0 && batch_f32.grid_y > 0;
if (
ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_STEP3VL
|| (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid)
) {
if (has_tiling_grid) {
// [QWEN_VIDEO] we do not support "frame merging" for llama-uhd style, so no batching for now
GGML_ASSERT(bitmaps.size() == 1);