mtmd, server: add "placeholder bitmap" for counting tokens , add */input_tokens API (#23913)

* mtmd: add "placeholder bitmap" for counting tokens w/o preprocessing

* fast path skip preproc for placeholder

* fix build

* correct the api

* add server endpoint + tests

* add object name

* update docs

* add proxy handling

* fix build

* fix audio input path

* use is_placeholder in process_mtmd_prompt()

* nits

* nits (2)

* docs: clarify chat/completions/input_tokens is not official

* fix merge problem
This commit is contained in:
Xuan-Son Nguyen
2026-06-06 11:06:51 +02:00
committed by GitHub
parent 5a69c97439
commit f5c6ae1827
26 changed files with 732 additions and 422 deletions
+139 -8
View File
@@ -4,6 +4,7 @@
#include "gguf.h"
#include "clip.h"
#include <array>
#include <climits>
#include <cstdarg>
#include <cinttypes>
@@ -429,10 +430,68 @@ static projector_type clip_projector_type_from_string(const std::string & str) {
// RGB uint8 image
struct clip_image_u8 {
int nx;
int ny;
clip_image_size get_size() const {
return { nx, ny };
}
void set_size(clip_image_size size, bool is_placeholder) {
nx = size.width;
ny = size.height;
if (is_placeholder) {
buf.clear();
} else {
buf.resize((size_t) nx * (size_t) ny * 3);
}
}
void cpy_buf(const std::vector<uint8_t> & new_buf) {
buf = new_buf;
}
const std::vector<uint8_t> & get_ro_buf() const {
if (is_placeholder()) {
throw std::runtime_error("this clip_image_u8 is a placeholder");
}
return buf;
}
// note to contributors: NEVER add a get_rw_buf(), it is a DANGEROUS pattern. always use get_pixel / set_pixel for buffer manipulation
bool is_placeholder() const {
return buf.empty();
}
std::array<uint8_t, 3> get_pixel(int x, int y) const {
if (is_placeholder()) {
// return a dummy value, so that legacy code can still process image without errors
return { 0, 0, 0 };
}
int idx = (y * nx + x) * 3;
return { buf[idx], buf[idx + 1], buf[idx + 2] };
}
void set_pixel(int x, int y, const std::array<uint8_t, 3> & rgb) {
if (is_placeholder()) {
return; // no-op
}
int idx = (y * nx + x) * 3;
buf[idx] = rgb[0];
buf[idx + 1] = rgb[1];
buf[idx + 2] = rgb[2];
}
size_t n_pixels() const {
return (size_t) nx * (size_t) ny;
}
size_t n_elements() const {
return n_pixels() * 3;
}
private:
std::vector<uint8_t> buf;
int nx = 0;
int ny = 0;
};
// For images, buf.size() == nx*ny*3
@@ -440,15 +499,87 @@ struct clip_image_u8 {
// For audio, only one channel is used, buf.size() == nx*ny
// nx will be n_frames and ny will be n_mel
struct clip_image_f32 {
int nx;
int ny;
std::vector<float> buf;
// marks the global view in e.g., DeepSeek-OCR Models
bool add_viewsep = false;
// whether a learned newline token should be appended after the image (eg Granite4 Vision)
// whether a learned newline (or EOI) token should be appended after the image (eg Granite4 Vision)
bool add_newline = false;
clip_image_size get_size() const {
return { nx_, ny_ };
}
int nx() const { return nx_; }
int ny() const { return ny_; }
void set_size(clip_image_size size, bool is_placeholder, bool is_audio) {
nx_ = size.width;
ny_ = size.height;
if (is_placeholder) {
buf.clear();
} else {
if (is_audio) {
buf.resize((size_t) nx_ * (size_t) ny_);
} else {
buf.resize((size_t) nx_ * (size_t) ny_ * 3);
}
}
}
void cpy_buf(const std::vector<float> & new_buf) {
buf = new_buf;
}
void from_u8(const clip_image_u8 & img) {
auto size = img.get_size();
nx_ = size.width;
ny_ = size.height;
if (img.is_placeholder()) {
buf.clear();
return; // no-op
}
buf.resize(img.n_elements());
const auto & u8_buf = img.get_ro_buf();
for (size_t i = 0; i < img.n_elements(); ++i) {
buf[i] = (float) u8_buf[i] / 255.0f;
}
}
size_t n_pixels() const {
return (size_t) nx_ * (size_t) ny_;
}
size_t n_elements() const {
return n_pixels() * 3;
}
void normalize(const float mean[3], const float std[3]) {
if (is_placeholder()) {
return; // no-op
}
for (size_t i = 0; i < n_pixels(); ++i) {
buf[i * 3 + 0] = (buf[i * 3 + 0] - mean[0]) / std[0];
buf[i * 3 + 1] = (buf[i * 3 + 1] - mean[1]) / std[1];
buf[i * 3 + 2] = (buf[i * 3 + 2] - mean[2]) / std[2];
}
}
const std::vector<float> & get_ro_buf() const {
if (is_placeholder()) {
throw std::runtime_error("this clip_image_f32 is a placeholder");
}
return buf;
}
// note to contributors: NEVER add a get_rw_buf(), it is a DANGEROUS pattern
bool is_placeholder() const {
return buf.empty();
}
private:
std::vector<float> buf;
int nx_ = 0;
int ny_ = 0;
};
//
+85 -130
View File
@@ -39,12 +39,14 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
}
// PPM header: P6 format, width, height, and max color value
file << "P6\n" << img.nx << " " << img.ny << "\n255\n";
const auto ppm_size = img.get_size();
file << "P6\n" << ppm_size.width << " " << ppm_size.height << "\n255\n";
// Write pixel data
for (size_t i = 0; i < img.buf.size(); i += 3) {
const auto & ppm_buf = img.get_ro_buf();
for (size_t i = 0; i < ppm_buf.size(); i += 3) {
// PPM expects binary data in RGB format, which matches our image buffer
file.write(reinterpret_cast<const char*>(&img.buf[i]), 3);
file.write(reinterpret_cast<const char*>(&ppm_buf[i]), 3);
}
file.close();
@@ -57,9 +59,10 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string&
return;
}
int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data
const auto bmp_size = img.get_size();
int fileSize = 54 + 3 * bmp_size.width * bmp_size.height; // File header + info header + pixel data
int bytesPerPixel = 3;
int widthInBytes = img.nx * bytesPerPixel;
int widthInBytes = bmp_size.width * bytesPerPixel;
int paddingAmount = (4 - (widthInBytes % 4)) % 4;
int stride = widthInBytes + paddingAmount;
@@ -72,7 +75,7 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string&
};
// Total file size
fileSize = 54 + (stride * img.ny);
fileSize = 54 + (stride * bmp_size.height);
fileHeader[2] = (unsigned char)(fileSize);
fileHeader[3] = (unsigned char)(fileSize >> 8);
fileHeader[4] = (unsigned char)(fileSize >> 16);
@@ -94,14 +97,14 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string&
};
// Width and height in the information header
infoHeader[4] = (unsigned char)(img.nx);
infoHeader[5] = (unsigned char)(img.nx >> 8);
infoHeader[6] = (unsigned char)(img.nx >> 16);
infoHeader[7] = (unsigned char)(img.nx >> 24);
infoHeader[8] = (unsigned char)(img.ny);
infoHeader[9] = (unsigned char)(img.ny >> 8);
infoHeader[10] = (unsigned char)(img.ny >> 16);
infoHeader[11] = (unsigned char)(img.ny >> 24);
infoHeader[4] = (unsigned char)(bmp_size.width);
infoHeader[5] = (unsigned char)(bmp_size.width >> 8);
infoHeader[6] = (unsigned char)(bmp_size.width >> 16);
infoHeader[7] = (unsigned char)(bmp_size.width >> 24);
infoHeader[8] = (unsigned char)(bmp_size.height);
infoHeader[9] = (unsigned char)(bmp_size.height >> 8);
infoHeader[10] = (unsigned char)(bmp_size.height >> 16);
infoHeader[11] = (unsigned char)(bmp_size.height >> 24);
// Write file headers
file.write(reinterpret_cast<char*>(fileHeader), sizeof(fileHeader));
@@ -109,14 +112,14 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string&
// Pixel data
std::vector<unsigned char> padding(3, 0); // Max padding size to be added to each row
for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
for (int x = 0; x < img.nx; ++x) {
for (int y = bmp_size.height - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
for (int x = 0; x < bmp_size.width; ++x) {
// Each pixel
size_t pixelIndex = (y * img.nx + x) * 3;
const auto px = img.get_pixel(x, y);
unsigned char pixel[3] = {
img.buf[pixelIndex + 2], // BMP stores pixels in BGR format
img.buf[pixelIndex + 1],
img.buf[pixelIndex]
px[2], // BMP stores pixels in BGR format
px[1],
px[0]
};
file.write(reinterpret_cast<char*>(pixel), 3);
}
@@ -129,12 +132,13 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string&
// debug function to convert f32 to u8
static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
dst.nx = src.nx;
dst.ny = src.ny;
dst.buf.resize(3 * src.nx * src.ny);
for (size_t i = 0; i < src.buf.size(); ++i) {
dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255));
dst.set_size(src.get_size(), false);
const auto & src_buf = src.get_ro_buf();
std::vector<uint8_t> dst_buf(src.n_elements());
for (size_t i = 0; i < src.n_elements(); ++i) {
dst_buf[i] = static_cast<uint8_t>(std::min(std::max(int(src_buf[i] * 255.0f), 0), 255));
}
dst.cpy_buf(dst_buf);
}
#endif
@@ -241,8 +245,8 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
proj_type(ctx->proj_type()),
img(img),
patch_size(hparams.patch_size),
n_patches_x(img.nx / patch_size),
n_patches_y(img.ny / patch_size),
n_patches_x(img.nx() / patch_size),
n_patches_y(img.ny() / patch_size),
n_patches(n_patches_x * n_patches_y),
n_embd(hparams.n_embd),
n_head(hparams.n_head),
@@ -278,8 +282,8 @@ void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const {
// siglip2 naflex
ggml_tensor * clip_graph::resize_position_embeddings(uint32_t interpolation_mode) {
ggml_tensor * pos_embd = model.position_embeddings;
const int height = img.ny / patch_size;
const int width = img.nx / patch_size;
const int height = img.ny() / patch_size;
const int width = img.nx() / patch_size;
const uint32_t mode = interpolation_mode;
const int n_per_side = (int)std::sqrt(pos_embd->ne[1]);
@@ -523,7 +527,7 @@ ggml_tensor * clip_graph::build_inp() {
}
ggml_tensor * clip_graph::build_inp_raw(int channels) {
ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels);
ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx(), img.ny(), channels);
ggml_set_name(inp_raw, "inp_raw");
ggml_set_input(inp_raw);
return inp_raw;
@@ -816,8 +820,8 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale
GGML_ASSERT(scale_factor > 1);
const int n_embd = cur->ne[0];
int width = img.nx / patch_size;
int height = img.ny / patch_size;
int width = img.nx() / patch_size;
int height = img.ny() / patch_size;
// pad width and height to factor
const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width;
@@ -2805,13 +2809,12 @@ struct clip_model_loader {
clip_image_f32_batch batch;
clip_image_f32_ptr img(clip_image_f32_init());
if (ctx_clip.model.modality == CLIP_MODALITY_VISION) {
img->nx = hparams.warmup_image_size;
img->ny = hparams.warmup_image_size;
LOG_INF("%s: warmup with image size = %d x %d\n", __func__, img->nx, img->ny);
const int sz = hparams.warmup_image_size;
img->set_size({sz, sz}, false, false);
LOG_INF("%s: warmup with image size = %d x %d\n", __func__, sz, sz);
} else {
img->nx = hparams.warmup_audio_size;
img->ny = hparams.n_mel_bins;
LOG_INF("%s: warmup with audio size = %d\n", __func__, img->nx);
img->set_size({hparams.warmup_audio_size, hparams.n_mel_bins}, false, false);
LOG_INF("%s: warmup with audio size = %d\n", __func__, hparams.warmup_audio_size);
}
batch.entries.push_back(std::move(img));
warmup(ctx_clip, batch);
@@ -3108,12 +3111,6 @@ struct clip_image_f32_batch * clip_image_f32_batch_init() {
return new clip_image_f32_batch();
}
unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) {
if (nx) *nx = img->nx;
if (ny) *ny = img->ny;
return img->buf.data();
}
void clip_image_size_free(struct clip_image_size * load_image_size) {
if (load_image_size == nullptr) {
return;
@@ -3134,7 +3131,7 @@ size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int id
LOG_ERR("%s: invalid index %d\n", __func__, idx);
return 0;
}
return batch->entries[idx]->nx;
return batch->entries[idx]->nx();
}
size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx) {
@@ -3142,7 +3139,7 @@ size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int id
LOG_ERR("%s: invalid index %d\n", __func__, idx);
return 0;
}
return batch->entries[idx]->ny;
return batch->entries[idx]->ny();
}
clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx) {
@@ -3153,13 +3150,6 @@ clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batc
return batch->entries[idx].get();
}
void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
img->nx = nx;
img->ny = ny;
img->buf.resize(3 * nx * ny);
memcpy(img->buf.data(), rgb_pixels, img->buf.size());
}
void clip_free(clip_ctx * ctx) {
if (ctx == nullptr) {
return;
@@ -3167,20 +3157,6 @@ void clip_free(clip_ctx * ctx) {
delete ctx;
}
// deprecated
size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
const int32_t nx = ctx->model.hparams.image_size;
const int32_t ny = ctx->model.hparams.image_size;
return clip_embd_nbytes_by_img(ctx, nx, ny);
}
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) {
clip_image_f32 img;
img.nx = img_w;
img.ny = img_h;
return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
}
int32_t clip_get_image_size(const struct clip_ctx * ctx) {
return ctx->model.hparams.image_size;
}
@@ -3211,9 +3187,9 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
case PROJECTOR_TYPE_PADDLEOCR:
case PROJECTOR_TYPE_HUNYUANVL:
case PROJECTOR_TYPE_YOUTUVL:
return (img->nx / params.patch_size) / 2;
return (img->nx() / params.patch_size) / 2;
case PROJECTOR_TYPE_STEP3VL:
return img->nx / (params.patch_size * params.n_merge);
return img->nx() / (params.patch_size * params.n_merge);
default:
break;
}
@@ -3233,9 +3209,9 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
case PROJECTOR_TYPE_PADDLEOCR:
case PROJECTOR_TYPE_HUNYUANVL:
case PROJECTOR_TYPE_YOUTUVL:
return (img->ny / params.patch_size) / 2;
return (img->ny() / params.patch_size) / 2;
case PROJECTOR_TYPE_STEP3VL:
return img->ny / (params.patch_size * params.n_merge);
return img->ny() / (params.patch_size * params.n_merge);
default:
break;
}
@@ -3247,7 +3223,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
// for models with fixed size image, the input image is already pre-processed and resized to square
int patch_size = params.patch_size;
int n_patches = (img->nx / patch_size) * (img->ny / patch_size);
int n_patches = (img->nx() / patch_size) * (img->ny() / patch_size);
projector_type proj = ctx->proj_type();
@@ -3313,14 +3289,14 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
case PROJECTOR_TYPE_YOUTUVL:
{
// dynamic size (2 conv, so double patch size)
int x_patch = img->nx / (params.patch_size * 2);
int y_patch = img->ny / (params.patch_size * 2);
int x_patch = img->nx() / (params.patch_size * 2);
int y_patch = img->ny() / (params.patch_size * 2);
n_patches = x_patch * y_patch;
} break;
case PROJECTOR_TYPE_STEP3VL:
{
int x_patch = img->nx / (params.patch_size * params.n_merge);
int y_patch = img->ny / (params.patch_size * params.n_merge);
int x_patch = img->nx() / (params.patch_size * params.n_merge);
int y_patch = img->ny() / (params.patch_size * params.n_merge);
n_patches = x_patch * y_patch;
} break;
case PROJECTOR_TYPE_GEMMA3:
@@ -3347,8 +3323,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
{
// dynamic size
int out_patch_size = params.patch_size * ctx->model.hparams.n_merge;
int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size;
int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size;
int x_patch = CLIP_ALIGN(img->nx(), out_patch_size) / out_patch_size;
int y_patch = CLIP_ALIGN(img->ny(), out_patch_size) / out_patch_size;
n_patches = x_patch * y_patch;
} break;
case PROJECTOR_TYPE_PADDLEOCR:
@@ -3364,8 +3340,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
{
// dynamic size
int n_merge = ctx->model.hparams.n_merge;
int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1);
int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1);
int n_patches_x = img->nx() / patch_size / (n_merge > 0 ? n_merge : 1);
int n_patches_y = img->ny() / patch_size / (n_merge > 0 ? n_merge : 1);
if (ctx->model.token_embd_img_break) {
n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
} else {
@@ -3378,7 +3354,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
case PROJECTOR_TYPE_MERALION:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
{
n_patches = img->nx;
n_patches = img->nx();
const int proj_stack_factor = ctx->model.hparams.proj_stack_factor;
if (ctx->model.audio_has_stack_frames()) {
@@ -3400,11 +3376,11 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
// chunk_size=100 frames --> 3x stride-2 conv2d --> 13 tokens per chunk
const int chunk_size = 100;
const int tokens_per_chunk = 13;
n_patches = (img->nx / chunk_size) * tokens_per_chunk;
n_patches = (img->nx() / chunk_size) * tokens_per_chunk;
} break;
case PROJECTOR_TYPE_GLMA:
{
n_patches = img->nx;
n_patches = img->nx();
// whisper downscales input token by half after conv1d
n_patches /= 2;
// reshape by merge_factor
@@ -3431,8 +3407,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
case PROJECTOR_TYPE_HUNYUANVL:
{
int merge = ctx->model.hparams.n_merge;
int ow = (img->nx / patch_size) / merge;
int oh = (img->ny / patch_size) / merge;
int ow = (img->nx() / patch_size) / merge;
int oh = (img->ny() / patch_size) / merge;
n_patches = (ow + 1) * oh + 2;
} break;
case PROJECTOR_TYPE_DEEPSEEKOCR2:
@@ -3446,13 +3422,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
} break;
case PROJECTOR_TYPE_LFM2A:
{
n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
n_patches = ((((img->nx() + 1) / 2) + 1) / 2 + 1) / 2;
} break;
case PROJECTOR_TYPE_GEMMA4A:
{
// Two Conv2D stride-2: O = floor((I + 2p - k) / s) + 1, p=1, k=3, s=2
// O = floor((I - 1) / 2) + 1
int n = img->nx;
int n = img->nx();
for (int i = 0; i < 2; i++) {
n = (n - 1) / 2 + 1;
}
@@ -3460,13 +3436,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
} break;
case PROJECTOR_TYPE_GEMMA4UA:
{
n_patches = img->nx; // no downsampling: one token per raw waveform frame
n_patches = img->nx(); // no downsampling: one token per raw waveform frame
} break;
case PROJECTOR_TYPE_GRANITE_SPEECH:
{
const int ws = ctx->model.hparams.audio_proj_window_size;
const int ds = ctx->model.hparams.audio_proj_downsample_rate;
n_patches = ((img->nx + ws - 1) / ws) * (ws / ds);
n_patches = ((img->nx() + ws - 1) / ws) * (ws / ds);
} break;
case PROJECTOR_TYPE_GRANITE4_VISION:
{
@@ -3475,7 +3451,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
// For 384×384 input: n = 24/8 = 3, query_side = 4 → 144.
const int window_side = ctx->model.hparams.downsample_window_side;
const int query_side = ctx->model.hparams.downsample_query_side;
const int side = img->nx / params.patch_size;
const int side = img->nx() / params.patch_size;
const int n = side / window_side;
n_patches = (query_side * n) * (query_side * n);
if (img->add_newline) {
@@ -3525,8 +3501,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
const auto & model = ctx->model;
const auto & hparams = model.hparams;
const int image_size_width = imgs.entries[0]->nx;
const int image_size_height = imgs.entries[0]->ny;
const int image_size_width = imgs.entries[0]->nx();
const int image_size_height = imgs.entries[0]->ny();
const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
@@ -3546,7 +3522,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
return inp;
};
auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector<float> & values) {
auto set_input_f32 = [&get_inp_tensor](const char * name, const std::vector<float> & values) {
ggml_tensor * cur = get_inp_tensor(name);
GGML_ASSERT(cur->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
@@ -3564,7 +3540,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
if (!imgs.is_audio) {
size_t nelem = 0;
for (const auto & img : imgs.entries) {
nelem += img->nx * img->ny * 3;
nelem += img->nx() * img->ny() * 3;
}
std::vector<float> inp_raw(nelem);
@@ -3580,19 +3556,20 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// ──────┘ x B
for (size_t i = 0; i < imgs.entries.size(); i++) {
const int nx = imgs.entries[i]->nx;
const int ny = imgs.entries[i]->ny;
const int nx = imgs.entries[i]->nx();
const int ny = imgs.entries[i]->ny();
const int n = nx * ny;
for (int b = 0; b < batch_size; b++) {
const auto & buf = imgs.entries[b]->get_ro_buf();
float * batch_entry = inp_raw.data() + b * (3*n);
for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
size_t base_src = 3*(y * nx + x); // idx of the first channel
size_t base_dst = y * nx + x; // idx of the first channel
batch_entry[ base_dst] = imgs.entries[b]->buf[base_src ];
batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1];
batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2];
batch_entry[ base_dst] = buf[base_src ];
batch_entry[1*n + base_dst] = buf[base_src + 1];
batch_entry[2*n + base_dst] = buf[base_src + 2];
}
}
}
@@ -3602,12 +3579,14 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
} else {
// audio input
GGML_ASSERT(imgs.entries.size() == 1);
const auto & mel_inp = imgs.entries[0];
const int n_step = mel_inp->nx;
const int n_mel = mel_inp->ny;
std::vector<float> inp_raw(n_step * n_mel);
std::memcpy(inp_raw.data(), mel_inp->buf.data(), n_step * n_mel * sizeof(float));
set_input_f32("inp_raw", inp_raw);
const auto & buf = mel_inp->get_ro_buf();
const int n_step = mel_inp->nx();
const int n_mel = mel_inp->ny();
GGML_ASSERT((size_t)n_step * n_mel == buf.size());
set_input_f32("inp_raw", buf);
}
// set input per projector
@@ -4218,7 +4197,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
GGML_ASSERT(imgs.entries.size() == 1);
const auto & img0 = imgs.entries.front();
// Compute n_pos matching SSCP output: two stride-2 convs
int n_pos = img0->nx;
int n_pos = img0->nx();
for (int i = 0; i < 2; i++) { n_pos = (n_pos - 1) / 2 + 1; }
// Chunked local attention: blocked causal mask and RPE
@@ -4324,7 +4303,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// reshapes as ggml_get_rows gathers. The names are set
// by g4v_gather() in models/granite4-vision.cpp.
const int patch_size = model.hparams.patch_size;
const int image_side = imgs.entries.front()->nx / patch_size;
const int image_side = imgs.entries.front()->nx() / patch_size;
const int window_side = hparams.downsample_window_side;
const int query_side = hparams.downsample_query_side;
const int n = image_side / window_side;
@@ -4570,19 +4549,6 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
return ctx->model.modality == CLIP_MODALITY_AUDIO;
}
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
clip_image_f32 clip_img;
clip_img.buf.resize(h * w * 3);
for (int i = 0; i < h*w*3; i++)
{
clip_img.buf[i] = img[i];
}
clip_img.nx = w;
clip_img.ny = h;
clip_image_encode(ctx, n_threads, &clip_img, vec);
return true;
}
//
// API used internally with mtmd
//
@@ -4591,17 +4557,6 @@ projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
return ctx->proj_type();
}
void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel) {
clip_image_f32 * audio = new clip_image_f32;
audio->nx = n_frames;
audio->ny = n_mel;
audio->buf.resize(n_frames * n_mel);
std::memcpy(audio->buf.data(), mel, n_frames * n_mel * sizeof(float));
batch->entries.push_back(clip_image_f32_ptr(audio));
batch->is_audio = true;
}
const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
return &ctx->model.hparams;
}
+3 -17
View File
@@ -17,6 +17,9 @@ struct clip_ctx;
struct clip_image_size {
int width;
int height;
bool operator==(const clip_image_size & other) const {
return width == other.width && height == other.height;
}
};
struct clip_image_f32;
@@ -54,9 +57,6 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
void clip_free(struct clip_ctx * ctx);
size_t clip_embd_nbytes(const struct clip_ctx * ctx);
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
int32_t clip_get_image_size (const struct clip_ctx * ctx);
int32_t clip_get_patch_size (const struct clip_ctx * ctx);
int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
@@ -79,9 +79,6 @@ struct clip_image_u8 * clip_image_u8_init (void);
struct clip_image_f32 * clip_image_f32_init(void);
struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava
// nx, ny are the output image dimensions
unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
void clip_image_size_free (struct clip_image_size * img_size);
void clip_image_u8_free (struct clip_image_u8 * img);
void clip_image_f32_free(struct clip_image_f32 * img);
@@ -94,12 +91,6 @@ size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int id
size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
/**
* Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
* The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes
*/
void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
@@ -107,11 +98,6 @@ bool clip_is_llava(const struct clip_ctx * ctx);
// note for contributor: this clip_is_(model) pattern is deprecated
// do NOT add new functions like this
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
// use by audio input
void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel);
bool clip_has_vision_encoder(const struct clip_ctx * ctx);
bool clip_has_audio_encoder(const struct clip_ctx * ctx);
+1 -1
View File
@@ -1,7 +1,7 @@
#include "models.h"
ggml_cgraph * clip_graph_conformer::build() {
const int n_frames = img.nx;
const int n_frames = img.nx();
const int n_pos = n_frames / 2;
const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1;
GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
+2 -2
View File
@@ -22,8 +22,8 @@ ggml_cgraph * clip_graph_exaone4_5::build() {
ggml_tensor * inp_raw = build_inp_raw();
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
{
ggml_tensor * inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+2 -2
View File
@@ -16,8 +16,8 @@ ggml_cgraph * clip_graph_glm4v::build() {
ggml_set_name(positions, "positions");
ggml_set_input(positions);
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
// second conv dimension
{
+1 -1
View File
@@ -1,7 +1,7 @@
#include "models.h"
ggml_cgraph * clip_graph_granite_speech::build() {
const int n_frames = img.nx;
const int n_frames = img.nx();
const int context_size = hparams.audio_chunk_size;
const int ctc_layer = n_layer / 2;
const int conv_kernel = hparams.audio_conv_kernel_size;
+2 -2
View File
@@ -7,8 +7,8 @@
// with a w*h? Also the permute is a bit different at (2, 1, 0, 3) instead of (2, 0, 1, 3).
ggml_tensor * clip_graph_kimik25::resize_position_embeddings_3d(uint32_t interpolation_mode) {
ggml_tensor * pos_embd = model.position_embeddings;
const int height = img.ny / patch_size;
const int width = img.nx / patch_size;
const int height = img.ny() / patch_size;
const int width = img.nx() / patch_size;
const uint32_t mode = interpolation_mode;
GGML_ASSERT(pos_embd);
+2 -2
View File
@@ -56,8 +56,8 @@ ggml_cgraph * clip_graph_mimovl::build() {
patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_add(ctx0, inp, inp_1);
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w,h,c,b] -> [c,w,h,b]
inp = ggml_cont_4d(ctx0, inp, n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
+2 -2
View File
@@ -19,8 +19,8 @@ ggml_cgraph * clip_graph_qwen2vl::build() {
ggml_tensor * inp_raw = build_inp_raw();
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
// second conv dimension
{
+2 -2
View File
@@ -16,8 +16,8 @@ ggml_cgraph * clip_graph_qwen3vl::build() {
ggml_tensor * inp_raw = build_inp_raw();
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
// second conv dimension
{
+1 -1
View File
@@ -1,7 +1,7 @@
#include "models.h"
ggml_cgraph * clip_graph_whisper_enc::build() {
const int n_frames = img.nx;
const int n_frames = img.nx();
const int n_pos = n_frames / 2;
GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
+1 -1
View File
@@ -166,7 +166,7 @@ struct mtmd_cli_context {
}
bool load_media(const std::string & fname) {
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str()));
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str(), false));
if (!bmp.ptr) {
return false;
}
+6 -5
View File
@@ -478,7 +478,7 @@ static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int
} // namespace audio_helpers
mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len) {
mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder) {
if (audio_helpers::is_audio_file((const char *)buf, len)) {
std::vector<float> pcmf32;
const int sample_rate = mtmd_get_audio_sample_rate(ctx);
@@ -490,7 +490,7 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigne
LOG_ERR("Unable to read WAV audio file from buffer\n");
return nullptr;
}
return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
return mtmd_bitmap_init_from_audio(pcmf32.size(), placeholder ? nullptr : pcmf32.data());
}
// otherwise, we assume it's an image
@@ -502,13 +502,13 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigne
LOG_ERR("%s: failed to decode image bytes\n", __func__);
return nullptr;
}
result = mtmd_bitmap_init(nx, ny, data);
result = mtmd_bitmap_init(nx, ny, placeholder ? nullptr : data);
stbi_image_free(data);
}
return result;
}
mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) {
mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder) {
std::vector<unsigned char> buf;
FILE * f = fopen(fname, "rb");
if (!f) {
@@ -533,5 +533,6 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char *
return nullptr;
}
return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size());
return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size(), placeholder);
}
+2 -2
View File
@@ -29,7 +29,7 @@ MTMD_API void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_da
// it calls mtmd_helper_bitmap_init_from_buf() internally
// returns nullptr on failure
// this function is thread-safe
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname);
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder);
// helper function to construct a mtmd_bitmap from a buffer containing a file
// supported formats:
@@ -38,7 +38,7 @@ MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, con
// note: audio files will be auto-detected based on magic bytes
// returns nullptr on failure
// this function is thread-safe
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder);
// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
+184 -170
View File
@@ -9,25 +9,12 @@
//
void mtmd_image_preprocessor::img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
dst.nx = src.nx;
dst.ny = src.ny;
dst.buf.resize(src.buf.size());
// TODO @ngxson : seems like this could be done more efficiently on cgraph
for (size_t i = 0; i < src.buf.size(); ++i) {
int c = i % 3; // rgb
dst.buf[i] = (static_cast<float>(src.buf[i]) / 255.0f - mean[c]) / std[c];
}
dst.from_u8(src);
dst.normalize(mean, std);
}
void mtmd_image_preprocessor::img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst) {
dst.nx = src.nx;
dst.ny = src.ny;
dst.buf.resize(src.buf.size());
for (size_t i = 0; i < src.buf.size(); ++i) {
dst.buf[i] = static_cast<float>(src.buf[i]);
}
dst.from_u8(src);
}
// set of tools to manipulate images
@@ -40,13 +27,16 @@ struct img_tool {
resize_algo algo,
pad_style padding = PAD_CEIL,
std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
dst.nx = target_resolution.width;
dst.ny = target_resolution.height;
dst.buf.resize(3 * dst.nx * dst.ny);
dst.set_size(target_resolution, src.is_placeholder());
if (dst.nx == src.nx && dst.ny == src.ny) {
if (src.is_placeholder()) {
// no-op for placeholder image, just set the size and return
return;
}
if (dst.get_size() == src.get_size()) {
// no resize needed, simple copy
dst.buf = src.buf;
dst.cpy_buf(src.get_ro_buf());
return;
}
@@ -68,17 +58,17 @@ struct img_tool {
} else {
// resize with padding
clip_image_u8 resized_image;
float scale_w = static_cast<float>(target_resolution.width) / src.nx;
float scale_h = static_cast<float>(target_resolution.height) / src.ny;
float scale_w = static_cast<float>(target_resolution.width) / src.get_size().width;
float scale_h = static_cast<float>(target_resolution.height) / src.get_size().height;
float scale = std::min(scale_w, scale_h);
int new_width, new_height;
if (padding == PAD_NEAREST) {
new_width = std::min(static_cast<int>(std::round(src.nx * scale)), target_resolution.width);
new_height = std::min(static_cast<int>(std::round(src.ny * scale)), target_resolution.height);
new_width = std::min(static_cast<int>(std::round(src.get_size().width * scale)), target_resolution.width);
new_height = std::min(static_cast<int>(std::round(src.get_size().height * scale)), target_resolution.height);
} else {
new_width = std::min(static_cast<int>(std::ceil(src.nx * scale)), target_resolution.width);
new_height = std::min(static_cast<int>(std::ceil(src.ny * scale)), target_resolution.height);
new_width = std::min(static_cast<int>(std::ceil(src.get_size().width * scale)), target_resolution.width);
new_height = std::min(static_cast<int>(std::ceil(src.get_size().height * scale)), target_resolution.height);
}
switch (algo) {
@@ -112,18 +102,17 @@ struct img_tool {
static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) {
GGML_ASSERT(x >= 0 && y >= 0 && w > 0 && h > 0);
GGML_ASSERT(x + w <= image.nx && y + h <= image.ny);
dst.nx = w;
dst.ny = h;
dst.buf.resize(3 * w * h);
GGML_ASSERT(x + w <= image.get_size().width && y + h <= image.get_size().height);
dst.set_size({w, h}, image.is_placeholder());
if (image.is_placeholder()) {
// no-op for placeholder image, just set the size and return
return;
}
for (int i = 0; i < h; ++i) {
for (int j = 0; j < w; ++j) {
int src_idx = 3 * ((y + i)*image.nx + (x + j));
int dst_idx = 3 * (i*w + j);
dst.buf[dst_idx] = image.buf[src_idx];
dst.buf[dst_idx + 1] = image.buf[src_idx + 1];
dst.buf[dst_idx + 2] = image.buf[src_idx + 2];
dst.set_pixel(j, i, image.get_pixel(x + j, y + i));
}
}
}
@@ -181,81 +170,101 @@ struct img_tool {
// draw src image into dst image at offset (offset_x, offset_y)
static void composite(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) {
for (int y = 0; y < src.ny; ++y) {
for (int x = 0; x < src.nx; ++x) {
if (src.is_placeholder()) {
// no-op for placeholder image
return;
}
const auto src_size = src.get_size();
const auto dst_size = dst.get_size();
for (int y = 0; y < src_size.height; ++y) {
for (int x = 0; x < src_size.width; ++x) {
int dx = x + offset_x;
int dy = y + offset_y;
// skip pixels that would be out of bounds in the destination
if (dx < 0 || dy < 0 || dx >= dst.nx || dy >= dst.ny) {
if (dx < 0 || dy < 0 || dx >= dst_size.width || dy >= dst_size.height) {
continue;
}
size_t dst_idx = 3 * (static_cast<size_t>(dy) * dst.nx + static_cast<size_t>(dx));
size_t src_idx = 3 * (static_cast<size_t>(y) * src.nx + static_cast<size_t>(x));
dst.buf[dst_idx + 0] = src.buf[src_idx + 0];
dst.buf[dst_idx + 1] = src.buf[src_idx + 1];
dst.buf[dst_idx + 2] = src.buf[src_idx + 2];
dst.set_pixel(dx, dy, src.get_pixel(x, y));
}
}
}
// fill the image with a solid color
static void fill(clip_image_u8 & img, const std::array<uint8_t, 3> & color) {
for (size_t i = 0; i < img.buf.size(); i += 3) {
img.buf[i] = color[0];
img.buf[i + 1] = color[1];
img.buf[i + 2] = color[2];
if (img.is_placeholder()) {
// no-op for placeholder image
return;
}
const auto size = img.get_size();
for (int y = 0; y < size.height; ++y) {
for (int x = 0; x < size.width; ++x) {
img.set_pixel(x, y, color);
}
}
}
private:
// Bilinear resize function
static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) {
if (src.nx == 0 || src.ny == 0) { dst.nx = dst.ny = 0; dst.buf.clear(); return; }
const auto src_size = src.get_size();
if (src_size.width == 0 || src_size.height == 0) { dst.set_size({0, 0}, false); return; }
if (target_width <= 0) target_width = 1;
if (target_height <= 0) target_height = 1;
dst.nx = target_width;
dst.ny = target_height;
dst.buf.resize(3 * target_width * target_height);
dst.set_size({target_width, target_height}, false);
float x_ratio = target_width > 1 ? static_cast<float>(src.nx - 1) / (target_width - 1) : 0.0f;
float y_ratio = target_height > 1 ? static_cast<float>(src.ny - 1) / (target_height - 1) : 0.0f;
if (src.is_placeholder()) {
// no-op for placeholder image, just set the size and return
return;
}
float x_ratio = target_width > 1 ? static_cast<float>(src_size.width - 1) / (target_width - 1) : 0.0f;
float y_ratio = target_height > 1 ? static_cast<float>(src_size.height - 1) / (target_height - 1) : 0.0f;
for (int y = 0; y < target_height; ++y) {
for (int x = 0; x < target_width; ++x) {
float px = x * x_ratio;
float py = y * y_ratio;
int x0 = std::min(static_cast<int>(px), src.nx - 1);
int y0 = std::min(static_cast<int>(py), src.ny - 1);
int x1 = std::min(x0 + 1, src.nx - 1);
int y1 = std::min(y0 + 1, src.ny - 1);
int x0 = std::min(static_cast<int>(px), src_size.width - 1);
int y0 = std::min(static_cast<int>(py), src_size.height - 1);
int x1 = std::min(x0 + 1, src_size.width - 1);
int y1 = std::min(y0 + 1, src_size.height - 1);
float xf = px - x0;
float yf = py - y0;
const auto p00 = src.get_pixel(x0, y0);
const auto p10 = src.get_pixel(x1, y0);
const auto p01 = src.get_pixel(x0, y1);
const auto p11 = src.get_pixel(x1, y1);
std::array<uint8_t, 3> pixel;
for (int c = 0; c < 3; ++c) {
float top = lerp(static_cast<float>(src.buf[3 * (y0 * src.nx + x0) + c]),
static_cast<float>(src.buf[3 * (y0 * src.nx + x1) + c]),
xf);
float bottom = lerp(static_cast<float>(src.buf[3 * (y1 * src.nx + x0) + c]),
static_cast<float>(src.buf[3 * (y1 * src.nx + x1) + c]),
xf);
dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, yf));
float top = lerp(static_cast<float>(p00[c]), static_cast<float>(p10[c]), xf);
float bottom = lerp(static_cast<float>(p01[c]), static_cast<float>(p11[c]), xf);
pixel[c] = static_cast<uint8_t>(lerp(top, bottom, yf));
}
dst.set_pixel(x, y, pixel);
}
}
}
// Bicubic resize function
// part of image will be cropped if the aspect ratio is different
static bool resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
const int nx = img.nx;
const int ny = img.ny;
static void resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
const auto img_size = img.get_size();
const int nx = img_size.width;
const int ny = img_size.height;
dst.nx = target_width;
dst.ny = target_height;
dst.buf.resize(3 * target_width * target_height);
dst.set_size({target_width, target_height}, false);
if (img.is_placeholder()) {
// no-op for placeholder image, just set the size and return
return;
}
float Cc;
float C[5] = {};
@@ -280,12 +289,13 @@ private:
dx = tx * j - x;
dy = ty * i - y;
std::array<uint8_t, 3> pixel;
for (k = 0; k < 3; k++) {
for (jj = 0; jj <= 3; jj++) {
d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
d0 = img.get_pixel(clip(x - 1, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k] - img.get_pixel(clip(x, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k];
d2 = img.get_pixel(clip(x + 1, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k] - img.get_pixel(clip(x, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k];
d3 = img.get_pixel(clip(x + 2, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k] - img.get_pixel(clip(x, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k];
a0 = img.get_pixel(clip(x, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k];
a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
@@ -303,13 +313,12 @@ private:
Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
pixel[k] = Cc2;
}
}
dst.set_pixel(j, i, pixel);
}
}
return true;
}
// Bicubic resize function using Pillow's ImagingResample algorithm
@@ -455,16 +464,17 @@ private:
};
// Horizontal resampling pass
// Resizes width from imIn.nx to imOut.nx, preserving height
// Resizes width from imIn to out_nx, preserving height
auto resample_horizontal = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut,
int out_nx,
int ksize, const std::vector<int> & bounds, const std::vector<int32_t> & weights) {
imOut.ny = imIn.ny;
imOut.buf.resize(3 * imOut.nx * imOut.ny);
const int in_ny = imIn.get_size().height;
imOut.set_size({out_nx, in_ny}, false);
// Process each row independently
for (int yy = 0; yy < imOut.ny; yy++) {
for (int yy = 0; yy < in_ny; yy++) {
// For each output pixel in this row
for (int xx = 0; xx < imOut.nx; xx++) {
for (int xx = 0; xx < out_nx; xx++) {
// Get the range of input pixels and filter coefficients
int xmin = bounds[xx * 2 + 0]; // First input pixel index
int xcnt = bounds[xx * 2 + 1]; // Number of input pixels
@@ -476,36 +486,36 @@ private:
// Convolve: sum weighted input pixels
for (int x = 0; x < xcnt; x++) {
int src_idx = ((yy * imIn.nx) + (x + xmin)) * 3;
ss0 += static_cast<uint8_t>(imIn.buf[src_idx + 0]) * weights[xx * ksize + x]; // R channel
ss1 += static_cast<uint8_t>(imIn.buf[src_idx + 1]) * weights[xx * ksize + x]; // G channel
ss2 += static_cast<uint8_t>(imIn.buf[src_idx + 2]) * weights[xx * ksize + x]; // B channel
const auto src_px = imIn.get_pixel(x + xmin, yy);
ss0 += src_px[0] * weights[xx * ksize + x]; // R channel
ss1 += src_px[1] * weights[xx * ksize + x]; // G channel
ss2 += src_px[2] * weights[xx * ksize + x]; // B channel
}
// Convert back from fixed-point (divide by 2^PRECISION_BITS) and clamp to [0,255]
int dst_idx = (yy * imOut.nx + xx) * 3;
imOut.buf[dst_idx + 0] = clip8(ss0 >> PRECISION_BITS);
imOut.buf[dst_idx + 1] = clip8(ss1 >> PRECISION_BITS);
imOut.buf[dst_idx + 2] = clip8(ss2 >> PRECISION_BITS);
imOut.set_pixel(xx, yy, {clip8(ss0 >> PRECISION_BITS),
clip8(ss1 >> PRECISION_BITS),
clip8(ss2 >> PRECISION_BITS)});
}
}
};
// Vertical resampling pass
// Resizes height from imIn.ny to imOut.ny, preserving width
// Resizes height from imIn to out_ny, preserving width
auto resample_vertical = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut,
int out_ny,
int ksize, const std::vector<int> & bounds, const std::vector<int32_t> & weight) {
imOut.nx = imIn.nx;
imOut.buf.resize(3 * imOut.nx * imOut.ny);
const int in_nx = imIn.get_size().width;
imOut.set_size({in_nx, out_ny}, false);
// For each output row
for (int yy = 0; yy < imOut.ny; yy++) {
for (int yy = 0; yy < out_ny; yy++) {
// Get the range of input rows and filter coefficients
int ymin = bounds[yy * 2 + 0]; // First input row index
int ycnt = bounds[yy * 2 + 1]; // Number of input rows
// Process each column in this output row
for (int xx = 0; xx < imOut.nx; xx++) {
for (int xx = 0; xx < in_nx; xx++) {
// Initialize accumulators for RGB channels with rounding bias
int32_t ss0 = 1 << (PRECISION_BITS - 1);
int32_t ss1 = 1 << (PRECISION_BITS - 1);
@@ -513,27 +523,23 @@ private:
// Convolve: sum weighted input pixels vertically
for (int y = 0; y < ycnt; y++) {
int src_idx = ((y + ymin) * imIn.nx + xx) * 3;
ss0 += static_cast<uint8_t>(imIn.buf[src_idx + 0]) * weight[yy * ksize + y]; // R channel
ss1 += static_cast<uint8_t>(imIn.buf[src_idx + 1]) * weight[yy * ksize + y]; // G channel
ss2 += static_cast<uint8_t>(imIn.buf[src_idx + 2]) * weight[yy * ksize + y]; // B channel
const auto src_px = imIn.get_pixel(xx, y + ymin);
ss0 += src_px[0] * weight[yy * ksize + y]; // R channel
ss1 += src_px[1] * weight[yy * ksize + y]; // G channel
ss2 += src_px[2] * weight[yy * ksize + y]; // B channel
}
// Convert back from fixed-point and clamp to [0,255]
int dst_idx = (yy * imOut.nx + xx) * 3;
imOut.buf[dst_idx + 0] = clip8(ss0 >> PRECISION_BITS);
imOut.buf[dst_idx + 1] = clip8(ss1 >> PRECISION_BITS);
imOut.buf[dst_idx + 2] = clip8(ss2 >> PRECISION_BITS);
imOut.set_pixel(xx, yy, {clip8(ss0 >> PRECISION_BITS),
clip8(ss1 >> PRECISION_BITS),
clip8(ss2 >> PRECISION_BITS)});
}
}
};
// Main resampling logic using separable two-pass approach
const int src_width = img.nx;
const int src_height = img.ny;
dst.nx = target_width;
dst.ny = target_height;
const int src_width = img.get_size().width;
const int src_height = img.get_size().height;
bool need_horizontal = (target_width != src_width);
bool need_vertical = (target_height != src_height);
@@ -555,18 +561,20 @@ private:
if (need_horizontal && need_vertical) {
// Both horizontal and vertical
clip_image_u8 temp;
temp.nx = target_width;
resample_horizontal(img, temp, ksize_horiz, bounds_horiz, weights_horiz);
resample_vertical(temp, dst, ksize_vert, bounds_vert, weights_vert);
resample_horizontal(img, temp, target_width, ksize_horiz, bounds_horiz, weights_horiz);
resample_vertical(temp, dst, target_height, ksize_vert, bounds_vert, weights_vert);
} else if (need_horizontal) {
// Only horizontal
resample_horizontal(img, dst, ksize_horiz, bounds_horiz, weights_horiz);
resample_horizontal(img, dst, target_width, ksize_horiz, bounds_horiz, weights_horiz);
} else if (need_vertical) {
// Only vertical
resample_vertical(img, dst, ksize_vert, bounds_vert, weights_vert);
resample_vertical(img, dst, target_height, ksize_vert, bounds_vert, weights_vert);
} else {
// No resizing needed - direct copy
dst.buf = img.buf;
dst.set_size(img.get_size(), img.is_placeholder());
if (!img.is_placeholder()) {
dst.cpy_buf(img.get_ro_buf());
}
}
return true;
@@ -588,7 +596,7 @@ private:
//
bool mtmd_image_preprocessor_llava_uhd::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
const clip_image_size original_size{img.nx, img.ny};
const clip_image_size original_size = img.get_size();
auto const inst = get_slice_instructions(original_size);
std::vector<clip_image_u8_ptr> imgs = slice_image(img, inst);
@@ -883,7 +891,7 @@ bool mtmd_image_preprocessor_fixed_size::preprocess(const clip_image_u8 & img, c
bool mtmd_image_preprocessor_dyn_size::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
GGML_ASSERT(hparams.image_min_pixels > 0 && hparams.image_max_pixels > 0);
clip_image_u8 resized_image;
const clip_image_size original_size{img.nx, img.ny};
const clip_image_size original_size = img.get_size();
// the original pixtral model doesn't have n_merge
const int cur_merge = hparams.n_merge == 0 ? 1 : hparams.n_merge;
const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
@@ -908,7 +916,7 @@ bool mtmd_image_preprocessor_dyn_size::preprocess(const clip_image_u8 & img, cli
bool mtmd_image_preprocessor_longest_edge::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
GGML_ASSERT(hparams.image_longest_edge > 0);
clip_image_u8 resized_image;
const clip_image_size original_size{img.nx, img.ny};
const clip_image_size original_size = img.get_size();
// the original pixtral model doesn't have n_merge
const int cur_merge = hparams.n_merge == 0 ? 1 : hparams.n_merge;
const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
@@ -1040,7 +1048,7 @@ bool mtmd_image_preprocessor_idefics3::preprocess(const clip_image_u8 & img, cli
// multiples of image_size (always rounding up)
//
// CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
const clip_image_size original_size{img.nx, img.ny};
const clip_image_size original_size = img.get_size();
const clip_image_size refined_size = img_tool::calc_size_preserved_ratio(
original_size, hparams.image_size, hparams.image_longest_edge);
// LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n",
@@ -1088,7 +1096,7 @@ bool mtmd_image_preprocessor_idefics3::preprocess(const clip_image_u8 & img, cli
bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
GGML_ASSERT(!hparams.image_res_candidates.empty());
const clip_image_size original_size{img.nx, img.ny};
const clip_image_size original_size = img.get_size();
auto const inst = get_slice_instructions(original_size);
std::vector<clip_image_u8_ptr> imgs = slice_image(img, inst, false);
@@ -1108,7 +1116,7 @@ bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img,
static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ };
// TODO: support 512 (tiny) and 640 (small) once we have eval data for them
const int64_t orig_area = static_cast<int64_t>(img.nx) * img.ny;
const int64_t orig_area = static_cast<int64_t>(img.n_pixels());
size_t mode_i = 0;
int64_t min_diff = std::numeric_limits<int64_t>::max();
@@ -1201,10 +1209,11 @@ bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img,
// emit 768x768 local tiles when the image is larger than a tile in either
// dimension, then always a 1024x1024 global view. order: [tiles..., global].
if (img.nx > tile_size || img.ny > tile_size) {
const float aspect_ratio = static_cast<float>(img.nx) / img.ny;
const auto img_size = img.get_size();
if (img_size.width > tile_size || img_size.height > tile_size) {
const float aspect_ratio = static_cast<float>(img_size.width) / img_size.height;
const auto target_ratios = get_target_ratios();
const clip_image_size grid = find_closest_aspect_ratio(aspect_ratio, target_ratios, img.nx, img.ny);
const clip_image_size grid = find_closest_aspect_ratio(aspect_ratio, target_ratios, img_size.width, img_size.height);
// stretch onto the grid (no aspect preserve), then crop tiles row-major.
clip_image_u8 refined;
@@ -1247,50 +1256,57 @@ void mtmd_image_preprocessor_step3vl::img_u8_resize_bilinear_to_f32(
int target_height,
const float mean[3],
const float std[3]) {
if (src.nx == target_width && src.ny == target_height) {
const auto src_size = src.get_size();
if (src_size.width == target_width && src_size.height == target_height) {
img_u8_to_f32(src, dst, mean, std);
return;
}
dst.nx = target_width;
dst.ny = target_height;
dst.buf.resize(3 * target_width * target_height);
dst.set_size({target_width, target_height}, false, false);
const float scale_x = static_cast<float>(src.nx) / target_width;
const float scale_y = static_cast<float>(src.ny) / target_height;
if (src.is_placeholder()) {
// no-op for placeholder image, just set the size and return
return;
}
const float scale_x = static_cast<float>(src_size.width) / target_width;
const float scale_y = static_cast<float>(src_size.height) / target_height;
std::vector<float> local_buf(3 * target_width * target_height);
for (int y = 0; y < target_height; ++y) {
const float src_y = (static_cast<float>(y) + 0.5f) * scale_y - 0.5f;
const int y0_floor = static_cast<int>(std::floor(src_y));
const int y0 = std::max(0, std::min(y0_floor, src.ny - 1));
const int y1 = std::max(0, std::min(y0_floor + 1, src.ny - 1));
const int y0 = std::max(0, std::min(y0_floor, src_size.height - 1));
const int y1 = std::max(0, std::min(y0_floor + 1, src_size.height - 1));
const float ly = src_y - y0_floor;
for (int x = 0; x < target_width; ++x) {
const float src_x = (static_cast<float>(x) + 0.5f) * scale_x - 0.5f;
const int x0_floor = static_cast<int>(std::floor(src_x));
const int x0 = std::max(0, std::min(x0_floor, src.nx - 1));
const int x1 = std::max(0, std::min(x0_floor + 1, src.nx - 1));
const int x0 = std::max(0, std::min(x0_floor, src_size.width - 1));
const int x1 = std::max(0, std::min(x0_floor + 1, src_size.width - 1));
const float lx = src_x - x0_floor;
const size_t idx00 = 3 * (y0 * src.nx + x0);
const size_t idx01 = 3 * (y0 * src.nx + x1);
const size_t idx10 = 3 * (y1 * src.nx + x0);
const size_t idx11 = 3 * (y1 * src.nx + x1);
const size_t idx_dst = 3 * (y * target_width + x);
const auto p00 = src.get_pixel(x0, y0);
const auto p01 = src.get_pixel(x1, y0);
const auto p10 = src.get_pixel(x0, y1);
const auto p11 = src.get_pixel(x1, y1);
const size_t idx_dst = 3 * (y * target_width + x);
for (int c = 0; c < 3; ++c) {
const float v00 = (static_cast<float>(src.buf[idx00 + c]) / 255.0f - mean[c]) / std[c];
const float v01 = (static_cast<float>(src.buf[idx01 + c]) / 255.0f - mean[c]) / std[c];
const float v10 = (static_cast<float>(src.buf[idx10 + c]) / 255.0f - mean[c]) / std[c];
const float v11 = (static_cast<float>(src.buf[idx11 + c]) / 255.0f - mean[c]) / std[c];
const float v00 = (static_cast<float>(p00[c]) / 255.0f - mean[c]) / std[c];
const float v01 = (static_cast<float>(p01[c]) / 255.0f - mean[c]) / std[c];
const float v10 = (static_cast<float>(p10[c]) / 255.0f - mean[c]) / std[c];
const float v11 = (static_cast<float>(p11[c]) / 255.0f - mean[c]) / std[c];
const float top = v00 + (v01 - v00) * lx;
const float bot = v10 + (v11 - v10) * lx;
dst.buf[idx_dst + c] = top + (bot - top) * ly;
local_buf[idx_dst + c] = top + (bot - top) * ly;
}
}
}
dst.cpy_buf(local_buf);
}
int mtmd_image_preprocessor_step3vl::get_image_longest_edge(const clip_hparams & params) {
@@ -1341,26 +1357,26 @@ std::vector<int> mtmd_image_preprocessor_step3vl::calc_grid(int length, int wind
clip_image_u8 mtmd_image_preprocessor_step3vl::prepare_image(const clip_image_u8 & img, const clip_hparams & params) {
clip_image_u8 resized = img;
const float aspect_ratio = img.ny > 0 ? static_cast<float>(img.nx) / img.ny : 1.0f;
if (std::min(img.nx, img.ny) < 32 &&
const auto img_size = img.get_size();
const float aspect_ratio = img_size.height > 0 ? static_cast<float>(img_size.width) / img_size.height : 1.0f;
if (std::min(img_size.width, img_size.height) < 32 &&
(aspect_ratio > wide_aspect_ratio_limit ||
aspect_ratio < 1.0f / wide_aspect_ratio_limit)) {
const int square_size = std::max(img.nx, img.ny);
const int square_size = std::max(img_size.width, img_size.height);
clip_image_u8 padded;
padded.nx = square_size;
padded.ny = square_size;
padded.buf.resize(3 * square_size * square_size);
padded.set_size({square_size, square_size}, false);
img_tool::fill(padded, {0, 0, 0});
img_tool::composite(padded, img, 0, 0);
resized = std::move(padded);
}
const int max_image_size = get_image_longest_edge(params);
if (std::max(resized.nx, resized.ny) > max_image_size) {
const float scale = static_cast<float>(max_image_size) / std::max(resized.nx, resized.ny);
const auto resized_size = resized.get_size();
if (std::max(resized_size.width, resized_size.height) > max_image_size) {
const float scale = static_cast<float>(max_image_size) / std::max(resized_size.width, resized_size.height);
const clip_image_size new_size = {
std::max(1, static_cast<int>(std::floor(resized.nx * scale))),
std::max(1, static_cast<int>(std::floor(resized.ny * scale))),
std::max(1, static_cast<int>(std::floor(resized_size.width * scale))),
std::max(1, static_cast<int>(std::floor(resized_size.height * scale))),
};
clip_image_u8 scaled;
img_tool::resize(resized, scaled, new_size, RESIZE_ALGO_BILINEAR, PAD_NONE);
@@ -1372,14 +1388,14 @@ clip_image_u8 mtmd_image_preprocessor_step3vl::prepare_image(const clip_image_u8
clip_image_u8 mtmd_image_preprocessor_step3vl::crop_with_black_padding(const clip_image_u8 & image, int x, int y, int w, int h) {
clip_image_u8 dst;
dst.nx = w;
dst.ny = h;
dst.buf.resize(3 * w * h, 0);
dst.set_size({w, h}, false);
img_tool::fill(dst, {0, 0, 0});
const auto img_size = image.get_size();
const int src_x0 = std::max(0, x);
const int src_y0 = std::max(0, y);
const int src_x1 = std::min(image.nx, x + w);
const int src_y1 = std::min(image.ny, y + h);
const int src_x1 = std::min(img_size.width, x + w);
const int src_y1 = std::min(img_size.height, y + h);
if (src_x0 >= src_x1 || src_y0 >= src_y1) {
return dst;
@@ -1390,11 +1406,7 @@ clip_image_u8 mtmd_image_preprocessor_step3vl::crop_with_black_padding(const cli
for (int yy = 0; yy < src_y1 - src_y0; ++yy) {
for (int xx = 0; xx < src_x1 - src_x0; ++xx) {
const int src_idx = 3 * ((src_y0 + yy) * image.nx + (src_x0 + xx));
const int dst_idx = 3 * ((dst_y0 + yy) * w + (dst_x0 + xx));
dst.buf[dst_idx + 0] = image.buf[src_idx + 0];
dst.buf[dst_idx + 1] = image.buf[src_idx + 1];
dst.buf[dst_idx + 2] = image.buf[src_idx + 2];
dst.set_pixel(dst_x0 + xx, dst_y0 + yy, image.get_pixel(src_x0 + xx, src_y0 + yy));
}
}
@@ -1443,7 +1455,7 @@ mtmd_image_preprocessor_step3vl::slice_instructions mtmd_image_preprocessor_step
bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
clip_image_u8 prepared = prepare_image(img, hparams);
const auto instructions = build_slice_instructions(hparams, {prepared.nx, prepared.ny});
const auto instructions = build_slice_instructions(hparams, prepared.get_size());
clip_image_f32_ptr overview_f32(clip_image_f32_init());
img_u8_resize_bilinear_to_f32(
@@ -1462,7 +1474,8 @@ bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip
}
clip_image_u8 img_for_crop = prepared;
if (instructions.refined_size.width != prepared.nx || instructions.refined_size.height != prepared.ny) {
const auto prepared_size = prepared.get_size();
if (instructions.refined_size.width != prepared_size.width || instructions.refined_size.height != prepared_size.height) {
clip_image_u8 refined;
img_tool::resize(prepared, refined, instructions.refined_size, RESIZE_ALGO_BILINEAR, PAD_NONE);
img_for_crop = std::move(refined);
@@ -1503,9 +1516,10 @@ bool mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img, clip
hparams.image_max_pixels / (patch_size * patch_size) : 256;
// Linear search for optimal scale to fit within max_num_patches
const auto img_size = img.get_size();
float scale = 1.0f;
int target_height = img.ny;
int target_width = img.nx;
int target_height = img_size.height;
int target_width = img_size.width;
auto get_scaled_image_size = [align_size](float scale, int size) -> int {
float scaled_size = size * scale;
@@ -1517,8 +1531,8 @@ bool mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img, clip
// Linear search with 0.02 step size
while (scale > 0.0f) {
target_height = get_scaled_image_size(scale, img.ny);
target_width = get_scaled_image_size(scale, img.nx);
target_height = get_scaled_image_size(scale, img_size.height);
target_width = get_scaled_image_size(scale, img_size.width);
int num_patches_h = target_height / patch_size;
int num_patches_w = target_width / patch_size;
+152 -56
View File
@@ -26,12 +26,46 @@
// represents raw image data, layout is RGBRGBRGB...
// length of data must be nx * ny * 3
// for audio bitmap: nx = sample count, ny = 1, layout is F32 F32 F32 ...
// length of data must be nx * sizeof(float)
struct mtmd_bitmap {
uint32_t nx;
uint32_t ny;
std::vector<unsigned char> data;
uint32_t nx = 0;
uint32_t ny = 0;
std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
bool is_audio = false; // true if the bitmap is audio
mtmd_bitmap(const unsigned char * data, uint32_t nx, uint32_t ny)
: nx(nx), ny(ny) {
if (data) {
size_t data_size = (size_t)nx * ny * 3;
this->data.resize(data_size);
std::memcpy(this->data.data(), data, data_size);
}
}
mtmd_bitmap(const unsigned char * data, uint32_t n_samples)
: nx(n_samples), ny(1), is_audio(true) {
if (data) {
size_t data_size = (size_t)nx * sizeof(float);
this->data.resize(data_size);
std::memcpy(this->data.data(), data, data_size);
}
}
const std::vector<unsigned char> & get_ro_buf() const {
return data;
}
bool is_placeholder() const {
return data.empty();
}
size_t n_bytes() const {
return data.size();
}
private:
std::vector<unsigned char> data;
};
// position indexing for decoder model
@@ -42,8 +76,8 @@ enum mtmd_pos_type {
};
struct mtmd_image_tokens {
uint32_t nx; // number of tokens in x direction
uint32_t ny; // number of tokens in y direction
uint32_t nx = 0; // number of tokens in x direction
uint32_t ny = 0; // number of tokens in y direction
mtmd_pos_type pos = MTMD_POS_TYPE_NORMAL;
uint32_t image_idx = 0; // 0-based position of this image among image chunks in the prompt(used by pos == MTMD_POS_TYPE_HUNYUANVL)
uint32_t n_tokens() const {
@@ -56,6 +90,16 @@ struct mtmd_image_tokens {
clip_image_f32_batch batch_f32; // preprocessed image patches
std::string id; // optional user-defined ID, useful for KV cache tracking
// true if one of entries in batch_f32 is a placeholder
bool is_placeholder() const {
for (const auto & entry : batch_f32.entries) {
if (entry->is_placeholder()) {
return true;
}
}
return false;
}
mtmd_image_tokens clone() {
return mtmd_image_tokens{
nx,
@@ -70,10 +114,20 @@ struct mtmd_image_tokens {
using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens>;
struct mtmd_audio_tokens {
uint32_t n_tokens; // number of tokens
uint32_t n_tokens = 0; // number of tokens
clip_image_f32_batch batch_f32; // preprocessed image patches
std::string id; // optional user-defined ID, useful for KV cache tracking
// true if one of entries in batch_f32 is a placeholder
bool is_placeholder() const {
for (const auto & entry : batch_f32.entries) {
if (entry->is_placeholder()) {
return true;
}
}
return false;
}
mtmd_audio_tokens clone() {
return mtmd_audio_tokens{
n_tokens,
@@ -795,16 +849,19 @@ struct mtmd_tokenizer {
}
// sanity check
GGML_ASSERT(bitmap->nx > 0 && bitmap->ny > 0);
GGML_ASSERT(bitmap->data.size() == (size_t)bitmap->nx * bitmap->ny * 3);
if (bitmap->nx <= 0 || bitmap->ny <= 0) {
LOG_ERR("%s: error: invalid bitmap dimensions: nx = %d, ny = %d\n",
__func__, bitmap->nx, bitmap->ny);
return 2;
}
GGML_ASSERT(ctx->image_preproc != nullptr);
// convert mtmd_bitmap to clip_image_u8
clip_image_u8_ptr img_u8(clip_image_u8_init());
img_u8->nx = bitmap->nx;
img_u8->ny = bitmap->ny;
img_u8->buf.resize(bitmap->data.size());
std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
img_u8->set_size(
{(int)bitmap->nx, (int)bitmap->ny},
bitmap->is_placeholder());
img_u8->cpy_buf(bitmap->get_ro_buf());
// preprocess image
clip_image_f32_batch batch_f32;
@@ -949,7 +1006,7 @@ struct mtmd_tokenizer {
return 2;
}
if (bitmap->data.size() == 0) {
if (bitmap->nx == 0) {
LOG_ERR("%s: error: empty audio data\n", __func__);
return 2;
}
@@ -960,26 +1017,46 @@ struct mtmd_tokenizer {
// sanity check
GGML_ASSERT(ctx->audio_preproc != nullptr);
GGML_ASSERT(bitmap->data.size() > sizeof(float));
GGML_ASSERT(bitmap->data.size() % sizeof(float) == 0);
// preprocess audio
std::vector<mtmd_audio_mel> mel_spec_chunks;
const float * samples = (const float *)bitmap->data.data();
size_t n_samples = bitmap->data.size() / sizeof(float);
bool ok = ctx->audio_preproc->preprocess(samples, n_samples, mel_spec_chunks);
if (!ok) {
LOG_ERR("Unable to preprocess audio\n");
return 2;
{
std::vector<float> dummy;
const float * samples = nullptr;
size_t n_samples = 0;
if (bitmap->is_placeholder()) {
// TODO @ngxson : skip underlay processing if bitmap is placeholder
GGML_ASSERT(bitmap->ny == 1);
dummy.resize(bitmap->nx);
samples = dummy.data();
n_samples = dummy.size();
} else {
const auto & buf = bitmap->get_ro_buf();
GGML_ASSERT(buf.size() > sizeof(float));
GGML_ASSERT(buf.size() % sizeof(float) == 0);
samples = (const float *)buf.data();
n_samples = buf.size() / sizeof(float);
}
bool ok = ctx->audio_preproc->preprocess(samples, n_samples, mel_spec_chunks);
if (!ok) {
LOG_ERR("Unable to preprocess audio\n");
return 2;
}
}
// consider each mel_spec as a separate audio chunk
// TODO: maybe support batching, but this may come with memory cost
for (auto & mel_spec : mel_spec_chunks) {
const bool is_placeholder = mel_spec.data.empty();
clip_image_f32_ptr mel_f32(clip_image_f32_init());
mel_f32->nx = mel_spec.n_len;
mel_f32->ny = mel_spec.n_mel;
mel_f32->buf = std::move(mel_spec.data);
mel_f32->set_size(
{mel_spec.n_len, mel_spec.n_mel},
is_placeholder, /* is_audio */ true);
mel_f32->cpy_buf(mel_spec.data);
size_t n_tokens = clip_n_output_tokens(ctx->ctx_a, mel_f32.get());
clip_image_f32_batch batch_f32;
@@ -1098,12 +1175,28 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
LOG_ERR("%s: model does not support vision input\n", __func__);
return 1;
}
if (chunk->tokens_image == nullptr) {
LOG_ERR("%s: image tokens are null\n", __func__);
return 1;
}
if (chunk->tokens_image->is_placeholder()) {
LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
return 1;
}
return mtmd_encode(ctx, chunk->tokens_image.get());
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
if (!ctx->ctx_a) {
LOG_ERR("%s: model does not support audio input\n", __func__);
return 1;
}
if (chunk->tokens_audio == nullptr) {
LOG_ERR("%s: audio tokens are null\n", __func__);
return 1;
}
if (chunk->tokens_audio->is_placeholder()) {
LOG_ERR("%s: audio tokens batch is placeholder\n", __func__);
return 1;
}
int n_mmproj_embd = ctx->n_embd_text;
ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
bool ok = clip_image_batch_encode(
@@ -1141,6 +1234,10 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
// e.g., DeepSeek-OCR-2: 144 per tile views, 257 for the global view
size_t offset = 0;
for (size_t i = 0; i < entries.size(); i++) {
if (entries[i]->is_placeholder()) {
LOG_ERR("%s: image tokens batch entry %zu is placeholder\n", __func__, i);
return 1;
}
int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
ok = clip_image_encode(
ctx_clip,
@@ -1150,6 +1247,10 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
offset += static_cast<size_t>(n_mmproj_embd) * n_tokens_per_image;
}
} else {
if (image_tokens->is_placeholder()) {
LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
return 1;
}
ok = clip_image_batch_encode(
ctx_clip,
ctx->n_threads,
@@ -1207,24 +1308,17 @@ int mtmd_get_audio_sample_rate(const mtmd_context * ctx) {
mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
uint32_t ny,
const unsigned char * data) {
mtmd_bitmap * bitmap = new mtmd_bitmap;
bitmap->nx = nx;
bitmap->ny = ny;
size_t data_size = (size_t)nx * ny * 3;
bitmap->data.resize(data_size);
std::memcpy(bitmap->data.data(), data, data_size);
mtmd_bitmap * bitmap = new mtmd_bitmap(data, nx, ny);
return bitmap;
}
mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples,
const float * data) {
mtmd_bitmap * bitmap = new mtmd_bitmap;
bitmap->nx = n_samples;
bitmap->ny = 1;
bitmap->is_audio = true;
size_t data_size = n_samples * sizeof(float);
bitmap->data.resize(data_size);
std::memcpy(bitmap->data.data(), data, data_size);
mtmd_bitmap * bitmap = new mtmd_bitmap((const unsigned char *)data, n_samples);
GGML_ASSERT(bitmap->is_audio);
if (!bitmap->is_placeholder()) {
GGML_ASSERT(bitmap->get_ro_buf().size() == n_samples * sizeof(float));
}
return bitmap;
}
@@ -1237,11 +1331,11 @@ uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap) {
}
const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
return bitmap->data.data();
return bitmap->get_ro_buf().data();
}
size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) {
return bitmap->data.size();
return bitmap->get_ro_buf().size();
}
bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
@@ -1535,14 +1629,16 @@ void mtmd_debug_encode_image(mtmd_context * ctx, const std::vector<std::vector<f
LOG_ERR("%s: model does not support vision input\n", __func__);
return;
}
clip_image_f32 inp_image;
inp_image.nx = image.size();
inp_image.ny = inp_image.nx;
inp_image.buf.reserve(inp_image.nx * inp_image.ny);
const int img_sz = (int)image.size();
std::vector<float> img_buf;
img_buf.reserve(img_sz * img_sz);
for (const auto & row : image) {
inp_image.buf.insert(inp_image.buf.end(), row.begin(), row.end());
img_buf.insert(img_buf.end(), row.begin(), row.end());
}
LOG_INF("%s: created input image with nx=%d, ny=%d\n", __func__, inp_image.nx, inp_image.ny);
clip_image_f32 inp_image;
inp_image.set_size({img_sz, img_sz}, false, false);
inp_image.cpy_buf(img_buf);
LOG_INF("%s: created input image with nx=%d, ny=%d\n", __func__, img_sz, img_sz);
mtmd_debug_encode_impl(ctx, ctx->ctx_v, inp_image);
}
@@ -1552,16 +1648,17 @@ void mtmd_debug_encode_audio(mtmd_context * ctx, const std::vector<float> & inpu
return;
}
int n_mel = clip_get_hparams(ctx->ctx_a)->n_mel_bins;
clip_image_f32 inp_audio;
inp_audio.nx = input.size();
inp_audio.ny = n_mel;
inp_audio.buf.resize(input.size() * n_mel);
for (size_t i = 0; i < input.size(); i++) {
const int audio_nx = (int)input.size();
std::vector<float> audio_buf(audio_nx * n_mel);
for (int i = 0; i < audio_nx; i++) {
for (int j = 0; j < n_mel; j++) {
inp_audio.buf[j * inp_audio.nx + i] = input[i];
audio_buf[j * audio_nx + i] = input[i];
}
}
LOG_INF("%s: created input audio with nx=%d, ny=%d\n", __func__, inp_audio.nx, inp_audio.ny);
clip_image_f32 inp_audio;
inp_audio.set_size({audio_nx, n_mel}, false, true);
inp_audio.cpy_buf(audio_buf);
LOG_INF("%s: created input audio with nx=%d, ny=%d\n", __func__, audio_nx, n_mel);
mtmd_debug_encode_impl(ctx, ctx->ctx_a, inp_audio);
}
@@ -1571,9 +1668,8 @@ void mtmd_debug_preprocess_image(mtmd_context * ctx, const std::vector<uint8_t>
return;
}
clip_image_u8 img_u8;
img_u8.nx = nx;
img_u8.ny = ny;
img_u8.buf = rgb_values;
img_u8.set_size({nx, ny}, false);
img_u8.cpy_buf(rgb_values);
clip_image_f32_batch batch_f32;
GGML_ASSERT(ctx->image_preproc != nullptr);
bool ok = ctx->image_preproc->preprocess(img_u8, batch_f32);
@@ -1583,7 +1679,7 @@ void mtmd_debug_preprocess_image(mtmd_context * ctx, const std::vector<uint8_t>
}
LOG_INF("%s: preprocessed image to batch_f32 with %d entries\n", __func__, (int)batch_f32.entries.size());
for (size_t i = 0; i < batch_f32.entries.size(); i++) {
LOG_INF("%s: entry %zu has nx=%d, ny=%d\n", __func__, i, batch_f32.entries[i]->nx, batch_f32.entries[i]->ny);
LOG_INF("%s: entry %zu has nx=%d, ny=%d\n", __func__, i, batch_f32.entries[i]->nx(), batch_f32.entries[i]->ny());
// TODO: better way to dump entry content?
}
}
+5
View File
@@ -136,6 +136,11 @@ MTMD_API int mtmd_get_audio_sample_rate(const mtmd_context * ctx);
// if bitmap is audio:
// length of data must be n_samples * sizeof(float)
// the data is in float format (PCM F32)
//
// if data == nullptr:
// the bitmap is considered "empty", and will be treated as a placeholder for counting tokens
// you can pass the bitmap via mtmd_tokenize(), then call mtmd_*_get_n_tokens() to count the tokens
// note: passing a placeholder bitmap to mtmd_encode() will return an error
MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data);
MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data);
MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap);
+30
View File
@@ -1447,6 +1447,36 @@ See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-r
}'
```
### POST `/v1/responses/input_tokens`: Token Counting
Similar to [Response input token counts API](https://developers.openai.com/api/reference/python/resources/responses/subresources/input_tokens/methods/count).
Example response:
```json
{
"object": "response.input_tokens",
"input_tokens": 11
}
```
### POST `/v1/chat/completions/input_tokens`: Token Counting
Similar to [Response input token counts API](https://developers.openai.com/api/reference/python/resources/responses/subresources/input_tokens/methods/count), but accepts a chat completion body as input.
Note: This is not an official OAI endpoint, but is added for completeness and convenience.
Example response:
```json
{
"object": "response.input_tokens",
"input_tokens": 11
}
```
## Anthropic-compatible API Endpoints
### POST `/v1/messages`: Anthropic-compatible Messages API
Given a list of `messages`, returns the assistant's response. Streaming is supported via Server-Sent Events. While no strong claims of compatibility with the Anthropic API spec are made, in our experience it suffices to support many apps.
+2 -2
View File
@@ -713,10 +713,10 @@ static std::string fnv_hash(const uint8_t * data, size_t len) {
return std::to_string(hash);
}
server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files) {
server_tokens process_mtmd_prompt(mtmd_context * mctx, const std::string & prompt, const std::vector<raw_buffer> & files, bool is_placeholder) {
mtmd::bitmaps bitmaps;
for (auto & file : files) {
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size()));
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size(), is_placeholder));
if (!bmp.ptr) {
throw std::runtime_error("Failed to load image or audio file");
}
+2 -1
View File
@@ -258,7 +258,8 @@ llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt,
size_t validate_utf8(const std::string& text);
// process mtmd prompt, return the server_tokens containing both text tokens and media chunks
server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files);
// if is_placeholder is true, the media chunk will be treated as placeholder for counting tokens; the output tokens are not usable for actual inference (e.g. for submitting a task to server_queue)
server_tokens process_mtmd_prompt(mtmd_context * mctx, const std::string & prompt, const std::vector<raw_buffer> & files, bool is_placeholder = false);
/**
* break the input "prompt" object into multiple prompt if needed, then tokenize them
+60 -14
View File
@@ -4333,6 +4333,10 @@ void server_routes::init_routes() {
TASK_RESPONSE_TYPE_OAI_CHAT);
};
this->post_chat_completions_tok = [this](const server_http_req & req) {
return handle_count_tokens(ctx_server.vocab, ctx_server.mctx, req, TASK_RESPONSE_TYPE_OAI_CHAT);
};
this->post_control = [this](const server_http_req & req) {
auto res = create_response();
const json body = json::parse(req.body);
@@ -4388,6 +4392,10 @@ void server_routes::init_routes() {
TASK_RESPONSE_TYPE_OAI_RESP);
};
this->post_responses_tok_oai = [this](const server_http_req & req) {
return handle_count_tokens(ctx_server.vocab, ctx_server.mctx, req, TASK_RESPONSE_TYPE_OAI_RESP);
};
this->post_transcriptions_oai = [this](const server_http_req & req) {
auto res = create_response();
@@ -4435,20 +4443,7 @@ void server_routes::init_routes() {
};
this->post_anthropic_count_tokens = [this](const server_http_req & req) {
auto res = create_response();
std::vector<raw_buffer> files;
json body = server_chat_convert_anthropic_to_oai(json::parse(req.body));
SRV_DBG("%s\n", "Request converted: Anthropic -> OpenAI Chat Completions");
SRV_DBG("converted request: %s\n", body.dump().c_str());
json body_parsed = oaicompat_chat_params_parse(
body,
meta->chat_params,
files);
json prompt = body_parsed.at("prompt");
llama_tokens tokens = tokenize_mixed(ctx_server.vocab, prompt, true, true);
res->ok({{"input_tokens", static_cast<int>(tokens.size())}});
return res;
return handle_count_tokens(ctx_server.vocab, ctx_server.mctx, req, TASK_RESPONSE_TYPE_ANTHROPIC);
};
// same with handle_chat_completions, but without inference part
@@ -4928,3 +4923,54 @@ std::unique_ptr<server_res_generator> server_routes::handle_embeddings_impl(cons
res->ok(root);
return res;
}
std::unique_ptr<server_res_generator> server_routes::handle_count_tokens(const llama_vocab * vocab, mtmd_context * mctx, const server_http_req & req, task_response_type res_type) {
auto res = create_response();
std::vector<raw_buffer> files;
json body = json::parse(req.body);
bool is_oai = false;
switch (res_type) {
case TASK_RESPONSE_TYPE_OAI_CHAT:
{
is_oai = true;
} break;
case TASK_RESPONSE_TYPE_OAI_RESP:
{
is_oai = true;
body = server_chat_convert_responses_to_chatcmpl(body);
} break;
case TASK_RESPONSE_TYPE_ANTHROPIC:
{
body = server_chat_convert_anthropic_to_oai(body);
} break;
default:
res->error(format_error_response("invalid res_type", ERROR_TYPE_INVALID_REQUEST));
return res;
}
json body_parsed = oaicompat_chat_params_parse(
body,
meta->chat_params,
files);
json prompt = body_parsed.at("prompt");
// SRV_DBG("prompt = %s\n", prompt.dump().c_str());
// TODO @ngxson : refactor this code block, move this to server-common and reuse it in other places
size_t n_tokens;
if (mctx != nullptr) {
if (!prompt.is_string()) {
throw std::runtime_error("for mtmd, input prompt must be a string.");
}
n_tokens = process_mtmd_prompt(mctx, prompt.get<std::string>(), files, true).size();
} else {
n_tokens = tokenize_mixed(vocab, prompt, true, true).size();
}
json response = {{"input_tokens", static_cast<int>(n_tokens)}};
if (is_oai) {
response["object"] = "response.input_tokens";
}
res->ok(response);
return res;
}
+3
View File
@@ -110,8 +110,10 @@ struct server_routes {
server_http_context::handler_t post_completions;
server_http_context::handler_t post_completions_oai;
server_http_context::handler_t post_chat_completions;
server_http_context::handler_t post_chat_completions_tok;
server_http_context::handler_t post_control;
server_http_context::handler_t post_responses_oai;
server_http_context::handler_t post_responses_tok_oai;
server_http_context::handler_t post_transcriptions_oai;
server_http_context::handler_t post_anthropic_messages;
server_http_context::handler_t post_anthropic_count_tokens;
@@ -139,6 +141,7 @@ private:
std::unique_ptr<server_res_generator> handle_slots_restore(const server_http_req & req, int id_slot);
std::unique_ptr<server_res_generator> handle_slots_erase(const server_http_req &, int id_slot);
std::unique_ptr<server_res_generator> handle_embeddings_impl(const server_http_req & req, task_response_type res_type);
std::unique_ptr<server_res_generator> handle_count_tokens(const llama_vocab * vocab, mtmd_context * mctx, const server_http_req & req, task_response_type res_type);
// using unique_ptr to allow late initialization of const
std::unique_ptr<const server_context_meta> meta;
+8 -1
View File
@@ -161,6 +161,8 @@ int llama_server(int argc, char ** argv) {
routes.post_tokenize = models_routes->proxy_post;
routes.post_detokenize = models_routes->proxy_post;
routes.post_apply_template = models_routes->proxy_post;
routes.post_chat_completions_tok = models_routes->proxy_post;
routes.post_responses_tok_oai = models_routes->proxy_post;
routes.get_lora_adapters = models_routes->proxy_get;
routes.post_lora_adapters = models_routes->proxy_post;
routes.get_slots = models_routes->proxy_get;
@@ -192,7 +194,6 @@ int llama_server(int argc, char ** argv) {
ctx_http.post("/v1/audio/transcriptions", ex_wrapper(routes.post_transcriptions_oai));
ctx_http.post("/audio/transcriptions", ex_wrapper(routes.post_transcriptions_oai));
ctx_http.post("/v1/messages", ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API
ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting
ctx_http.post("/infill", ex_wrapper(routes.post_infill));
ctx_http.post("/embedding", ex_wrapper(routes.post_embeddings)); // legacy
ctx_http.post("/embeddings", ex_wrapper(routes.post_embeddings));
@@ -204,6 +205,12 @@ int llama_server(int argc, char ** argv) {
ctx_http.post("/tokenize", ex_wrapper(routes.post_tokenize));
ctx_http.post("/detokenize", ex_wrapper(routes.post_detokenize));
ctx_http.post("/apply-template", ex_wrapper(routes.post_apply_template));
// token counting
ctx_http.post("/chat/completions/input_tokens", ex_wrapper(routes.post_chat_completions_tok));
ctx_http.post("/v1/chat/completions/input_tokens", ex_wrapper(routes.post_chat_completions_tok));
ctx_http.post("/responses/input_tokens", ex_wrapper(routes.post_responses_tok_oai));
ctx_http.post("/v1/responses/input_tokens", ex_wrapper(routes.post_responses_tok_oai));
ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting
// LoRA adapters hotswap
ctx_http.get ("/lora-adapters", ex_wrapper(routes.get_lora_adapters));
ctx_http.post("/lora-adapters", ex_wrapper(routes.post_lora_adapters));
@@ -573,3 +573,19 @@ def test_chat_completions_multiple_choices():
for choice in res.body["choices"]:
assert "assistant" == choice["message"]["role"]
assert choice["finish_reason"] == "length"
def test_chat_completions_token_count():
global server
server.start()
# make sure cache can be reused across multiple choices and multiple requests
# ref: https://github.com/ggml-org/llama.cpp/pull/18663
for _ in range(2):
res = server.make_request("POST", "/chat/completions/input_tokens", data={
"messages": [
{"role": "system", "content": "Book"},
{"role": "user", "content": "What is the best book"},
],
})
assert res.status_code == 200
assert res.body["input_tokens"] > 5
@@ -98,6 +98,25 @@ def test_vision_chat_completion(prompt, image_url, success, re_content):
assert res.status_code != 200
def test_vision_chat_completion_token_count():
global server
server.start()
res = server.make_request("POST", "/chat/completions/input_tokens", data={
"temperature": 0.0,
"top_k": 1,
"messages": [
{"role": "user", "content": [
{"type": "text", "text": "What is this:"},
{"type": "image_url", "image_url": {
"url": get_img_url("IMG_URL_0"),
}},
]},
],
})
assert res.status_code == 200
assert res.body["input_tokens"] > 10
@pytest.mark.parametrize(
"prompt, image_data, success, re_content",
[