mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-09 07:16:44 +02:00
mtmd, server: add "placeholder bitmap" for counting tokens , add */input_tokens API (#23913)
* mtmd: add "placeholder bitmap" for counting tokens w/o preprocessing * fast path skip preproc for placeholder * fix build * correct the api * add server endpoint + tests * add object name * update docs * add proxy handling * fix build * fix audio input path * use is_placeholder in process_mtmd_prompt() * nits * nits (2) * docs: clarify chat/completions/input_tokens is not official * fix merge problem
This commit is contained in:
+139
-8
@@ -4,6 +4,7 @@
|
||||
#include "gguf.h"
|
||||
#include "clip.h"
|
||||
|
||||
#include <array>
|
||||
#include <climits>
|
||||
#include <cstdarg>
|
||||
#include <cinttypes>
|
||||
@@ -429,10 +430,68 @@ static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||
|
||||
// RGB uint8 image
|
||||
struct clip_image_u8 {
|
||||
int nx;
|
||||
int ny;
|
||||
clip_image_size get_size() const {
|
||||
return { nx, ny };
|
||||
}
|
||||
|
||||
void set_size(clip_image_size size, bool is_placeholder) {
|
||||
nx = size.width;
|
||||
ny = size.height;
|
||||
if (is_placeholder) {
|
||||
buf.clear();
|
||||
} else {
|
||||
buf.resize((size_t) nx * (size_t) ny * 3);
|
||||
}
|
||||
}
|
||||
|
||||
void cpy_buf(const std::vector<uint8_t> & new_buf) {
|
||||
buf = new_buf;
|
||||
}
|
||||
|
||||
const std::vector<uint8_t> & get_ro_buf() const {
|
||||
if (is_placeholder()) {
|
||||
throw std::runtime_error("this clip_image_u8 is a placeholder");
|
||||
}
|
||||
return buf;
|
||||
}
|
||||
|
||||
// note to contributors: NEVER add a get_rw_buf(), it is a DANGEROUS pattern. always use get_pixel / set_pixel for buffer manipulation
|
||||
|
||||
bool is_placeholder() const {
|
||||
return buf.empty();
|
||||
}
|
||||
|
||||
std::array<uint8_t, 3> get_pixel(int x, int y) const {
|
||||
if (is_placeholder()) {
|
||||
// return a dummy value, so that legacy code can still process image without errors
|
||||
return { 0, 0, 0 };
|
||||
}
|
||||
int idx = (y * nx + x) * 3;
|
||||
return { buf[idx], buf[idx + 1], buf[idx + 2] };
|
||||
}
|
||||
|
||||
void set_pixel(int x, int y, const std::array<uint8_t, 3> & rgb) {
|
||||
if (is_placeholder()) {
|
||||
return; // no-op
|
||||
}
|
||||
int idx = (y * nx + x) * 3;
|
||||
buf[idx] = rgb[0];
|
||||
buf[idx + 1] = rgb[1];
|
||||
buf[idx + 2] = rgb[2];
|
||||
}
|
||||
|
||||
size_t n_pixels() const {
|
||||
return (size_t) nx * (size_t) ny;
|
||||
}
|
||||
|
||||
size_t n_elements() const {
|
||||
return n_pixels() * 3;
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<uint8_t> buf;
|
||||
int nx = 0;
|
||||
int ny = 0;
|
||||
};
|
||||
|
||||
// For images, buf.size() == nx*ny*3
|
||||
@@ -440,15 +499,87 @@ struct clip_image_u8 {
|
||||
// For audio, only one channel is used, buf.size() == nx*ny
|
||||
// nx will be n_frames and ny will be n_mel
|
||||
struct clip_image_f32 {
|
||||
int nx;
|
||||
int ny;
|
||||
|
||||
std::vector<float> buf;
|
||||
|
||||
// marks the global view in e.g., DeepSeek-OCR Models
|
||||
bool add_viewsep = false;
|
||||
// whether a learned newline token should be appended after the image (eg Granite4 Vision)
|
||||
// whether a learned newline (or EOI) token should be appended after the image (eg Granite4 Vision)
|
||||
bool add_newline = false;
|
||||
|
||||
clip_image_size get_size() const {
|
||||
return { nx_, ny_ };
|
||||
}
|
||||
|
||||
int nx() const { return nx_; }
|
||||
int ny() const { return ny_; }
|
||||
|
||||
void set_size(clip_image_size size, bool is_placeholder, bool is_audio) {
|
||||
nx_ = size.width;
|
||||
ny_ = size.height;
|
||||
if (is_placeholder) {
|
||||
buf.clear();
|
||||
} else {
|
||||
if (is_audio) {
|
||||
buf.resize((size_t) nx_ * (size_t) ny_);
|
||||
} else {
|
||||
buf.resize((size_t) nx_ * (size_t) ny_ * 3);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void cpy_buf(const std::vector<float> & new_buf) {
|
||||
buf = new_buf;
|
||||
}
|
||||
|
||||
void from_u8(const clip_image_u8 & img) {
|
||||
auto size = img.get_size();
|
||||
nx_ = size.width;
|
||||
ny_ = size.height;
|
||||
if (img.is_placeholder()) {
|
||||
buf.clear();
|
||||
return; // no-op
|
||||
}
|
||||
buf.resize(img.n_elements());
|
||||
const auto & u8_buf = img.get_ro_buf();
|
||||
for (size_t i = 0; i < img.n_elements(); ++i) {
|
||||
buf[i] = (float) u8_buf[i] / 255.0f;
|
||||
}
|
||||
}
|
||||
|
||||
size_t n_pixels() const {
|
||||
return (size_t) nx_ * (size_t) ny_;
|
||||
}
|
||||
|
||||
size_t n_elements() const {
|
||||
return n_pixels() * 3;
|
||||
}
|
||||
|
||||
void normalize(const float mean[3], const float std[3]) {
|
||||
if (is_placeholder()) {
|
||||
return; // no-op
|
||||
}
|
||||
for (size_t i = 0; i < n_pixels(); ++i) {
|
||||
buf[i * 3 + 0] = (buf[i * 3 + 0] - mean[0]) / std[0];
|
||||
buf[i * 3 + 1] = (buf[i * 3 + 1] - mean[1]) / std[1];
|
||||
buf[i * 3 + 2] = (buf[i * 3 + 2] - mean[2]) / std[2];
|
||||
}
|
||||
}
|
||||
|
||||
const std::vector<float> & get_ro_buf() const {
|
||||
if (is_placeholder()) {
|
||||
throw std::runtime_error("this clip_image_f32 is a placeholder");
|
||||
}
|
||||
return buf;
|
||||
}
|
||||
|
||||
// note to contributors: NEVER add a get_rw_buf(), it is a DANGEROUS pattern
|
||||
|
||||
bool is_placeholder() const {
|
||||
return buf.empty();
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<float> buf;
|
||||
int nx_ = 0;
|
||||
int ny_ = 0;
|
||||
};
|
||||
|
||||
//
|
||||
|
||||
+85
-130
@@ -39,12 +39,14 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
|
||||
}
|
||||
|
||||
// PPM header: P6 format, width, height, and max color value
|
||||
file << "P6\n" << img.nx << " " << img.ny << "\n255\n";
|
||||
const auto ppm_size = img.get_size();
|
||||
file << "P6\n" << ppm_size.width << " " << ppm_size.height << "\n255\n";
|
||||
|
||||
// Write pixel data
|
||||
for (size_t i = 0; i < img.buf.size(); i += 3) {
|
||||
const auto & ppm_buf = img.get_ro_buf();
|
||||
for (size_t i = 0; i < ppm_buf.size(); i += 3) {
|
||||
// PPM expects binary data in RGB format, which matches our image buffer
|
||||
file.write(reinterpret_cast<const char*>(&img.buf[i]), 3);
|
||||
file.write(reinterpret_cast<const char*>(&ppm_buf[i]), 3);
|
||||
}
|
||||
|
||||
file.close();
|
||||
@@ -57,9 +59,10 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string&
|
||||
return;
|
||||
}
|
||||
|
||||
int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data
|
||||
const auto bmp_size = img.get_size();
|
||||
int fileSize = 54 + 3 * bmp_size.width * bmp_size.height; // File header + info header + pixel data
|
||||
int bytesPerPixel = 3;
|
||||
int widthInBytes = img.nx * bytesPerPixel;
|
||||
int widthInBytes = bmp_size.width * bytesPerPixel;
|
||||
int paddingAmount = (4 - (widthInBytes % 4)) % 4;
|
||||
int stride = widthInBytes + paddingAmount;
|
||||
|
||||
@@ -72,7 +75,7 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string&
|
||||
};
|
||||
|
||||
// Total file size
|
||||
fileSize = 54 + (stride * img.ny);
|
||||
fileSize = 54 + (stride * bmp_size.height);
|
||||
fileHeader[2] = (unsigned char)(fileSize);
|
||||
fileHeader[3] = (unsigned char)(fileSize >> 8);
|
||||
fileHeader[4] = (unsigned char)(fileSize >> 16);
|
||||
@@ -94,14 +97,14 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string&
|
||||
};
|
||||
|
||||
// Width and height in the information header
|
||||
infoHeader[4] = (unsigned char)(img.nx);
|
||||
infoHeader[5] = (unsigned char)(img.nx >> 8);
|
||||
infoHeader[6] = (unsigned char)(img.nx >> 16);
|
||||
infoHeader[7] = (unsigned char)(img.nx >> 24);
|
||||
infoHeader[8] = (unsigned char)(img.ny);
|
||||
infoHeader[9] = (unsigned char)(img.ny >> 8);
|
||||
infoHeader[10] = (unsigned char)(img.ny >> 16);
|
||||
infoHeader[11] = (unsigned char)(img.ny >> 24);
|
||||
infoHeader[4] = (unsigned char)(bmp_size.width);
|
||||
infoHeader[5] = (unsigned char)(bmp_size.width >> 8);
|
||||
infoHeader[6] = (unsigned char)(bmp_size.width >> 16);
|
||||
infoHeader[7] = (unsigned char)(bmp_size.width >> 24);
|
||||
infoHeader[8] = (unsigned char)(bmp_size.height);
|
||||
infoHeader[9] = (unsigned char)(bmp_size.height >> 8);
|
||||
infoHeader[10] = (unsigned char)(bmp_size.height >> 16);
|
||||
infoHeader[11] = (unsigned char)(bmp_size.height >> 24);
|
||||
|
||||
// Write file headers
|
||||
file.write(reinterpret_cast<char*>(fileHeader), sizeof(fileHeader));
|
||||
@@ -109,14 +112,14 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string&
|
||||
|
||||
// Pixel data
|
||||
std::vector<unsigned char> padding(3, 0); // Max padding size to be added to each row
|
||||
for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
|
||||
for (int x = 0; x < img.nx; ++x) {
|
||||
for (int y = bmp_size.height - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
|
||||
for (int x = 0; x < bmp_size.width; ++x) {
|
||||
// Each pixel
|
||||
size_t pixelIndex = (y * img.nx + x) * 3;
|
||||
const auto px = img.get_pixel(x, y);
|
||||
unsigned char pixel[3] = {
|
||||
img.buf[pixelIndex + 2], // BMP stores pixels in BGR format
|
||||
img.buf[pixelIndex + 1],
|
||||
img.buf[pixelIndex]
|
||||
px[2], // BMP stores pixels in BGR format
|
||||
px[1],
|
||||
px[0]
|
||||
};
|
||||
file.write(reinterpret_cast<char*>(pixel), 3);
|
||||
}
|
||||
@@ -129,12 +132,13 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string&
|
||||
|
||||
// debug function to convert f32 to u8
|
||||
static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
|
||||
dst.nx = src.nx;
|
||||
dst.ny = src.ny;
|
||||
dst.buf.resize(3 * src.nx * src.ny);
|
||||
for (size_t i = 0; i < src.buf.size(); ++i) {
|
||||
dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255));
|
||||
dst.set_size(src.get_size(), false);
|
||||
const auto & src_buf = src.get_ro_buf();
|
||||
std::vector<uint8_t> dst_buf(src.n_elements());
|
||||
for (size_t i = 0; i < src.n_elements(); ++i) {
|
||||
dst_buf[i] = static_cast<uint8_t>(std::min(std::max(int(src_buf[i] * 255.0f), 0), 255));
|
||||
}
|
||||
dst.cpy_buf(dst_buf);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -241,8 +245,8 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
|
||||
proj_type(ctx->proj_type()),
|
||||
img(img),
|
||||
patch_size(hparams.patch_size),
|
||||
n_patches_x(img.nx / patch_size),
|
||||
n_patches_y(img.ny / patch_size),
|
||||
n_patches_x(img.nx() / patch_size),
|
||||
n_patches_y(img.ny() / patch_size),
|
||||
n_patches(n_patches_x * n_patches_y),
|
||||
n_embd(hparams.n_embd),
|
||||
n_head(hparams.n_head),
|
||||
@@ -278,8 +282,8 @@ void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const {
|
||||
// siglip2 naflex
|
||||
ggml_tensor * clip_graph::resize_position_embeddings(uint32_t interpolation_mode) {
|
||||
ggml_tensor * pos_embd = model.position_embeddings;
|
||||
const int height = img.ny / patch_size;
|
||||
const int width = img.nx / patch_size;
|
||||
const int height = img.ny() / patch_size;
|
||||
const int width = img.nx() / patch_size;
|
||||
const uint32_t mode = interpolation_mode;
|
||||
const int n_per_side = (int)std::sqrt(pos_embd->ne[1]);
|
||||
|
||||
@@ -523,7 +527,7 @@ ggml_tensor * clip_graph::build_inp() {
|
||||
}
|
||||
|
||||
ggml_tensor * clip_graph::build_inp_raw(int channels) {
|
||||
ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels);
|
||||
ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx(), img.ny(), channels);
|
||||
ggml_set_name(inp_raw, "inp_raw");
|
||||
ggml_set_input(inp_raw);
|
||||
return inp_raw;
|
||||
@@ -816,8 +820,8 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale
|
||||
GGML_ASSERT(scale_factor > 1);
|
||||
|
||||
const int n_embd = cur->ne[0];
|
||||
int width = img.nx / patch_size;
|
||||
int height = img.ny / patch_size;
|
||||
int width = img.nx() / patch_size;
|
||||
int height = img.ny() / patch_size;
|
||||
|
||||
// pad width and height to factor
|
||||
const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width;
|
||||
@@ -2805,13 +2809,12 @@ struct clip_model_loader {
|
||||
clip_image_f32_batch batch;
|
||||
clip_image_f32_ptr img(clip_image_f32_init());
|
||||
if (ctx_clip.model.modality == CLIP_MODALITY_VISION) {
|
||||
img->nx = hparams.warmup_image_size;
|
||||
img->ny = hparams.warmup_image_size;
|
||||
LOG_INF("%s: warmup with image size = %d x %d\n", __func__, img->nx, img->ny);
|
||||
const int sz = hparams.warmup_image_size;
|
||||
img->set_size({sz, sz}, false, false);
|
||||
LOG_INF("%s: warmup with image size = %d x %d\n", __func__, sz, sz);
|
||||
} else {
|
||||
img->nx = hparams.warmup_audio_size;
|
||||
img->ny = hparams.n_mel_bins;
|
||||
LOG_INF("%s: warmup with audio size = %d\n", __func__, img->nx);
|
||||
img->set_size({hparams.warmup_audio_size, hparams.n_mel_bins}, false, false);
|
||||
LOG_INF("%s: warmup with audio size = %d\n", __func__, hparams.warmup_audio_size);
|
||||
}
|
||||
batch.entries.push_back(std::move(img));
|
||||
warmup(ctx_clip, batch);
|
||||
@@ -3108,12 +3111,6 @@ struct clip_image_f32_batch * clip_image_f32_batch_init() {
|
||||
return new clip_image_f32_batch();
|
||||
}
|
||||
|
||||
unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) {
|
||||
if (nx) *nx = img->nx;
|
||||
if (ny) *ny = img->ny;
|
||||
return img->buf.data();
|
||||
}
|
||||
|
||||
void clip_image_size_free(struct clip_image_size * load_image_size) {
|
||||
if (load_image_size == nullptr) {
|
||||
return;
|
||||
@@ -3134,7 +3131,7 @@ size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int id
|
||||
LOG_ERR("%s: invalid index %d\n", __func__, idx);
|
||||
return 0;
|
||||
}
|
||||
return batch->entries[idx]->nx;
|
||||
return batch->entries[idx]->nx();
|
||||
}
|
||||
|
||||
size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx) {
|
||||
@@ -3142,7 +3139,7 @@ size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int id
|
||||
LOG_ERR("%s: invalid index %d\n", __func__, idx);
|
||||
return 0;
|
||||
}
|
||||
return batch->entries[idx]->ny;
|
||||
return batch->entries[idx]->ny();
|
||||
}
|
||||
|
||||
clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx) {
|
||||
@@ -3153,13 +3150,6 @@ clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batc
|
||||
return batch->entries[idx].get();
|
||||
}
|
||||
|
||||
void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
|
||||
img->nx = nx;
|
||||
img->ny = ny;
|
||||
img->buf.resize(3 * nx * ny);
|
||||
memcpy(img->buf.data(), rgb_pixels, img->buf.size());
|
||||
}
|
||||
|
||||
void clip_free(clip_ctx * ctx) {
|
||||
if (ctx == nullptr) {
|
||||
return;
|
||||
@@ -3167,20 +3157,6 @@ void clip_free(clip_ctx * ctx) {
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
// deprecated
|
||||
size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
|
||||
const int32_t nx = ctx->model.hparams.image_size;
|
||||
const int32_t ny = ctx->model.hparams.image_size;
|
||||
return clip_embd_nbytes_by_img(ctx, nx, ny);
|
||||
}
|
||||
|
||||
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) {
|
||||
clip_image_f32 img;
|
||||
img.nx = img_w;
|
||||
img.ny = img_h;
|
||||
return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
|
||||
}
|
||||
|
||||
int32_t clip_get_image_size(const struct clip_ctx * ctx) {
|
||||
return ctx->model.hparams.image_size;
|
||||
}
|
||||
@@ -3211,9 +3187,9 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
return (img->nx / params.patch_size) / 2;
|
||||
return (img->nx() / params.patch_size) / 2;
|
||||
case PROJECTOR_TYPE_STEP3VL:
|
||||
return img->nx / (params.patch_size * params.n_merge);
|
||||
return img->nx() / (params.patch_size * params.n_merge);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -3233,9 +3209,9 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
return (img->ny / params.patch_size) / 2;
|
||||
return (img->ny() / params.patch_size) / 2;
|
||||
case PROJECTOR_TYPE_STEP3VL:
|
||||
return img->ny / (params.patch_size * params.n_merge);
|
||||
return img->ny() / (params.patch_size * params.n_merge);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -3247,7 +3223,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
|
||||
// for models with fixed size image, the input image is already pre-processed and resized to square
|
||||
int patch_size = params.patch_size;
|
||||
int n_patches = (img->nx / patch_size) * (img->ny / patch_size);
|
||||
int n_patches = (img->nx() / patch_size) * (img->ny() / patch_size);
|
||||
|
||||
projector_type proj = ctx->proj_type();
|
||||
|
||||
@@ -3313,14 +3289,14 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
{
|
||||
// dynamic size (2 conv, so double patch size)
|
||||
int x_patch = img->nx / (params.patch_size * 2);
|
||||
int y_patch = img->ny / (params.patch_size * 2);
|
||||
int x_patch = img->nx() / (params.patch_size * 2);
|
||||
int y_patch = img->ny() / (params.patch_size * 2);
|
||||
n_patches = x_patch * y_patch;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_STEP3VL:
|
||||
{
|
||||
int x_patch = img->nx / (params.patch_size * params.n_merge);
|
||||
int y_patch = img->ny / (params.patch_size * params.n_merge);
|
||||
int x_patch = img->nx() / (params.patch_size * params.n_merge);
|
||||
int y_patch = img->ny() / (params.patch_size * params.n_merge);
|
||||
n_patches = x_patch * y_patch;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
@@ -3347,8 +3323,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
{
|
||||
// dynamic size
|
||||
int out_patch_size = params.patch_size * ctx->model.hparams.n_merge;
|
||||
int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size;
|
||||
int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size;
|
||||
int x_patch = CLIP_ALIGN(img->nx(), out_patch_size) / out_patch_size;
|
||||
int y_patch = CLIP_ALIGN(img->ny(), out_patch_size) / out_patch_size;
|
||||
n_patches = x_patch * y_patch;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
@@ -3364,8 +3340,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
{
|
||||
// dynamic size
|
||||
int n_merge = ctx->model.hparams.n_merge;
|
||||
int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1);
|
||||
int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1);
|
||||
int n_patches_x = img->nx() / patch_size / (n_merge > 0 ? n_merge : 1);
|
||||
int n_patches_y = img->ny() / patch_size / (n_merge > 0 ? n_merge : 1);
|
||||
if (ctx->model.token_embd_img_break) {
|
||||
n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
|
||||
} else {
|
||||
@@ -3378,7 +3354,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
case PROJECTOR_TYPE_MERALION:
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
{
|
||||
n_patches = img->nx;
|
||||
n_patches = img->nx();
|
||||
|
||||
const int proj_stack_factor = ctx->model.hparams.proj_stack_factor;
|
||||
if (ctx->model.audio_has_stack_frames()) {
|
||||
@@ -3400,11 +3376,11 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
// chunk_size=100 frames --> 3x stride-2 conv2d --> 13 tokens per chunk
|
||||
const int chunk_size = 100;
|
||||
const int tokens_per_chunk = 13;
|
||||
n_patches = (img->nx / chunk_size) * tokens_per_chunk;
|
||||
n_patches = (img->nx() / chunk_size) * tokens_per_chunk;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
{
|
||||
n_patches = img->nx;
|
||||
n_patches = img->nx();
|
||||
// whisper downscales input token by half after conv1d
|
||||
n_patches /= 2;
|
||||
// reshape by merge_factor
|
||||
@@ -3431,8 +3407,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
{
|
||||
int merge = ctx->model.hparams.n_merge;
|
||||
int ow = (img->nx / patch_size) / merge;
|
||||
int oh = (img->ny / patch_size) / merge;
|
||||
int ow = (img->nx() / patch_size) / merge;
|
||||
int oh = (img->ny() / patch_size) / merge;
|
||||
n_patches = (ow + 1) * oh + 2;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_DEEPSEEKOCR2:
|
||||
@@ -3446,13 +3422,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2A:
|
||||
{
|
||||
n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
|
||||
n_patches = ((((img->nx() + 1) / 2) + 1) / 2 + 1) / 2;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA4A:
|
||||
{
|
||||
// Two Conv2D stride-2: O = floor((I + 2p - k) / s) + 1, p=1, k=3, s=2
|
||||
// O = floor((I - 1) / 2) + 1
|
||||
int n = img->nx;
|
||||
int n = img->nx();
|
||||
for (int i = 0; i < 2; i++) {
|
||||
n = (n - 1) / 2 + 1;
|
||||
}
|
||||
@@ -3460,13 +3436,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA4UA:
|
||||
{
|
||||
n_patches = img->nx; // no downsampling: one token per raw waveform frame
|
||||
n_patches = img->nx(); // no downsampling: one token per raw waveform frame
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GRANITE_SPEECH:
|
||||
{
|
||||
const int ws = ctx->model.hparams.audio_proj_window_size;
|
||||
const int ds = ctx->model.hparams.audio_proj_downsample_rate;
|
||||
n_patches = ((img->nx + ws - 1) / ws) * (ws / ds);
|
||||
n_patches = ((img->nx() + ws - 1) / ws) * (ws / ds);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GRANITE4_VISION:
|
||||
{
|
||||
@@ -3475,7 +3451,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
// For 384×384 input: n = 24/8 = 3, query_side = 4 → 144.
|
||||
const int window_side = ctx->model.hparams.downsample_window_side;
|
||||
const int query_side = ctx->model.hparams.downsample_query_side;
|
||||
const int side = img->nx / params.patch_size;
|
||||
const int side = img->nx() / params.patch_size;
|
||||
const int n = side / window_side;
|
||||
n_patches = (query_side * n) * (query_side * n);
|
||||
if (img->add_newline) {
|
||||
@@ -3525,8 +3501,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
const auto & model = ctx->model;
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int image_size_width = imgs.entries[0]->nx;
|
||||
const int image_size_height = imgs.entries[0]->ny;
|
||||
const int image_size_width = imgs.entries[0]->nx();
|
||||
const int image_size_height = imgs.entries[0]->ny();
|
||||
|
||||
const int patch_size = hparams.patch_size;
|
||||
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
||||
@@ -3546,7 +3522,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
return inp;
|
||||
};
|
||||
|
||||
auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector<float> & values) {
|
||||
auto set_input_f32 = [&get_inp_tensor](const char * name, const std::vector<float> & values) {
|
||||
ggml_tensor * cur = get_inp_tensor(name);
|
||||
GGML_ASSERT(cur->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
|
||||
@@ -3564,7 +3540,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
if (!imgs.is_audio) {
|
||||
size_t nelem = 0;
|
||||
for (const auto & img : imgs.entries) {
|
||||
nelem += img->nx * img->ny * 3;
|
||||
nelem += img->nx() * img->ny() * 3;
|
||||
}
|
||||
std::vector<float> inp_raw(nelem);
|
||||
|
||||
@@ -3580,19 +3556,20 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
// ──────┘ x B
|
||||
|
||||
for (size_t i = 0; i < imgs.entries.size(); i++) {
|
||||
const int nx = imgs.entries[i]->nx;
|
||||
const int ny = imgs.entries[i]->ny;
|
||||
const int nx = imgs.entries[i]->nx();
|
||||
const int ny = imgs.entries[i]->ny();
|
||||
const int n = nx * ny;
|
||||
|
||||
for (int b = 0; b < batch_size; b++) {
|
||||
const auto & buf = imgs.entries[b]->get_ro_buf();
|
||||
float * batch_entry = inp_raw.data() + b * (3*n);
|
||||
for (int y = 0; y < ny; y++) {
|
||||
for (int x = 0; x < nx; x++) {
|
||||
size_t base_src = 3*(y * nx + x); // idx of the first channel
|
||||
size_t base_dst = y * nx + x; // idx of the first channel
|
||||
batch_entry[ base_dst] = imgs.entries[b]->buf[base_src ];
|
||||
batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1];
|
||||
batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2];
|
||||
batch_entry[ base_dst] = buf[base_src ];
|
||||
batch_entry[1*n + base_dst] = buf[base_src + 1];
|
||||
batch_entry[2*n + base_dst] = buf[base_src + 2];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3602,12 +3579,14 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
} else {
|
||||
// audio input
|
||||
GGML_ASSERT(imgs.entries.size() == 1);
|
||||
|
||||
const auto & mel_inp = imgs.entries[0];
|
||||
const int n_step = mel_inp->nx;
|
||||
const int n_mel = mel_inp->ny;
|
||||
std::vector<float> inp_raw(n_step * n_mel);
|
||||
std::memcpy(inp_raw.data(), mel_inp->buf.data(), n_step * n_mel * sizeof(float));
|
||||
set_input_f32("inp_raw", inp_raw);
|
||||
const auto & buf = mel_inp->get_ro_buf();
|
||||
const int n_step = mel_inp->nx();
|
||||
const int n_mel = mel_inp->ny();
|
||||
GGML_ASSERT((size_t)n_step * n_mel == buf.size());
|
||||
|
||||
set_input_f32("inp_raw", buf);
|
||||
}
|
||||
|
||||
// set input per projector
|
||||
@@ -4218,7 +4197,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
GGML_ASSERT(imgs.entries.size() == 1);
|
||||
const auto & img0 = imgs.entries.front();
|
||||
// Compute n_pos matching SSCP output: two stride-2 convs
|
||||
int n_pos = img0->nx;
|
||||
int n_pos = img0->nx();
|
||||
for (int i = 0; i < 2; i++) { n_pos = (n_pos - 1) / 2 + 1; }
|
||||
|
||||
// Chunked local attention: blocked causal mask and RPE
|
||||
@@ -4324,7 +4303,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
// reshapes as ggml_get_rows gathers. The names are set
|
||||
// by g4v_gather() in models/granite4-vision.cpp.
|
||||
const int patch_size = model.hparams.patch_size;
|
||||
const int image_side = imgs.entries.front()->nx / patch_size;
|
||||
const int image_side = imgs.entries.front()->nx() / patch_size;
|
||||
const int window_side = hparams.downsample_window_side;
|
||||
const int query_side = hparams.downsample_query_side;
|
||||
const int n = image_side / window_side;
|
||||
@@ -4570,19 +4549,6 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
|
||||
return ctx->model.modality == CLIP_MODALITY_AUDIO;
|
||||
}
|
||||
|
||||
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
|
||||
clip_image_f32 clip_img;
|
||||
clip_img.buf.resize(h * w * 3);
|
||||
for (int i = 0; i < h*w*3; i++)
|
||||
{
|
||||
clip_img.buf[i] = img[i];
|
||||
}
|
||||
clip_img.nx = w;
|
||||
clip_img.ny = h;
|
||||
clip_image_encode(ctx, n_threads, &clip_img, vec);
|
||||
return true;
|
||||
}
|
||||
|
||||
//
|
||||
// API used internally with mtmd
|
||||
//
|
||||
@@ -4591,17 +4557,6 @@ projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
|
||||
return ctx->proj_type();
|
||||
}
|
||||
|
||||
void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel) {
|
||||
clip_image_f32 * audio = new clip_image_f32;
|
||||
audio->nx = n_frames;
|
||||
audio->ny = n_mel;
|
||||
audio->buf.resize(n_frames * n_mel);
|
||||
std::memcpy(audio->buf.data(), mel, n_frames * n_mel * sizeof(float));
|
||||
|
||||
batch->entries.push_back(clip_image_f32_ptr(audio));
|
||||
batch->is_audio = true;
|
||||
}
|
||||
|
||||
const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
|
||||
return &ctx->model.hparams;
|
||||
}
|
||||
|
||||
+3
-17
@@ -17,6 +17,9 @@ struct clip_ctx;
|
||||
struct clip_image_size {
|
||||
int width;
|
||||
int height;
|
||||
bool operator==(const clip_image_size & other) const {
|
||||
return width == other.width && height == other.height;
|
||||
}
|
||||
};
|
||||
|
||||
struct clip_image_f32;
|
||||
@@ -54,9 +57,6 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
|
||||
|
||||
void clip_free(struct clip_ctx * ctx);
|
||||
|
||||
size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
||||
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
|
||||
|
||||
int32_t clip_get_image_size (const struct clip_ctx * ctx);
|
||||
int32_t clip_get_patch_size (const struct clip_ctx * ctx);
|
||||
int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
|
||||
@@ -79,9 +79,6 @@ struct clip_image_u8 * clip_image_u8_init (void);
|
||||
struct clip_image_f32 * clip_image_f32_init(void);
|
||||
struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava
|
||||
|
||||
// nx, ny are the output image dimensions
|
||||
unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
|
||||
|
||||
void clip_image_size_free (struct clip_image_size * img_size);
|
||||
void clip_image_u8_free (struct clip_image_u8 * img);
|
||||
void clip_image_f32_free(struct clip_image_f32 * img);
|
||||
@@ -94,12 +91,6 @@ size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int id
|
||||
size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
|
||||
struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
|
||||
|
||||
/**
|
||||
* Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
|
||||
* The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes
|
||||
*/
|
||||
void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
|
||||
|
||||
bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
|
||||
bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
|
||||
|
||||
@@ -107,11 +98,6 @@ bool clip_is_llava(const struct clip_ctx * ctx);
|
||||
// note for contributor: this clip_is_(model) pattern is deprecated
|
||||
// do NOT add new functions like this
|
||||
|
||||
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
|
||||
|
||||
// use by audio input
|
||||
void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel);
|
||||
|
||||
bool clip_has_vision_encoder(const struct clip_ctx * ctx);
|
||||
bool clip_has_audio_encoder(const struct clip_ctx * ctx);
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_conformer::build() {
|
||||
const int n_frames = img.nx;
|
||||
const int n_frames = img.nx();
|
||||
const int n_pos = n_frames / 2;
|
||||
const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1;
|
||||
GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
|
||||
|
||||
@@ -22,8 +22,8 @@ ggml_cgraph * clip_graph_exaone4_5::build() {
|
||||
ggml_tensor * inp_raw = build_inp_raw();
|
||||
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
|
||||
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
|
||||
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
|
||||
GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
|
||||
GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
|
||||
|
||||
{
|
||||
ggml_tensor * inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
|
||||
@@ -16,8 +16,8 @@ ggml_cgraph * clip_graph_glm4v::build() {
|
||||
ggml_set_name(positions, "positions");
|
||||
ggml_set_input(positions);
|
||||
|
||||
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
|
||||
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
|
||||
GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
|
||||
GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
|
||||
|
||||
// second conv dimension
|
||||
{
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_granite_speech::build() {
|
||||
const int n_frames = img.nx;
|
||||
const int n_frames = img.nx();
|
||||
const int context_size = hparams.audio_chunk_size;
|
||||
const int ctc_layer = n_layer / 2;
|
||||
const int conv_kernel = hparams.audio_conv_kernel_size;
|
||||
|
||||
@@ -7,8 +7,8 @@
|
||||
// with a w*h? Also the permute is a bit different at (2, 1, 0, 3) instead of (2, 0, 1, 3).
|
||||
ggml_tensor * clip_graph_kimik25::resize_position_embeddings_3d(uint32_t interpolation_mode) {
|
||||
ggml_tensor * pos_embd = model.position_embeddings;
|
||||
const int height = img.ny / patch_size;
|
||||
const int width = img.nx / patch_size;
|
||||
const int height = img.ny() / patch_size;
|
||||
const int width = img.nx() / patch_size;
|
||||
const uint32_t mode = interpolation_mode;
|
||||
|
||||
GGML_ASSERT(pos_embd);
|
||||
|
||||
@@ -56,8 +56,8 @@ ggml_cgraph * clip_graph_mimovl::build() {
|
||||
patch_size, patch_size, 0, 0, 1, 1);
|
||||
inp = ggml_add(ctx0, inp, inp_1);
|
||||
|
||||
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
|
||||
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
|
||||
GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
|
||||
GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
|
||||
|
||||
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w,h,c,b] -> [c,w,h,b]
|
||||
inp = ggml_cont_4d(ctx0, inp, n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
|
||||
|
||||
@@ -19,8 +19,8 @@ ggml_cgraph * clip_graph_qwen2vl::build() {
|
||||
ggml_tensor * inp_raw = build_inp_raw();
|
||||
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
|
||||
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
|
||||
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
|
||||
GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
|
||||
GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
|
||||
|
||||
// second conv dimension
|
||||
{
|
||||
|
||||
@@ -16,8 +16,8 @@ ggml_cgraph * clip_graph_qwen3vl::build() {
|
||||
ggml_tensor * inp_raw = build_inp_raw();
|
||||
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
|
||||
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
|
||||
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
|
||||
GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
|
||||
GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
|
||||
|
||||
// second conv dimension
|
||||
{
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_whisper_enc::build() {
|
||||
const int n_frames = img.nx;
|
||||
const int n_frames = img.nx();
|
||||
const int n_pos = n_frames / 2;
|
||||
GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
|
||||
|
||||
|
||||
@@ -166,7 +166,7 @@ struct mtmd_cli_context {
|
||||
}
|
||||
|
||||
bool load_media(const std::string & fname) {
|
||||
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str()));
|
||||
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str(), false));
|
||||
if (!bmp.ptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -478,7 +478,7 @@ static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int
|
||||
|
||||
} // namespace audio_helpers
|
||||
|
||||
mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len) {
|
||||
mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder) {
|
||||
if (audio_helpers::is_audio_file((const char *)buf, len)) {
|
||||
std::vector<float> pcmf32;
|
||||
const int sample_rate = mtmd_get_audio_sample_rate(ctx);
|
||||
@@ -490,7 +490,7 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigne
|
||||
LOG_ERR("Unable to read WAV audio file from buffer\n");
|
||||
return nullptr;
|
||||
}
|
||||
return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
|
||||
return mtmd_bitmap_init_from_audio(pcmf32.size(), placeholder ? nullptr : pcmf32.data());
|
||||
}
|
||||
|
||||
// otherwise, we assume it's an image
|
||||
@@ -502,13 +502,13 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigne
|
||||
LOG_ERR("%s: failed to decode image bytes\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
result = mtmd_bitmap_init(nx, ny, data);
|
||||
result = mtmd_bitmap_init(nx, ny, placeholder ? nullptr : data);
|
||||
stbi_image_free(data);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) {
|
||||
mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder) {
|
||||
std::vector<unsigned char> buf;
|
||||
FILE * f = fopen(fname, "rb");
|
||||
if (!f) {
|
||||
@@ -533,5 +533,6 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char *
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size());
|
||||
return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size(), placeholder);
|
||||
}
|
||||
|
||||
|
||||
@@ -29,7 +29,7 @@ MTMD_API void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_da
|
||||
// it calls mtmd_helper_bitmap_init_from_buf() internally
|
||||
// returns nullptr on failure
|
||||
// this function is thread-safe
|
||||
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname);
|
||||
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder);
|
||||
|
||||
// helper function to construct a mtmd_bitmap from a buffer containing a file
|
||||
// supported formats:
|
||||
@@ -38,7 +38,7 @@ MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, con
|
||||
// note: audio files will be auto-detected based on magic bytes
|
||||
// returns nullptr on failure
|
||||
// this function is thread-safe
|
||||
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
|
||||
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder);
|
||||
|
||||
// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
|
||||
MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
|
||||
|
||||
+184
-170
@@ -9,25 +9,12 @@
|
||||
//
|
||||
|
||||
void mtmd_image_preprocessor::img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
|
||||
dst.nx = src.nx;
|
||||
dst.ny = src.ny;
|
||||
dst.buf.resize(src.buf.size());
|
||||
|
||||
// TODO @ngxson : seems like this could be done more efficiently on cgraph
|
||||
for (size_t i = 0; i < src.buf.size(); ++i) {
|
||||
int c = i % 3; // rgb
|
||||
dst.buf[i] = (static_cast<float>(src.buf[i]) / 255.0f - mean[c]) / std[c];
|
||||
}
|
||||
dst.from_u8(src);
|
||||
dst.normalize(mean, std);
|
||||
}
|
||||
|
||||
void mtmd_image_preprocessor::img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst) {
|
||||
dst.nx = src.nx;
|
||||
dst.ny = src.ny;
|
||||
dst.buf.resize(src.buf.size());
|
||||
|
||||
for (size_t i = 0; i < src.buf.size(); ++i) {
|
||||
dst.buf[i] = static_cast<float>(src.buf[i]);
|
||||
}
|
||||
dst.from_u8(src);
|
||||
}
|
||||
|
||||
// set of tools to manipulate images
|
||||
@@ -40,13 +27,16 @@ struct img_tool {
|
||||
resize_algo algo,
|
||||
pad_style padding = PAD_CEIL,
|
||||
std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
|
||||
dst.nx = target_resolution.width;
|
||||
dst.ny = target_resolution.height;
|
||||
dst.buf.resize(3 * dst.nx * dst.ny);
|
||||
dst.set_size(target_resolution, src.is_placeholder());
|
||||
|
||||
if (dst.nx == src.nx && dst.ny == src.ny) {
|
||||
if (src.is_placeholder()) {
|
||||
// no-op for placeholder image, just set the size and return
|
||||
return;
|
||||
}
|
||||
|
||||
if (dst.get_size() == src.get_size()) {
|
||||
// no resize needed, simple copy
|
||||
dst.buf = src.buf;
|
||||
dst.cpy_buf(src.get_ro_buf());
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -68,17 +58,17 @@ struct img_tool {
|
||||
} else {
|
||||
// resize with padding
|
||||
clip_image_u8 resized_image;
|
||||
float scale_w = static_cast<float>(target_resolution.width) / src.nx;
|
||||
float scale_h = static_cast<float>(target_resolution.height) / src.ny;
|
||||
float scale_w = static_cast<float>(target_resolution.width) / src.get_size().width;
|
||||
float scale_h = static_cast<float>(target_resolution.height) / src.get_size().height;
|
||||
float scale = std::min(scale_w, scale_h);
|
||||
|
||||
int new_width, new_height;
|
||||
if (padding == PAD_NEAREST) {
|
||||
new_width = std::min(static_cast<int>(std::round(src.nx * scale)), target_resolution.width);
|
||||
new_height = std::min(static_cast<int>(std::round(src.ny * scale)), target_resolution.height);
|
||||
new_width = std::min(static_cast<int>(std::round(src.get_size().width * scale)), target_resolution.width);
|
||||
new_height = std::min(static_cast<int>(std::round(src.get_size().height * scale)), target_resolution.height);
|
||||
} else {
|
||||
new_width = std::min(static_cast<int>(std::ceil(src.nx * scale)), target_resolution.width);
|
||||
new_height = std::min(static_cast<int>(std::ceil(src.ny * scale)), target_resolution.height);
|
||||
new_width = std::min(static_cast<int>(std::ceil(src.get_size().width * scale)), target_resolution.width);
|
||||
new_height = std::min(static_cast<int>(std::ceil(src.get_size().height * scale)), target_resolution.height);
|
||||
}
|
||||
|
||||
switch (algo) {
|
||||
@@ -112,18 +102,17 @@ struct img_tool {
|
||||
|
||||
static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) {
|
||||
GGML_ASSERT(x >= 0 && y >= 0 && w > 0 && h > 0);
|
||||
GGML_ASSERT(x + w <= image.nx && y + h <= image.ny);
|
||||
dst.nx = w;
|
||||
dst.ny = h;
|
||||
dst.buf.resize(3 * w * h);
|
||||
GGML_ASSERT(x + w <= image.get_size().width && y + h <= image.get_size().height);
|
||||
dst.set_size({w, h}, image.is_placeholder());
|
||||
|
||||
if (image.is_placeholder()) {
|
||||
// no-op for placeholder image, just set the size and return
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i = 0; i < h; ++i) {
|
||||
for (int j = 0; j < w; ++j) {
|
||||
int src_idx = 3 * ((y + i)*image.nx + (x + j));
|
||||
int dst_idx = 3 * (i*w + j);
|
||||
dst.buf[dst_idx] = image.buf[src_idx];
|
||||
dst.buf[dst_idx + 1] = image.buf[src_idx + 1];
|
||||
dst.buf[dst_idx + 2] = image.buf[src_idx + 2];
|
||||
dst.set_pixel(j, i, image.get_pixel(x + j, y + i));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -181,81 +170,101 @@ struct img_tool {
|
||||
|
||||
// draw src image into dst image at offset (offset_x, offset_y)
|
||||
static void composite(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) {
|
||||
for (int y = 0; y < src.ny; ++y) {
|
||||
for (int x = 0; x < src.nx; ++x) {
|
||||
if (src.is_placeholder()) {
|
||||
// no-op for placeholder image
|
||||
return;
|
||||
}
|
||||
|
||||
const auto src_size = src.get_size();
|
||||
const auto dst_size = dst.get_size();
|
||||
for (int y = 0; y < src_size.height; ++y) {
|
||||
for (int x = 0; x < src_size.width; ++x) {
|
||||
int dx = x + offset_x;
|
||||
int dy = y + offset_y;
|
||||
// skip pixels that would be out of bounds in the destination
|
||||
if (dx < 0 || dy < 0 || dx >= dst.nx || dy >= dst.ny) {
|
||||
if (dx < 0 || dy < 0 || dx >= dst_size.width || dy >= dst_size.height) {
|
||||
continue;
|
||||
}
|
||||
size_t dst_idx = 3 * (static_cast<size_t>(dy) * dst.nx + static_cast<size_t>(dx));
|
||||
size_t src_idx = 3 * (static_cast<size_t>(y) * src.nx + static_cast<size_t>(x));
|
||||
dst.buf[dst_idx + 0] = src.buf[src_idx + 0];
|
||||
dst.buf[dst_idx + 1] = src.buf[src_idx + 1];
|
||||
dst.buf[dst_idx + 2] = src.buf[src_idx + 2];
|
||||
dst.set_pixel(dx, dy, src.get_pixel(x, y));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// fill the image with a solid color
|
||||
static void fill(clip_image_u8 & img, const std::array<uint8_t, 3> & color) {
|
||||
for (size_t i = 0; i < img.buf.size(); i += 3) {
|
||||
img.buf[i] = color[0];
|
||||
img.buf[i + 1] = color[1];
|
||||
img.buf[i + 2] = color[2];
|
||||
if (img.is_placeholder()) {
|
||||
// no-op for placeholder image
|
||||
return;
|
||||
}
|
||||
|
||||
const auto size = img.get_size();
|
||||
for (int y = 0; y < size.height; ++y) {
|
||||
for (int x = 0; x < size.width; ++x) {
|
||||
img.set_pixel(x, y, color);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// Bilinear resize function
|
||||
static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) {
|
||||
if (src.nx == 0 || src.ny == 0) { dst.nx = dst.ny = 0; dst.buf.clear(); return; }
|
||||
const auto src_size = src.get_size();
|
||||
if (src_size.width == 0 || src_size.height == 0) { dst.set_size({0, 0}, false); return; }
|
||||
if (target_width <= 0) target_width = 1;
|
||||
if (target_height <= 0) target_height = 1;
|
||||
|
||||
dst.nx = target_width;
|
||||
dst.ny = target_height;
|
||||
dst.buf.resize(3 * target_width * target_height);
|
||||
dst.set_size({target_width, target_height}, false);
|
||||
|
||||
float x_ratio = target_width > 1 ? static_cast<float>(src.nx - 1) / (target_width - 1) : 0.0f;
|
||||
float y_ratio = target_height > 1 ? static_cast<float>(src.ny - 1) / (target_height - 1) : 0.0f;
|
||||
if (src.is_placeholder()) {
|
||||
// no-op for placeholder image, just set the size and return
|
||||
return;
|
||||
}
|
||||
|
||||
float x_ratio = target_width > 1 ? static_cast<float>(src_size.width - 1) / (target_width - 1) : 0.0f;
|
||||
float y_ratio = target_height > 1 ? static_cast<float>(src_size.height - 1) / (target_height - 1) : 0.0f;
|
||||
|
||||
for (int y = 0; y < target_height; ++y) {
|
||||
for (int x = 0; x < target_width; ++x) {
|
||||
float px = x * x_ratio;
|
||||
float py = y * y_ratio;
|
||||
|
||||
int x0 = std::min(static_cast<int>(px), src.nx - 1);
|
||||
int y0 = std::min(static_cast<int>(py), src.ny - 1);
|
||||
int x1 = std::min(x0 + 1, src.nx - 1);
|
||||
int y1 = std::min(y0 + 1, src.ny - 1);
|
||||
int x0 = std::min(static_cast<int>(px), src_size.width - 1);
|
||||
int y0 = std::min(static_cast<int>(py), src_size.height - 1);
|
||||
int x1 = std::min(x0 + 1, src_size.width - 1);
|
||||
int y1 = std::min(y0 + 1, src_size.height - 1);
|
||||
|
||||
float xf = px - x0;
|
||||
float yf = py - y0;
|
||||
|
||||
const auto p00 = src.get_pixel(x0, y0);
|
||||
const auto p10 = src.get_pixel(x1, y0);
|
||||
const auto p01 = src.get_pixel(x0, y1);
|
||||
const auto p11 = src.get_pixel(x1, y1);
|
||||
|
||||
std::array<uint8_t, 3> pixel;
|
||||
for (int c = 0; c < 3; ++c) {
|
||||
float top = lerp(static_cast<float>(src.buf[3 * (y0 * src.nx + x0) + c]),
|
||||
static_cast<float>(src.buf[3 * (y0 * src.nx + x1) + c]),
|
||||
xf);
|
||||
float bottom = lerp(static_cast<float>(src.buf[3 * (y1 * src.nx + x0) + c]),
|
||||
static_cast<float>(src.buf[3 * (y1 * src.nx + x1) + c]),
|
||||
xf);
|
||||
dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, yf));
|
||||
float top = lerp(static_cast<float>(p00[c]), static_cast<float>(p10[c]), xf);
|
||||
float bottom = lerp(static_cast<float>(p01[c]), static_cast<float>(p11[c]), xf);
|
||||
pixel[c] = static_cast<uint8_t>(lerp(top, bottom, yf));
|
||||
}
|
||||
dst.set_pixel(x, y, pixel);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Bicubic resize function
|
||||
// part of image will be cropped if the aspect ratio is different
|
||||
static bool resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
|
||||
const int nx = img.nx;
|
||||
const int ny = img.ny;
|
||||
static void resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
|
||||
const auto img_size = img.get_size();
|
||||
const int nx = img_size.width;
|
||||
const int ny = img_size.height;
|
||||
|
||||
dst.nx = target_width;
|
||||
dst.ny = target_height;
|
||||
dst.buf.resize(3 * target_width * target_height);
|
||||
dst.set_size({target_width, target_height}, false);
|
||||
|
||||
if (img.is_placeholder()) {
|
||||
// no-op for placeholder image, just set the size and return
|
||||
return;
|
||||
}
|
||||
|
||||
float Cc;
|
||||
float C[5] = {};
|
||||
@@ -280,12 +289,13 @@ private:
|
||||
dx = tx * j - x;
|
||||
dy = ty * i - y;
|
||||
|
||||
std::array<uint8_t, 3> pixel;
|
||||
for (k = 0; k < 3; k++) {
|
||||
for (jj = 0; jj <= 3; jj++) {
|
||||
d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
||||
d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
||||
d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
||||
a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
||||
d0 = img.get_pixel(clip(x - 1, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k] - img.get_pixel(clip(x, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k];
|
||||
d2 = img.get_pixel(clip(x + 1, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k] - img.get_pixel(clip(x, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k];
|
||||
d3 = img.get_pixel(clip(x + 2, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k] - img.get_pixel(clip(x, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k];
|
||||
a0 = img.get_pixel(clip(x, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k];
|
||||
|
||||
a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
|
||||
a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
|
||||
@@ -303,13 +313,12 @@ private:
|
||||
Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
|
||||
|
||||
const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
|
||||
dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
|
||||
pixel[k] = Cc2;
|
||||
}
|
||||
}
|
||||
dst.set_pixel(j, i, pixel);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Bicubic resize function using Pillow's ImagingResample algorithm
|
||||
@@ -455,16 +464,17 @@ private:
|
||||
};
|
||||
|
||||
// Horizontal resampling pass
|
||||
// Resizes width from imIn.nx to imOut.nx, preserving height
|
||||
// Resizes width from imIn to out_nx, preserving height
|
||||
auto resample_horizontal = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut,
|
||||
int out_nx,
|
||||
int ksize, const std::vector<int> & bounds, const std::vector<int32_t> & weights) {
|
||||
imOut.ny = imIn.ny;
|
||||
imOut.buf.resize(3 * imOut.nx * imOut.ny);
|
||||
const int in_ny = imIn.get_size().height;
|
||||
imOut.set_size({out_nx, in_ny}, false);
|
||||
|
||||
// Process each row independently
|
||||
for (int yy = 0; yy < imOut.ny; yy++) {
|
||||
for (int yy = 0; yy < in_ny; yy++) {
|
||||
// For each output pixel in this row
|
||||
for (int xx = 0; xx < imOut.nx; xx++) {
|
||||
for (int xx = 0; xx < out_nx; xx++) {
|
||||
// Get the range of input pixels and filter coefficients
|
||||
int xmin = bounds[xx * 2 + 0]; // First input pixel index
|
||||
int xcnt = bounds[xx * 2 + 1]; // Number of input pixels
|
||||
@@ -476,36 +486,36 @@ private:
|
||||
|
||||
// Convolve: sum weighted input pixels
|
||||
for (int x = 0; x < xcnt; x++) {
|
||||
int src_idx = ((yy * imIn.nx) + (x + xmin)) * 3;
|
||||
ss0 += static_cast<uint8_t>(imIn.buf[src_idx + 0]) * weights[xx * ksize + x]; // R channel
|
||||
ss1 += static_cast<uint8_t>(imIn.buf[src_idx + 1]) * weights[xx * ksize + x]; // G channel
|
||||
ss2 += static_cast<uint8_t>(imIn.buf[src_idx + 2]) * weights[xx * ksize + x]; // B channel
|
||||
const auto src_px = imIn.get_pixel(x + xmin, yy);
|
||||
ss0 += src_px[0] * weights[xx * ksize + x]; // R channel
|
||||
ss1 += src_px[1] * weights[xx * ksize + x]; // G channel
|
||||
ss2 += src_px[2] * weights[xx * ksize + x]; // B channel
|
||||
}
|
||||
|
||||
// Convert back from fixed-point (divide by 2^PRECISION_BITS) and clamp to [0,255]
|
||||
int dst_idx = (yy * imOut.nx + xx) * 3;
|
||||
imOut.buf[dst_idx + 0] = clip8(ss0 >> PRECISION_BITS);
|
||||
imOut.buf[dst_idx + 1] = clip8(ss1 >> PRECISION_BITS);
|
||||
imOut.buf[dst_idx + 2] = clip8(ss2 >> PRECISION_BITS);
|
||||
imOut.set_pixel(xx, yy, {clip8(ss0 >> PRECISION_BITS),
|
||||
clip8(ss1 >> PRECISION_BITS),
|
||||
clip8(ss2 >> PRECISION_BITS)});
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Vertical resampling pass
|
||||
// Resizes height from imIn.ny to imOut.ny, preserving width
|
||||
// Resizes height from imIn to out_ny, preserving width
|
||||
auto resample_vertical = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut,
|
||||
int out_ny,
|
||||
int ksize, const std::vector<int> & bounds, const std::vector<int32_t> & weight) {
|
||||
imOut.nx = imIn.nx;
|
||||
imOut.buf.resize(3 * imOut.nx * imOut.ny);
|
||||
const int in_nx = imIn.get_size().width;
|
||||
imOut.set_size({in_nx, out_ny}, false);
|
||||
|
||||
// For each output row
|
||||
for (int yy = 0; yy < imOut.ny; yy++) {
|
||||
for (int yy = 0; yy < out_ny; yy++) {
|
||||
// Get the range of input rows and filter coefficients
|
||||
int ymin = bounds[yy * 2 + 0]; // First input row index
|
||||
int ycnt = bounds[yy * 2 + 1]; // Number of input rows
|
||||
|
||||
// Process each column in this output row
|
||||
for (int xx = 0; xx < imOut.nx; xx++) {
|
||||
for (int xx = 0; xx < in_nx; xx++) {
|
||||
// Initialize accumulators for RGB channels with rounding bias
|
||||
int32_t ss0 = 1 << (PRECISION_BITS - 1);
|
||||
int32_t ss1 = 1 << (PRECISION_BITS - 1);
|
||||
@@ -513,27 +523,23 @@ private:
|
||||
|
||||
// Convolve: sum weighted input pixels vertically
|
||||
for (int y = 0; y < ycnt; y++) {
|
||||
int src_idx = ((y + ymin) * imIn.nx + xx) * 3;
|
||||
ss0 += static_cast<uint8_t>(imIn.buf[src_idx + 0]) * weight[yy * ksize + y]; // R channel
|
||||
ss1 += static_cast<uint8_t>(imIn.buf[src_idx + 1]) * weight[yy * ksize + y]; // G channel
|
||||
ss2 += static_cast<uint8_t>(imIn.buf[src_idx + 2]) * weight[yy * ksize + y]; // B channel
|
||||
const auto src_px = imIn.get_pixel(xx, y + ymin);
|
||||
ss0 += src_px[0] * weight[yy * ksize + y]; // R channel
|
||||
ss1 += src_px[1] * weight[yy * ksize + y]; // G channel
|
||||
ss2 += src_px[2] * weight[yy * ksize + y]; // B channel
|
||||
}
|
||||
|
||||
// Convert back from fixed-point and clamp to [0,255]
|
||||
int dst_idx = (yy * imOut.nx + xx) * 3;
|
||||
imOut.buf[dst_idx + 0] = clip8(ss0 >> PRECISION_BITS);
|
||||
imOut.buf[dst_idx + 1] = clip8(ss1 >> PRECISION_BITS);
|
||||
imOut.buf[dst_idx + 2] = clip8(ss2 >> PRECISION_BITS);
|
||||
imOut.set_pixel(xx, yy, {clip8(ss0 >> PRECISION_BITS),
|
||||
clip8(ss1 >> PRECISION_BITS),
|
||||
clip8(ss2 >> PRECISION_BITS)});
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Main resampling logic using separable two-pass approach
|
||||
const int src_width = img.nx;
|
||||
const int src_height = img.ny;
|
||||
|
||||
dst.nx = target_width;
|
||||
dst.ny = target_height;
|
||||
const int src_width = img.get_size().width;
|
||||
const int src_height = img.get_size().height;
|
||||
|
||||
bool need_horizontal = (target_width != src_width);
|
||||
bool need_vertical = (target_height != src_height);
|
||||
@@ -555,18 +561,20 @@ private:
|
||||
if (need_horizontal && need_vertical) {
|
||||
// Both horizontal and vertical
|
||||
clip_image_u8 temp;
|
||||
temp.nx = target_width;
|
||||
resample_horizontal(img, temp, ksize_horiz, bounds_horiz, weights_horiz);
|
||||
resample_vertical(temp, dst, ksize_vert, bounds_vert, weights_vert);
|
||||
resample_horizontal(img, temp, target_width, ksize_horiz, bounds_horiz, weights_horiz);
|
||||
resample_vertical(temp, dst, target_height, ksize_vert, bounds_vert, weights_vert);
|
||||
} else if (need_horizontal) {
|
||||
// Only horizontal
|
||||
resample_horizontal(img, dst, ksize_horiz, bounds_horiz, weights_horiz);
|
||||
resample_horizontal(img, dst, target_width, ksize_horiz, bounds_horiz, weights_horiz);
|
||||
} else if (need_vertical) {
|
||||
// Only vertical
|
||||
resample_vertical(img, dst, ksize_vert, bounds_vert, weights_vert);
|
||||
resample_vertical(img, dst, target_height, ksize_vert, bounds_vert, weights_vert);
|
||||
} else {
|
||||
// No resizing needed - direct copy
|
||||
dst.buf = img.buf;
|
||||
dst.set_size(img.get_size(), img.is_placeholder());
|
||||
if (!img.is_placeholder()) {
|
||||
dst.cpy_buf(img.get_ro_buf());
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -588,7 +596,7 @@ private:
|
||||
//
|
||||
|
||||
bool mtmd_image_preprocessor_llava_uhd::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
|
||||
const clip_image_size original_size{img.nx, img.ny};
|
||||
const clip_image_size original_size = img.get_size();
|
||||
auto const inst = get_slice_instructions(original_size);
|
||||
std::vector<clip_image_u8_ptr> imgs = slice_image(img, inst);
|
||||
|
||||
@@ -883,7 +891,7 @@ bool mtmd_image_preprocessor_fixed_size::preprocess(const clip_image_u8 & img, c
|
||||
bool mtmd_image_preprocessor_dyn_size::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
|
||||
GGML_ASSERT(hparams.image_min_pixels > 0 && hparams.image_max_pixels > 0);
|
||||
clip_image_u8 resized_image;
|
||||
const clip_image_size original_size{img.nx, img.ny};
|
||||
const clip_image_size original_size = img.get_size();
|
||||
// the original pixtral model doesn't have n_merge
|
||||
const int cur_merge = hparams.n_merge == 0 ? 1 : hparams.n_merge;
|
||||
const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
|
||||
@@ -908,7 +916,7 @@ bool mtmd_image_preprocessor_dyn_size::preprocess(const clip_image_u8 & img, cli
|
||||
bool mtmd_image_preprocessor_longest_edge::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
|
||||
GGML_ASSERT(hparams.image_longest_edge > 0);
|
||||
clip_image_u8 resized_image;
|
||||
const clip_image_size original_size{img.nx, img.ny};
|
||||
const clip_image_size original_size = img.get_size();
|
||||
// the original pixtral model doesn't have n_merge
|
||||
const int cur_merge = hparams.n_merge == 0 ? 1 : hparams.n_merge;
|
||||
const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
|
||||
@@ -1040,7 +1048,7 @@ bool mtmd_image_preprocessor_idefics3::preprocess(const clip_image_u8 & img, cli
|
||||
// multiples of image_size (always rounding up)
|
||||
//
|
||||
// CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
|
||||
const clip_image_size original_size{img.nx, img.ny};
|
||||
const clip_image_size original_size = img.get_size();
|
||||
const clip_image_size refined_size = img_tool::calc_size_preserved_ratio(
|
||||
original_size, hparams.image_size, hparams.image_longest_edge);
|
||||
// LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n",
|
||||
@@ -1088,7 +1096,7 @@ bool mtmd_image_preprocessor_idefics3::preprocess(const clip_image_u8 & img, cli
|
||||
|
||||
bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
|
||||
GGML_ASSERT(!hparams.image_res_candidates.empty());
|
||||
const clip_image_size original_size{img.nx, img.ny};
|
||||
const clip_image_size original_size = img.get_size();
|
||||
auto const inst = get_slice_instructions(original_size);
|
||||
std::vector<clip_image_u8_ptr> imgs = slice_image(img, inst, false);
|
||||
|
||||
@@ -1108,7 +1116,7 @@ bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img,
|
||||
static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ };
|
||||
// TODO: support 512 (tiny) and 640 (small) once we have eval data for them
|
||||
|
||||
const int64_t orig_area = static_cast<int64_t>(img.nx) * img.ny;
|
||||
const int64_t orig_area = static_cast<int64_t>(img.n_pixels());
|
||||
|
||||
size_t mode_i = 0;
|
||||
int64_t min_diff = std::numeric_limits<int64_t>::max();
|
||||
@@ -1201,10 +1209,11 @@ bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img,
|
||||
// emit 768x768 local tiles when the image is larger than a tile in either
|
||||
// dimension, then always a 1024x1024 global view. order: [tiles..., global].
|
||||
|
||||
if (img.nx > tile_size || img.ny > tile_size) {
|
||||
const float aspect_ratio = static_cast<float>(img.nx) / img.ny;
|
||||
const auto img_size = img.get_size();
|
||||
if (img_size.width > tile_size || img_size.height > tile_size) {
|
||||
const float aspect_ratio = static_cast<float>(img_size.width) / img_size.height;
|
||||
const auto target_ratios = get_target_ratios();
|
||||
const clip_image_size grid = find_closest_aspect_ratio(aspect_ratio, target_ratios, img.nx, img.ny);
|
||||
const clip_image_size grid = find_closest_aspect_ratio(aspect_ratio, target_ratios, img_size.width, img_size.height);
|
||||
|
||||
// stretch onto the grid (no aspect preserve), then crop tiles row-major.
|
||||
clip_image_u8 refined;
|
||||
@@ -1247,50 +1256,57 @@ void mtmd_image_preprocessor_step3vl::img_u8_resize_bilinear_to_f32(
|
||||
int target_height,
|
||||
const float mean[3],
|
||||
const float std[3]) {
|
||||
if (src.nx == target_width && src.ny == target_height) {
|
||||
const auto src_size = src.get_size();
|
||||
if (src_size.width == target_width && src_size.height == target_height) {
|
||||
img_u8_to_f32(src, dst, mean, std);
|
||||
return;
|
||||
}
|
||||
|
||||
dst.nx = target_width;
|
||||
dst.ny = target_height;
|
||||
dst.buf.resize(3 * target_width * target_height);
|
||||
dst.set_size({target_width, target_height}, false, false);
|
||||
|
||||
const float scale_x = static_cast<float>(src.nx) / target_width;
|
||||
const float scale_y = static_cast<float>(src.ny) / target_height;
|
||||
if (src.is_placeholder()) {
|
||||
// no-op for placeholder image, just set the size and return
|
||||
return;
|
||||
}
|
||||
|
||||
const float scale_x = static_cast<float>(src_size.width) / target_width;
|
||||
const float scale_y = static_cast<float>(src_size.height) / target_height;
|
||||
|
||||
std::vector<float> local_buf(3 * target_width * target_height);
|
||||
|
||||
for (int y = 0; y < target_height; ++y) {
|
||||
const float src_y = (static_cast<float>(y) + 0.5f) * scale_y - 0.5f;
|
||||
const int y0_floor = static_cast<int>(std::floor(src_y));
|
||||
const int y0 = std::max(0, std::min(y0_floor, src.ny - 1));
|
||||
const int y1 = std::max(0, std::min(y0_floor + 1, src.ny - 1));
|
||||
const int y0 = std::max(0, std::min(y0_floor, src_size.height - 1));
|
||||
const int y1 = std::max(0, std::min(y0_floor + 1, src_size.height - 1));
|
||||
const float ly = src_y - y0_floor;
|
||||
|
||||
for (int x = 0; x < target_width; ++x) {
|
||||
const float src_x = (static_cast<float>(x) + 0.5f) * scale_x - 0.5f;
|
||||
const int x0_floor = static_cast<int>(std::floor(src_x));
|
||||
const int x0 = std::max(0, std::min(x0_floor, src.nx - 1));
|
||||
const int x1 = std::max(0, std::min(x0_floor + 1, src.nx - 1));
|
||||
const int x0 = std::max(0, std::min(x0_floor, src_size.width - 1));
|
||||
const int x1 = std::max(0, std::min(x0_floor + 1, src_size.width - 1));
|
||||
const float lx = src_x - x0_floor;
|
||||
|
||||
const size_t idx00 = 3 * (y0 * src.nx + x0);
|
||||
const size_t idx01 = 3 * (y0 * src.nx + x1);
|
||||
const size_t idx10 = 3 * (y1 * src.nx + x0);
|
||||
const size_t idx11 = 3 * (y1 * src.nx + x1);
|
||||
const size_t idx_dst = 3 * (y * target_width + x);
|
||||
const auto p00 = src.get_pixel(x0, y0);
|
||||
const auto p01 = src.get_pixel(x1, y0);
|
||||
const auto p10 = src.get_pixel(x0, y1);
|
||||
const auto p11 = src.get_pixel(x1, y1);
|
||||
|
||||
const size_t idx_dst = 3 * (y * target_width + x);
|
||||
for (int c = 0; c < 3; ++c) {
|
||||
const float v00 = (static_cast<float>(src.buf[idx00 + c]) / 255.0f - mean[c]) / std[c];
|
||||
const float v01 = (static_cast<float>(src.buf[idx01 + c]) / 255.0f - mean[c]) / std[c];
|
||||
const float v10 = (static_cast<float>(src.buf[idx10 + c]) / 255.0f - mean[c]) / std[c];
|
||||
const float v11 = (static_cast<float>(src.buf[idx11 + c]) / 255.0f - mean[c]) / std[c];
|
||||
const float v00 = (static_cast<float>(p00[c]) / 255.0f - mean[c]) / std[c];
|
||||
const float v01 = (static_cast<float>(p01[c]) / 255.0f - mean[c]) / std[c];
|
||||
const float v10 = (static_cast<float>(p10[c]) / 255.0f - mean[c]) / std[c];
|
||||
const float v11 = (static_cast<float>(p11[c]) / 255.0f - mean[c]) / std[c];
|
||||
|
||||
const float top = v00 + (v01 - v00) * lx;
|
||||
const float bot = v10 + (v11 - v10) * lx;
|
||||
dst.buf[idx_dst + c] = top + (bot - top) * ly;
|
||||
local_buf[idx_dst + c] = top + (bot - top) * ly;
|
||||
}
|
||||
}
|
||||
}
|
||||
dst.cpy_buf(local_buf);
|
||||
}
|
||||
|
||||
int mtmd_image_preprocessor_step3vl::get_image_longest_edge(const clip_hparams & params) {
|
||||
@@ -1341,26 +1357,26 @@ std::vector<int> mtmd_image_preprocessor_step3vl::calc_grid(int length, int wind
|
||||
|
||||
clip_image_u8 mtmd_image_preprocessor_step3vl::prepare_image(const clip_image_u8 & img, const clip_hparams & params) {
|
||||
clip_image_u8 resized = img;
|
||||
const float aspect_ratio = img.ny > 0 ? static_cast<float>(img.nx) / img.ny : 1.0f;
|
||||
if (std::min(img.nx, img.ny) < 32 &&
|
||||
const auto img_size = img.get_size();
|
||||
const float aspect_ratio = img_size.height > 0 ? static_cast<float>(img_size.width) / img_size.height : 1.0f;
|
||||
if (std::min(img_size.width, img_size.height) < 32 &&
|
||||
(aspect_ratio > wide_aspect_ratio_limit ||
|
||||
aspect_ratio < 1.0f / wide_aspect_ratio_limit)) {
|
||||
const int square_size = std::max(img.nx, img.ny);
|
||||
const int square_size = std::max(img_size.width, img_size.height);
|
||||
clip_image_u8 padded;
|
||||
padded.nx = square_size;
|
||||
padded.ny = square_size;
|
||||
padded.buf.resize(3 * square_size * square_size);
|
||||
padded.set_size({square_size, square_size}, false);
|
||||
img_tool::fill(padded, {0, 0, 0});
|
||||
img_tool::composite(padded, img, 0, 0);
|
||||
resized = std::move(padded);
|
||||
}
|
||||
|
||||
const int max_image_size = get_image_longest_edge(params);
|
||||
if (std::max(resized.nx, resized.ny) > max_image_size) {
|
||||
const float scale = static_cast<float>(max_image_size) / std::max(resized.nx, resized.ny);
|
||||
const auto resized_size = resized.get_size();
|
||||
if (std::max(resized_size.width, resized_size.height) > max_image_size) {
|
||||
const float scale = static_cast<float>(max_image_size) / std::max(resized_size.width, resized_size.height);
|
||||
const clip_image_size new_size = {
|
||||
std::max(1, static_cast<int>(std::floor(resized.nx * scale))),
|
||||
std::max(1, static_cast<int>(std::floor(resized.ny * scale))),
|
||||
std::max(1, static_cast<int>(std::floor(resized_size.width * scale))),
|
||||
std::max(1, static_cast<int>(std::floor(resized_size.height * scale))),
|
||||
};
|
||||
clip_image_u8 scaled;
|
||||
img_tool::resize(resized, scaled, new_size, RESIZE_ALGO_BILINEAR, PAD_NONE);
|
||||
@@ -1372,14 +1388,14 @@ clip_image_u8 mtmd_image_preprocessor_step3vl::prepare_image(const clip_image_u8
|
||||
|
||||
clip_image_u8 mtmd_image_preprocessor_step3vl::crop_with_black_padding(const clip_image_u8 & image, int x, int y, int w, int h) {
|
||||
clip_image_u8 dst;
|
||||
dst.nx = w;
|
||||
dst.ny = h;
|
||||
dst.buf.resize(3 * w * h, 0);
|
||||
dst.set_size({w, h}, false);
|
||||
img_tool::fill(dst, {0, 0, 0});
|
||||
|
||||
const auto img_size = image.get_size();
|
||||
const int src_x0 = std::max(0, x);
|
||||
const int src_y0 = std::max(0, y);
|
||||
const int src_x1 = std::min(image.nx, x + w);
|
||||
const int src_y1 = std::min(image.ny, y + h);
|
||||
const int src_x1 = std::min(img_size.width, x + w);
|
||||
const int src_y1 = std::min(img_size.height, y + h);
|
||||
|
||||
if (src_x0 >= src_x1 || src_y0 >= src_y1) {
|
||||
return dst;
|
||||
@@ -1390,11 +1406,7 @@ clip_image_u8 mtmd_image_preprocessor_step3vl::crop_with_black_padding(const cli
|
||||
|
||||
for (int yy = 0; yy < src_y1 - src_y0; ++yy) {
|
||||
for (int xx = 0; xx < src_x1 - src_x0; ++xx) {
|
||||
const int src_idx = 3 * ((src_y0 + yy) * image.nx + (src_x0 + xx));
|
||||
const int dst_idx = 3 * ((dst_y0 + yy) * w + (dst_x0 + xx));
|
||||
dst.buf[dst_idx + 0] = image.buf[src_idx + 0];
|
||||
dst.buf[dst_idx + 1] = image.buf[src_idx + 1];
|
||||
dst.buf[dst_idx + 2] = image.buf[src_idx + 2];
|
||||
dst.set_pixel(dst_x0 + xx, dst_y0 + yy, image.get_pixel(src_x0 + xx, src_y0 + yy));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1443,7 +1455,7 @@ mtmd_image_preprocessor_step3vl::slice_instructions mtmd_image_preprocessor_step
|
||||
|
||||
bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
|
||||
clip_image_u8 prepared = prepare_image(img, hparams);
|
||||
const auto instructions = build_slice_instructions(hparams, {prepared.nx, prepared.ny});
|
||||
const auto instructions = build_slice_instructions(hparams, prepared.get_size());
|
||||
|
||||
clip_image_f32_ptr overview_f32(clip_image_f32_init());
|
||||
img_u8_resize_bilinear_to_f32(
|
||||
@@ -1462,7 +1474,8 @@ bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip
|
||||
}
|
||||
|
||||
clip_image_u8 img_for_crop = prepared;
|
||||
if (instructions.refined_size.width != prepared.nx || instructions.refined_size.height != prepared.ny) {
|
||||
const auto prepared_size = prepared.get_size();
|
||||
if (instructions.refined_size.width != prepared_size.width || instructions.refined_size.height != prepared_size.height) {
|
||||
clip_image_u8 refined;
|
||||
img_tool::resize(prepared, refined, instructions.refined_size, RESIZE_ALGO_BILINEAR, PAD_NONE);
|
||||
img_for_crop = std::move(refined);
|
||||
@@ -1503,9 +1516,10 @@ bool mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img, clip
|
||||
hparams.image_max_pixels / (patch_size * patch_size) : 256;
|
||||
|
||||
// Linear search for optimal scale to fit within max_num_patches
|
||||
const auto img_size = img.get_size();
|
||||
float scale = 1.0f;
|
||||
int target_height = img.ny;
|
||||
int target_width = img.nx;
|
||||
int target_height = img_size.height;
|
||||
int target_width = img_size.width;
|
||||
|
||||
auto get_scaled_image_size = [align_size](float scale, int size) -> int {
|
||||
float scaled_size = size * scale;
|
||||
@@ -1517,8 +1531,8 @@ bool mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img, clip
|
||||
|
||||
// Linear search with 0.02 step size
|
||||
while (scale > 0.0f) {
|
||||
target_height = get_scaled_image_size(scale, img.ny);
|
||||
target_width = get_scaled_image_size(scale, img.nx);
|
||||
target_height = get_scaled_image_size(scale, img_size.height);
|
||||
target_width = get_scaled_image_size(scale, img_size.width);
|
||||
|
||||
int num_patches_h = target_height / patch_size;
|
||||
int num_patches_w = target_width / patch_size;
|
||||
|
||||
+152
-56
@@ -26,12 +26,46 @@
|
||||
|
||||
// represents raw image data, layout is RGBRGBRGB...
|
||||
// length of data must be nx * ny * 3
|
||||
// for audio bitmap: nx = sample count, ny = 1, layout is F32 F32 F32 ...
|
||||
// length of data must be nx * sizeof(float)
|
||||
struct mtmd_bitmap {
|
||||
uint32_t nx;
|
||||
uint32_t ny;
|
||||
std::vector<unsigned char> data;
|
||||
uint32_t nx = 0;
|
||||
uint32_t ny = 0;
|
||||
std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
|
||||
bool is_audio = false; // true if the bitmap is audio
|
||||
|
||||
mtmd_bitmap(const unsigned char * data, uint32_t nx, uint32_t ny)
|
||||
: nx(nx), ny(ny) {
|
||||
if (data) {
|
||||
size_t data_size = (size_t)nx * ny * 3;
|
||||
this->data.resize(data_size);
|
||||
std::memcpy(this->data.data(), data, data_size);
|
||||
}
|
||||
}
|
||||
|
||||
mtmd_bitmap(const unsigned char * data, uint32_t n_samples)
|
||||
: nx(n_samples), ny(1), is_audio(true) {
|
||||
if (data) {
|
||||
size_t data_size = (size_t)nx * sizeof(float);
|
||||
this->data.resize(data_size);
|
||||
std::memcpy(this->data.data(), data, data_size);
|
||||
}
|
||||
}
|
||||
|
||||
const std::vector<unsigned char> & get_ro_buf() const {
|
||||
return data;
|
||||
}
|
||||
|
||||
bool is_placeholder() const {
|
||||
return data.empty();
|
||||
}
|
||||
|
||||
size_t n_bytes() const {
|
||||
return data.size();
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<unsigned char> data;
|
||||
};
|
||||
|
||||
// position indexing for decoder model
|
||||
@@ -42,8 +76,8 @@ enum mtmd_pos_type {
|
||||
};
|
||||
|
||||
struct mtmd_image_tokens {
|
||||
uint32_t nx; // number of tokens in x direction
|
||||
uint32_t ny; // number of tokens in y direction
|
||||
uint32_t nx = 0; // number of tokens in x direction
|
||||
uint32_t ny = 0; // number of tokens in y direction
|
||||
mtmd_pos_type pos = MTMD_POS_TYPE_NORMAL;
|
||||
uint32_t image_idx = 0; // 0-based position of this image among image chunks in the prompt(used by pos == MTMD_POS_TYPE_HUNYUANVL)
|
||||
uint32_t n_tokens() const {
|
||||
@@ -56,6 +90,16 @@ struct mtmd_image_tokens {
|
||||
clip_image_f32_batch batch_f32; // preprocessed image patches
|
||||
std::string id; // optional user-defined ID, useful for KV cache tracking
|
||||
|
||||
// true if one of entries in batch_f32 is a placeholder
|
||||
bool is_placeholder() const {
|
||||
for (const auto & entry : batch_f32.entries) {
|
||||
if (entry->is_placeholder()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
mtmd_image_tokens clone() {
|
||||
return mtmd_image_tokens{
|
||||
nx,
|
||||
@@ -70,10 +114,20 @@ struct mtmd_image_tokens {
|
||||
using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens>;
|
||||
|
||||
struct mtmd_audio_tokens {
|
||||
uint32_t n_tokens; // number of tokens
|
||||
uint32_t n_tokens = 0; // number of tokens
|
||||
clip_image_f32_batch batch_f32; // preprocessed image patches
|
||||
std::string id; // optional user-defined ID, useful for KV cache tracking
|
||||
|
||||
// true if one of entries in batch_f32 is a placeholder
|
||||
bool is_placeholder() const {
|
||||
for (const auto & entry : batch_f32.entries) {
|
||||
if (entry->is_placeholder()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
mtmd_audio_tokens clone() {
|
||||
return mtmd_audio_tokens{
|
||||
n_tokens,
|
||||
@@ -795,16 +849,19 @@ struct mtmd_tokenizer {
|
||||
}
|
||||
|
||||
// sanity check
|
||||
GGML_ASSERT(bitmap->nx > 0 && bitmap->ny > 0);
|
||||
GGML_ASSERT(bitmap->data.size() == (size_t)bitmap->nx * bitmap->ny * 3);
|
||||
if (bitmap->nx <= 0 || bitmap->ny <= 0) {
|
||||
LOG_ERR("%s: error: invalid bitmap dimensions: nx = %d, ny = %d\n",
|
||||
__func__, bitmap->nx, bitmap->ny);
|
||||
return 2;
|
||||
}
|
||||
GGML_ASSERT(ctx->image_preproc != nullptr);
|
||||
|
||||
// convert mtmd_bitmap to clip_image_u8
|
||||
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
||||
img_u8->nx = bitmap->nx;
|
||||
img_u8->ny = bitmap->ny;
|
||||
img_u8->buf.resize(bitmap->data.size());
|
||||
std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
|
||||
img_u8->set_size(
|
||||
{(int)bitmap->nx, (int)bitmap->ny},
|
||||
bitmap->is_placeholder());
|
||||
img_u8->cpy_buf(bitmap->get_ro_buf());
|
||||
|
||||
// preprocess image
|
||||
clip_image_f32_batch batch_f32;
|
||||
@@ -949,7 +1006,7 @@ struct mtmd_tokenizer {
|
||||
return 2;
|
||||
}
|
||||
|
||||
if (bitmap->data.size() == 0) {
|
||||
if (bitmap->nx == 0) {
|
||||
LOG_ERR("%s: error: empty audio data\n", __func__);
|
||||
return 2;
|
||||
}
|
||||
@@ -960,26 +1017,46 @@ struct mtmd_tokenizer {
|
||||
|
||||
// sanity check
|
||||
GGML_ASSERT(ctx->audio_preproc != nullptr);
|
||||
GGML_ASSERT(bitmap->data.size() > sizeof(float));
|
||||
GGML_ASSERT(bitmap->data.size() % sizeof(float) == 0);
|
||||
|
||||
// preprocess audio
|
||||
std::vector<mtmd_audio_mel> mel_spec_chunks;
|
||||
const float * samples = (const float *)bitmap->data.data();
|
||||
size_t n_samples = bitmap->data.size() / sizeof(float);
|
||||
bool ok = ctx->audio_preproc->preprocess(samples, n_samples, mel_spec_chunks);
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to preprocess audio\n");
|
||||
return 2;
|
||||
{
|
||||
std::vector<float> dummy;
|
||||
const float * samples = nullptr;
|
||||
size_t n_samples = 0;
|
||||
if (bitmap->is_placeholder()) {
|
||||
// TODO @ngxson : skip underlay processing if bitmap is placeholder
|
||||
GGML_ASSERT(bitmap->ny == 1);
|
||||
|
||||
dummy.resize(bitmap->nx);
|
||||
samples = dummy.data();
|
||||
n_samples = dummy.size();
|
||||
} else {
|
||||
const auto & buf = bitmap->get_ro_buf();
|
||||
GGML_ASSERT(buf.size() > sizeof(float));
|
||||
GGML_ASSERT(buf.size() % sizeof(float) == 0);
|
||||
|
||||
samples = (const float *)buf.data();
|
||||
n_samples = buf.size() / sizeof(float);
|
||||
}
|
||||
bool ok = ctx->audio_preproc->preprocess(samples, n_samples, mel_spec_chunks);
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to preprocess audio\n");
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
||||
// consider each mel_spec as a separate audio chunk
|
||||
// TODO: maybe support batching, but this may come with memory cost
|
||||
for (auto & mel_spec : mel_spec_chunks) {
|
||||
const bool is_placeholder = mel_spec.data.empty();
|
||||
|
||||
clip_image_f32_ptr mel_f32(clip_image_f32_init());
|
||||
mel_f32->nx = mel_spec.n_len;
|
||||
mel_f32->ny = mel_spec.n_mel;
|
||||
mel_f32->buf = std::move(mel_spec.data);
|
||||
mel_f32->set_size(
|
||||
{mel_spec.n_len, mel_spec.n_mel},
|
||||
is_placeholder, /* is_audio */ true);
|
||||
mel_f32->cpy_buf(mel_spec.data);
|
||||
|
||||
size_t n_tokens = clip_n_output_tokens(ctx->ctx_a, mel_f32.get());
|
||||
|
||||
clip_image_f32_batch batch_f32;
|
||||
@@ -1098,12 +1175,28 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
|
||||
LOG_ERR("%s: model does not support vision input\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
if (chunk->tokens_image == nullptr) {
|
||||
LOG_ERR("%s: image tokens are null\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
if (chunk->tokens_image->is_placeholder()) {
|
||||
LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
return mtmd_encode(ctx, chunk->tokens_image.get());
|
||||
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
|
||||
if (!ctx->ctx_a) {
|
||||
LOG_ERR("%s: model does not support audio input\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
if (chunk->tokens_audio == nullptr) {
|
||||
LOG_ERR("%s: audio tokens are null\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
if (chunk->tokens_audio->is_placeholder()) {
|
||||
LOG_ERR("%s: audio tokens batch is placeholder\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
int n_mmproj_embd = ctx->n_embd_text;
|
||||
ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
|
||||
bool ok = clip_image_batch_encode(
|
||||
@@ -1141,6 +1234,10 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
|
||||
// e.g., DeepSeek-OCR-2: 144 per tile views, 257 for the global view
|
||||
size_t offset = 0;
|
||||
for (size_t i = 0; i < entries.size(); i++) {
|
||||
if (entries[i]->is_placeholder()) {
|
||||
LOG_ERR("%s: image tokens batch entry %zu is placeholder\n", __func__, i);
|
||||
return 1;
|
||||
}
|
||||
int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
|
||||
ok = clip_image_encode(
|
||||
ctx_clip,
|
||||
@@ -1150,6 +1247,10 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
|
||||
offset += static_cast<size_t>(n_mmproj_embd) * n_tokens_per_image;
|
||||
}
|
||||
} else {
|
||||
if (image_tokens->is_placeholder()) {
|
||||
LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
ok = clip_image_batch_encode(
|
||||
ctx_clip,
|
||||
ctx->n_threads,
|
||||
@@ -1207,24 +1308,17 @@ int mtmd_get_audio_sample_rate(const mtmd_context * ctx) {
|
||||
mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
|
||||
uint32_t ny,
|
||||
const unsigned char * data) {
|
||||
mtmd_bitmap * bitmap = new mtmd_bitmap;
|
||||
bitmap->nx = nx;
|
||||
bitmap->ny = ny;
|
||||
size_t data_size = (size_t)nx * ny * 3;
|
||||
bitmap->data.resize(data_size);
|
||||
std::memcpy(bitmap->data.data(), data, data_size);
|
||||
mtmd_bitmap * bitmap = new mtmd_bitmap(data, nx, ny);
|
||||
return bitmap;
|
||||
}
|
||||
|
||||
mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples,
|
||||
const float * data) {
|
||||
mtmd_bitmap * bitmap = new mtmd_bitmap;
|
||||
bitmap->nx = n_samples;
|
||||
bitmap->ny = 1;
|
||||
bitmap->is_audio = true;
|
||||
size_t data_size = n_samples * sizeof(float);
|
||||
bitmap->data.resize(data_size);
|
||||
std::memcpy(bitmap->data.data(), data, data_size);
|
||||
mtmd_bitmap * bitmap = new mtmd_bitmap((const unsigned char *)data, n_samples);
|
||||
GGML_ASSERT(bitmap->is_audio);
|
||||
if (!bitmap->is_placeholder()) {
|
||||
GGML_ASSERT(bitmap->get_ro_buf().size() == n_samples * sizeof(float));
|
||||
}
|
||||
return bitmap;
|
||||
}
|
||||
|
||||
@@ -1237,11 +1331,11 @@ uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap) {
|
||||
}
|
||||
|
||||
const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
|
||||
return bitmap->data.data();
|
||||
return bitmap->get_ro_buf().data();
|
||||
}
|
||||
|
||||
size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) {
|
||||
return bitmap->data.size();
|
||||
return bitmap->get_ro_buf().size();
|
||||
}
|
||||
|
||||
bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
|
||||
@@ -1535,14 +1629,16 @@ void mtmd_debug_encode_image(mtmd_context * ctx, const std::vector<std::vector<f
|
||||
LOG_ERR("%s: model does not support vision input\n", __func__);
|
||||
return;
|
||||
}
|
||||
clip_image_f32 inp_image;
|
||||
inp_image.nx = image.size();
|
||||
inp_image.ny = inp_image.nx;
|
||||
inp_image.buf.reserve(inp_image.nx * inp_image.ny);
|
||||
const int img_sz = (int)image.size();
|
||||
std::vector<float> img_buf;
|
||||
img_buf.reserve(img_sz * img_sz);
|
||||
for (const auto & row : image) {
|
||||
inp_image.buf.insert(inp_image.buf.end(), row.begin(), row.end());
|
||||
img_buf.insert(img_buf.end(), row.begin(), row.end());
|
||||
}
|
||||
LOG_INF("%s: created input image with nx=%d, ny=%d\n", __func__, inp_image.nx, inp_image.ny);
|
||||
clip_image_f32 inp_image;
|
||||
inp_image.set_size({img_sz, img_sz}, false, false);
|
||||
inp_image.cpy_buf(img_buf);
|
||||
LOG_INF("%s: created input image with nx=%d, ny=%d\n", __func__, img_sz, img_sz);
|
||||
mtmd_debug_encode_impl(ctx, ctx->ctx_v, inp_image);
|
||||
}
|
||||
|
||||
@@ -1552,16 +1648,17 @@ void mtmd_debug_encode_audio(mtmd_context * ctx, const std::vector<float> & inpu
|
||||
return;
|
||||
}
|
||||
int n_mel = clip_get_hparams(ctx->ctx_a)->n_mel_bins;
|
||||
clip_image_f32 inp_audio;
|
||||
inp_audio.nx = input.size();
|
||||
inp_audio.ny = n_mel;
|
||||
inp_audio.buf.resize(input.size() * n_mel);
|
||||
for (size_t i = 0; i < input.size(); i++) {
|
||||
const int audio_nx = (int)input.size();
|
||||
std::vector<float> audio_buf(audio_nx * n_mel);
|
||||
for (int i = 0; i < audio_nx; i++) {
|
||||
for (int j = 0; j < n_mel; j++) {
|
||||
inp_audio.buf[j * inp_audio.nx + i] = input[i];
|
||||
audio_buf[j * audio_nx + i] = input[i];
|
||||
}
|
||||
}
|
||||
LOG_INF("%s: created input audio with nx=%d, ny=%d\n", __func__, inp_audio.nx, inp_audio.ny);
|
||||
clip_image_f32 inp_audio;
|
||||
inp_audio.set_size({audio_nx, n_mel}, false, true);
|
||||
inp_audio.cpy_buf(audio_buf);
|
||||
LOG_INF("%s: created input audio with nx=%d, ny=%d\n", __func__, audio_nx, n_mel);
|
||||
mtmd_debug_encode_impl(ctx, ctx->ctx_a, inp_audio);
|
||||
}
|
||||
|
||||
@@ -1571,9 +1668,8 @@ void mtmd_debug_preprocess_image(mtmd_context * ctx, const std::vector<uint8_t>
|
||||
return;
|
||||
}
|
||||
clip_image_u8 img_u8;
|
||||
img_u8.nx = nx;
|
||||
img_u8.ny = ny;
|
||||
img_u8.buf = rgb_values;
|
||||
img_u8.set_size({nx, ny}, false);
|
||||
img_u8.cpy_buf(rgb_values);
|
||||
clip_image_f32_batch batch_f32;
|
||||
GGML_ASSERT(ctx->image_preproc != nullptr);
|
||||
bool ok = ctx->image_preproc->preprocess(img_u8, batch_f32);
|
||||
@@ -1583,7 +1679,7 @@ void mtmd_debug_preprocess_image(mtmd_context * ctx, const std::vector<uint8_t>
|
||||
}
|
||||
LOG_INF("%s: preprocessed image to batch_f32 with %d entries\n", __func__, (int)batch_f32.entries.size());
|
||||
for (size_t i = 0; i < batch_f32.entries.size(); i++) {
|
||||
LOG_INF("%s: entry %zu has nx=%d, ny=%d\n", __func__, i, batch_f32.entries[i]->nx, batch_f32.entries[i]->ny);
|
||||
LOG_INF("%s: entry %zu has nx=%d, ny=%d\n", __func__, i, batch_f32.entries[i]->nx(), batch_f32.entries[i]->ny());
|
||||
// TODO: better way to dump entry content?
|
||||
}
|
||||
}
|
||||
|
||||
@@ -136,6 +136,11 @@ MTMD_API int mtmd_get_audio_sample_rate(const mtmd_context * ctx);
|
||||
// if bitmap is audio:
|
||||
// length of data must be n_samples * sizeof(float)
|
||||
// the data is in float format (PCM F32)
|
||||
//
|
||||
// if data == nullptr:
|
||||
// the bitmap is considered "empty", and will be treated as a placeholder for counting tokens
|
||||
// you can pass the bitmap via mtmd_tokenize(), then call mtmd_*_get_n_tokens() to count the tokens
|
||||
// note: passing a placeholder bitmap to mtmd_encode() will return an error
|
||||
MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data);
|
||||
MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data);
|
||||
MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap);
|
||||
|
||||
@@ -1447,6 +1447,36 @@ See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-r
|
||||
}'
|
||||
```
|
||||
|
||||
### POST `/v1/responses/input_tokens`: Token Counting
|
||||
|
||||
Similar to [Response input token counts API](https://developers.openai.com/api/reference/python/resources/responses/subresources/input_tokens/methods/count).
|
||||
|
||||
Example response:
|
||||
|
||||
```json
|
||||
{
|
||||
"object": "response.input_tokens",
|
||||
"input_tokens": 11
|
||||
}
|
||||
```
|
||||
|
||||
### POST `/v1/chat/completions/input_tokens`: Token Counting
|
||||
|
||||
Similar to [Response input token counts API](https://developers.openai.com/api/reference/python/resources/responses/subresources/input_tokens/methods/count), but accepts a chat completion body as input.
|
||||
|
||||
Note: This is not an official OAI endpoint, but is added for completeness and convenience.
|
||||
|
||||
Example response:
|
||||
|
||||
```json
|
||||
{
|
||||
"object": "response.input_tokens",
|
||||
"input_tokens": 11
|
||||
}
|
||||
```
|
||||
|
||||
## Anthropic-compatible API Endpoints
|
||||
|
||||
### POST `/v1/messages`: Anthropic-compatible Messages API
|
||||
|
||||
Given a list of `messages`, returns the assistant's response. Streaming is supported via Server-Sent Events. While no strong claims of compatibility with the Anthropic API spec are made, in our experience it suffices to support many apps.
|
||||
|
||||
@@ -713,10 +713,10 @@ static std::string fnv_hash(const uint8_t * data, size_t len) {
|
||||
return std::to_string(hash);
|
||||
}
|
||||
|
||||
server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files) {
|
||||
server_tokens process_mtmd_prompt(mtmd_context * mctx, const std::string & prompt, const std::vector<raw_buffer> & files, bool is_placeholder) {
|
||||
mtmd::bitmaps bitmaps;
|
||||
for (auto & file : files) {
|
||||
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size()));
|
||||
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size(), is_placeholder));
|
||||
if (!bmp.ptr) {
|
||||
throw std::runtime_error("Failed to load image or audio file");
|
||||
}
|
||||
|
||||
@@ -258,7 +258,8 @@ llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt,
|
||||
size_t validate_utf8(const std::string& text);
|
||||
|
||||
// process mtmd prompt, return the server_tokens containing both text tokens and media chunks
|
||||
server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files);
|
||||
// if is_placeholder is true, the media chunk will be treated as placeholder for counting tokens; the output tokens are not usable for actual inference (e.g. for submitting a task to server_queue)
|
||||
server_tokens process_mtmd_prompt(mtmd_context * mctx, const std::string & prompt, const std::vector<raw_buffer> & files, bool is_placeholder = false);
|
||||
|
||||
/**
|
||||
* break the input "prompt" object into multiple prompt if needed, then tokenize them
|
||||
|
||||
@@ -4333,6 +4333,10 @@ void server_routes::init_routes() {
|
||||
TASK_RESPONSE_TYPE_OAI_CHAT);
|
||||
};
|
||||
|
||||
this->post_chat_completions_tok = [this](const server_http_req & req) {
|
||||
return handle_count_tokens(ctx_server.vocab, ctx_server.mctx, req, TASK_RESPONSE_TYPE_OAI_CHAT);
|
||||
};
|
||||
|
||||
this->post_control = [this](const server_http_req & req) {
|
||||
auto res = create_response();
|
||||
const json body = json::parse(req.body);
|
||||
@@ -4388,6 +4392,10 @@ void server_routes::init_routes() {
|
||||
TASK_RESPONSE_TYPE_OAI_RESP);
|
||||
};
|
||||
|
||||
this->post_responses_tok_oai = [this](const server_http_req & req) {
|
||||
return handle_count_tokens(ctx_server.vocab, ctx_server.mctx, req, TASK_RESPONSE_TYPE_OAI_RESP);
|
||||
};
|
||||
|
||||
this->post_transcriptions_oai = [this](const server_http_req & req) {
|
||||
auto res = create_response();
|
||||
|
||||
@@ -4435,20 +4443,7 @@ void server_routes::init_routes() {
|
||||
};
|
||||
|
||||
this->post_anthropic_count_tokens = [this](const server_http_req & req) {
|
||||
auto res = create_response();
|
||||
std::vector<raw_buffer> files;
|
||||
json body = server_chat_convert_anthropic_to_oai(json::parse(req.body));
|
||||
SRV_DBG("%s\n", "Request converted: Anthropic -> OpenAI Chat Completions");
|
||||
SRV_DBG("converted request: %s\n", body.dump().c_str());
|
||||
json body_parsed = oaicompat_chat_params_parse(
|
||||
body,
|
||||
meta->chat_params,
|
||||
files);
|
||||
|
||||
json prompt = body_parsed.at("prompt");
|
||||
llama_tokens tokens = tokenize_mixed(ctx_server.vocab, prompt, true, true);
|
||||
res->ok({{"input_tokens", static_cast<int>(tokens.size())}});
|
||||
return res;
|
||||
return handle_count_tokens(ctx_server.vocab, ctx_server.mctx, req, TASK_RESPONSE_TYPE_ANTHROPIC);
|
||||
};
|
||||
|
||||
// same with handle_chat_completions, but without inference part
|
||||
@@ -4928,3 +4923,54 @@ std::unique_ptr<server_res_generator> server_routes::handle_embeddings_impl(cons
|
||||
res->ok(root);
|
||||
return res;
|
||||
}
|
||||
|
||||
std::unique_ptr<server_res_generator> server_routes::handle_count_tokens(const llama_vocab * vocab, mtmd_context * mctx, const server_http_req & req, task_response_type res_type) {
|
||||
auto res = create_response();
|
||||
std::vector<raw_buffer> files;
|
||||
json body = json::parse(req.body);
|
||||
bool is_oai = false;
|
||||
|
||||
switch (res_type) {
|
||||
case TASK_RESPONSE_TYPE_OAI_CHAT:
|
||||
{
|
||||
is_oai = true;
|
||||
} break;
|
||||
case TASK_RESPONSE_TYPE_OAI_RESP:
|
||||
{
|
||||
is_oai = true;
|
||||
body = server_chat_convert_responses_to_chatcmpl(body);
|
||||
} break;
|
||||
case TASK_RESPONSE_TYPE_ANTHROPIC:
|
||||
{
|
||||
body = server_chat_convert_anthropic_to_oai(body);
|
||||
} break;
|
||||
default:
|
||||
res->error(format_error_response("invalid res_type", ERROR_TYPE_INVALID_REQUEST));
|
||||
return res;
|
||||
}
|
||||
|
||||
json body_parsed = oaicompat_chat_params_parse(
|
||||
body,
|
||||
meta->chat_params,
|
||||
files);
|
||||
json prompt = body_parsed.at("prompt");
|
||||
// SRV_DBG("prompt = %s\n", prompt.dump().c_str());
|
||||
|
||||
// TODO @ngxson : refactor this code block, move this to server-common and reuse it in other places
|
||||
size_t n_tokens;
|
||||
if (mctx != nullptr) {
|
||||
if (!prompt.is_string()) {
|
||||
throw std::runtime_error("for mtmd, input prompt must be a string.");
|
||||
}
|
||||
n_tokens = process_mtmd_prompt(mctx, prompt.get<std::string>(), files, true).size();
|
||||
} else {
|
||||
n_tokens = tokenize_mixed(vocab, prompt, true, true).size();
|
||||
}
|
||||
|
||||
json response = {{"input_tokens", static_cast<int>(n_tokens)}};
|
||||
if (is_oai) {
|
||||
response["object"] = "response.input_tokens";
|
||||
}
|
||||
res->ok(response);
|
||||
return res;
|
||||
}
|
||||
|
||||
@@ -110,8 +110,10 @@ struct server_routes {
|
||||
server_http_context::handler_t post_completions;
|
||||
server_http_context::handler_t post_completions_oai;
|
||||
server_http_context::handler_t post_chat_completions;
|
||||
server_http_context::handler_t post_chat_completions_tok;
|
||||
server_http_context::handler_t post_control;
|
||||
server_http_context::handler_t post_responses_oai;
|
||||
server_http_context::handler_t post_responses_tok_oai;
|
||||
server_http_context::handler_t post_transcriptions_oai;
|
||||
server_http_context::handler_t post_anthropic_messages;
|
||||
server_http_context::handler_t post_anthropic_count_tokens;
|
||||
@@ -139,6 +141,7 @@ private:
|
||||
std::unique_ptr<server_res_generator> handle_slots_restore(const server_http_req & req, int id_slot);
|
||||
std::unique_ptr<server_res_generator> handle_slots_erase(const server_http_req &, int id_slot);
|
||||
std::unique_ptr<server_res_generator> handle_embeddings_impl(const server_http_req & req, task_response_type res_type);
|
||||
std::unique_ptr<server_res_generator> handle_count_tokens(const llama_vocab * vocab, mtmd_context * mctx, const server_http_req & req, task_response_type res_type);
|
||||
|
||||
// using unique_ptr to allow late initialization of const
|
||||
std::unique_ptr<const server_context_meta> meta;
|
||||
|
||||
@@ -161,6 +161,8 @@ int llama_server(int argc, char ** argv) {
|
||||
routes.post_tokenize = models_routes->proxy_post;
|
||||
routes.post_detokenize = models_routes->proxy_post;
|
||||
routes.post_apply_template = models_routes->proxy_post;
|
||||
routes.post_chat_completions_tok = models_routes->proxy_post;
|
||||
routes.post_responses_tok_oai = models_routes->proxy_post;
|
||||
routes.get_lora_adapters = models_routes->proxy_get;
|
||||
routes.post_lora_adapters = models_routes->proxy_post;
|
||||
routes.get_slots = models_routes->proxy_get;
|
||||
@@ -192,7 +194,6 @@ int llama_server(int argc, char ** argv) {
|
||||
ctx_http.post("/v1/audio/transcriptions", ex_wrapper(routes.post_transcriptions_oai));
|
||||
ctx_http.post("/audio/transcriptions", ex_wrapper(routes.post_transcriptions_oai));
|
||||
ctx_http.post("/v1/messages", ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API
|
||||
ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting
|
||||
ctx_http.post("/infill", ex_wrapper(routes.post_infill));
|
||||
ctx_http.post("/embedding", ex_wrapper(routes.post_embeddings)); // legacy
|
||||
ctx_http.post("/embeddings", ex_wrapper(routes.post_embeddings));
|
||||
@@ -204,6 +205,12 @@ int llama_server(int argc, char ** argv) {
|
||||
ctx_http.post("/tokenize", ex_wrapper(routes.post_tokenize));
|
||||
ctx_http.post("/detokenize", ex_wrapper(routes.post_detokenize));
|
||||
ctx_http.post("/apply-template", ex_wrapper(routes.post_apply_template));
|
||||
// token counting
|
||||
ctx_http.post("/chat/completions/input_tokens", ex_wrapper(routes.post_chat_completions_tok));
|
||||
ctx_http.post("/v1/chat/completions/input_tokens", ex_wrapper(routes.post_chat_completions_tok));
|
||||
ctx_http.post("/responses/input_tokens", ex_wrapper(routes.post_responses_tok_oai));
|
||||
ctx_http.post("/v1/responses/input_tokens", ex_wrapper(routes.post_responses_tok_oai));
|
||||
ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting
|
||||
// LoRA adapters hotswap
|
||||
ctx_http.get ("/lora-adapters", ex_wrapper(routes.get_lora_adapters));
|
||||
ctx_http.post("/lora-adapters", ex_wrapper(routes.post_lora_adapters));
|
||||
|
||||
@@ -573,3 +573,19 @@ def test_chat_completions_multiple_choices():
|
||||
for choice in res.body["choices"]:
|
||||
assert "assistant" == choice["message"]["role"]
|
||||
assert choice["finish_reason"] == "length"
|
||||
|
||||
|
||||
def test_chat_completions_token_count():
|
||||
global server
|
||||
server.start()
|
||||
# make sure cache can be reused across multiple choices and multiple requests
|
||||
# ref: https://github.com/ggml-org/llama.cpp/pull/18663
|
||||
for _ in range(2):
|
||||
res = server.make_request("POST", "/chat/completions/input_tokens", data={
|
||||
"messages": [
|
||||
{"role": "system", "content": "Book"},
|
||||
{"role": "user", "content": "What is the best book"},
|
||||
],
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert res.body["input_tokens"] > 5
|
||||
|
||||
@@ -98,6 +98,25 @@ def test_vision_chat_completion(prompt, image_url, success, re_content):
|
||||
assert res.status_code != 200
|
||||
|
||||
|
||||
def test_vision_chat_completion_token_count():
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/chat/completions/input_tokens", data={
|
||||
"temperature": 0.0,
|
||||
"top_k": 1,
|
||||
"messages": [
|
||||
{"role": "user", "content": [
|
||||
{"type": "text", "text": "What is this:"},
|
||||
{"type": "image_url", "image_url": {
|
||||
"url": get_img_url("IMG_URL_0"),
|
||||
}},
|
||||
]},
|
||||
],
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert res.body["input_tokens"] > 10
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"prompt, image_data, success, re_content",
|
||||
[
|
||||
|
||||
Reference in New Issue
Block a user