Compare commits

...

1 Commits

Author SHA1 Message Date
Xuan-Son Nguyen e37abd6b5f mtmd: add batching API (#24384)
* mtmd: add batching API

* wip

* first working version (gemma4v)

* add arg

* nits

* wire up support_batch()

* fix 0.0 output embd

* fix audio

* nits

* refactor a bit

* nits

* fix non-batching case

* fix comment
2026-06-13 00:10:29 +02:00
14 changed files with 544 additions and 126 deletions
+7
View File
@@ -2243,6 +2243,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.image_max_tokens = value;
}
).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
add_opt(common_arg(
{"--mtmd-batch-max-tokens"}, "N",
string_format("maximum number of image tokens per batch when encoding images (default: %d)", params.mtmd_batch_max_tokens),
[](common_params & params, int value) {
params.mtmd_batch_max_tokens = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MTMD_BATCH_MAX_TOKENS"));
if (llama_supports_rpc()) {
add_opt(common_arg(
{"--rpc"}, "SERVERS",
+1
View File
@@ -575,6 +575,7 @@ struct common_params {
std::vector<std::string> image; // path to image file(s) ; TODO: change the name to "media"
int image_min_tokens = -1;
int image_max_tokens = -1;
int mtmd_batch_max_tokens = 1024;
// finetune
struct lr_opt lr;
+4
View File
@@ -54,6 +54,10 @@ struct clip_graph {
virtual ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const;
// TODO: build_mm(w, b, x) to support bias
virtual bool support_batch() const {
return false;
}
//
// utility functions
//
+54 -23
View File
@@ -171,6 +171,8 @@ struct clip_ctx {
std::map<ggml_backend_dev_t, size_t> mem_usage;
std::map<ggml_backend_dev_t, size_t> mem_compute;
bool support_batch = false;
clip_ctx(clip_context_params & ctx_params) {
flash_attn_type = ctx_params.flash_attn_type;
no_alloc = ctx_params.no_alloc;
@@ -314,7 +316,7 @@ ggml_tensor * clip_graph::build_vit(
std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos,
const build_vit_opts & opts
) {
// batch dim: inp is [n_embd, n_pos] (B==1) or [n_embd, n_pos, B] (multi-tile encode)
// batch dim: inp is [n_embd, n_pos, B]
const int64_t B = inp->ne[2];
if (learned_pos_embd) {
@@ -862,7 +864,7 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale
return cur;
}
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
static std::unique_ptr<clip_graph> clip_get_graph_builder(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
const clip_image_f32 & img = *imgs.entries[0];
std::unique_ptr<clip_graph> builder;
@@ -1025,7 +1027,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
// TODO [QWEN_VIDEO]: improve this in the future
builder->n_batch = imgs.entries.size();
return builder->build();
return builder;
}
//
@@ -2819,7 +2821,7 @@ struct clip_model_loader {
std::vector<support_info_op> ops;
};
static void warmup(clip_ctx & ctx_clip) {
static clip_image_f32_batch get_dummy_batch(clip_ctx & ctx_clip) {
// create a fake batch
const auto & hparams = ctx_clip.model.hparams;
clip_image_f32_batch batch;
@@ -2833,6 +2835,20 @@ struct clip_model_loader {
LOG_INF("%s: warmup with audio size = %d\n", __func__, hparams.warmup_audio_size);
}
batch.entries.push_back(std::move(img));
return batch;
}
static void init_ctx(clip_ctx & ctx_clip) {
ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
// check batching support
auto batch = get_dummy_batch(ctx_clip);
auto builder = clip_get_graph_builder(&ctx_clip, batch);
ctx_clip.support_batch = builder->support_batch();
}
static void warmup(clip_ctx & ctx_clip) {
auto batch = get_dummy_batch(ctx_clip);
warmup(ctx_clip, batch);
}
@@ -2905,9 +2921,7 @@ struct clip_model_loader {
// only initialize backend buffers, but do not allocate them yet
static support_info_graph reserve_compute_meta(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) {
ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch);
ggml_cgraph * gf = clip_get_graph_builder(&ctx_clip, batch)->build();
ggml_backend_sched_reserve(ctx_clip.sched.get(), gf);
ctx_clip.mem_compute.clear();
@@ -3070,6 +3084,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
ctx_vision = new clip_ctx(ctx_params);
loader.load_hparams(ctx_vision->model, CLIP_MODALITY_VISION);
loader.load_tensors(*ctx_vision);
loader.init_ctx(*ctx_vision);
if (ctx_params.warmup) {
loader.warmup(*ctx_vision);
}
@@ -3083,6 +3098,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
ctx_audio = new clip_ctx(ctx_params);
loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
loader.load_tensors(*ctx_audio);
loader.init_ctx(*ctx_audio);
if (ctx_params.warmup) {
loader.warmup(*ctx_audio);
}
@@ -3484,25 +3500,22 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
return n_patches;
}
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, std::vector<float> & out_vec) {
clip_image_f32_batch imgs;
clip_image_f32_ptr img_copy(clip_image_f32_init());
*img_copy = *img;
imgs.entries.push_back(std::move(img_copy));
return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
return clip_image_batch_encode(ctx, n_threads, &imgs, out_vec);
}
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, std::vector<float> & out_batch_embd) {
const clip_image_f32_batch & imgs = *imgs_c_ptr;
int n_batch_cur = imgs.entries.size();
// maximum supported batch size, usually == 2 for qwen-vl-based models
int n_batch_max = clip_model_n_batch_max(ctx);
// TODO @ngxson : implement batch size > 1 as a loop
// we don't need true batching support because the cgraph will gonna be big anyway
if (n_batch_cur > n_batch_max) {
// [QWEN_VIDEO] for video models, the batch dimension is used as temporal dimension for merged frames
if (!ctx->support_batch && n_batch_cur > clip_model_n_temporal_merge(ctx)) {
LOG_ERR("%s: batch size %d exceeds maximum supported batch/temporal-merge size %d\n", __func__, n_batch_cur, clip_model_n_temporal_merge(ctx));
return false;
}
@@ -3513,7 +3526,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// build the inference graph
ggml_backend_sched_reset(ctx->sched.get());
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
ggml_cgraph * gf = clip_get_graph_builder(ctx, imgs)->build();
ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
// set inputs
@@ -3582,6 +3595,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
const int n = nx * ny;
for (int b = 0; b < n_batch_cur; b++) {
LOG_DBG("%s: copying image %d/%d to input buffer (nx=%d, ny=%d)\n", __func__, b+1, n_batch_cur, nx, ny);
const auto & buf = imgs.entries[b]->get_ro_buf();
float * batch_entry = inp_raw.data() + b * (3*n);
for (int y = 0; y < ny; y++) {
@@ -4416,7 +4430,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// the last node is the embedding tensor
ggml_tensor * embeddings = ggml_graph_node(gf, -1);
// sanity check (only support batch size of 1 for now)
// sanity check (assuming that all images in batch have the same number of tokens, so we only check the first one)
const int n_tokens_out = embeddings->ne[1];
const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get());
if (n_tokens_out != expected_n_tokens_out) {
@@ -4424,16 +4438,26 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
GGML_ABORT("Invalid number of output tokens");
}
// copy the embeddings to the location passed by the user
if (vec != nullptr) {
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
LOG_DBG("%s: output embedding shape [%d, %d, %d]\n", __func__,
(int)embeddings->ne[0], (int)embeddings->ne[1], (int)embeddings->ne[2]);
// copy output to user buffer if provided
// if output is empty, skip the copy
if (!out_batch_embd.empty()) {
if (out_batch_embd.size() != (size_t)ggml_nelements(embeddings)) {
LOG_ERR("%s: output buffer has %zu elements but expected %zu\n", __func__, out_batch_embd.size(), (size_t)ggml_nelements(embeddings));
GGML_ABORT("Output buffer size mismatch");
}
ggml_backend_tensor_get(embeddings, out_batch_embd.data(), 0, ggml_nbytes(embeddings));
} else {
LOG_WRN("%s: output buffer is empty, skipping copy\n", __func__);
}
// Debug: dump final embeddings if MTMD_DEBUG_EMBEDDINGS is set
if (ctx->debug_output_embeddings) {
const int64_t n_embd = embeddings->ne[0];
const int64_t n_tokens = embeddings->ne[1];
std::vector<float> emb_data(n_embd * n_tokens);
std::vector<float> emb_data(ggml_nelements(embeddings));
ggml_backend_tensor_get(embeddings, emb_data.data(), 0, ggml_nbytes(embeddings));
LOG_INF("\n=== MTMD_DEBUG_EMBEDDINGS ===\n");
@@ -4570,7 +4594,14 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
return ctx->model.modality == CLIP_MODALITY_AUDIO;
}
int clip_model_n_batch_max(const struct clip_ctx * ctx) {
bool clip_support_batch(const struct clip_ctx * ctx) {
return ctx->support_batch;
}
// TODO @ngxson : this is no longer correct with mtmd_batch API
// this was only meant to be used by qwen-vl-based models, to fuse 2 input images into one (qwen-vl video support)
// this logic should be refactored in near future to distinctly handle "merge frames" and "batching"
int clip_model_n_temporal_merge(const struct clip_ctx * ctx) {
switch (ctx->proj_type()) {
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
+5 -3
View File
@@ -97,8 +97,8 @@ size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int id
size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, std::vector<float> & out_vec);
bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, std::vector<float> & out_batch_embd);
bool clip_is_llava(const struct clip_ctx * ctx);
// note for contributor: this clip_is_(model) pattern is deprecated
@@ -107,7 +107,9 @@ bool clip_is_llava(const struct clip_ctx * ctx);
bool clip_has_vision_encoder(const struct clip_ctx * ctx);
bool clip_has_audio_encoder(const struct clip_ctx * ctx);
int clip_model_n_batch_max(const struct clip_ctx * ctx);
bool clip_support_batch(const struct clip_ctx * ctx);
int clip_model_n_temporal_merge(const struct clip_ctx * ctx); // TODO @ngxson : remove, refactor this
std::map<ggml_backend_dev_t, size_t> clip_get_mem_usage(const struct clip_ctx * ctx);
+11 -9
View File
@@ -10,7 +10,7 @@ ggml_cgraph * clip_graph_gemma4v::build() {
ggml_set_name(inp_raw, "inp_raw_scaled");
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd);
inp = ggml_reshape_3d(ctx0, inp, n_patches, n_embd, n_batch);
inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
ggml_set_name(inp, "inp");
// note: no patch bias
@@ -51,10 +51,11 @@ ggml_cgraph * clip_graph_gemma4v::build() {
// first half
ggml_tensor * first;
{
first = ggml_view_3d(ctx0, cur,
n_dim/2, n_head, n_pos,
first = ggml_view_4d(ctx0, cur,
n_dim/2, n_head, n_pos, n_batch,
cur->nb[1],
cur->nb[2],
cur->nb[3],
0);
first = ggml_rope_ext(
ctx0,
@@ -70,10 +71,11 @@ ggml_cgraph * clip_graph_gemma4v::build() {
// second half
ggml_tensor * second;
{
second = ggml_view_3d(ctx0, cur,
n_dim/2, n_head, n_pos,
second = ggml_view_4d(ctx0, cur,
n_dim/2, n_head, n_pos, n_batch,
cur->nb[1],
cur->nb[2],
cur->nb[3],
n_dim/2 * ggml_element_size(cur));
second = ggml_rope_ext(
ctx0,
@@ -103,14 +105,14 @@ ggml_cgraph * clip_graph_gemma4v::build() {
const int kernel_size = hparams.n_merge;
GGML_ASSERT(kernel_size > 0);
// [n_embd, n_patches] -> [n_patches_x, n_patches_y, n_embd, 1]
cur = ggml_cont_4d(ctx0, ggml_transpose(ctx0, cur), n_patches_x, n_patches_y, n_embd, 1);
// [n_embd, n_patches] -> [n_patches_x, n_patches_y, n_embd, n_batch]
cur = ggml_cont_4d(ctx0, ggml_transpose(ctx0, cur), n_patches_x, n_patches_y, n_embd, n_batch);
cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG,
kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
const int out_x = n_patches_x / kernel_size;
const int out_y = n_patches_y / kernel_size;
// [out_x, out_y, n_embd, 1] -> [n_embd, out_x * out_y]
cur = ggml_reshape_3d(ctx0, cur, out_x * out_y, n_embd, 1);
// [out_x, out_y, n_embd, n_batch] -> [n_embd, out_x * out_y, n_batch]
cur = ggml_reshape_3d(ctx0, cur, out_x * out_y, n_embd, n_batch);
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
cur = ggml_scale(ctx0, cur, sqrtf((float)n_embd));
cb(cur, "pooled", -1);
+1
View File
@@ -16,6 +16,7 @@ struct clip_graph_gemma4v : clip_graph {
clip_graph_gemma4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const override;
bool support_batch() const override { return true; }
};
struct clip_graph_gemma4uv : clip_graph {
+8 -5
View File
@@ -67,8 +67,8 @@ MTMD_API void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * image,
// helper function that automatically:
// 1. run llama_decode() on text chunks
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
// 2. run mtmd_encode_chunk() on image chunks, then mtmd_get_output_embd() and then llama_decode()
// if any of the mtmd_encode_chunk() or llama_decode() calls return non-zero, stop and forward the error
// otherwise, returns 0 on success
// this function is NOT thread-safe
MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
@@ -157,13 +157,16 @@ MTMD_API int32_t mtmd_helper_video_read_next(mtmd_helper_video * ctx,
} // extern "C"
#endif
#ifdef __cplusplus
#include <set>
#include <memory>
namespace mtmd_helper {
//
// C++ wrappers
//
#ifdef __cplusplus
namespace mtmd_helper {
// video-related C++ wrappers
struct mtmd_helper_video_deleter {
void operator()(mtmd_helper_video * val) { mtmd_helper_video_free(val); }
+296 -66
View File
@@ -69,8 +69,8 @@ struct mtmd_bitmap {
return data.size();
}
bool can_batch_with(const mtmd_bitmap & other) const {
// [QWEN_VIDEO] can batch if both are images with same size
bool can_merge_with(const mtmd_bitmap & other) const {
// [QWEN_VIDEO] can (temporal) merge if both are images with same size
return !is_audio && !other.is_audio && nx == other.nx && ny == other.ny;
}
@@ -90,12 +90,24 @@ struct mtmd_image_tokens {
uint32_t ny = 0; // number of tokens in y direction
mtmd_pos_type pos = MTMD_POS_TYPE_NORMAL;
uint32_t image_idx = 0; // 0-based position of this image among image chunks in the prompt(used by pos == MTMD_POS_TYPE_HUNYUANVL)
uint32_t n_temporal_merge = 1; // for qwen-vl style temporal merge
uint32_t n_tokens() const {
if (pos == MTMD_POS_TYPE_HUNYUANVL) {
// [BOI] [row0 tokens + newline] ... [row(ny-1) tokens + newline] [EOI]
return (nx + 1) * ny + 2;
}
return nx * ny;
// [QWEN_VIDEO] this logic is quite ugly, it's mostly to make qwen-vl temporal merge work, can be improved in the future
if (batch_f32.entries.size() == 1 || n_temporal_merge == 1) {
return nx * ny;
}
uint32_t nz = batch_f32.entries.size();
// TODO: simplify this by repeating the last frame until it fits the temporal merge
if (nz % n_temporal_merge != 0) {
nz = nz / n_temporal_merge + 1;
} else {
nz = nz / n_temporal_merge;
}
return nx * ny * nz;
}
clip_image_f32_batch batch_f32; // preprocessed image patches
std::string id; // optional user-defined ID, useful for KV cache tracking
@@ -110,12 +122,17 @@ struct mtmd_image_tokens {
return false;
}
bool can_batch_with(const mtmd_image_tokens & other) {
return nx == other.nx && ny == other.ny && pos == other.pos;
}
mtmd_image_tokens clone() {
return mtmd_image_tokens{
nx,
ny,
pos,
image_idx,
n_temporal_merge,
batch_f32.clone(),
id
};
@@ -153,12 +170,49 @@ struct mtmd_input_chunk {
std::vector<llama_token> tokens_text;
mtmd_image_tokens_ptr tokens_image;
mtmd_audio_tokens_ptr tokens_audio;
bool can_batch_with(const mtmd_input_chunk & other) const {
if (type != other.type) {
return false;
}
if (tokens_image && other.tokens_image) {
return tokens_image->can_batch_with(*other.tokens_image);
}
// TODO: allow batching audio chunks of the same size
return false;
}
bool is_placeholder() const {
if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
return tokens_image && tokens_image->is_placeholder();
} else if (type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
return tokens_audio && tokens_audio->is_placeholder();
}
return false;
}
};
struct mtmd_input_chunks {
std::vector<mtmd_input_chunk> entries;
};
struct mtmd_batch {
mtmd_context * ctx;
std::vector<const mtmd_input_chunk *> entries;
std::vector<float> output_embd; // aggregated output embedding for the whole batch
mtmd_batch(mtmd_context * ctx): ctx(ctx) {}
int32_t n_tokens() const {
int32_t n = 0;
for (const auto * chunk : entries) {
n += mtmd_input_chunk_get_n_tokens(chunk);
}
return n;
}
};
// slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings
// models not having it (llava-1.6) will process embeddings without any special tokens in-between
enum mtmd_slice_tmpl {
@@ -197,6 +251,7 @@ mtmd_context_params mtmd_context_params_default() {
/* image_max_tokens */ -1,
/* cb_eval */ nullptr,
/* cb_eval_user_data */ nullptr,
/* batch_max_tokens */ 1024,
};
return params;
}
@@ -204,7 +259,7 @@ mtmd_context_params mtmd_context_params_default() {
struct mtmd_context {
struct clip_ctx * ctx_v; // vision
struct clip_ctx * ctx_a; // audio
std::vector<float> image_embd_v; // image embedding vector
std::vector<float> out_embd; // image embedding vector
bool print_timings;
int n_threads;
@@ -239,17 +294,21 @@ struct mtmd_context {
std::unique_ptr<mtmd_audio_preprocessor> audio_preproc;
std::unique_ptr<mtmd_image_preprocessor> image_preproc;
// batching
int32_t batch_max_tokens;
// TODO @ngxson : add timings
mtmd_context(const char * mmproj_fname,
const llama_model * text_model,
const mtmd_context_params & ctx_params,
bool no_alloc = false) :
print_timings(ctx_params.print_timings),
n_threads (ctx_params.n_threads),
media_marker (ctx_params.media_marker),
n_embd_text (text_model ? llama_model_n_embd_inp(text_model) : -1),
vocab (text_model ? llama_model_get_vocab(text_model) : nullptr)
print_timings (ctx_params.print_timings),
n_threads (ctx_params.n_threads),
media_marker (ctx_params.media_marker),
n_embd_text (text_model ? llama_model_n_embd_inp(text_model) : -1),
vocab (text_model ? llama_model_get_vocab(text_model) : nullptr),
batch_max_tokens(ctx_params.batch_max_tokens)
{
if (ctx_params.image_marker != nullptr) {
throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
@@ -680,6 +739,16 @@ struct mtmd_context {
return ctx_a ? clip_get_projector_type(ctx_a) : PROJECTOR_TYPE_UNKNOWN;
}
int64_t n_embd_out() const {
if (ctx_v) {
return clip_n_mmproj_embd(ctx_v);
} else if (ctx_a) {
return clip_n_mmproj_embd(ctx_a);
} else {
throw std::runtime_error("no CLIP model loaded");
}
}
~mtmd_context() {
clip_free(ctx_a);
clip_free(ctx_v);
@@ -845,7 +914,7 @@ struct mtmd_tokenizer {
// [QWEN_VIDEO] handle frame merging for models that support it (i.e. qwen-vl)
int n_merge_frames = 1;
if (ctx->ctx_v) {
n_merge_frames = clip_model_n_batch_max(ctx->ctx_v);
n_merge_frames = clip_model_n_temporal_merge(ctx->ctx_v);
GGML_ASSERT(n_merge_frames <= 2 && "we only support merging maximum 2 images for now; open an issue if this model supports merging more");
}
@@ -860,7 +929,7 @@ struct mtmd_tokenizer {
if (i + 1 < parts.size() && parts[i + 1].bitmap != nullptr) {
const mtmd_bitmap * bm_a = parts[i].bitmap;
const mtmd_bitmap * bm_b = parts[i + 1].bitmap;
if (bm_a->can_batch_with(*bm_b)) {
if (bm_a->can_merge_with(*bm_b)) {
LOG_DBG("%s: merging 2 frames at part index %zu and %zu\n", __func__, i, i + 1);
merged_bitmaps.push_back({bm_a, bm_b});
parts.erase(parts.begin() + i + 1); // collapse the second bitmap part
@@ -1103,13 +1172,17 @@ struct mtmd_tokenizer {
size_t n_tokens = 0;
for (const auto & e : batch_f32.entries) {
n_tokens += clip_n_output_tokens(ctx->ctx_v, e.get());
if (clip_model_n_batch_max(ctx->ctx_v) == 2) {
if (clip_model_n_temporal_merge(ctx->ctx_v) == 2) {
// [QWEN_VIDEO] pair input is merged to the same embd, so only count as one image
break;
}
}
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
// [QWEN_VIDEO] improve this in the future
image_tokens->n_temporal_merge = clip_model_n_temporal_merge(ctx->ctx_v);
if (mtmd_decode_use_mrope(ctx)) {
// for Qwen2VL, we need this information for M-RoPE decoding positions
image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
@@ -1327,60 +1400,18 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
}
}
int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n");
return 0;
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
if (!ctx->ctx_v) {
LOG_ERR("%s: model does not support vision input\n", __func__);
return 1;
}
if (chunk->tokens_image == nullptr) {
LOG_ERR("%s: image tokens are null\n", __func__);
return 1;
}
if (chunk->tokens_image->is_placeholder()) {
LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
return 1;
}
return mtmd_encode(ctx, chunk->tokens_image.get());
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
if (!ctx->ctx_a) {
LOG_ERR("%s: model does not support audio input\n", __func__);
return 1;
}
if (chunk->tokens_audio == nullptr) {
LOG_ERR("%s: audio tokens are null\n", __func__);
return 1;
}
if (chunk->tokens_audio->is_placeholder()) {
LOG_ERR("%s: audio tokens batch is placeholder\n", __func__);
return 1;
}
int n_mmproj_embd = ctx->n_embd_text;
ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
bool ok = clip_image_batch_encode(
ctx->ctx_a,
ctx->n_threads,
&chunk->tokens_audio->batch_f32,
ctx->image_embd_v.data());
return ok ? 0 : 1;
}
LOG_ERR("%s: unknown chunk type %d\n", __func__, (int)chunk->type);
return 1;
}
int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens, std::vector<float> & out_embd) {
clip_ctx * ctx_clip = ctx->ctx_v;
if (!ctx_clip) {
LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__);
return 1;
}
auto proj_type = clip_get_projector_type(ctx_clip);
int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
int n_embd_out = ctx->n_embd_out();
auto n_tokens_out = image_tokens->n_tokens();
out_embd.resize((size_t)n_embd_out * n_tokens_out);
bool ok = false;
if (clip_is_llava(ctx_clip)
@@ -1400,12 +1431,19 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
return 1;
}
int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
ok = clip_image_encode(
std::vector<float> tmp_embd((size_t)n_tokens_per_image * n_embd_out);
bool ok_i = clip_image_encode(
ctx_clip,
ctx->n_threads,
entries[i].get(),
ctx->image_embd_v.data() + offset);
offset += static_cast<size_t>(n_mmproj_embd) * n_tokens_per_image;
tmp_embd);
if (!ok_i) {
LOG_ERR("%s: failed to encode image %zu\n", __func__, i);
return 1;
}
ok = true;
std::copy(tmp_embd.begin(), tmp_embd.end(), out_embd.begin() + offset);
offset += static_cast<size_t>(n_embd_out) * n_tokens_per_image;
}
} else {
if (image_tokens->is_placeholder()) {
@@ -1416,14 +1454,206 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
ctx_clip,
ctx->n_threads,
&image_tokens->batch_f32,
ctx->image_embd_v.data());
out_embd);
}
return ok ? 0 : 1;
}
static int32_t mtmd_encode_chunk_impl(mtmd_context * ctx, const mtmd_input_chunk * chunk, std::vector<float> & out_embd) {
if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n");
return 0;
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
if (!ctx->ctx_v) {
LOG_ERR("%s: model does not support vision input\n", __func__);
return 1;
}
if (chunk->tokens_image == nullptr) {
LOG_ERR("%s: image tokens are null\n", __func__);
return 1;
}
if (chunk->tokens_image->is_placeholder()) {
LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
return 1;
}
return mtmd_encode_impl(ctx, chunk->tokens_image.get(), out_embd);
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
if (!ctx->ctx_a) {
LOG_ERR("%s: model does not support audio input\n", __func__);
return 1;
}
if (chunk->tokens_audio == nullptr) {
LOG_ERR("%s: audio tokens are null\n", __func__);
return 1;
}
if (chunk->tokens_audio->is_placeholder()) {
LOG_ERR("%s: audio tokens batch is placeholder\n", __func__);
return 1;
}
int n_mmproj_embd = ctx->n_embd_out();
out_embd.resize((size_t)chunk->tokens_audio->n_tokens * n_mmproj_embd);
bool ok = clip_image_batch_encode(
ctx->ctx_a,
ctx->n_threads,
&chunk->tokens_audio->batch_f32,
out_embd);
return ok ? 0 : 1;
}
LOG_ERR("%s: unknown chunk type %d\n", __func__, (int)chunk->type);
return 1;
}
int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
// this is the non-batching version
try {
return mtmd_encode_chunk_impl(ctx, chunk, ctx->out_embd);
} catch (const std::exception & e) {
LOG_ERR("%s: error: %s\n", __func__, e.what());
return 1;
}
}
int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
try {
return mtmd_encode_impl(ctx, image_tokens, ctx->out_embd);
} catch (const std::exception & e) {
LOG_ERR("%s: error: %s\n", __func__, e.what());
return 1;
}
}
float * mtmd_get_output_embd(mtmd_context * ctx) {
return ctx->image_embd_v.data();
return ctx->out_embd.data();
}
mtmd_batch * mtmd_batch_init(mtmd_context * ctx) {
return new mtmd_batch(ctx);
}
void mtmd_batch_free(mtmd_batch * batch) {
if (batch) {
delete batch;
}
}
int32_t mtmd_batch_add_chunk(mtmd_batch * batch, const mtmd_input_chunk * chunk) {
if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
LOG_ERR("%s: text chunk is not supported in batch\n", __func__);
return 1;
}
auto * ctx = batch->ctx->get_clip_ctx(chunk);
if (!ctx) {
LOG_ERR("%s: model does not support input chunk type %d\n", __func__, (int)chunk->type);
return 1;
}
if (batch->entries.empty()) {
// batch must have at least one chunk
batch->entries.push_back(chunk);
return 0;
}
if (!clip_support_batch(ctx)) {
// if no batching support, batch can only have one single chunk
return 2; // "batch too large" error code
}
int32_t new_n_tokens = batch->n_tokens() + (int32_t)mtmd_input_chunk_get_n_tokens(chunk);
if (new_n_tokens > batch->ctx->batch_max_tokens) {
return 2; // "batch too large" error code
}
auto & first_chunk = batch->entries[0];
if (first_chunk->can_batch_with(*chunk)) {
batch->entries.push_back(chunk);
return 0;
}
return 3; // "cannot batch" error code
}
static int32_t mtmd_batch_encode_impl(mtmd_batch * batch) {
if (batch->entries.empty()) {
LOG_ERR("%s: batch is empty\n", __func__);
return 1;
}
for (const auto * chunk : batch->entries) {
if (chunk->is_placeholder()) {
LOG_ERR("%s: chunk is placeholder\n", __func__);
return 1;
}
}
// represent the whole batch as one single chunk
mtmd::input_chunk_ptr batch_chunk(mtmd_input_chunk_copy(batch->entries[0]));
if (batch_chunk->tokens_image) {
auto & b0_f32 = batch_chunk->tokens_image->batch_f32;
// copy all entries from other chunks into the first chunk's batch_f32
// note: skip first entry because it's already in batch_chunk
for (size_t ic = 1; ic < batch->entries.size(); ic++) {
auto & chunk = batch->entries[ic];
GGML_ASSERT(chunk->tokens_image);
auto b1_f32 = chunk->tokens_image->batch_f32.clone();
for (size_t i = 0; i < b1_f32.entries.size(); i++) {
b0_f32.entries.push_back(std::move(b1_f32.entries[i]));
}
}
} else if (batch_chunk->tokens_audio) {
auto & b0_f32 = batch_chunk->tokens_audio->batch_f32;
// copy all entries from other chunks into the first chunk's batch_f32
// note: skip first entry because it's already in batch_chunk
for (size_t ic = 1; ic < batch->entries.size(); ic++) {
auto & chunk = batch->entries[ic];
GGML_ASSERT(chunk->tokens_audio);
auto b1_f32 = chunk->tokens_audio->batch_f32.clone();
for (size_t i = 0; i < b1_f32.entries.size(); i++) {
b0_f32.entries.push_back(std::move(b1_f32.entries[i]));
}
}
} else {
LOG_ERR("%s: unsupported chunk type\n", __func__);
return 1;
}
LOG_DBG("%s: encoding batch with %zu entries and total %zu tokens\n",
__func__, batch->entries.size(), mtmd_input_chunk_get_n_tokens(batch_chunk.get()));
int32_t res = mtmd_encode_chunk_impl(
batch->ctx,
batch_chunk.get(),
batch->output_embd);
return res;
}
int32_t mtmd_batch_encode(mtmd_batch * batch) {
try {
return mtmd_batch_encode_impl(batch);
} catch (const std::exception & e) {
LOG_ERR("%s: error: %s\n", __func__, e.what());
return 1;
}
}
float * mtmd_batch_get_output_embd(mtmd_batch * batch, const mtmd_input_chunk * chunk) {
if (batch->output_embd.empty()) {
LOG_ERR("%s: batch has not been encoded yet\n", __func__);
return nullptr;
}
size_t offset = 0;
const size_t n_embd = batch->ctx->n_embd_out();
for (const auto * c : batch->entries) {
size_t offset_prev = offset;
size_t n_tokens = mtmd_input_chunk_get_n_tokens(c);
offset += n_tokens * n_embd;
GGML_ASSERT(offset_prev < batch->output_embd.size());
GGML_ASSERT(offset <= batch->output_embd.size());
if (c == chunk) {
return &batch->output_embd.data()[offset_prev];
}
}
return nullptr; // not found
}
bool mtmd_decode_use_non_causal(const mtmd_context * ctx, const mtmd_input_chunk * chunk) {
@@ -1801,7 +2031,7 @@ static void mtmd_debug_encode_impl(mtmd_context * ctx, clip_ctx * ctx_clip, clip
ctx_clip,
ctx->n_threads,
&image,
embd_output.data());
embd_output);
if (!ok) {
LOG_ERR("%s: failed to encode image\n", __func__);
}
+36 -4
View File
@@ -63,6 +63,7 @@ struct mtmd_bitmap;
struct mtmd_image_tokens;
struct mtmd_input_chunk;
struct mtmd_input_chunks;
struct mtmd_batch;
struct mtmd_input_text {
const char * text;
@@ -80,6 +81,7 @@ typedef struct mtmd_image_tokens mtmd_image_tokens;
typedef struct mtmd_input_chunk mtmd_input_chunk;
typedef struct mtmd_input_chunks mtmd_input_chunks;
typedef struct mtmd_input_text mtmd_input_text;
typedef struct mtmd_batch mtmd_batch;
struct mtmd_context_params {
bool use_gpu;
@@ -97,6 +99,11 @@ struct mtmd_context_params {
// callback function passed over to mtmd proper
ggml_backend_sched_eval_callback cb_eval;
void * cb_eval_user_data;
// batching params
int32_t batch_max_tokens; // maximum number of output tokens in a batch
// (note: this is not a hard-limit, the first image will always be added even if it exceeds this limit)
// (default: 1024)
};
MTMD_API const char * mtmd_default_marker(void);
@@ -265,12 +272,12 @@ MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
const mtmd_bitmap ** bitmaps,
size_t n_bitmaps);
// returns 0 on success
// TODO: deprecate
MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
const mtmd_image_tokens * image_tokens);
DEPRECATED(MTMD_API int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens),
"use mtmd_encode_chunk() instead");
// text chunk will be ignored silently, only media chunk will be encoded
// returns 0 on success
// returns 1 on generic error
MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
const mtmd_input_chunk * chunk);
@@ -279,6 +286,26 @@ MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
// llama_model_n_embd_inp(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float)
MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
// batch encoding API
// chunks are not owned by the batch, they will not be freed by mtmd_batch_free()
// batch is valid for a given context, cannot be shared across contexts
MTMD_API mtmd_batch * mtmd_batch_init(mtmd_context * ctx);
MTMD_API void mtmd_batch_free(mtmd_batch * batch);
// only media chunks are allowed, text chunks will be rejected
// returns 0 on success
// returns 1 on generic error
// returns 2 if the batch is too large (chunk won't be added)
// returns 3 if it cannot be batched with the existing chunks in the batch
MTMD_API int32_t mtmd_batch_add_chunk(mtmd_batch * batch, const mtmd_input_chunk * chunk);
// returns 0 on success
// returns 1 on generic error
MTMD_API int32_t mtmd_batch_encode(mtmd_batch * batch);
MTMD_API float * mtmd_batch_get_output_embd(mtmd_batch * batch, const mtmd_input_chunk * chunk);
// Set callback for all future logging events.
// If this is not called, or NULL is supplied, everything is output on stderr.
MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data);
@@ -336,6 +363,11 @@ struct mtmd_input_chunk_deleter {
};
using input_chunk_ptr = std::unique_ptr<mtmd_input_chunk, mtmd_input_chunk_deleter>;
struct mtmd_batch_deleter {
void operator()(mtmd_batch * val) { mtmd_batch_free(val); }
};
using batch_ptr = std::unique_ptr<mtmd_batch, mtmd_batch_deleter>;
struct bitmap {
bitmap_ptr ptr;
bitmap() : ptr(nullptr) {}
+8
View File
@@ -344,6 +344,14 @@ const mtmd::input_chunk_ptr & server_tokens::find_chunk(size_t idx) const {
throw std::runtime_error("Chunk not found");
}
std::pair<const mtmd::input_chunk_ptr *, size_t> server_tokens::find_next_media_chunk(size_t idx) const {
auto it = map_idx_to_media.upper_bound(idx);
if (it != map_idx_to_media.end()) {
return { &it->second, it->first };
}
return { nullptr, 0 };
}
void server_tokens::push_back(llama_token tok) {
if (tok == LLAMA_TOKEN_NULL) {
throw std::runtime_error("Invalid token");
+4
View File
@@ -180,6 +180,10 @@ public:
const mtmd::input_chunk_ptr & find_chunk(size_t idx) const;
// find next media chunk after idx
// returns a pair of pointer to the chunk (nullptr if not found) and its start index in tokens
std::pair<const mtmd::input_chunk_ptr *, size_t> find_next_media_chunk(size_t idx) const;
void push_back(llama_token tok);
// will create a copy of the chunk if it contains non-text data
+108 -15
View File
@@ -80,6 +80,8 @@ struct server_slot {
// multimodal
mtmd_context * mctx = nullptr;
mtmd::batch_ptr mbatch = nullptr;
std::array<llama_context *, 2> mtgt = {nullptr, nullptr}; // [0] for main context, [1] for optional draft context
// speculative decoding
common_speculative * spec;
@@ -239,6 +241,18 @@ struct server_slot {
// clear alora start
alora_invocation_start = -1;
// clear multimodal state
mbatch.reset();
mtgt[0] = ctx_tgt;
mtgt[1] = nullptr;
if (ctx_dft && llama_get_ctx_other(ctx_dft) != ctx_tgt) {
// TODO: in the future, figure out how to infuse target embeddings to the images
// for now, we re-decode the same chunk in both ctx_tgt and ctx_dft
// maybe we simply need to call `common_speculative_process()` ?
// [TAG_MTMD_DRAFT_PROCESSING]
mtgt[1] = ctx_dft;
}
}
void init_sampler() const {
@@ -578,6 +592,87 @@ struct server_slot {
other.prompt = prompt.clone();
other.init_sampler();
}
// returns 0 on success
// caller need to update prompt.tokens after a successful call to keep track of the processing progress
int process_mtmd_chunk(size_t idx, size_t & n_tokens_out) {
GGML_ASSERT(mctx);
const auto & input_tokens = task->tokens;
auto & chunk = input_tokens.find_chunk(idx);
int32_t res = 0;
auto try_decode = [&]() -> int32_t {
if (mbatch) {
float * embd = mtmd_batch_get_output_embd(mbatch.get(), chunk.get());
if (embd) {
for (auto * lctx : mtgt) {
if (lctx == nullptr) {
continue;
}
llama_pos new_n_past; // unused for now
res = mtmd_helper_decode_image_chunk(
mctx,
lctx,
chunk.get(),
embd,
prompt.tokens.pos_next(),
id,
llama_n_batch(lctx),
&new_n_past
);
if (res != 0) {
SLT_ERR(*this, "failed to decode mtmd chunk, idx = %zu, res = %d\n", idx, res);
return -1;
}
}
n_tokens_out = mtmd_input_chunk_get_n_tokens(chunk.get());
return 0; // success
}
}
return 1; // (non-error) need to create & encode batch
};
// if the batch is already exist, try searching & encode
res = try_decode();
if (res == 0) {
return 0;
} else if (res < 0) {
// fatal error
return res;
}
// otherwise, the batch is either uninitialized or is used up
// we need to create & encode a new batch
mbatch.reset(mtmd_batch_init(mctx));
res = mtmd_batch_add_chunk(mbatch.get(), chunk.get());
GGML_ASSERT(res == 0); // we should never have an empty batch
// try batching as much as possible
int n_added = 1;
size_t idx_cur = idx;
while (res == 0) {
auto [next_chunk, next_idx] = input_tokens.find_next_media_chunk(idx_cur);
if (next_chunk == nullptr) {
break;
}
res = mtmd_batch_add_chunk(mbatch.get(), next_chunk->get());
n_added += (res == 0 ? 1 : 0);
idx_cur = next_idx;
SLT_DBG(*this, "try adding media chunk idx = %zu to batch, res = %d\n", next_idx, res);
// if res != 0, batch is full or chunk is not compatible -> this loop breaks
}
// TODO @ngxson : move this log line to debug when it become more stable
SLT_INF(*this, "encoding mtmd batch from idx = %zu, n_chunks = %d\n", idx, n_added);
res = mtmd_batch_encode(mbatch.get());
if (res != 0) {
SLT_ERR(*this, "failed to encode mtmd batch for chunk idx = %zu, res = %d\n", idx, res);
return -1;
}
return try_decode();
}
};
@@ -781,6 +876,7 @@ private:
mparams.warmup = params_base.warmup;
mparams.image_min_tokens = params_base.image_min_tokens;
mparams.image_max_tokens = params_base.image_max_tokens;
mparams.batch_max_tokens = params_base.mtmd_batch_max_tokens;
mparams.media_marker = get_media_marker();
}
@@ -2928,7 +3024,7 @@ private:
send_partial_response(slot, {}, false, true);
}
}
}
} // end of SLOT_STATE_STARTED
if (!slot.can_split()) {
// cannot fit the prompt in the current batch - will try next iter
@@ -2983,10 +3079,18 @@ private:
bool has_mtmd = false;
// check if we should process the image
while (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) {
while (true) {
auto cur_token_idx = slot.prompt.n_tokens();
if (
cur_token_idx >= slot.task->n_tokens() ||
input_tokens[cur_token_idx] != LLAMA_TOKEN_NULL // encountered a text token
) {
break;
}
// process the image
size_t n_tokens_out = 0;
int32_t res = input_tokens.process_chunk(ctx_tgt, mctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out);
int32_t res = slot.process_mtmd_chunk(cur_token_idx, n_tokens_out);
if (res != 0) {
SLT_ERR(slot, "failed to process image, res = %d\n", res);
send_error(slot, "failed to process image", ERROR_TYPE_SERVER);
@@ -2994,22 +3098,11 @@ private:
continue;
}
if (ctx_dft && llama_get_ctx_other(ctx_dft.get()) != ctx_tgt) {
// TODO: in the future, figure out how to infuse target embeddings to the images
// for now, we skip this for simplicity
// maybe we simply need to call `common_speculative_process()` on the mtmd batches in the `process_chunk` above?
// [TAG_MTMD_DRAFT_PROCESSING]
res = input_tokens.process_chunk(ctx_dft.get(), mctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out);
if (res != 0) {
GGML_ABORT("failed to process multi-modal data on draft context\n");
}
}
slot.n_prompt_tokens_processed += n_tokens_out;
// add the image chunk to cache
{
const auto & chunk = input_tokens.find_chunk(slot.prompt.n_tokens());
const auto & chunk = input_tokens.find_chunk(cur_token_idx);
slot.prompt.tokens.push_back(chunk.get()); // copy
}
+1 -1
View File
@@ -113,7 +113,7 @@ bool server_http_context::init(const common_params & params) {
#endif
srv->set_default_headers({{"Server", "llama.cpp"}});
srv->set_logger(log_server_request);
// srv->set_logger(log_server_request); // TODO @ngxson : this is too spamy, no very useful; improve it in the future
srv->set_exception_handler([](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) {
// this is fail-safe; exceptions should already handled by `ex_wrapper`