Compare commits

..

5 Commits

Author SHA1 Message Date
Xuan-Son Nguyen db52540f73 mtmd: add batching support for internvl (#24775) 2026-06-19 01:16:16 +02:00
Pascal 3a3edc9ac6 Ggml/cuda col2im 1d (#24417)
* cuda: add GGML_OP_COL2IM_1D, follow-up to the CPU op

* cuda: col2im_1d use fast_div_modulo for the index decomposition

* cuda: col2im_1d tighten supports_op, type match and contiguous dst
2026-06-18 22:23:01 +02:00
Reguna 40f3aafc45 server: add "X-Accel-Buffering": "no" header to streaming endpoints (#24774)
* server: add "X-Accel-Buffering": "no" header to streaming endpoints

This header tells Nginx (as a reverse proxy) to NOT buffer responses. (only affects streaming endpoints)
Without it, Nginx will break streaming with certain applications (notably the Pi coding harness).
2026-06-18 22:01:24 +02:00
Xuan-Son Nguyen a6b3260a42 mtmd: add batching for mtmd-cli, add video tests (#24778) 2026-06-18 21:55:04 +02:00
o7si 32eddaf2ea cmake : fix ui build with read-only source (#24752) 2026-06-18 18:59:18 +02:00
10 changed files with 279 additions and 36 deletions
+81
View File
@@ -0,0 +1,81 @@
#include "col2im-1d.cuh"
#include "convert.cuh"
// col2im_1d: scatter-add GEMM columns to 1D signal (gather approach)
// columns: [K*OC, T_in] -> output: [T_out, OC]
// Supports F32, F16, BF16 data with F32 accumulator.
template <typename T>
static __global__ void col2im_1d_kernel(
const T * __restrict__ col,
T * __restrict__ dst,
const int T_in, const uint3 T_out_fd,
const int OC, const int K, const int K_OC,
const int s0, const int p0, const int total) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx >= total) return;
// dst layout: [T_out, OC], ne[0]=T_out fastest
const uint2 qr = fast_div_modulo((uint32_t)idx, T_out_fd); // qr.x = idx / T_out, qr.y = idx % T_out
const int oc = (int)qr.x;
const int t_out = (int)qr.y;
const int t_abs = t_out + p0; // absolute position in uncropped signal
// Gather: find all (t_in, k) where t_in*s + k == t_abs, 0 <= k < K
int t_in_min = (t_abs - K + s0) / s0; // ceil((t_abs - K + 1) / s)
if (t_in_min < 0) t_in_min = 0;
int t_in_max = t_abs / s0;
if (t_in_max >= T_in) t_in_max = T_in - 1;
float sum = 0.0f;
for (int t_in = t_in_min; t_in <= t_in_max; t_in++) {
const int k = t_abs - t_in * s0;
// col layout: [K*OC, T_in], column index = oc * K + k
sum += ggml_cuda_cast<float>(col[(oc * K + k) + t_in * K_OC]);
}
dst[idx] = ggml_cuda_cast<T>(sum);
}
void ggml_cuda_op_col2im_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
const int32_t OC = ((const int32_t *)(dst->op_params))[1];
const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
const int K_OC = (int) src0->ne[0];
const int T_in = (int) src0->ne[1];
const int K = K_OC / OC;
const int T_out = (int) dst->ne[0];
const uint3 T_out_fd = init_fastdiv_values((uint32_t)T_out);
const int total = T_out * OC;
const int block_size = 256;
const int num_blocks = (total + block_size - 1) / block_size;
switch (src0->type) {
case GGML_TYPE_F32: {
col2im_1d_kernel<<<num_blocks, block_size, 0, stream>>>(
(const float *)src0->data, (float *)dst->data,
T_in, T_out_fd, OC, K, K_OC, s0, p0, total);
} break;
case GGML_TYPE_F16: {
col2im_1d_kernel<<<num_blocks, block_size, 0, stream>>>(
(const half *)src0->data, (half *)dst->data,
T_in, T_out_fd, OC, K, K_OC, s0, p0, total);
} break;
case GGML_TYPE_BF16: {
col2im_1d_kernel<<<num_blocks, block_size, 0, stream>>>(
(const nv_bfloat16 *)src0->data, (nv_bfloat16 *)dst->data,
T_in, T_out_fd, OC, K, K_OC, s0, p0, total);
} break;
default:
GGML_ABORT("col2im_1d: unsupported type");
}
}
+3
View File
@@ -0,0 +1,3 @@
#include "common.cuh"
void ggml_cuda_op_col2im_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+12
View File
@@ -11,6 +11,7 @@
#include "ggml-cuda/argsort.cuh"
#include "ggml-cuda/binbcast.cuh"
#include "ggml-cuda/clamp.cuh"
#include "ggml-cuda/col2im-1d.cuh"
#include "ggml-cuda/concat.cuh"
#include "ggml-cuda/conv-transpose-1d.cuh"
#include "ggml-cuda/conv2d.cuh"
@@ -3051,6 +3052,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_CONV_TRANSPOSE_1D:
ggml_cuda_op_conv_transpose_1d(ctx,dst);
break;
case GGML_OP_COL2IM_1D:
ggml_cuda_op_col2im_1d(ctx, dst);
break;
case GGML_OP_POOL_2D:
ggml_cuda_op_pool2d(ctx, dst);
break;
@@ -5316,6 +5320,14 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
}
return false;
} break;
case GGML_OP_COL2IM_1D:
{
ggml_type src0_type = op->src[0]->type;
return (src0_type == GGML_TYPE_F32 || src0_type == GGML_TYPE_F16 || src0_type == GGML_TYPE_BF16) &&
op->type == src0_type &&
ggml_is_contiguous(op->src[0]) &&
ggml_is_contiguous(op);
} break;
case GGML_OP_SILU_BACK:
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
break;
+23 -4
View File
@@ -20,6 +20,7 @@ set(LLAMA_UI_GZIP "" CACHE STRING "Apply gzip compress to assets to save ban
set(DIST_DIR "${UI_BINARY_DIR}/dist")
set(SRC_DIST_DIR "${UI_SOURCE_DIR}/dist")
set(WORK_DIR "${UI_BINARY_DIR}/ui-src")
set(STAMP_FILE "${UI_BINARY_DIR}/.ui-stamp")
set(UI_CPP "${UI_BINARY_DIR}/ui.cpp")
set(UI_H "${UI_BINARY_DIR}/ui.h")
@@ -64,6 +65,22 @@ function(npm_build_should_skip out_var)
set(${out_var} TRUE PARENT_SCOPE)
endfunction()
function(stage_sources)
if(EXISTS "${WORK_DIR}")
file(GLOB staged RELATIVE "${WORK_DIR}" "${WORK_DIR}/*")
list(REMOVE_ITEM staged "node_modules")
foreach(entry ${staged})
file(REMOVE_RECURSE "${WORK_DIR}/${entry}")
endforeach()
endif()
file(COPY "${UI_SOURCE_DIR}/"
DESTINATION "${WORK_DIR}"
NO_SOURCE_PERMISSIONS
PATTERN "node_modules" EXCLUDE
)
endfunction()
function(npm_build out_var)
set(${out_var} FALSE PARENT_SCOPE)
@@ -89,14 +106,16 @@ function(npm_build out_var)
return()
endif()
stage_sources()
# npm writes node_modules/.package-lock.json on every successful install,
# so a package-lock.json newer than this marker means node_modules is stale
set(NPM_MARKER "${UI_SOURCE_DIR}/node_modules/.package-lock.json")
set(NPM_MARKER "${WORK_DIR}/node_modules/.package-lock.json")
set(need_install FALSE)
if(NOT EXISTS "${NPM_MARKER}")
set(need_install TRUE)
else()
file(TIMESTAMP "${UI_SOURCE_DIR}/package-lock.json" lock_ts)
file(TIMESTAMP "${WORK_DIR}/package-lock.json" lock_ts)
file(TIMESTAMP "${NPM_MARKER}" marker_ts)
if(lock_ts STRGREATER marker_ts)
set(need_install TRUE)
@@ -107,7 +126,7 @@ function(npm_build out_var)
message(STATUS "UI: running npm install")
execute_process(
COMMAND ${NPM_EXECUTABLE} install
WORKING_DIRECTORY "${UI_SOURCE_DIR}"
WORKING_DIRECTORY "${WORK_DIR}"
RESULT_VARIABLE rc
ERROR_VARIABLE err
)
@@ -124,7 +143,7 @@ function(npm_build out_var)
execute_process(
COMMAND ${CMAKE_COMMAND} -E env "LLAMA_UI_OUT_DIR=${DIST_DIR}" "LLAMA_UI_VERSION=${HF_VERSION}" "LLAMA_BUILD_NUMBER=${LLAMA_BUILD_NUMBER}"
${NPM_EXECUTABLE} run build
WORKING_DIRECTORY "${UI_SOURCE_DIR}"
WORKING_DIRECTORY "${WORK_DIR}"
RESULT_VARIABLE rc
ERROR_VARIABLE err
)
+1 -1
View File
@@ -534,7 +534,7 @@ ggml_tensor * clip_graph::build_vit(
ggml_tensor * clip_graph::build_inp() {
ggml_tensor * inp_raw = build_inp_raw();
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd);
inp = ggml_reshape_3d(ctx0, inp, n_patches, n_embd, n_batch);
inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
if (model.patch_bias) {
inp = ggml_add(ctx0, inp, model.patch_bias);
+11 -7
View File
@@ -8,7 +8,9 @@ ggml_cgraph * clip_graph_internvl::build() {
ggml_tensor * inp = build_inp();
// add CLS token
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
ggml_tensor * cls_repeated = ggml_repeat_4d(ctx0, model.class_embedding,
model.class_embedding->ne[0], 1, n_batch, 1);
inp = ggml_concat(ctx0, inp, cls_repeated, 1);
// The larger models use a different ViT, which uses RMS norm instead of layer norm
// ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
@@ -24,14 +26,15 @@ ggml_cgraph * clip_graph_internvl::build() {
nullptr);
// remove CLS token
cur = ggml_view_2d(ctx0, cur,
n_embd, n_patches,
ggml_row_size(cur->type, n_embd), 0);
cur = ggml_view_3d(ctx0, cur,
n_embd, n_patches, n_batch,
cur->nb[1], cur->nb[2], 0);
cur = ggml_cont(ctx0, cur);
// pixel shuffle
{
const int scale_factor = model.hparams.n_merge;
const int bsz = 1; // batch size, always 1 for now since we don't support batching
const int bsz = n_batch;
const int height = n_patches_y;
const int width = n_patches_x;
GGML_ASSERT(scale_factor > 0);
@@ -44,9 +47,10 @@ ggml_cgraph * clip_graph_internvl::build() {
bsz);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
// flatten to 2D
cur = ggml_cont_2d(ctx0, cur,
cur = ggml_cont_3d(ctx0, cur,
n_embd * scale_factor * scale_factor,
cur->ne[1] * cur->ne[2]);
cur->ne[1] * cur->ne[2],
cur->ne[3]);
}
// projector (always using GELU activation)
+1
View File
@@ -80,6 +80,7 @@ struct clip_graph_minicpmv4_6 : clip_graph {
struct clip_graph_internvl : clip_graph {
clip_graph_internvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
bool support_batch() const override { return true; }
};
struct clip_graph_nemotron_v2_vl : clip_graph {
+102 -15
View File
@@ -32,9 +32,9 @@ static volatile bool g_is_generating = false;
static volatile bool g_is_interrupted = false;
/**
* Please note that this is NOT a production-ready stuff.
* Please note that this is NOT a production-ready binary.
* It is a playground for trying multimodal support in llama.cpp.
* For contributors: please keep this code simple and easy to understand.
* For contributors: please keep this code simple and easy to understand. Do not add unnecessary complexity. The goal is to have a simple CLI for testing multimodal support.
*/
static void show_additional_info(int /*argc*/, char ** argv) {
@@ -65,6 +65,14 @@ static void sigint_handler(int signo) {
}
#endif
// this is only used by tests.sh to capture the response ; it's not meant to be used in production
static void inject_test_response_marker() {
const char * env = std::getenv("MTMD_TEST_RESPONSE_MARKER");
if (env) {
LOG("%s\n", env);
}
}
struct mtmd_cli_context {
mtmd::context_ptr ctx_vision;
common_init_result_ptr llama_init;
@@ -79,6 +87,8 @@ struct mtmd_cli_context {
mtmd::bitmaps bitmaps;
std::vector<mtmd_helper::video_ptr> videos;
mtmd::batch_ptr mbatch;
// chat template
common_chat_templates_ptr tmpls;
std::vector<common_chat_msg> chat_history;
@@ -233,6 +243,8 @@ static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg &
}
static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
inject_test_response_marker();
bool add_bos = ctx.chat_history.empty();
auto formatted_chat = chat_add_and_format(ctx, msg);
LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str());
@@ -259,20 +271,95 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
ctx.bitmaps.entries.clear();
ctx.videos.clear();
llama_pos new_n_past;
if (mtmd_helper_eval_chunks(ctx.ctx_vision.get(),
ctx.lctx, // lctx
chunks.ptr.get(), // chunks
ctx.n_past, // n_past
0, // seq_id
ctx.n_batch, // n_batch
true, // logits_last
&new_n_past)) {
LOG_ERR("Unable to eval prompt\n");
return 1;
}
// batch encode all media chunks, then decode each
size_t n_chunks = mtmd_input_chunks_size(chunks.ptr.get());
for (size_t i = 0; i < n_chunks; i++) {
auto chunk = mtmd_input_chunks_get(chunks.ptr.get(), i);
auto chunk_type = mtmd_input_chunk_get_type(chunk);
ctx.n_past = new_n_past;
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
// decode text chunk
llama_pos new_n_past = ctx.n_past;
res = mtmd_helper_eval_chunk_single(ctx.ctx_vision.get(),
ctx.lctx,
chunk,
ctx.n_past,
0, // seq_id
ctx.n_batch,
i == n_chunks - 1, // logits_last
&new_n_past);
if (res != 0) {
LOG_ERR("Unable to eval text chunk %zu\n", i);
return 1;
}
ctx.n_past = new_n_past;
} else {
// media chunk: try to get embd from existing batch, or create a new batch
float * embd = nullptr;
if (ctx.mbatch) {
embd = mtmd_batch_get_output_embd(ctx.mbatch.get(), chunk);
if (embd) {
LOG_DBG("found embd for media chunk %zu in existing batch\n", i);
} else {
LOG_DBG("media chunk %zu not found in existing batch, creating new batch\n", i);
}
}
if (!embd) {
// create and encode a new batch with as many media chunks as possible
ctx.mbatch.reset(mtmd_batch_init(ctx.ctx_vision.get()));
res = mtmd_batch_add_chunk(ctx.mbatch.get(), chunk);
GGML_ASSERT(res == 0); // first chunk must always succeed
int n_added = 1;
// add as many subsequent media chunks as possible
for (size_t j = i + 1; j < n_chunks; j++) {
auto next_chunk = mtmd_input_chunks_get(chunks.ptr.get(), j);
auto next_type = mtmd_input_chunk_get_type(next_chunk);
if (next_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
break; // text chunk splits the batch
}
res = mtmd_batch_add_chunk(ctx.mbatch.get(), next_chunk);
if (res != 0) {
break; // batch full or incompatible
}
n_added++;
}
int64_t time_start = ggml_time_ms();
LOG_INF("encoding mtmd batch, n_chunks = %d (done = %zu, total = %zu)\n", n_added, i, n_chunks);
res = mtmd_batch_encode(ctx.mbatch.get());
if (res != 0) {
LOG_ERR("Failed to encode mtmd batch, res = %d\n", res);
return 1;
}
LOG_INF("mtmd batch encoding done in %d ms\n", (int)(ggml_time_ms() - time_start));
embd = mtmd_batch_get_output_embd(ctx.mbatch.get(), chunk);
}
GGML_ASSERT(embd != nullptr);
llama_pos new_n_past = ctx.n_past;
res = mtmd_helper_decode_image_chunk(ctx.ctx_vision.get(),
ctx.lctx,
chunk,
embd,
ctx.n_past,
0, // seq_id
ctx.n_batch,
&new_n_past,
nullptr, // callback
nullptr // user_data
);
if (res != 0) {
LOG_ERR("Unable to decode media chunk %zu\n", i);
return 1;
}
ctx.n_past = new_n_past;
}
}
LOG("\n");
+43 -9
View File
@@ -13,6 +13,8 @@ mkdir -p $SCRIPT_DIR/output
PROJ_ROOT="$SCRIPT_DIR/../.."
cd $PROJ_ROOT
export MTMD_TEST_RESPONSE_MARKER="<MTMD_TEST_RESPONSE_MARKER>"
# Check if the first argument is "big", then run test with big models
# This is useful if we're running the script on a larger machine, so we can test the big models
RUN_BIG_TESTS=false
@@ -28,6 +30,15 @@ if [ "${1:-}" = "huge" ]; then
echo "Include BIG and HUGE models..."
fi
USE_VIDEO=false
if [ "${1:-}" = "video" ]; then
USE_VIDEO=true
echo "Using video as input..."
# behavior of USE_VIDEO:
# do NOT check if the output contains "new york", only verify if the exit code is 0
# when printing the result, print the OK/FAIL line then print the generated text
fi
# Check if the second argument is "flash", then enable flash attention
# This is useful to test if flash attention off works correctly
FLASH_ATTN="on"
@@ -50,13 +61,20 @@ add_test_vision() {
if [ $# -gt 0 ]; then
extra_args=$(printf " %q" "$@")
fi
if [ "$USE_VIDEO" = true ]; then
arr_file+=("test-3.mp4")
else
arr_file+=("test-1.jpeg")
fi
arr_prefix+=("[vision]")
arr_hf+=("$hf")
arr_extra_args+=("$extra_args")
arr_file+=("test-1.jpeg")
}
add_test_audio() {
if [ "$USE_VIDEO" = true ]; then
return 0
fi
local hf=$1
shift
local extra_args=""
@@ -166,19 +184,35 @@ for i in "${!arr_hf[@]}"; do
cmd+=" -p \"what is the publisher name of the newspaper?\""
fi
output=$(eval "$cmd" 2>&1 | tee /dev/tty)
exit_code=0
output=$(eval "$cmd" 2>&1 | tee /dev/tty) || exit_code=$?
echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
# either contains "new york" or both "men" and "walk"
if echo "$output" | grep -iq "new york" \
|| (echo "$output" | grep -iq "men" && echo "$output" | grep -iq "walk")
then
result="$prefix \033[32mOK\033[0m: $hf"
if [ "$USE_VIDEO" = true ]; then
# for video, only check exit code; do not grep for "new york"
if [ $exit_code -eq 0 ]; then
result="$prefix \033[32mOK\033[0m: $hf"
else
result="$prefix \033[31mFAIL\033[0m: $hf"
fi
# append generated text (after the response marker)
generated_text=$(echo "$output" | sed "1,/${MTMD_TEST_RESPONSE_MARKER}/d" | tail -10)
if [ -n "$generated_text" ]; then
result+="\n$generated_text"
fi
echo -e "$result"
else
result="$prefix \033[31mFAIL\033[0m: $hf"
# either contains "new york" or both "men" and "walk"
if echo "$output" | grep -iq "new york" \
|| (echo "$output" | grep -iq "men" && echo "$output" | grep -iq "walk")
then
result="$prefix \033[32mOK\033[0m: $hf"
else
result="$prefix \033[31mFAIL\033[0m: $hf"
fi
echo -e "$result"
fi
echo -e "$result"
arr_res+=("$result")
echo ""
+2
View File
@@ -492,6 +492,8 @@ using server_http_req_ptr = std::unique_ptr<server_http_req>;
static void process_handler_response(server_http_req_ptr && request, server_http_res_ptr & response, httplib::Response & res) {
if (response->is_stream()) {
res.status = response->status;
// Tell Nginx to not buffer any streamed response
response->headers["X-Accel-Buffering"] = "no";
set_headers(res, response->headers);
const std::string content_type = response->content_type;
// convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it