mtmd : add video input support (#24269)

* wip

* ok: lazy bitmap API

* remember to free lazy text

* wip

* add mtmd_helper_video

* support video input on server (base64 input)

* add MTMD_VIDEO config

* add timestamp

* update CLI

* cli: allow auto-completion for video

* add --video arg

* fix build

* update docs

* rename as suggested
This commit is contained in:
Xuan-Son Nguyen
2026-06-08 13:40:12 +02:00
committed by GitHub
parent c2b1518fd4
commit 8f83d6c271
16 changed files with 807 additions and 77 deletions
+2 -2
View File
@@ -2221,8 +2221,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
add_opt(common_arg(
{"--image", "--audio"}, "FILE",
"path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
{"--image", "--audio", "--video"}, "FILE",
"path to an image, audio, or video file. use with multimodal models, use comma-separated values for multiple files\n",
[](common_params & params, const std::string & value) {
for (const auto & item : parse_csv_row(value)) {
params.image.emplace_back(item);
+1 -1
View File
@@ -571,7 +571,7 @@ struct common_params {
struct common_params_model mmproj;
bool mmproj_use_gpu = true; // use GPU for multimodal model
bool no_mmproj = false; // explicitly disable multimodal model
std::vector<std::string> image; // path to image file(s)
std::vector<std::string> image; // path to image file(s) ; TODO: change the name to "media"
int image_min_tokens = -1;
int image_max_tokens = -1;
+6
View File
@@ -2,6 +2,7 @@
#include <assert.h>
#include "mtmd.h"
#include "mtmd-helper.h"
int main(void) {
printf("\n\nTesting libmtmd C API...\n");
@@ -17,6 +18,11 @@ int main(void) {
return 1;
}
// simple test for the helper
size_t n_tokens_total = mtmd_helper_get_n_tokens(chunks);
printf("Total tokens in chunks: %zu\n", n_tokens_total);
assert(n_tokens_total > 0);
size_t n_chunks = mtmd_input_chunks_size(chunks);
printf("Number of chunks: %zu\n", n_chunks);
assert(n_chunks > 0);
+7 -2
View File
@@ -235,7 +235,7 @@ struct cli_context {
};
// TODO?: Make this reusable, enums, docs
static const std::array<std::string_view, 7> cmds = {
static const std::array<std::string_view, 8> cmds = {
"/audio ",
"/clear",
"/exit",
@@ -243,6 +243,7 @@ static const std::array<std::string_view, 7> cmds = {
"/image ",
"/read ",
"/regen",
"/video ",
};
static std::vector<std::pair<std::string, size_t>> auto_completion_callback(std::string_view line, size_t cursor_byte_pos) {
@@ -457,6 +458,9 @@ int llama_cli(int argc, char ** argv) {
if (inf.has_inp_audio) {
console::log(" /audio <file> add an audio file\n");
}
if (inf.has_inp_video) {
console::log(" /video <file> add a video file\n");
}
console::log("\n");
// interactive loop
@@ -553,7 +557,8 @@ int llama_cli(int argc, char ** argv) {
continue;
} else if (
(string_starts_with(buffer, "/image ") && inf.has_inp_image) ||
(string_starts_with(buffer, "/audio ") && inf.has_inp_audio)) {
(string_starts_with(buffer, "/audio ") && inf.has_inp_audio) ||
(string_starts_with(buffer, "/video ") && inf.has_inp_video)) {
// just in case (bad copy-paste for example), we strip all trailing/leading spaces
std::string fname = string_strip(buffer.substr(7));
std::string marker = ctx_cli.load_input_file(fname, true);
+7
View File
@@ -1,5 +1,8 @@
# mtmd
set(MTMD_VIDEO ON CACHE BOOL "enable video support in mtmd (requires ffmpeg binary in PATH)")
# TODO: add MTMD_VIDEO_METHOD in the future to select between ffmpeg and other backends
find_package(Threads REQUIRED)
add_library(mtmd
@@ -63,6 +66,10 @@ target_include_directories(mtmd PRIVATE ../..)
target_include_directories(mtmd PRIVATE ../../vendor)
target_compile_features (mtmd PRIVATE cxx_std_17)
if (MTMD_VIDEO)
target_compile_definitions(mtmd PRIVATE MTMD_VIDEO)
endif()
if (BUILD_SHARED_LIBS)
set_target_properties (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(mtmd PRIVATE LLAMA_BUILD)
+14 -5
View File
@@ -77,6 +77,7 @@ struct mtmd_cli_context {
int n_batch;
mtmd::bitmaps bitmaps;
std::vector<mtmd_helper::video_ptr> videos;
// chat template
common_chat_templates_ptr tmpls;
@@ -166,11 +167,14 @@ struct mtmd_cli_context {
}
bool load_media(const std::string & fname) {
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str(), false));
if (!bmp.ptr) {
auto res = mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str(), false);
if (!res.bitmap) {
return false;
}
bitmaps.entries.push_back(std::move(bmp));
bitmaps.entries.emplace_back(res.bitmap);
if (res.video_ctx) {
videos.emplace_back(res.video_ctx);
}
return true;
}
};
@@ -253,6 +257,7 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
}
ctx.bitmaps.entries.clear();
ctx.videos.clear();
llama_pos new_n_past;
if (mtmd_helper_eval_chunks(ctx.ctx_vision.get(),
@@ -373,6 +378,9 @@ int main(int argc, char ** argv) {
if (mtmd_support_audio(ctx.ctx_vision.get())) {
LOG("\n /audio <path> load an audio");
}
if (mtmd_helper_support_video(ctx.ctx_vision.get())) {
LOG("\n /video <path> load a video");
}
LOG("\n /clear clear the chat history");
LOG("\n /quit or /exit exit the program");
LOG("\n");
@@ -407,14 +415,15 @@ int main(int argc, char ** argv) {
g_is_generating = true;
bool is_image = line == "/image" || line.find("/image ") == 0;
bool is_audio = line == "/audio" || line.find("/audio ") == 0;
if (is_image || is_audio) {
bool is_video = line == "/video" || line.find("/video ") == 0;
if (is_image || is_audio || is_video) {
if (line.size() < 8) {
LOG_ERR("ERR: Missing media filename\n");
continue;
}
std::string media_path = line.substr(7);
if (ctx.load_media(media_path)) {
LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio");
LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : is_audio ? "audio" : "video");
content += mtmd_default_marker();
}
// else, error is already printed by libmtmd
+490 -16
View File
@@ -36,6 +36,11 @@
#error "mtmd-helper is a public library outside of mtmd. it must not include internal headers"
#endif
#ifdef MTMD_VIDEO
#include "sheredom/subprocess.h"
#include <thread>
#endif
//
// internal logging functions
//
@@ -79,6 +84,7 @@ struct mtmd_helper_logger {
}
} g_logger;
#define LOG_DBG(...) g_logger.log(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
#define LOG_INF(...) g_logger.log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
#define LOG_WRN(...) g_logger.log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
#define LOG_ERR(...) g_logger.log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
@@ -478,42 +484,94 @@ static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int
} // namespace audio_helpers
mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder) {
// Computes FNV-1a hash of the data
static std::string fnv_hash(const uint8_t * data, size_t len) {
const uint64_t fnv_prime = 0x100000001b3ULL;
uint64_t hash = 0xcbf29ce484222325ULL;
for (size_t i = 0; i < len; ++i) {
hash ^= data[i];
hash *= fnv_prime;
}
return std::to_string(hash);
}
mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder) {
// calculate the hash if needed
std::string id;
mtmd_bitmap * result = nullptr;
if (!placeholder) {
id = fnv_hash(buf, len);
}
if (audio_helpers::is_audio_file((const char *)buf, len)) {
std::vector<float> pcmf32;
const int sample_rate = mtmd_get_audio_sample_rate(ctx);
if (sample_rate < 0) {
LOG_ERR("This model does not support audio input\n");
return nullptr;
return {nullptr, nullptr};
}
if (!audio_helpers::decode_audio_from_buf(buf, len, sample_rate, pcmf32)) {
LOG_ERR("Unable to read WAV audio file from buffer\n");
return nullptr;
return {nullptr, nullptr};
}
return mtmd_bitmap_init_from_audio(pcmf32.size(), placeholder ? nullptr : pcmf32.data());
result = mtmd_bitmap_init_from_audio(pcmf32.size(), placeholder ? nullptr : pcmf32.data());
mtmd_bitmap_set_id(result, id.empty() ? nullptr : id.c_str());
return {result, nullptr};
}
// otherwise, we assume it's an image
mtmd_bitmap * result = nullptr;
{
if (!result) {
int nx, ny, nc;
auto * data = stbi_load_from_memory(buf, len, &nx, &ny, &nc, 3);
if (!data) {
LOG_ERR("%s: failed to decode image bytes\n", __func__);
return nullptr;
if (data) {
result = mtmd_bitmap_init(nx, ny, placeholder ? nullptr : data);
mtmd_bitmap_set_id(result, id.empty() ? nullptr : id.c_str());
stbi_image_free(data);
return {result, nullptr};
}
result = mtmd_bitmap_init(nx, ny, placeholder ? nullptr : data);
stbi_image_free(data);
// otherwise, fallthrough to video decoding (if supported)
}
return result;
// last try: load as video
#ifdef MTMD_VIDEO
if (!result) {
auto params = mtmd_helper_video_init_params_default();
auto video_ctx = mtmd_helper_video_init_from_buf(ctx, buf, len, params);
if (!video_ctx) {
LOG_ERR("%s: failed to decode buffer as either image/audio/video\n", __func__);
return {nullptr, nullptr};
}
result = mtmd_bitmap_init_lazy(ctx,
id.empty() ? nullptr : id.c_str(),
video_ctx,
[](size_t, void * user_data, mtmd_bitmap ** out_bitmap, char ** out_text) -> int {
auto * vctx = static_cast<mtmd_helper_video *>(user_data);
char * text = nullptr;
int ret = mtmd_helper_video_read_next(vctx, out_bitmap, &text);
*out_text = text; // heap-allocated by read_next; freed automatically by mtmd
return ret;
});
return {result, video_ctx};
}
#else
if (!result) {
LOG_ERR("%s: failed to decode buffer as either image or audio (video support not compiled in)\n", __func__);
return {nullptr, nullptr};
}
#endif
// should not reach here
return {nullptr, nullptr};
}
mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder) {
mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder) {
std::vector<unsigned char> buf;
FILE * f = fopen(fname, "rb");
if (!f) {
LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
return nullptr;
return {nullptr, nullptr};
}
fseek(f, 0, SEEK_END);
@@ -522,7 +580,7 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char *
if (file_size < 0) {
LOG_ERR("Failed to get file size of %s\n", fname);
fclose(f);
return nullptr;
return {nullptr, nullptr};
}
buf.resize(file_size);
@@ -530,9 +588,425 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char *
fclose(f);
if (n_read != (size_t)file_size) {
LOG_ERR("Failed to read entire file %s", fname);
return nullptr;
return {nullptr, nullptr};
}
return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size(), placeholder);
}
bool mtmd_helper_support_video(mtmd_context * ctx) {
#ifdef MTMD_VIDEO
return mtmd_support_vision(ctx);
#else
return false;
#endif
}
//
// Video input helpers
//
#ifdef MTMD_VIDEO
struct mtmd_helper_video {
mtmd_context * mctx;
std::string path;
std::vector<uint8_t> input_buf; // non-empty when initialized from buffer
std::string ffmpeg_bin;
std::string ffprobe_bin;
float fps_target = 0.0f;
mtmd_helper_video_info info = {};
struct subprocess_s proc = {};
bool proc_alive = false;
int32_t current_frame = 0;
std::thread feeder_thread;
std::string prompt_start = "Video:";
int32_t timestamp_interval_ms = 5000; // emit a timestamp text every N ms (0 = disabled)
float next_timestamp_ms = 0.0f; // next elapsed-ms threshold at which to emit
std::vector<uint8_t> frame_buf;
std::string pending_text; // text queued to be returned before the next frame
bool start_emitted = false;
bool is_buf_input() const { return !input_buf.empty(); }
// must run in a separate thread alongside stdout reading to avoid pipe deadlock
void feed_stdin(struct subprocess_s * sp) {
FILE * f = subprocess_stdin(sp);
if (!f) {
LOG_DBG("%s: subprocess has no stdin pipe\n", __func__);
return;
}
LOG_DBG("%s: feeding %zu bytes to stdin\n", __func__, input_buf.size());
size_t written = fwrite(input_buf.data(), 1, input_buf.size(), f);
LOG_DBG("%s: wrote %zu bytes, closing stdin\n", __func__, written);
fclose(f);
}
bool probe(float fps_target_arg) {
const char * input_arg = is_buf_input() ? "pipe:0" : path.c_str();
const char * cmd[] = {
ffprobe_bin.c_str(),
"-v", "quiet",
"-show_entries", "stream=width,height,r_frame_rate,nb_frames,duration",
"-select_streams", "v:0",
"-of", "default=noprint_wrappers=1",
input_arg,
nullptr,
};
LOG_DBG("%s: launching:", __func__);
for (size_t i = 0; cmd[i]; i++) { LOG_DBG(" %s", cmd[i]); }
LOG_DBG("\n");
struct subprocess_s fprobe;
if (subprocess_create(cmd,
subprocess_option_search_user_path | subprocess_option_inherit_environment,
&fprobe) != 0) {
LOG_ERR("%s: failed to launch ffprobe\n", __func__);
return false;
}
std::thread probe_feeder;
if (is_buf_input()) {
probe_feeder = std::thread([this, &fprobe]() { feed_stdin(&fprobe); });
}
uint32_t width = 0;
uint32_t height = 0;
float orig_fps = 0.0f;
float duration = -1.0f;
int32_t n_frames_orig = -1;
char line[256];
FILE * fp = subprocess_stdout(&fprobe);
while (fgets(line, sizeof(line), fp)) {
char * eq = strchr(line, '=');
if (!eq) continue;
*eq = '\0';
const char * key = line;
const char * val = eq + 1;
char * nl = (char *)strchr(val, '\n');
if (nl) *nl = '\0';
if (strcmp(key, "width") == 0) {
width = (uint32_t)atoi(val);
} else if (strcmp(key, "height") == 0) {
height = (uint32_t)atoi(val);
} else if (strcmp(key, "r_frame_rate") == 0) {
orig_fps = parse_rational(val);
} else if (strcmp(key, "nb_frames") == 0 && strcmp(val, "N/A") != 0) {
n_frames_orig = atoi(val);
} else if (strcmp(key, "duration") == 0 && strcmp(val, "N/A") != 0) {
duration = (float)atof(val);
}
}
if (probe_feeder.joinable()) {
probe_feeder.join();
}
int ret_code;
subprocess_join(&fprobe, &ret_code);
subprocess_destroy(&fprobe);
if (width == 0 || height == 0 || orig_fps <= 0.0f) {
return false;
}
if (duration < 0.0f && n_frames_orig > 0) {
duration = (float)n_frames_orig / orig_fps;
}
fps_target = fps_target_arg > 0.0f ? fps_target_arg : orig_fps;
info.width = width;
info.height = height;
info.fps = fps_target;
LOG_DBG("%s: %ux%u fps=%.2f duration=%.2fs n_frames=%d\n",
__func__, width, height, fps_target, duration, info.n_frames);
info.n_frames = duration > 0.0f ? (int32_t)(duration * fps_target + 0.5f) : -1;
frame_buf.resize((size_t)width * height * 3);
return true;
}
bool start_ffmpeg(float seek_seconds) {
char seek_buf[64];
char fps_buf[64];
std::vector<const char *> cmd;
cmd.push_back(ffmpeg_bin.c_str());
if (!is_buf_input() && seek_seconds > 0.0f) {
// input-side seek: fast, keyframe-accurate; only valid for seekable file inputs
snprintf(seek_buf, sizeof(seek_buf), "%.6f", seek_seconds);
cmd.push_back("-ss");
cmd.push_back(seek_buf);
}
cmd.push_back("-i");
// cache:pipe:0 wraps stdin with a seekable in-memory cache, letting ffmpeg seek
// backwards for container headers (e.g. MP4 moov atom at end of file)
cmd.push_back(is_buf_input() ? "cache:pipe:0" : path.c_str());
if (seek_seconds > 0.0f && is_buf_input()) {
// output-side seek: frame-accurate but decodes and discards frames up to seek point
snprintf(seek_buf, sizeof(seek_buf), "%.6f", seek_seconds);
cmd.push_back("-ss");
cmd.push_back(seek_buf);
}
if (fps_target > 0.0f) {
snprintf(fps_buf, sizeof(fps_buf), "fps=%.6f", fps_target);
cmd.push_back("-vf");
cmd.push_back(fps_buf);
}
cmd.push_back("-f");
cmd.push_back("rawvideo");
cmd.push_back("-pix_fmt");
cmd.push_back("rgb24");
cmd.push_back("pipe:1");
cmd.push_back("-loglevel");
cmd.push_back("error");
cmd.push_back(nullptr);
LOG_DBG("%s: launching:", __func__);
for (size_t i = 0; cmd[i]; i++) {
LOG_DBG(" %s", cmd[i]);
}
LOG_DBG("\n");
int ret = subprocess_create(
cmd.data(),
subprocess_option_search_user_path | subprocess_option_inherit_environment,
&proc);
proc_alive = (ret == 0);
LOG_DBG("%s: subprocess_create ret=%d proc_alive=%d\n", __func__, ret, (int)proc_alive);
if (proc_alive && is_buf_input()) {
LOG_DBG("%s: starting feeder thread for %zu-byte buffer\n", __func__, input_buf.size());
feeder_thread = std::thread([this]() { feed_stdin(&proc); });
}
return proc_alive;
}
void stop_ffmpeg() {
if (proc_alive) {
subprocess_terminate(&proc);
subprocess_destroy(&proc);
proc_alive = false;
}
if (feeder_thread.joinable()) {
feeder_thread.join();
}
}
mtmd_bitmap * read_next_frame() {
if (!proc_alive) return nullptr;
FILE * fp = subprocess_stdout(&proc);
const size_t frame_size = (size_t)info.width * info.height * 3;
LOG_DBG("%s: reading frame %d, expecting %zu bytes (%ux%u)\n",
__func__, current_frame, frame_size, info.width, info.height);
size_t total_read = 0;
while (total_read < frame_size) {
size_t n = fread(frame_buf.data() + total_read, 1, frame_size - total_read, fp);
if (n == 0) {
// clean EOF only if no bytes read yet; partial frame is an error
LOG_DBG("%s: fread returned 0 after %zu/%zu bytes (ferror=%d)\n",
__func__, total_read, frame_size, ferror(fp));
proc_alive = false;
return nullptr;
}
total_read += n;
}
LOG_DBG("%s: frame %d read OK\n", __func__, current_frame);
current_frame++;
return mtmd_bitmap_init(info.width, info.height, frame_buf.data());
}
int32_t read_next(mtmd_bitmap ** out_bitmap, char ** out_text) {
*out_bitmap = nullptr;
*out_text = nullptr;
if (!pending_text.empty()) {
*out_text = strdup(pending_text.c_str());
pending_text.clear();
return *out_text ? 0 : -2;
}
LOG_DBG("%s: proc_alive=%d start_emitted=%d current_frame=%d\n",
__func__, (int)proc_alive, (int)start_emitted, current_frame);
if (!proc_alive) {
return (current_frame == 0) ? -2 : -1;
}
if (!start_emitted) {
start_emitted = true;
if (!prompt_start.empty()) {
*out_text = strdup(prompt_start.c_str());
return *out_text ? 0 : -2;
}
}
mtmd_bitmap * frame = read_next_frame();
if (!frame) return -1;
*out_bitmap = frame;
if (timestamp_interval_ms > 0) {
// current_frame was already incremented by read_next_frame(); undo for elapsed calc
float elapsed_ms = (float)(current_frame - 1) / info.fps * 1000.0f;
if (elapsed_ms >= next_timestamp_ms) {
char ts_buf[32];
float elapsed_s = elapsed_ms / 1000.0f;
int minutes = (int)(elapsed_s / 60);
float seconds = elapsed_s - minutes * 60.0f;
snprintf(ts_buf, sizeof(ts_buf), "[%dm%.2fs]", minutes, seconds);
pending_text = ts_buf;
next_timestamp_ms += (float)timestamp_interval_ms;
}
}
return 0;
}
static float parse_rational(const char * s) {
int num = 0, den = 1;
if (sscanf(s, "%d/%d", &num, &den) == 2 && den > 0) {
return (float)num / (float)den;
}
float val;
if (sscanf(s, "%f", &val) == 1) {
return val;
}
return 0.0f;
}
};
#endif
mtmd_helper_video_init_params mtmd_helper_video_init_params_default() {
return {
/* fps_target */ 4.0f,
/* ffmpeg_bin_dir */ nullptr,
/* timestamp_interval_ms */ 5000,
};
}
static std::string video_resolve_bin(const char * bin_dir, const char * name) {
if (!bin_dir || bin_dir[0] == '\0') {
return name; // rely on PATH
}
std::string result = bin_dir;
char last = result.back();
if (last != '/' && last != '\\') {
#ifdef _WIN32
result += '\\';
#else
result += '/';
#endif
}
result += name;
#ifdef _WIN32
result += ".exe";
#endif
return result;
}
mtmd_helper_video * mtmd_helper_video_init(
mtmd_context * mctx,
const char * path,
mtmd_helper_video_init_params params) {
#ifdef MTMD_VIDEO
auto * ctx = new mtmd_helper_video();
ctx->mctx = mctx;
ctx->path = path;
ctx->ffmpeg_bin = video_resolve_bin(params.ffmpeg_bin_dir, "ffmpeg");
ctx->ffprobe_bin = video_resolve_bin(params.ffmpeg_bin_dir, "ffprobe");
ctx->timestamp_interval_ms = params.timestamp_interval_ms;
if (!ctx->probe(params.fps_target)) {
LOG_ERR("%s: ffprobe failed for '%s' (is ffprobe in PATH?)\n", __func__, path);
delete ctx;
return nullptr;
}
if (!ctx->start_ffmpeg(0.0f)) {
LOG_ERR("%s: failed to start ffmpeg for '%s' (is ffmpeg in PATH?)\n", __func__, path);
delete ctx;
return nullptr;
}
return ctx;
#else
LOG_ERR("%s: video is not supported in this build (MTMD_VIDEO is set to OFF)\n", __func__);
return nullptr;
#endif
}
mtmd_helper_video * mtmd_helper_video_init_from_buf(
mtmd_context * mctx,
const unsigned char * buf, size_t len,
mtmd_helper_video_init_params params) {
#ifdef MTMD_VIDEO
auto * ctx = new mtmd_helper_video();
ctx->mctx = mctx;
ctx->input_buf.assign(buf, buf + len);
ctx->ffmpeg_bin = video_resolve_bin(params.ffmpeg_bin_dir, "ffmpeg");
ctx->ffprobe_bin = video_resolve_bin(params.ffmpeg_bin_dir, "ffprobe");
ctx->timestamp_interval_ms = params.timestamp_interval_ms;
if (!ctx->probe(params.fps_target)) {
LOG_ERR("%s: ffprobe failed on buffer (is ffprobe in PATH?)\n", __func__);
delete ctx;
return nullptr;
}
if (!ctx->start_ffmpeg(0.0f)) {
LOG_ERR("%s: failed to start ffmpeg on buffer (is ffmpeg in PATH?)\n", __func__);
delete ctx;
return nullptr;
}
return ctx;
#else
LOG_ERR("%s: video is not supported in this build (MTMD_VIDEO is set to OFF)\n", __func__);
return nullptr;
#endif
}
void mtmd_helper_video_free(mtmd_helper_video * ctx) {
#ifdef MTMD_VIDEO
if (!ctx) return;
ctx->stop_ffmpeg();
delete ctx;
#else
LOG_ERR("%s: video is not supported in this build (MTMD_VIDEO is set to OFF)\n", __func__);
#endif
}
mtmd_helper_video_info mtmd_helper_video_get_info(const mtmd_helper_video * ctx) {
#ifdef MTMD_VIDEO
return ctx->info;
#else
GGML_ASSERT(false && "video is not supported in this build (MTMD_VIDEO is set to OFF)");
#endif
}
int32_t mtmd_helper_video_read_next(mtmd_helper_video * ctx,
mtmd_bitmap ** out_bitmap, char ** out_text) {
#ifdef MTMD_VIDEO
if (!ctx) return -2;
return ctx->read_next(out_bitmap, out_text);
#else
GGML_ASSERT(false && "video is not supported in this build (MTMD_VIDEO is set to OFF)");
#endif
}
+79 -3
View File
@@ -20,25 +20,39 @@ extern "C" {
// BREAKING CHANGES are expected.
//
struct mtmd_helper_video;
typedef struct mtmd_helper_video mtmd_helper_video;
// Set callback for all future logging events.
// If this is not called, or NULL is supplied, everything is output on stderr.
// Note: this also call mtmd_log_set() internally
MTMD_API void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_data);
// Returns true if this build includes video support (MTMD_VIDEO was ON at compile time).
MTMD_API bool mtmd_helper_support_video(mtmd_context * ctx);
struct mtmd_helper_bitmap_wrapper {
mtmd_bitmap * bitmap;
mtmd_helper_video * video_ctx;
};
// helper function to construct a mtmd_bitmap from a file
// it calls mtmd_helper_bitmap_init_from_buf() internally
// returns nullptr on failure
// this function is thread-safe
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder);
MTMD_API struct mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder);
// helper function to construct a mtmd_bitmap from a buffer containing a file
// supported formats:
// image: formats supported by stb_image: jpg, png, bmp, gif, etc.
// audio: formats supported by miniaudio: wav, mp3, flac
// note: audio files will be auto-detected based on magic bytes
// note:
// - for now, video input is only supported via C++ helper functions
// - audio files will be auto-detected based on magic bytes
// - output bitmap will have FNV hash as the ID
// returns nullptr on failure
// this function is thread-safe
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder);
MTMD_API struct mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder);
// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
@@ -89,6 +103,56 @@ MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
int32_t n_batch,
llama_pos * new_n_past);
//
// video input helpers (requires ffmpeg/ffprobe installed on the system)
// the notion of video only exists at the helper level, it is not visible to the core mtmd library
//
// NOTE: this implementation is model-agnostic, it can be used with any vision-capable model
// however, it may not be accurate for some specific models
// (this is expected for now, to keep the implementation simple)
//
struct mtmd_helper_video_info {
uint32_t width;
uint32_t height;
float fps; // effective fps (fps_target if set, else original video fps)
int32_t n_frames; // estimated total frames at effective fps (-1 if unknown)
};
struct mtmd_helper_video_init_params {
float fps_target; // desired output fps; <= 0 means use the video's native fps, defaulted to 4.0f
const char * ffmpeg_bin_dir; // directory containing ffmpeg/ffprobe binaries; NULL means search PATH
int64_t timestamp_interval_ms; // interval for adding timestamp as text chunk (example: "[10m50.5s]"); <= 0 means no timestamp, defaulted to 5000ms
// TODO @ngxson : allow "placeholder" bitmap output for counting tokens
};
MTMD_API struct mtmd_helper_video_init_params mtmd_helper_video_init_params_default(void);
// returns NULL on failure (ffprobe not found, file unreadable, etc.)
MTMD_API mtmd_helper_video * mtmd_helper_video_init(
struct mtmd_context * mctx,
const char * path,
struct mtmd_helper_video_init_params params);
// Same as mtmd_helper_video_init(), but reads from an in-memory buffer.
// The buffer is copied internally; the caller does not need to keep it alive.
// Note: pipe input is not seekable, so seeking will use output-side seeking
// (ffmpeg decodes and discards frames up to the target position).
MTMD_API mtmd_helper_video * mtmd_helper_video_init_from_buf(
struct mtmd_context * mctx,
const unsigned char * buf, size_t len,
struct mtmd_helper_video_init_params params);
MTMD_API void mtmd_helper_video_free(mtmd_helper_video * ctx);
MTMD_API struct mtmd_helper_video_info mtmd_helper_video_get_info(const mtmd_helper_video * ctx);
// Read the next item from the video stream; exactly one of out_bitmap or out_text is set per call.
// *out_bitmap - heap-allocated; caller must free with mtmd_bitmap_free()
// *out_text - heap-allocated (always via strdup/malloc); caller must free with free()
// returns 0 on success, -1 on EOF, -2 on error
MTMD_API int32_t mtmd_helper_video_read_next(mtmd_helper_video * ctx,
mtmd_bitmap ** out_bitmap,
char ** out_text);
#ifdef __cplusplus
} // extern "C"
#endif
@@ -97,4 +161,16 @@ MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
// C++ wrappers
//
#ifdef __cplusplus
namespace mtmd_helper {
// video-related C++ wrappers
struct mtmd_helper_video_deleter {
void operator()(mtmd_helper_video * val) { mtmd_helper_video_free(val); }
};
using video_ptr = std::unique_ptr<mtmd_helper_video, mtmd_helper_video_deleter>;
} // namespace mtmd_helper
#endif
#endif
+138 -29
View File
@@ -35,6 +35,10 @@ struct mtmd_bitmap {
std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
bool is_audio = false; // true if the bitmap is audio
// lazy-loaded bitmap
mtmd_bitmap_lazy_callback lazy_callback = nullptr;
void * lazy_user_data = nullptr;
mtmd_bitmap(const unsigned char * data, uint32_t nx, uint32_t ny)
: nx(nx), ny(ny), is_audio(false) {
if (data) {
@@ -732,30 +736,111 @@ void mtmd_free(mtmd_context * ctx) {
struct mtmd_tokenizer {
mtmd_context * ctx;
std::vector<const mtmd_bitmap *> bitmaps;
std::string input_text;
bool add_special;
bool parse_special;
const llama_vocab * vocab;
struct part {
std::string text;
const mtmd_bitmap * bitmap;
};
std::vector<part> parts;
// these will be freed when mtmd_tokenizer finishes
std::vector<mtmd::bitmap> bm_from_lazy; // TODO @ngxson : refactor, free bm_from_lazy progressively
std::vector<const char *> text_from_lazy;
mtmd_input_chunks cur;
uint32_t n_images_added = 0; // 0-based index assigned to the next image chunk
~mtmd_tokenizer() {
// note: mtmd::bitmap is already RAII
for (auto & str : text_from_lazy) {
free((void *)str);
}
}
mtmd_tokenizer(mtmd_context * ctx,
const mtmd_input_text * text,
const mtmd_bitmap ** bitmaps,
size_t n_bitmaps) : ctx(ctx), bitmaps(bitmaps, bitmaps + n_bitmaps) {
const mtmd_bitmap ** bmps,
size_t n_bitmaps) : ctx(ctx) {
add_special = text->add_special;
parse_special = text->parse_special;
input_text = text->text;
vocab = ctx->vocab;
std::vector<const mtmd_bitmap *> bitmaps(bmps, bmps + n_bitmaps);
auto parts_str = split_text(input_text, ctx->media_marker);
size_t i_bm = 0;
for (const auto & part : parts_str) {
if (part == ctx->media_marker) {
if (i_bm >= bitmaps.size()) {
throw std::runtime_error(string_format("number of media markers in text (%zu) exceeds number of bitmaps (%zu)", i_bm + 1, bitmaps.size()));
}
parts.push_back({"", bitmaps[i_bm++]});
} else {
parts.push_back({std::move(part), nullptr});
}
}
size_t n_markers = 0;
for (const auto & part : parts) {
if (part.bitmap != nullptr) {
n_markers++;
}
}
if (n_markers != bitmaps.size()) {
throw std::runtime_error(string_format("number of media markers in text (%zu) does not match number of bitmaps (%zu)", n_markers, bitmaps.size()));
}
expand_lazy_bitmaps();
}
void expand_lazy_bitmaps() {
std::vector<part> expanded;
expanded.reserve(parts.size());
for (auto & p : parts) {
if (p.bitmap != nullptr && p.bitmap->lazy_callback) {
LOG_DBG("%s: expanding lazy bitmap\n", __func__);
for (size_t i = 0;; i++) {
char * out_str = nullptr;
mtmd_bitmap * out_bm = nullptr;
int res = p.bitmap->lazy_callback(i,
p.bitmap->lazy_user_data,
&out_bm,
&out_str);
if (out_bm && out_str) {
throw std::runtime_error(string_format("lazy callback cannot return both bitmap and text"));
}
if (res == 0) {
// OK, append the returned chunk; lazy part is not yet added
if (out_bm) {
auto & ptr = bm_from_lazy.emplace_back(out_bm); // remember to free it later
expanded.push_back({"", ptr.ptr.get()});
LOG_DBG("%s: lazy callback returned bitmap with dimensions %d x %d\n", __func__, out_bm->nx, out_bm->ny);
} else if (out_str) {
auto & ptr = text_from_lazy.emplace_back(out_str); // remember to free it later
expanded.push_back({ptr, nullptr});
LOG_DBG("%s: lazy callback returned text: %s\n", __func__, out_str);
}
} else if (res == -1) {
// EOF: lazy part removes itself (not added to expanded)
break;
} else if (res == -2) {
// error
throw std::runtime_error(string_format("lazy callback returned error"));
}
}
} else {
expanded.push_back(std::move(p));
}
}
parts = std::move(expanded);
}
int32_t tokenize(mtmd_input_chunks * output) {
cur.entries.clear();
std::vector<std::string> parts = split_text(input_text, ctx->media_marker);
size_t i_bm = 0; // index of the current bitmap
// [QWEN_VIDEO] handle frame merging for models that support it (i.e. qwen-vl)
int n_merge_frames = 1;
@@ -764,53 +849,50 @@ struct mtmd_tokenizer {
GGML_ASSERT(n_merge_frames <= 2 && "we only support merging maximum 2 images for now; open an issue if this model supports merging more");
}
// Build merged_bitmaps: each entry is a group of 1 or 2 bitmaps.
// For consecutive mergeable bitmap parts, merge them and collapse the second part out of this->parts.
std::vector<std::vector<const mtmd_bitmap *>> merged_bitmaps;
if (n_merge_frames > 1) {
size_t i_bm_scan = 0;
for (size_t i = 0; i < parts.size(); ++i) {
if (parts[i] != ctx->media_marker) {
if (parts[i].bitmap == nullptr) {
continue;
}
if (i + 1 < parts.size()
&& parts[i + 1] == ctx->media_marker
&& i_bm_scan + 1 < bitmaps.size()) {
const mtmd_bitmap * bm_a = bitmaps[i_bm_scan];
const mtmd_bitmap * bm_b = bitmaps[i_bm_scan + 1];
if (i + 1 < parts.size() && parts[i + 1].bitmap != nullptr) {
const mtmd_bitmap * bm_a = parts[i].bitmap;
const mtmd_bitmap * bm_b = parts[i + 1].bitmap;
if (bm_a->can_batch_with(*bm_b)) {
LOG_DBG("%s: merging 2 frames at bitmap index %zu and %zu\n", __func__, i_bm_scan, i_bm_scan + 1);
LOG_DBG("%s: merging 2 frames at part index %zu and %zu\n", __func__, i, i + 1);
merged_bitmaps.push_back({bm_a, bm_b});
parts.erase(parts.begin() + i + 1); // remove the second marker
i_bm_scan += 2;
parts.erase(parts.begin() + i + 1); // collapse the second bitmap part
continue;
}
}
LOG_DBG("%s: no merging for bitmap index %zu\n", __func__, i_bm_scan);
merged_bitmaps.push_back({bitmaps[i_bm_scan]});
++i_bm_scan;
LOG_DBG("%s: no merging for part index %zu\n", __func__, i);
merged_bitmaps.push_back({parts[i].bitmap});
}
} else {
for (size_t i = 0; i < bitmaps.size(); ++i) {
merged_bitmaps.push_back({bitmaps[i]});
for (const auto & p : parts) {
if (p.bitmap != nullptr) {
merged_bitmaps.push_back({p.bitmap});
}
}
}
i_bm = 0;
for (auto & part : parts) {
if (part == ctx->media_marker) {
// this is a marker, we should add the next bitmap
size_t i_bm = 0;
for (const auto & p : parts) {
if (p.bitmap != nullptr) {
if (i_bm >= merged_bitmaps.size()) {
LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
__func__, merged_bitmaps.size(), parts.size() - 1);
return 1;
}
auto & bmps = merged_bitmaps[i_bm++];
auto bmps = merged_bitmaps[i_bm++];
int32_t res = add_media(bmps);
if (res != 0) {
return res;
}
} else {
// this is a text part, we should add it as text
add_text(part, parse_special);
add_text(p.text, parse_special);
}
}
@@ -1236,8 +1318,13 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
const mtmd_input_text * text,
const mtmd_bitmap ** bitmaps,
size_t n_bitmaps) {
mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps);
return tokenizer.tokenize(output);
try {
mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps);
return tokenizer.tokenize(output);
} catch (const std::exception & e) {
LOG_ERR("%s: error: %s\n", __func__, e.what());
return 2;
}
}
int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
@@ -1373,6 +1460,10 @@ int mtmd_get_audio_sample_rate(const mtmd_context * ctx) {
return clip_get_hparams(ctx->ctx_a)->audio_sample_rate;
}
const char * mtmd_get_marker(const mtmd_context * ctx) {
return ctx->media_marker.c_str();
}
//
// public API functions
//
@@ -1405,10 +1496,16 @@ uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap) {
}
const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
if (bitmap->is_placeholder()) {
return nullptr;
}
return bitmap->get_ro_buf().data();
}
size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) {
if (bitmap->is_placeholder()) {
return 0;
}
return bitmap->get_ro_buf().size();
}
@@ -1428,6 +1525,18 @@ void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id) {
}
}
mtmd_bitmap * mtmd_bitmap_init_lazy(mtmd_context * ctx,
const char * id,
void * user_data,
mtmd_bitmap_lazy_callback callback) {
GGML_UNUSED(ctx); // reserved for future use
mtmd_bitmap * bitmap = new mtmd_bitmap(nullptr, 0, 0);
bitmap->lazy_callback = callback;
bitmap->lazy_user_data = user_data;
mtmd_bitmap_set_id(bitmap, id);
return bitmap;
}
void mtmd_bitmap_free(mtmd_bitmap * bitmap) {
if (bitmap) {
delete bitmap;
+31
View File
@@ -128,6 +128,9 @@ MTMD_API bool mtmd_support_audio(const mtmd_context * ctx);
// return -1 if audio is not supported
MTMD_API int mtmd_get_audio_sample_rate(const mtmd_context * ctx);
// get the current marker string
MTMD_API const char * mtmd_get_marker(const mtmd_context * ctx);
// mtmd_bitmap
//
// if bitmap is image:
@@ -156,6 +159,34 @@ MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap);
MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
MTMD_API void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id);
// mtmd_bitmap lazy
//
// this is a special bitmap that:
// - does not hold the actual data
// - can be expanded into one or more chunks (either media to text chunks)
// user must provide a callback to fill in the data when mtmd_tokenize() is called
// this is useful for large video inputs:
// - allow reading video frame by frame, without loading the entire video into memory
// - allow tracking the whole video with a single ID (for example, the file hash)
// set (*out_bitmap) to non-nullptr to emit a bitmap chunk; it will be freed automatically
// set (*out_text) to non-nullptr to emit a text chunk; it must be heap-allocated, null-terminated and will be freed automatically
// either out_bitmap or out_text can be set, but not both
// out_bitmap cannot be another lazy bitmap (no nested lazy allowed)
// return value:
// 0 on success
// -1 on EOF (signal to mtmd_tokenize to move on)
// -2 on error (signal to mtmd_tokenize to abort)
typedef int(* mtmd_bitmap_lazy_callback)(
size_t chunk_idx,
void * user_data,
mtmd_bitmap ** out_bitmap,
char ** out_text);
MTMD_API mtmd_bitmap * mtmd_bitmap_init_lazy(mtmd_context * ctx,
const char * id, // usually set to file hash
void * user_data,
mtmd_bitmap_lazy_callback callback);
// mtmd_input_chunks
//
Binary file not shown.
+4
View File
@@ -1252,6 +1252,10 @@ The `response_format` parameter supports both plain JSON output (e.g. `{"type":
`parallel_tool_calls` : Whether to enable parallel/multiple tool calls (only supported on some models, verification is based on jinja template).
For multimodal input:
- Content type `image_url` and `input_audio` are the same as OAI schema
- Content type `input_video` is an extension from OAI schema. For now, it only accepts base64 input
*Examples:*
You can use either Python `openai` library with appropriate checkpoints:
+22 -18
View File
@@ -701,29 +701,19 @@ size_t validate_utf8(const std::string& text) {
return len;
}
// Computes FNV-1a hash of the data
static std::string fnv_hash(const uint8_t * data, size_t len) {
const uint64_t fnv_prime = 0x100000001b3ULL;
uint64_t hash = 0xcbf29ce484222325ULL;
for (size_t i = 0; i < len; ++i) {
hash ^= data[i];
hash *= fnv_prime;
}
return std::to_string(hash);
}
server_tokens process_mtmd_prompt(mtmd_context * mctx, const std::string & prompt, const std::vector<raw_buffer> & files, bool is_placeholder) {
// these will be freed upon going out of scope
mtmd::bitmaps bitmaps;
std::vector<mtmd_helper::video_ptr> videos;
for (auto & file : files) {
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size(), is_placeholder));
if (!bmp.ptr) {
auto out = mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size(), is_placeholder);
if (!out.bitmap) {
throw std::runtime_error("Failed to load image or audio file");
}
// calculate bitmap hash (for KV caching)
std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());
bmp.set_id(hash.c_str());
bitmaps.entries.push_back(std::move(bmp));
bitmaps.entries.emplace_back(out.bitmap);
if (out.video_ctx) {
videos.emplace_back(out.video_ctx);
}
}
// process prompt
std::vector<server_tokens> inputs;
@@ -1023,6 +1013,20 @@ json oaicompat_chat_params_parse(
p["text"] = get_media_marker();
p.erase("input_audio");
} else if (type == "input_video") {
if (!opt.allow_video) {
throw std::runtime_error("video input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
}
json input_video = json_value(p, "input_video", json::object());
std::string data = json_value(input_video, "data", std::string());
auto decoded_data = base64_decode(data); // expected to be base64 encoded
out_files.push_back(decoded_data);
p["type"] = "media_marker";
p["text"] = get_media_marker();
p.erase("input_video");
} else if (type != "text") {
throw std::invalid_argument("unsupported content[].type");
}
+1
View File
@@ -294,6 +294,7 @@ struct server_chat_params {
common_chat_templates_ptr tmpls;
bool allow_image;
bool allow_audio;
bool allow_video;
bool enable_thinking = true;
int reasoning_budget = -1;
std::string reasoning_budget_message;
+4 -1
View File
@@ -1247,6 +1247,7 @@ private:
/* tmpls */ std::move(chat_templates),
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
/* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
/* allow_video */ mctx ? mtmd_helper_support_video(mctx) : false,
/* enable_thinking */ enable_thinking,
/* reasoning_budget */ params_base.sampling.reasoning_budget_tokens,
/* reasoning_budget_msg */ params_base.sampling.reasoning_budget_message,
@@ -3586,6 +3587,7 @@ server_context_meta server_context::get_meta() const {
/* has_mtmd */ impl->mctx != nullptr,
/* has_inp_image */ impl->chat_params.allow_image,
/* has_inp_audio */ impl->chat_params.allow_audio,
/* has_inp_video */ impl->chat_params.allow_video,
/* json_ui_settings */ impl->json_ui_settings,
/* json_webui_settings */ impl->json_webui_settings, // Deprecated
/* slot_n_ctx */ impl->get_slot_n_ctx(),
@@ -4183,6 +4185,7 @@ void server_routes::init_routes() {
{ "model_path", meta->model_path },
{ "modalities", json {
{"vision", meta->has_inp_image},
{"video", meta->has_inp_video},
{"audio", meta->has_inp_audio},
} },
{ "media_marker", get_media_marker() },
@@ -4976,7 +4979,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_count_tokens(const l
n_tokens = tokenize_mixed(vocab, prompt, true, true).size();
}
json response = {{"input_tokens", static_cast<int>(n_tokens)}};
json response = {{"input_tokens", static_cast<int64_t>(n_tokens)}};
if (is_oai) {
response["object"] = "response.input_tokens";
}
+1
View File
@@ -21,6 +21,7 @@ struct server_context_meta {
bool has_mtmd;
bool has_inp_image;
bool has_inp_audio;
bool has_inp_video;
json json_ui_settings; // Primary: new name
json json_webui_settings; // Deprecated: use json_ui_settings instead (kept for backward compat)
int slot_n_ctx;