mtmd : add video input support (#24269)

* wip * ok: lazy bitmap API * remember to free lazy text * wip * add mtmd_helper_video * support video input on server (base64 input) * add MTMD_VIDEO config * add timestamp * update CLI * cli: allow auto-completion for video * add --video arg * fix build * update docs * rename as suggested
2026-06-09 07:16:44 +02:00 · 2026-06-08 13:40:12 +02:00
parent c2b1518fd4
commit 8f83d6c271
16 changed files with 807 additions and 77 deletions
@@ -2221,8 +2221,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
    add_opt(common_arg(
-        {"--image", "--audio"}, "FILE",
+        {"--image", "--audio", "--video"}, "FILE",
-        "path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
+        "path to an image, audio, or video file. use with multimodal models, use comma-separated values for multiple files\n",
        [](common_params & params, const std::string & value) {
            for (const auto & item : parse_csv_row(value)) {
                params.image.emplace_back(item);
@@ -571,7 +571,7 @@ struct common_params {
    struct common_params_model mmproj;
    bool mmproj_use_gpu = true;     // use GPU for multimodal model
    bool no_mmproj = false;         // explicitly disable multimodal model
-    std::vector<std::string> image; // path to image file(s)
+    std::vector<std::string> image; // path to image file(s) ; TODO: change the name to "media"
    int image_min_tokens = -1;
    int image_max_tokens = -1;
@@ -2,6 +2,7 @@
 #include <assert.h>
 #include "mtmd.h"
 #include "mtmd-helper.h"
 int main(void) {
    printf("\n\nTesting libmtmd C API...\n");
@@ -17,6 +18,11 @@ int main(void) {
        return 1;
    }
    // simple test for the helper
    size_t n_tokens_total = mtmd_helper_get_n_tokens(chunks);
    printf("Total tokens in chunks: %zu\n", n_tokens_total);
    assert(n_tokens_total > 0);
    size_t n_chunks = mtmd_input_chunks_size(chunks);
    printf("Number of chunks: %zu\n", n_chunks);
    assert(n_chunks > 0);
@@ -235,7 +235,7 @@ struct cli_context {
 };
 // TODO?: Make this reusable, enums, docs
-static const std::array<std::string_view, 7> cmds = {
+static const std::array<std::string_view, 8> cmds = {
    "/audio ",
    "/clear",
    "/exit",
@@ -243,6 +243,7 @@ static const std::array<std::string_view, 7> cmds = {
    "/image ",
    "/read ",
    "/regen",
    "/video ",
 };
 static std::vector<std::pair<std::string, size_t>> auto_completion_callback(std::string_view line, size_t cursor_byte_pos) {
@@ -457,6 +458,9 @@ int llama_cli(int argc, char ** argv) {
    if (inf.has_inp_audio) {
        console::log("  /audio <file>       add an audio file\n");
    }
    if (inf.has_inp_video) {
        console::log("  /video <file>       add a video file\n");
    }
    console::log("\n");
    // interactive loop
@@ -553,7 +557,8 @@ int llama_cli(int argc, char ** argv) {
            continue;
        } else if (
                (string_starts_with(buffer, "/image ") && inf.has_inp_image) ||
-                (string_starts_with(buffer, "/audio ") && inf.has_inp_audio)) {
+                (string_starts_with(buffer, "/audio ") && inf.has_inp_audio) ||
                (string_starts_with(buffer, "/video ") && inf.has_inp_video)) {
            // just in case (bad copy-paste for example), we strip all trailing/leading spaces
            std::string fname = string_strip(buffer.substr(7));
            std::string marker = ctx_cli.load_input_file(fname, true);
@@ -1,5 +1,8 @@
 # mtmd
 set(MTMD_VIDEO ON CACHE BOOL "enable video support in mtmd (requires ffmpeg binary in PATH)")
 # TODO: add MTMD_VIDEO_METHOD in the future to select between ffmpeg and other backends
 find_package(Threads REQUIRED)
 add_library(mtmd
@@ -63,6 +66,10 @@ target_include_directories(mtmd PRIVATE ../..)
 target_include_directories(mtmd PRIVATE ../../vendor)
 target_compile_features   (mtmd PRIVATE cxx_std_17)
 if (MTMD_VIDEO)
    target_compile_definitions(mtmd PRIVATE MTMD_VIDEO)
 endif()
 if (BUILD_SHARED_LIBS)
    set_target_properties     (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
    target_compile_definitions(mtmd PRIVATE LLAMA_BUILD)
@@ -77,6 +77,7 @@ struct mtmd_cli_context {
    int                 n_batch;
    mtmd::bitmaps bitmaps;
    std::vector<mtmd_helper::video_ptr> videos;
    // chat template
    common_chat_templates_ptr tmpls;
@@ -166,11 +167,14 @@ struct mtmd_cli_context {
    }
    bool load_media(const std::string & fname) {
-        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str(), false));
+        auto res = mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str(), false);
-        if (!bmp.ptr) {
+        if (!res.bitmap) {
            return false;
        }
-        bitmaps.entries.push_back(std::move(bmp));
+        bitmaps.entries.emplace_back(res.bitmap);
        if (res.video_ctx) {
            videos.emplace_back(res.video_ctx);
        }
        return true;
    }
 };
@@ -253,6 +257,7 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
    }
    ctx.bitmaps.entries.clear();
    ctx.videos.clear();
    llama_pos new_n_past;
    if (mtmd_helper_eval_chunks(ctx.ctx_vision.get(),
@@ -373,6 +378,9 @@ int main(int argc, char ** argv) {
        if (mtmd_support_audio(ctx.ctx_vision.get())) {
            LOG("\n   /audio <path>    load an audio");
        }
        if (mtmd_helper_support_video(ctx.ctx_vision.get())) {
            LOG("\n   /video <path>    load a video");
        }
        LOG("\n   /clear           clear the chat history");
        LOG("\n   /quit or /exit   exit the program");
        LOG("\n");
@@ -407,14 +415,15 @@ int main(int argc, char ** argv) {
            g_is_generating = true;
            bool is_image = line == "/image" || line.find("/image ") == 0;
            bool is_audio = line == "/audio" || line.find("/audio ") == 0;
-            if (is_image || is_audio) {
+            bool is_video = line == "/video" || line.find("/video ") == 0;
            if (is_image || is_audio || is_video) {
                if (line.size() < 8) {
                    LOG_ERR("ERR: Missing media filename\n");
                    continue;
                }
                std::string media_path = line.substr(7);
                if (ctx.load_media(media_path)) {
-                    LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio");
+                    LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : is_audio ? "audio" : "video");
                    content += mtmd_default_marker();
                }
                // else, error is already printed by libmtmd
@@ -36,6 +36,11 @@
 #error "mtmd-helper is a public library outside of mtmd. it must not include internal headers"
 #endif
 #ifdef MTMD_VIDEO
 #include "sheredom/subprocess.h"
 #include <thread>
 #endif
 //
 // internal logging functions
 //
@@ -79,6 +84,7 @@ struct mtmd_helper_logger {
    }
 } g_logger;
 #define LOG_DBG(...) g_logger.log(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
 #define LOG_INF(...) g_logger.log(GGML_LOG_LEVEL_INFO,  __VA_ARGS__)
 #define LOG_WRN(...) g_logger.log(GGML_LOG_LEVEL_WARN,  __VA_ARGS__)
 #define LOG_ERR(...) g_logger.log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
@@ -478,42 +484,94 @@ static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int
 } // namespace audio_helpers
-mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder) {
+// Computes FNV-1a hash of the data
 static std::string fnv_hash(const uint8_t * data, size_t len) {
    const uint64_t fnv_prime = 0x100000001b3ULL;
    uint64_t hash = 0xcbf29ce484222325ULL;
    for (size_t i = 0; i < len; ++i) {
        hash ^= data[i];
        hash *= fnv_prime;
    }
    return std::to_string(hash);
 }
 mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder) {
    // calculate the hash if needed
    std::string id;
    mtmd_bitmap * result = nullptr;
    if (!placeholder) {
        id = fnv_hash(buf, len);
    }
    if (audio_helpers::is_audio_file((const char *)buf, len)) {
        std::vector<float> pcmf32;
        const int sample_rate = mtmd_get_audio_sample_rate(ctx);
        if (sample_rate < 0) {
            LOG_ERR("This model does not support audio input\n");
-            return nullptr;
+            return {nullptr, nullptr};
        }
        if (!audio_helpers::decode_audio_from_buf(buf, len, sample_rate, pcmf32)) {
            LOG_ERR("Unable to read WAV audio file from buffer\n");
-            return nullptr;
+            return {nullptr, nullptr};
        }
-        return mtmd_bitmap_init_from_audio(pcmf32.size(), placeholder ? nullptr : pcmf32.data());
+        result = mtmd_bitmap_init_from_audio(pcmf32.size(), placeholder ? nullptr : pcmf32.data());
        mtmd_bitmap_set_id(result, id.empty() ? nullptr : id.c_str());
        return {result, nullptr};
    }
    // otherwise, we assume it's an image
-    mtmd_bitmap * result = nullptr;
+    if (!result) {
    {
        int nx, ny, nc;
        auto * data = stbi_load_from_memory(buf, len, &nx, &ny, &nc, 3);
-        if (!data) {
+        if (data) {
-            LOG_ERR("%s: failed to decode image bytes\n", __func__);
+            result = mtmd_bitmap_init(nx, ny, placeholder ? nullptr : data);
-            return nullptr;
+            mtmd_bitmap_set_id(result, id.empty() ? nullptr : id.c_str());
            stbi_image_free(data);
            return {result, nullptr};
        }
-        result = mtmd_bitmap_init(nx, ny, placeholder ? nullptr : data);
+        // otherwise, fallthrough to video decoding (if supported)
        stbi_image_free(data);
    }
-    return result;
+
    // last try: load as video
 #ifdef MTMD_VIDEO
    if (!result) {
        auto params = mtmd_helper_video_init_params_default();
        auto video_ctx = mtmd_helper_video_init_from_buf(ctx, buf, len, params);
        if (!video_ctx) {
            LOG_ERR("%s: failed to decode buffer as either image/audio/video\n", __func__);
            return {nullptr, nullptr};
        }
        result = mtmd_bitmap_init_lazy(ctx,
            id.empty() ? nullptr : id.c_str(),
            video_ctx,
            [](size_t, void * user_data, mtmd_bitmap ** out_bitmap, char ** out_text) -> int {
                auto * vctx = static_cast<mtmd_helper_video *>(user_data);
                char * text = nullptr;
                int ret = mtmd_helper_video_read_next(vctx, out_bitmap, &text);
                *out_text = text; // heap-allocated by read_next; freed automatically by mtmd
                return ret;
            });
         return {result, video_ctx};
    }
 #else
    if (!result) {
        LOG_ERR("%s: failed to decode buffer as either image or audio (video support not compiled in)\n", __func__);
        return {nullptr, nullptr};
    }
 #endif
    // should not reach here
    return {nullptr, nullptr};
 }
-mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder) {
+mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder) {
    std::vector<unsigned char> buf;
    FILE * f = fopen(fname, "rb");
    if (!f) {
        LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
-        return nullptr;
+        return {nullptr, nullptr};
    }
    fseek(f, 0, SEEK_END);
@@ -522,7 +580,7 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char *
    if (file_size < 0) {
        LOG_ERR("Failed to get file size of %s\n", fname);
        fclose(f);
-        return nullptr;
+        return {nullptr, nullptr};
    }
    buf.resize(file_size);
@@ -530,9 +588,425 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char *
    fclose(f);
    if (n_read != (size_t)file_size) {
        LOG_ERR("Failed to read entire file %s", fname);
-        return nullptr;
+        return {nullptr, nullptr};
    }
    return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size(), placeholder);
 }
 bool mtmd_helper_support_video(mtmd_context * ctx) {
 #ifdef MTMD_VIDEO
    return mtmd_support_vision(ctx);
 #else
    return false;
 #endif
 }
 //
 // Video input helpers
 //
 #ifdef MTMD_VIDEO
 struct mtmd_helper_video {
    mtmd_context * mctx;
    std::string path;
    std::vector<uint8_t> input_buf; // non-empty when initialized from buffer
    std::string ffmpeg_bin;
    std::string ffprobe_bin;
    float fps_target = 0.0f;
    mtmd_helper_video_info info = {};
    struct subprocess_s proc = {};
    bool proc_alive = false;
    int32_t current_frame = 0;
    std::thread feeder_thread;
    std::string prompt_start         = "Video:";
    int32_t     timestamp_interval_ms = 5000; // emit a timestamp text every N ms (0 = disabled)
    float       next_timestamp_ms     = 0.0f; // next elapsed-ms threshold at which to emit
    std::vector<uint8_t> frame_buf;
    std::string pending_text; // text queued to be returned before the next frame
    bool        start_emitted = false;
    bool is_buf_input() const { return !input_buf.empty(); }
    // must run in a separate thread alongside stdout reading to avoid pipe deadlock
    void feed_stdin(struct subprocess_s * sp) {
        FILE * f = subprocess_stdin(sp);
        if (!f) {
            LOG_DBG("%s: subprocess has no stdin pipe\n", __func__);
            return;
        }
        LOG_DBG("%s: feeding %zu bytes to stdin\n", __func__, input_buf.size());
        size_t written = fwrite(input_buf.data(), 1, input_buf.size(), f);
        LOG_DBG("%s: wrote %zu bytes, closing stdin\n", __func__, written);
        fclose(f);
    }
    bool probe(float fps_target_arg) {
        const char * input_arg = is_buf_input() ? "pipe:0" : path.c_str();
        const char * cmd[] = {
            ffprobe_bin.c_str(),
            "-v", "quiet",
            "-show_entries", "stream=width,height,r_frame_rate,nb_frames,duration",
            "-select_streams", "v:0",
            "-of", "default=noprint_wrappers=1",
            input_arg,
            nullptr,
        };
        LOG_DBG("%s: launching:", __func__);
        for (size_t i = 0; cmd[i]; i++) { LOG_DBG(" %s", cmd[i]); }
        LOG_DBG("\n");
        struct subprocess_s fprobe;
        if (subprocess_create(cmd,
                subprocess_option_search_user_path | subprocess_option_inherit_environment,
                &fprobe) != 0) {
            LOG_ERR("%s: failed to launch ffprobe\n", __func__);
            return false;
        }
        std::thread probe_feeder;
        if (is_buf_input()) {
            probe_feeder = std::thread([this, &fprobe]() { feed_stdin(&fprobe); });
        }
        uint32_t width  = 0;
        uint32_t height = 0;
        float orig_fps = 0.0f;
        float duration = -1.0f;
        int32_t n_frames_orig = -1;
        char line[256];
        FILE * fp = subprocess_stdout(&fprobe);
        while (fgets(line, sizeof(line), fp)) {
            char * eq = strchr(line, '=');
            if (!eq) continue;
            *eq = '\0';
            const char * key = line;
            const char * val = eq + 1;
            char * nl = (char *)strchr(val, '\n');
            if (nl) *nl = '\0';
            if (strcmp(key, "width") == 0) {
                width = (uint32_t)atoi(val);
            } else if (strcmp(key, "height") == 0) {
                height = (uint32_t)atoi(val);
            } else if (strcmp(key, "r_frame_rate") == 0) {
                orig_fps = parse_rational(val);
            } else if (strcmp(key, "nb_frames") == 0 && strcmp(val, "N/A") != 0) {
                n_frames_orig = atoi(val);
            } else if (strcmp(key, "duration") == 0 && strcmp(val, "N/A") != 0) {
                duration = (float)atof(val);
            }
        }
        if (probe_feeder.joinable()) {
            probe_feeder.join();
        }
        int ret_code;
        subprocess_join(&fprobe, &ret_code);
        subprocess_destroy(&fprobe);
        if (width == 0 || height == 0 || orig_fps <= 0.0f) {
            return false;
        }
        if (duration < 0.0f && n_frames_orig > 0) {
            duration = (float)n_frames_orig / orig_fps;
        }
        fps_target = fps_target_arg > 0.0f ? fps_target_arg : orig_fps;
        info.width    = width;
        info.height   = height;
        info.fps      = fps_target;
        LOG_DBG("%s: %ux%u fps=%.2f duration=%.2fs n_frames=%d\n",
                __func__, width, height, fps_target, duration, info.n_frames);
        info.n_frames = duration > 0.0f ? (int32_t)(duration * fps_target + 0.5f) : -1;
        frame_buf.resize((size_t)width * height * 3);
        return true;
    }
    bool start_ffmpeg(float seek_seconds) {
        char seek_buf[64];
        char fps_buf[64];
        std::vector<const char *> cmd;
        cmd.push_back(ffmpeg_bin.c_str());
        if (!is_buf_input() && seek_seconds > 0.0f) {
            // input-side seek: fast, keyframe-accurate; only valid for seekable file inputs
            snprintf(seek_buf, sizeof(seek_buf), "%.6f", seek_seconds);
            cmd.push_back("-ss");
            cmd.push_back(seek_buf);
        }
        cmd.push_back("-i");
        // cache:pipe:0 wraps stdin with a seekable in-memory cache, letting ffmpeg seek
        // backwards for container headers (e.g. MP4 moov atom at end of file)
        cmd.push_back(is_buf_input() ? "cache:pipe:0" : path.c_str());
        if (seek_seconds > 0.0f && is_buf_input()) {
            // output-side seek: frame-accurate but decodes and discards frames up to seek point
            snprintf(seek_buf, sizeof(seek_buf), "%.6f", seek_seconds);
            cmd.push_back("-ss");
            cmd.push_back(seek_buf);
        }
        if (fps_target > 0.0f) {
            snprintf(fps_buf, sizeof(fps_buf), "fps=%.6f", fps_target);
            cmd.push_back("-vf");
            cmd.push_back(fps_buf);
        }
        cmd.push_back("-f");
        cmd.push_back("rawvideo");
        cmd.push_back("-pix_fmt");
        cmd.push_back("rgb24");
        cmd.push_back("pipe:1");
        cmd.push_back("-loglevel");
        cmd.push_back("error");
        cmd.push_back(nullptr);
        LOG_DBG("%s: launching:", __func__);
        for (size_t i = 0; cmd[i]; i++) {
            LOG_DBG(" %s", cmd[i]);
        }
        LOG_DBG("\n");
        int ret = subprocess_create(
            cmd.data(),
            subprocess_option_search_user_path | subprocess_option_inherit_environment,
            &proc);
        proc_alive = (ret == 0);
        LOG_DBG("%s: subprocess_create ret=%d proc_alive=%d\n", __func__, ret, (int)proc_alive);
        if (proc_alive && is_buf_input()) {
            LOG_DBG("%s: starting feeder thread for %zu-byte buffer\n", __func__, input_buf.size());
            feeder_thread = std::thread([this]() { feed_stdin(&proc); });
        }
        return proc_alive;
    }
    void stop_ffmpeg() {
        if (proc_alive) {
            subprocess_terminate(&proc);
            subprocess_destroy(&proc);
            proc_alive = false;
        }
        if (feeder_thread.joinable()) {
            feeder_thread.join();
        }
    }
    mtmd_bitmap * read_next_frame() {
        if (!proc_alive) return nullptr;
        FILE * fp = subprocess_stdout(&proc);
        const size_t frame_size = (size_t)info.width * info.height * 3;
        LOG_DBG("%s: reading frame %d, expecting %zu bytes (%ux%u)\n",
                __func__, current_frame, frame_size, info.width, info.height);
        size_t total_read = 0;
        while (total_read < frame_size) {
            size_t n = fread(frame_buf.data() + total_read, 1, frame_size - total_read, fp);
            if (n == 0) {
                // clean EOF only if no bytes read yet; partial frame is an error
                LOG_DBG("%s: fread returned 0 after %zu/%zu bytes (ferror=%d)\n",
                        __func__, total_read, frame_size, ferror(fp));
                proc_alive = false;
                return nullptr;
            }
            total_read += n;
        }
        LOG_DBG("%s: frame %d read OK\n", __func__, current_frame);
        current_frame++;
        return mtmd_bitmap_init(info.width, info.height, frame_buf.data());
    }
    int32_t read_next(mtmd_bitmap ** out_bitmap, char ** out_text) {
        *out_bitmap = nullptr;
        *out_text   = nullptr;
        if (!pending_text.empty()) {
            *out_text = strdup(pending_text.c_str());
            pending_text.clear();
            return *out_text ? 0 : -2;
        }
        LOG_DBG("%s: proc_alive=%d start_emitted=%d current_frame=%d\n",
                __func__, (int)proc_alive, (int)start_emitted, current_frame);
        if (!proc_alive) {
            return (current_frame == 0) ? -2 : -1;
        }
        if (!start_emitted) {
            start_emitted = true;
            if (!prompt_start.empty()) {
                *out_text = strdup(prompt_start.c_str());
                return *out_text ? 0 : -2;
            }
        }
        mtmd_bitmap * frame = read_next_frame();
        if (!frame) return -1;
        *out_bitmap = frame;
        if (timestamp_interval_ms > 0) {
            // current_frame was already incremented by read_next_frame(); undo for elapsed calc
            float elapsed_ms = (float)(current_frame - 1) / info.fps * 1000.0f;
            if (elapsed_ms >= next_timestamp_ms) {
                char ts_buf[32];
                float elapsed_s = elapsed_ms / 1000.0f;
                int   minutes   = (int)(elapsed_s / 60);
                float seconds   = elapsed_s - minutes * 60.0f;
                snprintf(ts_buf, sizeof(ts_buf), "[%dm%.2fs]", minutes, seconds);
                pending_text = ts_buf;
                next_timestamp_ms += (float)timestamp_interval_ms;
            }
        }
        return 0;
    }
    static float parse_rational(const char * s) {
        int num = 0, den = 1;
        if (sscanf(s, "%d/%d", &num, &den) == 2 && den > 0) {
            return (float)num / (float)den;
        }
        float val;
        if (sscanf(s, "%f", &val) == 1) {
            return val;
        }
        return 0.0f;
    }
 };
 #endif
 mtmd_helper_video_init_params mtmd_helper_video_init_params_default() {
    return {
        /* fps_target             */ 4.0f,
        /* ffmpeg_bin_dir         */ nullptr,
        /* timestamp_interval_ms  */ 5000,
    };
 }
 static std::string video_resolve_bin(const char * bin_dir, const char * name) {
    if (!bin_dir || bin_dir[0] == '\0') {
        return name; // rely on PATH
    }
    std::string result = bin_dir;
    char last = result.back();
    if (last != '/' && last != '\\') {
 #ifdef _WIN32
        result += '\\';
 #else
        result += '/';
 #endif
    }
    result += name;
 #ifdef _WIN32
    result += ".exe";
 #endif
    return result;
 }
 mtmd_helper_video * mtmd_helper_video_init(
        mtmd_context * mctx,
        const char * path,
        mtmd_helper_video_init_params params) {
 #ifdef MTMD_VIDEO
    auto * ctx = new mtmd_helper_video();
    ctx->mctx                 = mctx;
    ctx->path                 = path;
    ctx->ffmpeg_bin           = video_resolve_bin(params.ffmpeg_bin_dir, "ffmpeg");
    ctx->ffprobe_bin          = video_resolve_bin(params.ffmpeg_bin_dir, "ffprobe");
    ctx->timestamp_interval_ms = params.timestamp_interval_ms;
    if (!ctx->probe(params.fps_target)) {
        LOG_ERR("%s: ffprobe failed for '%s' (is ffprobe in PATH?)\n", __func__, path);
        delete ctx;
        return nullptr;
    }
    if (!ctx->start_ffmpeg(0.0f)) {
        LOG_ERR("%s: failed to start ffmpeg for '%s' (is ffmpeg in PATH?)\n", __func__, path);
        delete ctx;
        return nullptr;
    }
    return ctx;
 #else
    LOG_ERR("%s: video is not supported in this build (MTMD_VIDEO is set to OFF)\n", __func__);
    return nullptr;
 #endif
 }
 mtmd_helper_video * mtmd_helper_video_init_from_buf(
        mtmd_context * mctx,
        const unsigned char * buf, size_t len,
        mtmd_helper_video_init_params params) {
 #ifdef MTMD_VIDEO
    auto * ctx = new mtmd_helper_video();
    ctx->mctx                  = mctx;
    ctx->input_buf.assign(buf, buf + len);
    ctx->ffmpeg_bin            = video_resolve_bin(params.ffmpeg_bin_dir, "ffmpeg");
    ctx->ffprobe_bin           = video_resolve_bin(params.ffmpeg_bin_dir, "ffprobe");
    ctx->timestamp_interval_ms = params.timestamp_interval_ms;
    if (!ctx->probe(params.fps_target)) {
        LOG_ERR("%s: ffprobe failed on buffer (is ffprobe in PATH?)\n", __func__);
        delete ctx;
        return nullptr;
    }
    if (!ctx->start_ffmpeg(0.0f)) {
        LOG_ERR("%s: failed to start ffmpeg on buffer (is ffmpeg in PATH?)\n", __func__);
        delete ctx;
        return nullptr;
    }
    return ctx;
 #else
    LOG_ERR("%s: video is not supported in this build (MTMD_VIDEO is set to OFF)\n", __func__);
    return nullptr;
 #endif
 }
 void mtmd_helper_video_free(mtmd_helper_video * ctx) {
 #ifdef MTMD_VIDEO
    if (!ctx) return;
    ctx->stop_ffmpeg();
    delete ctx;
 #else
    LOG_ERR("%s: video is not supported in this build (MTMD_VIDEO is set to OFF)\n", __func__);
 #endif
 }
 mtmd_helper_video_info mtmd_helper_video_get_info(const mtmd_helper_video * ctx) {
 #ifdef MTMD_VIDEO
    return ctx->info;
 #else
    GGML_ASSERT(false && "video is not supported in this build (MTMD_VIDEO is set to OFF)");
 #endif
 }
 int32_t mtmd_helper_video_read_next(mtmd_helper_video * ctx,
        mtmd_bitmap ** out_bitmap, char ** out_text) {
 #ifdef MTMD_VIDEO
    if (!ctx) return -2;
    return ctx->read_next(out_bitmap, out_text);
 #else
    GGML_ASSERT(false && "video is not supported in this build (MTMD_VIDEO is set to OFF)");
 #endif
 }
@@ -20,25 +20,39 @@ extern "C" {
 // BREAKING CHANGES are expected.
 //
 struct mtmd_helper_video;
 typedef struct mtmd_helper_video mtmd_helper_video;
 // Set callback for all future logging events.
 // If this is not called, or NULL is supplied, everything is output on stderr.
 // Note: this also call mtmd_log_set() internally
 MTMD_API void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_data);
 // Returns true if this build includes video support (MTMD_VIDEO was ON at compile time).
 MTMD_API bool mtmd_helper_support_video(mtmd_context * ctx);
 struct mtmd_helper_bitmap_wrapper {
    mtmd_bitmap * bitmap;
    mtmd_helper_video * video_ctx;
 };
 // helper function to construct a mtmd_bitmap from a file
 // it calls mtmd_helper_bitmap_init_from_buf() internally
 // returns nullptr on failure
 // this function is thread-safe
-MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder);
+MTMD_API struct mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder);
 // helper function to construct a mtmd_bitmap from a buffer containing a file
 // supported formats:
 //     image: formats supported by stb_image: jpg, png, bmp, gif, etc.
 //     audio: formats supported by miniaudio: wav, mp3, flac
-// note: audio files will be auto-detected based on magic bytes
+// note:
 //   - for now, video input is only supported via C++ helper functions
 //   - audio files will be auto-detected based on magic bytes
 //   - output bitmap will have FNV hash as the ID
 // returns nullptr on failure
 // this function is thread-safe
-MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder);
+MTMD_API struct mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder);
 // helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
 MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
@@ -89,6 +103,56 @@ MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
                                                int32_t n_batch,
                                                llama_pos * new_n_past);
 //
 // video input helpers (requires ffmpeg/ffprobe installed on the system)
 // the notion of video only exists at the helper level, it is not visible to the core mtmd library
 //
 // NOTE: this implementation is model-agnostic, it can be used with any vision-capable model
 //       however, it may not be accurate for some specific models
 //       (this is expected for now, to keep the implementation simple)
 //
 struct mtmd_helper_video_info {
    uint32_t width;
    uint32_t height;
    float    fps;      // effective fps (fps_target if set, else original video fps)
    int32_t  n_frames; // estimated total frames at effective fps (-1 if unknown)
 };
 struct mtmd_helper_video_init_params {
    float fps_target;            // desired output fps; <= 0 means use the video's native fps, defaulted to 4.0f
    const char * ffmpeg_bin_dir; // directory containing ffmpeg/ffprobe binaries; NULL means search PATH
    int64_t timestamp_interval_ms; // interval for adding timestamp as text chunk (example: "[10m50.5s]"); <= 0 means no timestamp, defaulted to 5000ms
    // TODO @ngxson : allow "placeholder" bitmap output for counting tokens
 };
 MTMD_API struct mtmd_helper_video_init_params mtmd_helper_video_init_params_default(void);
 // returns NULL on failure (ffprobe not found, file unreadable, etc.)
 MTMD_API mtmd_helper_video * mtmd_helper_video_init(
                    struct mtmd_context * mctx,
                    const char * path,
                    struct mtmd_helper_video_init_params params);
 // Same as mtmd_helper_video_init(), but reads from an in-memory buffer.
 // The buffer is copied internally; the caller does not need to keep it alive.
 // Note: pipe input is not seekable, so seeking will use output-side seeking
 // (ffmpeg decodes and discards frames up to the target position).
 MTMD_API mtmd_helper_video * mtmd_helper_video_init_from_buf(
                    struct mtmd_context * mctx,
                    const unsigned char * buf, size_t len,
                    struct mtmd_helper_video_init_params params);
 MTMD_API void mtmd_helper_video_free(mtmd_helper_video * ctx);
 MTMD_API struct mtmd_helper_video_info mtmd_helper_video_get_info(const mtmd_helper_video * ctx);
 // Read the next item from the video stream; exactly one of out_bitmap or out_text is set per call.
 // *out_bitmap - heap-allocated; caller must free with mtmd_bitmap_free()
 // *out_text   - heap-allocated (always via strdup/malloc); caller must free with free()
 // returns 0 on success, -1 on EOF, -2 on error
 MTMD_API int32_t mtmd_helper_video_read_next(mtmd_helper_video * ctx,
            mtmd_bitmap ** out_bitmap,
            char ** out_text);
 #ifdef __cplusplus
 } // extern "C"
 #endif
@@ -97,4 +161,16 @@ MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
 // C++ wrappers
 //
 #ifdef __cplusplus
 namespace mtmd_helper {
 // video-related C++ wrappers
 struct mtmd_helper_video_deleter {
    void operator()(mtmd_helper_video * val) { mtmd_helper_video_free(val); }
 };
 using video_ptr = std::unique_ptr<mtmd_helper_video, mtmd_helper_video_deleter>;
 } // namespace mtmd_helper
 #endif
 #endif
@@ -35,6 +35,10 @@ struct mtmd_bitmap {
    std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
    bool is_audio = false; // true if the bitmap is audio
    // lazy-loaded bitmap
    mtmd_bitmap_lazy_callback lazy_callback = nullptr;
    void * lazy_user_data = nullptr;
    mtmd_bitmap(const unsigned char * data, uint32_t nx, uint32_t ny)
        : nx(nx), ny(ny), is_audio(false) {
        if (data) {
@@ -732,30 +736,111 @@ void mtmd_free(mtmd_context * ctx) {
 struct mtmd_tokenizer {
    mtmd_context * ctx;
    std::vector<const mtmd_bitmap *> bitmaps;
    std::string input_text;
    bool add_special;
    bool parse_special;
    const llama_vocab * vocab;
    struct part {
        std::string text;
        const mtmd_bitmap * bitmap;
    };
    std::vector<part> parts;
    // these will be freed when mtmd_tokenizer finishes
    std::vector<mtmd::bitmap> bm_from_lazy; // TODO @ngxson : refactor, free bm_from_lazy progressively
    std::vector<const char *> text_from_lazy;
    mtmd_input_chunks cur;
    uint32_t n_images_added = 0; // 0-based index assigned to the next image chunk
    ~mtmd_tokenizer() {
        // note: mtmd::bitmap is already RAII
        for (auto & str : text_from_lazy) {
            free((void *)str);
        }
    }
    mtmd_tokenizer(mtmd_context * ctx,
            const mtmd_input_text * text,
-            const mtmd_bitmap ** bitmaps,
+            const mtmd_bitmap ** bmps,
-            size_t n_bitmaps) : ctx(ctx), bitmaps(bitmaps, bitmaps + n_bitmaps) {
+            size_t n_bitmaps) : ctx(ctx) {
        add_special   = text->add_special;
        parse_special = text->parse_special;
        input_text    = text->text;
        vocab         = ctx->vocab;
        std::vector<const mtmd_bitmap *> bitmaps(bmps, bmps + n_bitmaps);
        auto parts_str = split_text(input_text, ctx->media_marker);
        size_t i_bm = 0;
        for (const auto & part : parts_str) {
            if (part == ctx->media_marker) {
                if (i_bm >= bitmaps.size()) {
                    throw std::runtime_error(string_format("number of media markers in text (%zu) exceeds number of bitmaps (%zu)", i_bm + 1, bitmaps.size()));
                }
                parts.push_back({"", bitmaps[i_bm++]});
            } else {
                parts.push_back({std::move(part), nullptr});
            }
        }
        size_t n_markers = 0;
        for (const auto & part : parts) {
            if (part.bitmap != nullptr) {
                n_markers++;
            }
        }
        if (n_markers != bitmaps.size()) {
            throw std::runtime_error(string_format("number of media markers in text (%zu) does not match number of bitmaps (%zu)", n_markers, bitmaps.size()));
        }
        expand_lazy_bitmaps();
    }
    void expand_lazy_bitmaps() {
        std::vector<part> expanded;
        expanded.reserve(parts.size());
        for (auto & p : parts) {
            if (p.bitmap != nullptr && p.bitmap->lazy_callback) {
                LOG_DBG("%s: expanding lazy bitmap\n", __func__);
                for (size_t i = 0;; i++) {
                    char * out_str = nullptr;
                    mtmd_bitmap * out_bm = nullptr;
                    int res = p.bitmap->lazy_callback(i,
                                    p.bitmap->lazy_user_data,
                                    &out_bm,
                                    &out_str);
                    if (out_bm && out_str) {
                        throw std::runtime_error(string_format("lazy callback cannot return both bitmap and text"));
                    }
                    if (res == 0) {
                        // OK, append the returned chunk; lazy part is not yet added
                        if (out_bm) {
                            auto & ptr = bm_from_lazy.emplace_back(out_bm); // remember to free it later
                            expanded.push_back({"", ptr.ptr.get()});
                            LOG_DBG("%s: lazy callback returned bitmap with dimensions %d x %d\n", __func__, out_bm->nx, out_bm->ny);
                        } else if (out_str) {
                            auto & ptr = text_from_lazy.emplace_back(out_str); // remember to free it later
                            expanded.push_back({ptr, nullptr});
                            LOG_DBG("%s: lazy callback returned text: %s\n", __func__, out_str);
                        }
                    } else if (res == -1) {
                        // EOF: lazy part removes itself (not added to expanded)
                        break;
                    } else if (res == -2) {
                        // error
                        throw std::runtime_error(string_format("lazy callback returned error"));
                    }
                }
            } else {
                expanded.push_back(std::move(p));
            }
        }
        parts = std::move(expanded);
    }
    int32_t tokenize(mtmd_input_chunks * output) {
        cur.entries.clear();
        std::vector<std::string> parts = split_text(input_text, ctx->media_marker);
        size_t i_bm = 0; // index of the current bitmap
        // [QWEN_VIDEO] handle frame merging for models that support it (i.e. qwen-vl)
        int n_merge_frames = 1;
@@ -764,53 +849,50 @@ struct mtmd_tokenizer {
            GGML_ASSERT(n_merge_frames <= 2 && "we only support merging maximum 2 images for now; open an issue if this model supports merging more");
        }
        // Build merged_bitmaps: each entry is a group of 1 or 2 bitmaps.
        // For consecutive mergeable bitmap parts, merge them and collapse the second part out of this->parts.
        std::vector<std::vector<const mtmd_bitmap *>> merged_bitmaps;
        if (n_merge_frames > 1) {
            size_t i_bm_scan = 0;
            for (size_t i = 0; i < parts.size(); ++i) {
-                if (parts[i] != ctx->media_marker) {
+                if (parts[i].bitmap == nullptr) {
                    continue;
                }
-                if (i + 1 < parts.size()
+                if (i + 1 < parts.size() && parts[i + 1].bitmap != nullptr) {
-                        && parts[i + 1] == ctx->media_marker
+                    const mtmd_bitmap * bm_a = parts[i].bitmap;
-                        && i_bm_scan + 1 < bitmaps.size()) {
+                    const mtmd_bitmap * bm_b = parts[i + 1].bitmap;
                    const mtmd_bitmap * bm_a = bitmaps[i_bm_scan];
                    const mtmd_bitmap * bm_b = bitmaps[i_bm_scan + 1];
                    if (bm_a->can_batch_with(*bm_b)) {
-                        LOG_DBG("%s: merging 2 frames at bitmap index %zu and %zu\n", __func__, i_bm_scan, i_bm_scan + 1);
+                        LOG_DBG("%s: merging 2 frames at part index %zu and %zu\n", __func__, i, i + 1);
                        merged_bitmaps.push_back({bm_a, bm_b});
-                        parts.erase(parts.begin() + i + 1); // remove the second marker
+                        parts.erase(parts.begin() + i + 1); // collapse the second bitmap part
                        i_bm_scan += 2;
                        continue;
                    }
                }
-                LOG_DBG("%s: no merging for bitmap index %zu\n", __func__, i_bm_scan);
+                LOG_DBG("%s: no merging for part index %zu\n", __func__, i);
-                merged_bitmaps.push_back({bitmaps[i_bm_scan]});
+                merged_bitmaps.push_back({parts[i].bitmap});
                ++i_bm_scan;
            }
        } else {
-            for (size_t i = 0; i < bitmaps.size(); ++i) {
+            for (const auto & p : parts) {
-                merged_bitmaps.push_back({bitmaps[i]});
+                if (p.bitmap != nullptr) {
                    merged_bitmaps.push_back({p.bitmap});
                }
            }
        }
-        i_bm = 0;
+        size_t i_bm = 0;
-        for (auto & part : parts) {
+        for (const auto & p : parts) {
-            if (part == ctx->media_marker) {
+            if (p.bitmap != nullptr) {
                // this is a marker, we should add the next bitmap
                if (i_bm >= merged_bitmaps.size()) {
                    LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
                            __func__, merged_bitmaps.size(), parts.size() - 1);
                    return 1;
                }
-                auto & bmps = merged_bitmaps[i_bm++];
+                auto bmps = merged_bitmaps[i_bm++];
                int32_t res = add_media(bmps);
                if (res != 0) {
                    return res;
                }
            } else {
-                // this is a text part, we should add it as text
+                add_text(p.text, parse_special);
                add_text(part, parse_special);
            }
        }
@@ -1236,8 +1318,13 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
            const mtmd_input_text * text,
            const mtmd_bitmap ** bitmaps,
            size_t n_bitmaps) {
-    mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps);
+    try {
-    return tokenizer.tokenize(output);
+        mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps);
        return tokenizer.tokenize(output);
    } catch (const std::exception & e) {
        LOG_ERR("%s: error: %s\n", __func__, e.what());
        return 2;
    }
 }
 int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
@@ -1373,6 +1460,10 @@ int mtmd_get_audio_sample_rate(const mtmd_context * ctx) {
    return clip_get_hparams(ctx->ctx_a)->audio_sample_rate;
 }
 const char * mtmd_get_marker(const mtmd_context * ctx) {
    return ctx->media_marker.c_str();
 }
 //
 // public API functions
 //
@@ -1405,10 +1496,16 @@ uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap) {
 }
 const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
    if (bitmap->is_placeholder()) {
        return nullptr;
    }
    return bitmap->get_ro_buf().data();
 }
 size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) {
    if (bitmap->is_placeholder()) {
        return 0;
    }
    return bitmap->get_ro_buf().size();
 }
@@ -1428,6 +1525,18 @@ void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id) {
    }
 }
 mtmd_bitmap * mtmd_bitmap_init_lazy(mtmd_context * ctx,
                                    const char * id,
                                    void * user_data,
                                    mtmd_bitmap_lazy_callback callback) {
    GGML_UNUSED(ctx); // reserved for future use
    mtmd_bitmap * bitmap = new mtmd_bitmap(nullptr, 0, 0);
    bitmap->lazy_callback = callback;
    bitmap->lazy_user_data = user_data;
    mtmd_bitmap_set_id(bitmap, id);
    return bitmap;
 }
 void mtmd_bitmap_free(mtmd_bitmap * bitmap) {
    if (bitmap) {
        delete bitmap;
@@ -128,6 +128,9 @@ MTMD_API bool mtmd_support_audio(const mtmd_context * ctx);
 // return -1 if audio is not supported
 MTMD_API int mtmd_get_audio_sample_rate(const mtmd_context * ctx);
 // get the current marker string
 MTMD_API const char * mtmd_get_marker(const mtmd_context * ctx);
 // mtmd_bitmap
 //
 // if bitmap is image:
@@ -156,6 +159,34 @@ MTMD_API void                  mtmd_bitmap_free       (mtmd_bitmap * bitmap);
 MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
 MTMD_API void         mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id);
 // mtmd_bitmap lazy
 //
 // this is a special bitmap that:
 // - does not hold the actual data
 // - can be expanded into one or more chunks (either media to text chunks)
 // user must provide a callback to fill in the data when mtmd_tokenize() is called
 // this is useful for large video inputs:
 // - allow reading video frame by frame, without loading the entire video into memory
 // - allow tracking the whole video with a single ID (for example, the file hash)
 // set (*out_bitmap) to non-nullptr to emit a bitmap chunk; it will be freed automatically
 // set (*out_text) to non-nullptr to emit a text chunk; it must be heap-allocated, null-terminated and will be freed automatically
 // either out_bitmap or out_text can be set, but not both
 // out_bitmap cannot be another lazy bitmap (no nested lazy allowed)
 // return value:
 //    0 on success
 //   -1 on EOF (signal to mtmd_tokenize to move on)
 //   -2 on error (signal to mtmd_tokenize to abort)
 typedef int(* mtmd_bitmap_lazy_callback)(
    size_t chunk_idx,
    void * user_data,
    mtmd_bitmap ** out_bitmap,
    char ** out_text);
 MTMD_API mtmd_bitmap * mtmd_bitmap_init_lazy(mtmd_context * ctx,
                                             const char * id, // usually set to file hash
                                             void * user_data,
                                             mtmd_bitmap_lazy_callback callback);
 // mtmd_input_chunks
 //
@@ -1252,6 +1252,10 @@ The `response_format` parameter supports both plain JSON output (e.g. `{"type":
 `parallel_tool_calls` : Whether to enable parallel/multiple tool calls (only supported on some models, verification is based on jinja template).
 For multimodal input:
 - Content type `image_url` and `input_audio` are the same as OAI schema
 - Content type `input_video` is an extension from OAI schema. For now, it only accepts base64 input
 *Examples:*
 You can use either Python `openai` library with appropriate checkpoints:
@@ -701,29 +701,19 @@ size_t validate_utf8(const std::string& text) {
    return len;
 }
 // Computes FNV-1a hash of the data
 static std::string fnv_hash(const uint8_t * data, size_t len) {
    const uint64_t fnv_prime = 0x100000001b3ULL;
    uint64_t hash = 0xcbf29ce484222325ULL;
    for (size_t i = 0; i < len; ++i) {
        hash ^= data[i];
        hash *= fnv_prime;
    }
    return std::to_string(hash);
 }
 server_tokens process_mtmd_prompt(mtmd_context * mctx, const std::string & prompt, const std::vector<raw_buffer> & files, bool is_placeholder) {
    // these will be freed upon going out of scope
    mtmd::bitmaps bitmaps;
    std::vector<mtmd_helper::video_ptr> videos;
    for (auto & file : files) {
-        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size(), is_placeholder));
+        auto out = mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size(), is_placeholder);
-        if (!bmp.ptr) {
+        if (!out.bitmap) {
            throw std::runtime_error("Failed to load image or audio file");
        }
-        // calculate bitmap hash (for KV caching)
+        bitmaps.entries.emplace_back(out.bitmap);
-        std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());
+        if (out.video_ctx) {
-        bmp.set_id(hash.c_str());
+            videos.emplace_back(out.video_ctx);
-        bitmaps.entries.push_back(std::move(bmp));
+        }
    }
    // process prompt
    std::vector<server_tokens> inputs;
@@ -1023,6 +1013,20 @@ json oaicompat_chat_params_parse(
                p["text"] = get_media_marker();
                p.erase("input_audio");
            } else if (type == "input_video") {
                if (!opt.allow_video) {
                    throw std::runtime_error("video input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
                }
                json input_video  = json_value(p, "input_video", json::object());
                std::string data  = json_value(input_video, "data", std::string());
                auto decoded_data = base64_decode(data); // expected to be base64 encoded
                out_files.push_back(decoded_data);
                p["type"] = "media_marker";
                p["text"] = get_media_marker();
                p.erase("input_video");
            } else if (type != "text") {
                throw std::invalid_argument("unsupported content[].type");
            }
@@ -294,6 +294,7 @@ struct server_chat_params {
    common_chat_templates_ptr tmpls;
    bool allow_image;
    bool allow_audio;
    bool allow_video;
    bool enable_thinking = true;
    int  reasoning_budget = -1;
    std::string reasoning_budget_message;
@@ -1247,6 +1247,7 @@ private:
                /* tmpls                 */ std::move(chat_templates),
                /* allow_image           */ mctx ? mtmd_support_vision(mctx) : false,
                /* allow_audio           */ mctx ? mtmd_support_audio (mctx) : false,
                /* allow_video           */ mctx ? mtmd_helper_support_video(mctx) : false,
                /* enable_thinking       */ enable_thinking,
                /* reasoning_budget      */ params_base.sampling.reasoning_budget_tokens,
                /* reasoning_budget_msg  */ params_base.sampling.reasoning_budget_message,
@@ -3586,6 +3587,7 @@ server_context_meta server_context::get_meta() const {
        /* has_mtmd               */ impl->mctx != nullptr,
        /* has_inp_image          */ impl->chat_params.allow_image,
        /* has_inp_audio          */ impl->chat_params.allow_audio,
        /* has_inp_video          */ impl->chat_params.allow_video,
        /* json_ui_settings       */ impl->json_ui_settings,
        /* json_webui_settings    */ impl->json_webui_settings,  // Deprecated
        /* slot_n_ctx             */ impl->get_slot_n_ctx(),
@@ -4183,6 +4185,7 @@ void server_routes::init_routes() {
            { "model_path",                  meta->model_path },
            { "modalities",                  json {
                {"vision", meta->has_inp_image},
                {"video",  meta->has_inp_video},
                {"audio",  meta->has_inp_audio},
            } },
            { "media_marker",                get_media_marker() },
@@ -4976,7 +4979,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_count_tokens(const l
        n_tokens = tokenize_mixed(vocab, prompt, true, true).size();
    }
-    json response = {{"input_tokens", static_cast<int>(n_tokens)}};
+    json response = {{"input_tokens", static_cast<int64_t>(n_tokens)}};
    if (is_oai) {
        response["object"] = "response.input_tokens";
    }
@@ -21,6 +21,7 @@ struct server_context_meta {
    bool has_mtmd;
    bool has_inp_image;
    bool has_inp_audio;
    bool has_inp_video;
    json json_ui_settings;            // Primary: new name
    json json_webui_settings;            // Deprecated: use json_ui_settings instead (kept for backward compat)
    int slot_n_ctx;