mtmd : add video input support (#24269)

* wip * ok: lazy bitmap API * remember to free lazy text * wip * add mtmd_helper_video * support video input on server (base64 input) * add MTMD_VIDEO config * add timestamp * update CLI * cli: allow auto-completion for video * add --video arg * fix build * update docs * rename as suggested
2026-06-09 07:16:44 +02:00 · 2026-06-08 13:40:12 +02:00
parent c2b1518fd4
commit 8f83d6c271
16 changed files with 807 additions and 77 deletions
@@ -2221,8 +2221,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
    add_opt(common_arg(
-        {"--image", "--audio"}, "FILE",
-        "path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
+        {"--image", "--audio", "--video"}, "FILE",
+        "path to an image, audio, or video file. use with multimodal models, use comma-separated values for multiple files\n",
        [](common_params & params, const std::string & value) {
            for (const auto & item : parse_csv_row(value)) {
                params.image.emplace_back(item);
@@ -571,7 +571,7 @@ struct common_params {
    struct common_params_model mmproj;
    bool mmproj_use_gpu = true;     // use GPU for multimodal model
    bool no_mmproj = false;         // explicitly disable multimodal model
-    std::vector<std::string> image; // path to image file(s)
+    std::vector<std::string> image; // path to image file(s) ; TODO: change the name to "media"
    int image_min_tokens = -1;
    int image_max_tokens = -1;

@@ -2,6 +2,7 @@
 #include <assert.h>

 #include "mtmd.h"
+#include "mtmd-helper.h"

 int main(void) {
    printf("\n\nTesting libmtmd C API...\n");
@@ -17,6 +18,11 @@ int main(void) {
        return 1;
    }

+    // simple test for the helper
+    size_t n_tokens_total = mtmd_helper_get_n_tokens(chunks);
+    printf("Total tokens in chunks: %zu\n", n_tokens_total);
+    assert(n_tokens_total > 0);
+
    size_t n_chunks = mtmd_input_chunks_size(chunks);
    printf("Number of chunks: %zu\n", n_chunks);
    assert(n_chunks > 0);
@@ -235,7 +235,7 @@ struct cli_context {
 };

 // TODO?: Make this reusable, enums, docs
-static const std::array<std::string_view, 7> cmds = {
+static const std::array<std::string_view, 8> cmds = {
    "/audio ",
    "/clear",
    "/exit",
@@ -243,6 +243,7 @@ static const std::array<std::string_view, 7> cmds = {
    "/image ",
    "/read ",
    "/regen",
+    "/video ",
 };

 static std::vector<std::pair<std::string, size_t>> auto_completion_callback(std::string_view line, size_t cursor_byte_pos) {
@@ -457,6 +458,9 @@ int llama_cli(int argc, char ** argv) {
    if (inf.has_inp_audio) {
        console::log("  /audio <file>       add an audio file\n");
    }
+    if (inf.has_inp_video) {
+        console::log("  /video <file>       add a video file\n");
+    }
    console::log("\n");

    // interactive loop
@@ -553,7 +557,8 @@ int llama_cli(int argc, char ** argv) {
            continue;
        } else if (
                (string_starts_with(buffer, "/image ") && inf.has_inp_image) ||
-                (string_starts_with(buffer, "/audio ") && inf.has_inp_audio)) {
+                (string_starts_with(buffer, "/audio ") && inf.has_inp_audio) ||
+                (string_starts_with(buffer, "/video ") && inf.has_inp_video)) {
            // just in case (bad copy-paste for example), we strip all trailing/leading spaces
            std::string fname = string_strip(buffer.substr(7));
            std::string marker = ctx_cli.load_input_file(fname, true);
@@ -1,5 +1,8 @@
 # mtmd

+set(MTMD_VIDEO ON CACHE BOOL "enable video support in mtmd (requires ffmpeg binary in PATH)")
+# TODO: add MTMD_VIDEO_METHOD in the future to select between ffmpeg and other backends
+
 find_package(Threads REQUIRED)

 add_library(mtmd
@@ -63,6 +66,10 @@ target_include_directories(mtmd PRIVATE ../..)
 target_include_directories(mtmd PRIVATE ../../vendor)
 target_compile_features   (mtmd PRIVATE cxx_std_17)

+if (MTMD_VIDEO)
+    target_compile_definitions(mtmd PRIVATE MTMD_VIDEO)
+endif()
+
 if (BUILD_SHARED_LIBS)
    set_target_properties     (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
    target_compile_definitions(mtmd PRIVATE LLAMA_BUILD)
@@ -77,6 +77,7 @@ struct mtmd_cli_context {
    int                 n_batch;

    mtmd::bitmaps bitmaps;
+    std::vector<mtmd_helper::video_ptr> videos;

    // chat template
    common_chat_templates_ptr tmpls;
@@ -166,11 +167,14 @@ struct mtmd_cli_context {
    }

    bool load_media(const std::string & fname) {
-        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str(), false));
-        if (!bmp.ptr) {
+        auto res = mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str(), false);
+        if (!res.bitmap) {
            return false;
        }
-        bitmaps.entries.push_back(std::move(bmp));
+        bitmaps.entries.emplace_back(res.bitmap);
+        if (res.video_ctx) {
+            videos.emplace_back(res.video_ctx);
+        }
        return true;
    }
 };
@@ -253,6 +257,7 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
    }

    ctx.bitmaps.entries.clear();
+    ctx.videos.clear();

    llama_pos new_n_past;
    if (mtmd_helper_eval_chunks(ctx.ctx_vision.get(),
@@ -373,6 +378,9 @@ int main(int argc, char ** argv) {
        if (mtmd_support_audio(ctx.ctx_vision.get())) {
            LOG("\n   /audio <path>    load an audio");
        }
+        if (mtmd_helper_support_video(ctx.ctx_vision.get())) {
+            LOG("\n   /video <path>    load a video");
+        }
        LOG("\n   /clear           clear the chat history");
        LOG("\n   /quit or /exit   exit the program");
        LOG("\n");
@@ -407,14 +415,15 @@ int main(int argc, char ** argv) {
            g_is_generating = true;
            bool is_image = line == "/image" || line.find("/image ") == 0;
            bool is_audio = line == "/audio" || line.find("/audio ") == 0;
-            if (is_image || is_audio) {
+            bool is_video = line == "/video" || line.find("/video ") == 0;
+            if (is_image || is_audio || is_video) {
                if (line.size() < 8) {
                    LOG_ERR("ERR: Missing media filename\n");
                    continue;
                }
                std::string media_path = line.substr(7);
                if (ctx.load_media(media_path)) {
-                    LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio");
+                    LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : is_audio ? "audio" : "video");
                    content += mtmd_default_marker();
                }
                // else, error is already printed by libmtmd
@@ -36,6 +36,11 @@
 #error "mtmd-helper is a public library outside of mtmd. it must not include internal headers"
 #endif

+#ifdef MTMD_VIDEO
+#include "sheredom/subprocess.h"
+#include <thread>
+#endif
+
 //
 // internal logging functions
 //
@@ -79,6 +84,7 @@ struct mtmd_helper_logger {
    }
 } g_logger;

+#define LOG_DBG(...) g_logger.log(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
 #define LOG_INF(...) g_logger.log(GGML_LOG_LEVEL_INFO,  __VA_ARGS__)
 #define LOG_WRN(...) g_logger.log(GGML_LOG_LEVEL_WARN,  __VA_ARGS__)
 #define LOG_ERR(...) g_logger.log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
@@ -478,42 +484,94 @@ static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int

 } // namespace audio_helpers

-mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder) {
+// Computes FNV-1a hash of the data
+static std::string fnv_hash(const uint8_t * data, size_t len) {
+    const uint64_t fnv_prime = 0x100000001b3ULL;
+    uint64_t hash = 0xcbf29ce484222325ULL;
+
+    for (size_t i = 0; i < len; ++i) {
+        hash ^= data[i];
+        hash *= fnv_prime;
+    }
+    return std::to_string(hash);
+}
+
+mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder) {
+    // calculate the hash if needed
+    std::string id;
+    mtmd_bitmap * result = nullptr;
+
+    if (!placeholder) {
+        id = fnv_hash(buf, len);
+    }
+
    if (audio_helpers::is_audio_file((const char *)buf, len)) {
        std::vector<float> pcmf32;
        const int sample_rate = mtmd_get_audio_sample_rate(ctx);
        if (sample_rate < 0) {
            LOG_ERR("This model does not support audio input\n");
-            return nullptr;
+            return {nullptr, nullptr};
        }
        if (!audio_helpers::decode_audio_from_buf(buf, len, sample_rate, pcmf32)) {
            LOG_ERR("Unable to read WAV audio file from buffer\n");
-            return nullptr;
+            return {nullptr, nullptr};
        }
-        return mtmd_bitmap_init_from_audio(pcmf32.size(), placeholder ? nullptr : pcmf32.data());
+        result = mtmd_bitmap_init_from_audio(pcmf32.size(), placeholder ? nullptr : pcmf32.data());
+        mtmd_bitmap_set_id(result, id.empty() ? nullptr : id.c_str());
+        return {result, nullptr};
    }

    // otherwise, we assume it's an image
-    mtmd_bitmap * result = nullptr;
-    {
+    if (!result) {
        int nx, ny, nc;
        auto * data = stbi_load_from_memory(buf, len, &nx, &ny, &nc, 3);
-        if (!data) {
-            LOG_ERR("%s: failed to decode image bytes\n", __func__);
-            return nullptr;
+        if (data) {
+            result = mtmd_bitmap_init(nx, ny, placeholder ? nullptr : data);
+            mtmd_bitmap_set_id(result, id.empty() ? nullptr : id.c_str());
+            stbi_image_free(data);
+            return {result, nullptr};
        }
-        result = mtmd_bitmap_init(nx, ny, placeholder ? nullptr : data);
-        stbi_image_free(data);
+        // otherwise, fallthrough to video decoding (if supported)
    }
-    return result;
+
+    // last try: load as video
+#ifdef MTMD_VIDEO
+    if (!result) {
+        auto params = mtmd_helper_video_init_params_default();
+        auto video_ctx = mtmd_helper_video_init_from_buf(ctx, buf, len, params);
+        if (!video_ctx) {
+            LOG_ERR("%s: failed to decode buffer as either image/audio/video\n", __func__);
+            return {nullptr, nullptr};
+        }
+        result = mtmd_bitmap_init_lazy(ctx,
+            id.empty() ? nullptr : id.c_str(),
+            video_ctx,
+            [](size_t, void * user_data, mtmd_bitmap ** out_bitmap, char ** out_text) -> int {
+                auto * vctx = static_cast<mtmd_helper_video *>(user_data);
+                char * text = nullptr;
+                int ret = mtmd_helper_video_read_next(vctx, out_bitmap, &text);
+                *out_text = text; // heap-allocated by read_next; freed automatically by mtmd
+                return ret;
+            });
+         return {result, video_ctx};
+    }
+#else
+    if (!result) {
+        LOG_ERR("%s: failed to decode buffer as either image or audio (video support not compiled in)\n", __func__);
+        return {nullptr, nullptr};
+    }
+#endif
+
+    // should not reach here
+    return {nullptr, nullptr};
 }

-mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder) {
+mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder) {
    std::vector<unsigned char> buf;
    FILE * f = fopen(fname, "rb");
    if (!f) {
        LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
-        return nullptr;
+        return {nullptr, nullptr};
    }

    fseek(f, 0, SEEK_END);
@@ -522,7 +580,7 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char *
    if (file_size < 0) {
        LOG_ERR("Failed to get file size of %s\n", fname);
        fclose(f);
-        return nullptr;
+        return {nullptr, nullptr};
    }
    buf.resize(file_size);

@@ -530,9 +588,425 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char *
    fclose(f);
    if (n_read != (size_t)file_size) {
        LOG_ERR("Failed to read entire file %s", fname);
-        return nullptr;
+        return {nullptr, nullptr};
    }

    return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size(), placeholder);
 }

+bool mtmd_helper_support_video(mtmd_context * ctx) {
+#ifdef MTMD_VIDEO
+    return mtmd_support_vision(ctx);
+#else
+    return false;
+#endif
+}
+
+//
+// Video input helpers
+//
+
+#ifdef MTMD_VIDEO
+
+struct mtmd_helper_video {
+    mtmd_context * mctx;
+    std::string path;
+    std::vector<uint8_t> input_buf; // non-empty when initialized from buffer
+    std::string ffmpeg_bin;
+    std::string ffprobe_bin;
+    float fps_target = 0.0f;
+    mtmd_helper_video_info info = {};
+
+    struct subprocess_s proc = {};
+    bool proc_alive = false;
+    int32_t current_frame = 0;
+    std::thread feeder_thread;
+
+    std::string prompt_start         = "Video:";
+    int32_t     timestamp_interval_ms = 5000; // emit a timestamp text every N ms (0 = disabled)
+    float       next_timestamp_ms     = 0.0f; // next elapsed-ms threshold at which to emit
+
+    std::vector<uint8_t> frame_buf;
+    std::string pending_text; // text queued to be returned before the next frame
+    bool        start_emitted = false;
+
+    bool is_buf_input() const { return !input_buf.empty(); }
+
+    // must run in a separate thread alongside stdout reading to avoid pipe deadlock
+    void feed_stdin(struct subprocess_s * sp) {
+        FILE * f = subprocess_stdin(sp);
+        if (!f) {
+            LOG_DBG("%s: subprocess has no stdin pipe\n", __func__);
+            return;
+        }
+        LOG_DBG("%s: feeding %zu bytes to stdin\n", __func__, input_buf.size());
+        size_t written = fwrite(input_buf.data(), 1, input_buf.size(), f);
+        LOG_DBG("%s: wrote %zu bytes, closing stdin\n", __func__, written);
+        fclose(f);
+    }
+
+    bool probe(float fps_target_arg) {
+        const char * input_arg = is_buf_input() ? "pipe:0" : path.c_str();
+        const char * cmd[] = {
+            ffprobe_bin.c_str(),
+            "-v", "quiet",
+            "-show_entries", "stream=width,height,r_frame_rate,nb_frames,duration",
+            "-select_streams", "v:0",
+            "-of", "default=noprint_wrappers=1",
+            input_arg,
+            nullptr,
+        };
+
+        LOG_DBG("%s: launching:", __func__);
+        for (size_t i = 0; cmd[i]; i++) { LOG_DBG(" %s", cmd[i]); }
+        LOG_DBG("\n");
+
+        struct subprocess_s fprobe;
+        if (subprocess_create(cmd,
+                subprocess_option_search_user_path | subprocess_option_inherit_environment,
+                &fprobe) != 0) {
+            LOG_ERR("%s: failed to launch ffprobe\n", __func__);
+            return false;
+        }
+
+        std::thread probe_feeder;
+        if (is_buf_input()) {
+            probe_feeder = std::thread([this, &fprobe]() { feed_stdin(&fprobe); });
+        }
+
+        uint32_t width  = 0;
+        uint32_t height = 0;
+        float orig_fps = 0.0f;
+        float duration = -1.0f;
+        int32_t n_frames_orig = -1;
+        char line[256];
+        FILE * fp = subprocess_stdout(&fprobe);
+
+        while (fgets(line, sizeof(line), fp)) {
+            char * eq = strchr(line, '=');
+            if (!eq) continue;
+            *eq = '\0';
+            const char * key = line;
+            const char * val = eq + 1;
+            char * nl = (char *)strchr(val, '\n');
+            if (nl) *nl = '\0';
+
+            if (strcmp(key, "width") == 0) {
+                width = (uint32_t)atoi(val);
+            } else if (strcmp(key, "height") == 0) {
+                height = (uint32_t)atoi(val);
+            } else if (strcmp(key, "r_frame_rate") == 0) {
+                orig_fps = parse_rational(val);
+            } else if (strcmp(key, "nb_frames") == 0 && strcmp(val, "N/A") != 0) {
+                n_frames_orig = atoi(val);
+            } else if (strcmp(key, "duration") == 0 && strcmp(val, "N/A") != 0) {
+                duration = (float)atof(val);
+            }
+        }
+
+        if (probe_feeder.joinable()) {
+            probe_feeder.join();
+        }
+
+        int ret_code;
+        subprocess_join(&fprobe, &ret_code);
+        subprocess_destroy(&fprobe);
+
+        if (width == 0 || height == 0 || orig_fps <= 0.0f) {
+            return false;
+        }
+
+        if (duration < 0.0f && n_frames_orig > 0) {
+            duration = (float)n_frames_orig / orig_fps;
+        }
+
+        fps_target = fps_target_arg > 0.0f ? fps_target_arg : orig_fps;
+        info.width    = width;
+        info.height   = height;
+        info.fps      = fps_target;
+        LOG_DBG("%s: %ux%u fps=%.2f duration=%.2fs n_frames=%d\n",
+                __func__, width, height, fps_target, duration, info.n_frames);
+        info.n_frames = duration > 0.0f ? (int32_t)(duration * fps_target + 0.5f) : -1;
+        frame_buf.resize((size_t)width * height * 3);
+        return true;
+    }
+
+    bool start_ffmpeg(float seek_seconds) {
+        char seek_buf[64];
+        char fps_buf[64];
+
+        std::vector<const char *> cmd;
+        cmd.push_back(ffmpeg_bin.c_str());
+
+        if (!is_buf_input() && seek_seconds > 0.0f) {
+            // input-side seek: fast, keyframe-accurate; only valid for seekable file inputs
+            snprintf(seek_buf, sizeof(seek_buf), "%.6f", seek_seconds);
+            cmd.push_back("-ss");
+            cmd.push_back(seek_buf);
+        }
+
+        cmd.push_back("-i");
+        // cache:pipe:0 wraps stdin with a seekable in-memory cache, letting ffmpeg seek
+        // backwards for container headers (e.g. MP4 moov atom at end of file)
+        cmd.push_back(is_buf_input() ? "cache:pipe:0" : path.c_str());
+
+        if (seek_seconds > 0.0f && is_buf_input()) {
+            // output-side seek: frame-accurate but decodes and discards frames up to seek point
+            snprintf(seek_buf, sizeof(seek_buf), "%.6f", seek_seconds);
+            cmd.push_back("-ss");
+            cmd.push_back(seek_buf);
+        }
+
+        if (fps_target > 0.0f) {
+            snprintf(fps_buf, sizeof(fps_buf), "fps=%.6f", fps_target);
+            cmd.push_back("-vf");
+            cmd.push_back(fps_buf);
+        }
+
+        cmd.push_back("-f");
+        cmd.push_back("rawvideo");
+        cmd.push_back("-pix_fmt");
+        cmd.push_back("rgb24");
+        cmd.push_back("pipe:1");
+        cmd.push_back("-loglevel");
+        cmd.push_back("error");
+        cmd.push_back(nullptr);
+
+        LOG_DBG("%s: launching:", __func__);
+        for (size_t i = 0; cmd[i]; i++) {
+            LOG_DBG(" %s", cmd[i]);
+        }
+        LOG_DBG("\n");
+
+        int ret = subprocess_create(
+            cmd.data(),
+            subprocess_option_search_user_path | subprocess_option_inherit_environment,
+            &proc);
+
+        proc_alive = (ret == 0);
+        LOG_DBG("%s: subprocess_create ret=%d proc_alive=%d\n", __func__, ret, (int)proc_alive);
+
+        if (proc_alive && is_buf_input()) {
+            LOG_DBG("%s: starting feeder thread for %zu-byte buffer\n", __func__, input_buf.size());
+            feeder_thread = std::thread([this]() { feed_stdin(&proc); });
+        }
+
+        return proc_alive;
+    }
+
+    void stop_ffmpeg() {
+        if (proc_alive) {
+            subprocess_terminate(&proc);
+            subprocess_destroy(&proc);
+            proc_alive = false;
+        }
+        if (feeder_thread.joinable()) {
+            feeder_thread.join();
+        }
+    }
+
+    mtmd_bitmap * read_next_frame() {
+        if (!proc_alive) return nullptr;
+
+        FILE * fp = subprocess_stdout(&proc);
+        const size_t frame_size = (size_t)info.width * info.height * 3;
+        LOG_DBG("%s: reading frame %d, expecting %zu bytes (%ux%u)\n",
+                __func__, current_frame, frame_size, info.width, info.height);
+
+        size_t total_read = 0;
+        while (total_read < frame_size) {
+            size_t n = fread(frame_buf.data() + total_read, 1, frame_size - total_read, fp);
+            if (n == 0) {
+                // clean EOF only if no bytes read yet; partial frame is an error
+                LOG_DBG("%s: fread returned 0 after %zu/%zu bytes (ferror=%d)\n",
+                        __func__, total_read, frame_size, ferror(fp));
+                proc_alive = false;
+                return nullptr;
+            }
+            total_read += n;
+        }
+
+        LOG_DBG("%s: frame %d read OK\n", __func__, current_frame);
+        current_frame++;
+        return mtmd_bitmap_init(info.width, info.height, frame_buf.data());
+    }
+
+    int32_t read_next(mtmd_bitmap ** out_bitmap, char ** out_text) {
+        *out_bitmap = nullptr;
+        *out_text   = nullptr;
+
+        if (!pending_text.empty()) {
+            *out_text = strdup(pending_text.c_str());
+            pending_text.clear();
+            return *out_text ? 0 : -2;
+        }
+
+        LOG_DBG("%s: proc_alive=%d start_emitted=%d current_frame=%d\n",
+                __func__, (int)proc_alive, (int)start_emitted, current_frame);
+
+        if (!proc_alive) {
+            return (current_frame == 0) ? -2 : -1;
+        }
+
+        if (!start_emitted) {
+            start_emitted = true;
+            if (!prompt_start.empty()) {
+                *out_text = strdup(prompt_start.c_str());
+                return *out_text ? 0 : -2;
+            }
+        }
+
+        mtmd_bitmap * frame = read_next_frame();
+        if (!frame) return -1;
+        *out_bitmap = frame;
+
+        if (timestamp_interval_ms > 0) {
+            // current_frame was already incremented by read_next_frame(); undo for elapsed calc
+            float elapsed_ms = (float)(current_frame - 1) / info.fps * 1000.0f;
+            if (elapsed_ms >= next_timestamp_ms) {
+                char ts_buf[32];
+                float elapsed_s = elapsed_ms / 1000.0f;
+                int   minutes   = (int)(elapsed_s / 60);
+                float seconds   = elapsed_s - minutes * 60.0f;
+                snprintf(ts_buf, sizeof(ts_buf), "[%dm%.2fs]", minutes, seconds);
+                pending_text = ts_buf;
+                next_timestamp_ms += (float)timestamp_interval_ms;
+            }
+        }
+
+        return 0;
+    }
+
+    static float parse_rational(const char * s) {
+        int num = 0, den = 1;
+        if (sscanf(s, "%d/%d", &num, &den) == 2 && den > 0) {
+            return (float)num / (float)den;
+        }
+        float val;
+        if (sscanf(s, "%f", &val) == 1) {
+            return val;
+        }
+        return 0.0f;
+    }
+};
+#endif
+
+mtmd_helper_video_init_params mtmd_helper_video_init_params_default() {
+    return {
+        /* fps_target             */ 4.0f,
+        /* ffmpeg_bin_dir         */ nullptr,
+        /* timestamp_interval_ms  */ 5000,
+    };
+}
+
+static std::string video_resolve_bin(const char * bin_dir, const char * name) {
+    if (!bin_dir || bin_dir[0] == '\0') {
+        return name; // rely on PATH
+    }
+    std::string result = bin_dir;
+    char last = result.back();
+    if (last != '/' && last != '\\') {
+#ifdef _WIN32
+        result += '\\';
+#else
+        result += '/';
+#endif
+    }
+    result += name;
+#ifdef _WIN32
+    result += ".exe";
+#endif
+    return result;
+}
+
+mtmd_helper_video * mtmd_helper_video_init(
+        mtmd_context * mctx,
+        const char * path,
+        mtmd_helper_video_init_params params) {
+#ifdef MTMD_VIDEO
+    auto * ctx = new mtmd_helper_video();
+
+    ctx->mctx                 = mctx;
+    ctx->path                 = path;
+    ctx->ffmpeg_bin           = video_resolve_bin(params.ffmpeg_bin_dir, "ffmpeg");
+    ctx->ffprobe_bin          = video_resolve_bin(params.ffmpeg_bin_dir, "ffprobe");
+    ctx->timestamp_interval_ms = params.timestamp_interval_ms;
+
+    if (!ctx->probe(params.fps_target)) {
+        LOG_ERR("%s: ffprobe failed for '%s' (is ffprobe in PATH?)\n", __func__, path);
+        delete ctx;
+        return nullptr;
+    }
+
+    if (!ctx->start_ffmpeg(0.0f)) {
+        LOG_ERR("%s: failed to start ffmpeg for '%s' (is ffmpeg in PATH?)\n", __func__, path);
+        delete ctx;
+        return nullptr;
+    }
+
+    return ctx;
+#else
+    LOG_ERR("%s: video is not supported in this build (MTMD_VIDEO is set to OFF)\n", __func__);
+    return nullptr;
+#endif
+}
+
+mtmd_helper_video * mtmd_helper_video_init_from_buf(
+        mtmd_context * mctx,
+        const unsigned char * buf, size_t len,
+        mtmd_helper_video_init_params params) {
+#ifdef MTMD_VIDEO
+    auto * ctx = new mtmd_helper_video();
+
+    ctx->mctx                  = mctx;
+    ctx->input_buf.assign(buf, buf + len);
+    ctx->ffmpeg_bin            = video_resolve_bin(params.ffmpeg_bin_dir, "ffmpeg");
+    ctx->ffprobe_bin           = video_resolve_bin(params.ffmpeg_bin_dir, "ffprobe");
+    ctx->timestamp_interval_ms = params.timestamp_interval_ms;
+
+    if (!ctx->probe(params.fps_target)) {
+        LOG_ERR("%s: ffprobe failed on buffer (is ffprobe in PATH?)\n", __func__);
+        delete ctx;
+        return nullptr;
+    }
+
+    if (!ctx->start_ffmpeg(0.0f)) {
+        LOG_ERR("%s: failed to start ffmpeg on buffer (is ffmpeg in PATH?)\n", __func__);
+        delete ctx;
+        return nullptr;
+    }
+
+    return ctx;
+#else
+    LOG_ERR("%s: video is not supported in this build (MTMD_VIDEO is set to OFF)\n", __func__);
+    return nullptr;
+#endif
+}
+
+void mtmd_helper_video_free(mtmd_helper_video * ctx) {
+#ifdef MTMD_VIDEO
+    if (!ctx) return;
+    ctx->stop_ffmpeg();
+    delete ctx;
+#else
+    LOG_ERR("%s: video is not supported in this build (MTMD_VIDEO is set to OFF)\n", __func__);
+#endif
+}
+
+mtmd_helper_video_info mtmd_helper_video_get_info(const mtmd_helper_video * ctx) {
+#ifdef MTMD_VIDEO
+    return ctx->info;
+#else
+    GGML_ASSERT(false && "video is not supported in this build (MTMD_VIDEO is set to OFF)");
+#endif
+}
+
+int32_t mtmd_helper_video_read_next(mtmd_helper_video * ctx,
+        mtmd_bitmap ** out_bitmap, char ** out_text) {
+#ifdef MTMD_VIDEO
+    if (!ctx) return -2;
+    return ctx->read_next(out_bitmap, out_text);
+#else
+    GGML_ASSERT(false && "video is not supported in this build (MTMD_VIDEO is set to OFF)");
+#endif
+}
@@ -20,25 +20,39 @@ extern "C" {
 // BREAKING CHANGES are expected.
 //

+struct mtmd_helper_video;
+typedef struct mtmd_helper_video mtmd_helper_video;
+
 // Set callback for all future logging events.
 // If this is not called, or NULL is supplied, everything is output on stderr.
 // Note: this also call mtmd_log_set() internally
 MTMD_API void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_data);

+// Returns true if this build includes video support (MTMD_VIDEO was ON at compile time).
+MTMD_API bool mtmd_helper_support_video(mtmd_context * ctx);
+
+struct mtmd_helper_bitmap_wrapper {
+    mtmd_bitmap * bitmap;
+    mtmd_helper_video * video_ctx;
+};
+
 // helper function to construct a mtmd_bitmap from a file
 // it calls mtmd_helper_bitmap_init_from_buf() internally
 // returns nullptr on failure
 // this function is thread-safe
-MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder);
+MTMD_API struct mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder);

 // helper function to construct a mtmd_bitmap from a buffer containing a file
 // supported formats:
 //     image: formats supported by stb_image: jpg, png, bmp, gif, etc.
 //     audio: formats supported by miniaudio: wav, mp3, flac
-// note: audio files will be auto-detected based on magic bytes
+// note:
+//   - for now, video input is only supported via C++ helper functions
+//   - audio files will be auto-detected based on magic bytes
+//   - output bitmap will have FNV hash as the ID
 // returns nullptr on failure
 // this function is thread-safe
-MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder);
+MTMD_API struct mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder);

 // helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
 MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
@@ -89,6 +103,56 @@ MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
                                                int32_t n_batch,
                                                llama_pos * new_n_past);

+//
+// video input helpers (requires ffmpeg/ffprobe installed on the system)
+// the notion of video only exists at the helper level, it is not visible to the core mtmd library
+//
+// NOTE: this implementation is model-agnostic, it can be used with any vision-capable model
+//       however, it may not be accurate for some specific models
+//       (this is expected for now, to keep the implementation simple)
+//
+
+struct mtmd_helper_video_info {
+    uint32_t width;
+    uint32_t height;
+    float    fps;      // effective fps (fps_target if set, else original video fps)
+    int32_t  n_frames; // estimated total frames at effective fps (-1 if unknown)
+};
+
+struct mtmd_helper_video_init_params {
+    float fps_target;            // desired output fps; <= 0 means use the video's native fps, defaulted to 4.0f
+    const char * ffmpeg_bin_dir; // directory containing ffmpeg/ffprobe binaries; NULL means search PATH
+    int64_t timestamp_interval_ms; // interval for adding timestamp as text chunk (example: "[10m50.5s]"); <= 0 means no timestamp, defaulted to 5000ms
+    // TODO @ngxson : allow "placeholder" bitmap output for counting tokens
+};
+
+MTMD_API struct mtmd_helper_video_init_params mtmd_helper_video_init_params_default(void);
+
+// returns NULL on failure (ffprobe not found, file unreadable, etc.)
+MTMD_API mtmd_helper_video * mtmd_helper_video_init(
+                    struct mtmd_context * mctx,
+                    const char * path,
+                    struct mtmd_helper_video_init_params params);
+
+// Same as mtmd_helper_video_init(), but reads from an in-memory buffer.
+// The buffer is copied internally; the caller does not need to keep it alive.
+// Note: pipe input is not seekable, so seeking will use output-side seeking
+// (ffmpeg decodes and discards frames up to the target position).
+MTMD_API mtmd_helper_video * mtmd_helper_video_init_from_buf(
+                    struct mtmd_context * mctx,
+                    const unsigned char * buf, size_t len,
+                    struct mtmd_helper_video_init_params params);
+MTMD_API void mtmd_helper_video_free(mtmd_helper_video * ctx);
+MTMD_API struct mtmd_helper_video_info mtmd_helper_video_get_info(const mtmd_helper_video * ctx);
+
+// Read the next item from the video stream; exactly one of out_bitmap or out_text is set per call.
+// *out_bitmap - heap-allocated; caller must free with mtmd_bitmap_free()
+// *out_text   - heap-allocated (always via strdup/malloc); caller must free with free()
+// returns 0 on success, -1 on EOF, -2 on error
+MTMD_API int32_t mtmd_helper_video_read_next(mtmd_helper_video * ctx,
+            mtmd_bitmap ** out_bitmap,
+            char ** out_text);
+
 #ifdef __cplusplus
 } // extern "C"
 #endif
@@ -97,4 +161,16 @@ MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
 // C++ wrappers
 //

+#ifdef __cplusplus
+namespace mtmd_helper {
+
+// video-related C++ wrappers
+struct mtmd_helper_video_deleter {
+    void operator()(mtmd_helper_video * val) { mtmd_helper_video_free(val); }
+};
+using video_ptr = std::unique_ptr<mtmd_helper_video, mtmd_helper_video_deleter>;
+
+} // namespace mtmd_helper
+#endif
+
 #endif
@@ -35,6 +35,10 @@ struct mtmd_bitmap {
    std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
    bool is_audio = false; // true if the bitmap is audio

+    // lazy-loaded bitmap
+    mtmd_bitmap_lazy_callback lazy_callback = nullptr;
+    void * lazy_user_data = nullptr;
+
    mtmd_bitmap(const unsigned char * data, uint32_t nx, uint32_t ny)
        : nx(nx), ny(ny), is_audio(false) {
        if (data) {
@@ -732,30 +736,111 @@ void mtmd_free(mtmd_context * ctx) {

 struct mtmd_tokenizer {
    mtmd_context * ctx;
-    std::vector<const mtmd_bitmap *> bitmaps;

    std::string input_text;
    bool add_special;
    bool parse_special;
    const llama_vocab * vocab;

+    struct part {
+        std::string text;
+        const mtmd_bitmap * bitmap;
+    };
+    std::vector<part> parts;
+    // these will be freed when mtmd_tokenizer finishes
+    std::vector<mtmd::bitmap> bm_from_lazy; // TODO @ngxson : refactor, free bm_from_lazy progressively
+    std::vector<const char *> text_from_lazy;
+
    mtmd_input_chunks cur;
    uint32_t n_images_added = 0; // 0-based index assigned to the next image chunk

+    ~mtmd_tokenizer() {
+        // note: mtmd::bitmap is already RAII
+        for (auto & str : text_from_lazy) {
+            free((void *)str);
+        }
+    }
+
    mtmd_tokenizer(mtmd_context * ctx,
            const mtmd_input_text * text,
-            const mtmd_bitmap ** bitmaps,
-            size_t n_bitmaps) : ctx(ctx), bitmaps(bitmaps, bitmaps + n_bitmaps) {
+            const mtmd_bitmap ** bmps,
+            size_t n_bitmaps) : ctx(ctx) {
        add_special   = text->add_special;
        parse_special = text->parse_special;
        input_text    = text->text;
        vocab         = ctx->vocab;
+
+        std::vector<const mtmd_bitmap *> bitmaps(bmps, bmps + n_bitmaps);
+        auto parts_str = split_text(input_text, ctx->media_marker);
+        size_t i_bm = 0;
+        for (const auto & part : parts_str) {
+            if (part == ctx->media_marker) {
+                if (i_bm >= bitmaps.size()) {
+                    throw std::runtime_error(string_format("number of media markers in text (%zu) exceeds number of bitmaps (%zu)", i_bm + 1, bitmaps.size()));
+                }
+                parts.push_back({"", bitmaps[i_bm++]});
+            } else {
+                parts.push_back({std::move(part), nullptr});
+            }
+        }
+
+        size_t n_markers = 0;
+        for (const auto & part : parts) {
+            if (part.bitmap != nullptr) {
+                n_markers++;
+            }
+        }
+        if (n_markers != bitmaps.size()) {
+            throw std::runtime_error(string_format("number of media markers in text (%zu) does not match number of bitmaps (%zu)", n_markers, bitmaps.size()));
+        }
+
+        expand_lazy_bitmaps();
+    }
+
+    void expand_lazy_bitmaps() {
+        std::vector<part> expanded;
+        expanded.reserve(parts.size());
+        for (auto & p : parts) {
+            if (p.bitmap != nullptr && p.bitmap->lazy_callback) {
+                LOG_DBG("%s: expanding lazy bitmap\n", __func__);
+                for (size_t i = 0;; i++) {
+                    char * out_str = nullptr;
+                    mtmd_bitmap * out_bm = nullptr;
+                    int res = p.bitmap->lazy_callback(i,
+                                    p.bitmap->lazy_user_data,
+                                    &out_bm,
+                                    &out_str);
+                    if (out_bm && out_str) {
+                        throw std::runtime_error(string_format("lazy callback cannot return both bitmap and text"));
+                    }
+                    if (res == 0) {
+                        // OK, append the returned chunk; lazy part is not yet added
+                        if (out_bm) {
+                            auto & ptr = bm_from_lazy.emplace_back(out_bm); // remember to free it later
+                            expanded.push_back({"", ptr.ptr.get()});
+                            LOG_DBG("%s: lazy callback returned bitmap with dimensions %d x %d\n", __func__, out_bm->nx, out_bm->ny);
+                        } else if (out_str) {
+                            auto & ptr = text_from_lazy.emplace_back(out_str); // remember to free it later
+                            expanded.push_back({ptr, nullptr});
+                            LOG_DBG("%s: lazy callback returned text: %s\n", __func__, out_str);
+                        }
+                    } else if (res == -1) {
+                        // EOF: lazy part removes itself (not added to expanded)
+                        break;
+                    } else if (res == -2) {
+                        // error
+                        throw std::runtime_error(string_format("lazy callback returned error"));
+                    }
+                }
+            } else {
+                expanded.push_back(std::move(p));
+            }
+        }
+        parts = std::move(expanded);
    }

    int32_t tokenize(mtmd_input_chunks * output) {
        cur.entries.clear();
-        std::vector<std::string> parts = split_text(input_text, ctx->media_marker);
-        size_t i_bm = 0; // index of the current bitmap

        // [QWEN_VIDEO] handle frame merging for models that support it (i.e. qwen-vl)
        int n_merge_frames = 1;
@@ -764,53 +849,50 @@ struct mtmd_tokenizer {
            GGML_ASSERT(n_merge_frames <= 2 && "we only support merging maximum 2 images for now; open an issue if this model supports merging more");
        }

+        // Build merged_bitmaps: each entry is a group of 1 or 2 bitmaps.
+        // For consecutive mergeable bitmap parts, merge them and collapse the second part out of this->parts.
        std::vector<std::vector<const mtmd_bitmap *>> merged_bitmaps;
        if (n_merge_frames > 1) {
-            size_t i_bm_scan = 0;
            for (size_t i = 0; i < parts.size(); ++i) {
-                if (parts[i] != ctx->media_marker) {
+                if (parts[i].bitmap == nullptr) {
                    continue;
                }
-                if (i + 1 < parts.size()
-                        && parts[i + 1] == ctx->media_marker
-                        && i_bm_scan + 1 < bitmaps.size()) {
-                    const mtmd_bitmap * bm_a = bitmaps[i_bm_scan];
-                    const mtmd_bitmap * bm_b = bitmaps[i_bm_scan + 1];
+                if (i + 1 < parts.size() && parts[i + 1].bitmap != nullptr) {
+                    const mtmd_bitmap * bm_a = parts[i].bitmap;
+                    const mtmd_bitmap * bm_b = parts[i + 1].bitmap;
                    if (bm_a->can_batch_with(*bm_b)) {
-                        LOG_DBG("%s: merging 2 frames at bitmap index %zu and %zu\n", __func__, i_bm_scan, i_bm_scan + 1);
+                        LOG_DBG("%s: merging 2 frames at part index %zu and %zu\n", __func__, i, i + 1);
                        merged_bitmaps.push_back({bm_a, bm_b});
-                        parts.erase(parts.begin() + i + 1); // remove the second marker
-                        i_bm_scan += 2;
+                        parts.erase(parts.begin() + i + 1); // collapse the second bitmap part
                        continue;
                    }
                }
-                LOG_DBG("%s: no merging for bitmap index %zu\n", __func__, i_bm_scan);
-                merged_bitmaps.push_back({bitmaps[i_bm_scan]});
-                ++i_bm_scan;
+                LOG_DBG("%s: no merging for part index %zu\n", __func__, i);
+                merged_bitmaps.push_back({parts[i].bitmap});
            }
        } else {
-            for (size_t i = 0; i < bitmaps.size(); ++i) {
-                merged_bitmaps.push_back({bitmaps[i]});
+            for (const auto & p : parts) {
+                if (p.bitmap != nullptr) {
+                    merged_bitmaps.push_back({p.bitmap});
+                }
            }
        }

-        i_bm = 0;
-        for (auto & part : parts) {
-            if (part == ctx->media_marker) {
-                // this is a marker, we should add the next bitmap
+        size_t i_bm = 0;
+        for (const auto & p : parts) {
+            if (p.bitmap != nullptr) {
                if (i_bm >= merged_bitmaps.size()) {
                    LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
                            __func__, merged_bitmaps.size(), parts.size() - 1);
                    return 1;
                }
-                auto & bmps = merged_bitmaps[i_bm++];
+                auto bmps = merged_bitmaps[i_bm++];
                int32_t res = add_media(bmps);
                if (res != 0) {
                    return res;
                }
            } else {
-                // this is a text part, we should add it as text
-                add_text(part, parse_special);
+                add_text(p.text, parse_special);
            }
        }

@@ -1236,8 +1318,13 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
            const mtmd_input_text * text,
            const mtmd_bitmap ** bitmaps,
            size_t n_bitmaps) {
-    mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps);
-    return tokenizer.tokenize(output);
+    try {
+        mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps);
+        return tokenizer.tokenize(output);
+    } catch (const std::exception & e) {
+        LOG_ERR("%s: error: %s\n", __func__, e.what());
+        return 2;
+    }
 }

 int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
@@ -1373,6 +1460,10 @@ int mtmd_get_audio_sample_rate(const mtmd_context * ctx) {
    return clip_get_hparams(ctx->ctx_a)->audio_sample_rate;
 }

+const char * mtmd_get_marker(const mtmd_context * ctx) {
+    return ctx->media_marker.c_str();
+}
+
 //
 // public API functions
 //
@@ -1405,10 +1496,16 @@ uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap) {
 }

 const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
+    if (bitmap->is_placeholder()) {
+        return nullptr;
+    }
    return bitmap->get_ro_buf().data();
 }

 size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) {
+    if (bitmap->is_placeholder()) {
+        return 0;
+    }
    return bitmap->get_ro_buf().size();
 }

@@ -1428,6 +1525,18 @@ void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id) {
    }
 }

+mtmd_bitmap * mtmd_bitmap_init_lazy(mtmd_context * ctx,
+                                    const char * id,
+                                    void * user_data,
+                                    mtmd_bitmap_lazy_callback callback) {
+    GGML_UNUSED(ctx); // reserved for future use
+    mtmd_bitmap * bitmap = new mtmd_bitmap(nullptr, 0, 0);
+    bitmap->lazy_callback = callback;
+    bitmap->lazy_user_data = user_data;
+    mtmd_bitmap_set_id(bitmap, id);
+    return bitmap;
+}
+
 void mtmd_bitmap_free(mtmd_bitmap * bitmap) {
    if (bitmap) {
        delete bitmap;
@@ -128,6 +128,9 @@ MTMD_API bool mtmd_support_audio(const mtmd_context * ctx);
 // return -1 if audio is not supported
 MTMD_API int mtmd_get_audio_sample_rate(const mtmd_context * ctx);

+// get the current marker string
+MTMD_API const char * mtmd_get_marker(const mtmd_context * ctx);
+
 // mtmd_bitmap
 //
 // if bitmap is image:
@@ -156,6 +159,34 @@ MTMD_API void                  mtmd_bitmap_free       (mtmd_bitmap * bitmap);
 MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
 MTMD_API void         mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id);

+// mtmd_bitmap lazy
+//
+// this is a special bitmap that:
+// - does not hold the actual data
+// - can be expanded into one or more chunks (either media to text chunks)
+// user must provide a callback to fill in the data when mtmd_tokenize() is called
+// this is useful for large video inputs:
+// - allow reading video frame by frame, without loading the entire video into memory
+// - allow tracking the whole video with a single ID (for example, the file hash)
+
+// set (*out_bitmap) to non-nullptr to emit a bitmap chunk; it will be freed automatically
+// set (*out_text) to non-nullptr to emit a text chunk; it must be heap-allocated, null-terminated and will be freed automatically
+// either out_bitmap or out_text can be set, but not both
+// out_bitmap cannot be another lazy bitmap (no nested lazy allowed)
+// return value:
+//    0 on success
+//   -1 on EOF (signal to mtmd_tokenize to move on)
+//   -2 on error (signal to mtmd_tokenize to abort)
+typedef int(* mtmd_bitmap_lazy_callback)(
+    size_t chunk_idx,
+    void * user_data,
+    mtmd_bitmap ** out_bitmap,
+    char ** out_text);
+
+MTMD_API mtmd_bitmap * mtmd_bitmap_init_lazy(mtmd_context * ctx,
+                                             const char * id, // usually set to file hash
+                                             void * user_data,
+                                             mtmd_bitmap_lazy_callback callback);

 // mtmd_input_chunks
 //
@@ -1252,6 +1252,10 @@ The `response_format` parameter supports both plain JSON output (e.g. `{"type":

 `parallel_tool_calls` : Whether to enable parallel/multiple tool calls (only supported on some models, verification is based on jinja template).

+For multimodal input:
+- Content type `image_url` and `input_audio` are the same as OAI schema
+- Content type `input_video` is an extension from OAI schema. For now, it only accepts base64 input
+
 *Examples:*

 You can use either Python `openai` library with appropriate checkpoints:
@@ -701,29 +701,19 @@ size_t validate_utf8(const std::string& text) {
    return len;
 }

-// Computes FNV-1a hash of the data
-static std::string fnv_hash(const uint8_t * data, size_t len) {
-    const uint64_t fnv_prime = 0x100000001b3ULL;
-    uint64_t hash = 0xcbf29ce484222325ULL;
-
-    for (size_t i = 0; i < len; ++i) {
-        hash ^= data[i];
-        hash *= fnv_prime;
-    }
-    return std::to_string(hash);
-}
-
 server_tokens process_mtmd_prompt(mtmd_context * mctx, const std::string & prompt, const std::vector<raw_buffer> & files, bool is_placeholder) {
+    // these will be freed upon going out of scope
    mtmd::bitmaps bitmaps;
+    std::vector<mtmd_helper::video_ptr> videos;
    for (auto & file : files) {
-        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size(), is_placeholder));
-        if (!bmp.ptr) {
+        auto out = mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size(), is_placeholder);
+        if (!out.bitmap) {
            throw std::runtime_error("Failed to load image or audio file");
        }
-        // calculate bitmap hash (for KV caching)
-        std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());
-        bmp.set_id(hash.c_str());
-        bitmaps.entries.push_back(std::move(bmp));
+        bitmaps.entries.emplace_back(out.bitmap);
+        if (out.video_ctx) {
+            videos.emplace_back(out.video_ctx);
+        }
    }
    // process prompt
    std::vector<server_tokens> inputs;
@@ -1023,6 +1013,20 @@ json oaicompat_chat_params_parse(
                p["text"] = get_media_marker();
                p.erase("input_audio");

+            } else if (type == "input_video") {
+                if (!opt.allow_video) {
+                    throw std::runtime_error("video input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
+                }
+
+                json input_video  = json_value(p, "input_video", json::object());
+                std::string data  = json_value(input_video, "data", std::string());
+                auto decoded_data = base64_decode(data); // expected to be base64 encoded
+                out_files.push_back(decoded_data);
+
+                p["type"] = "media_marker";
+                p["text"] = get_media_marker();
+                p.erase("input_video");
+
            } else if (type != "text") {
                throw std::invalid_argument("unsupported content[].type");
            }
@@ -294,6 +294,7 @@ struct server_chat_params {
    common_chat_templates_ptr tmpls;
    bool allow_image;
    bool allow_audio;
+    bool allow_video;
    bool enable_thinking = true;
    int  reasoning_budget = -1;
    std::string reasoning_budget_message;
@@ -1247,6 +1247,7 @@ private:
                /* tmpls                 */ std::move(chat_templates),
                /* allow_image           */ mctx ? mtmd_support_vision(mctx) : false,
                /* allow_audio           */ mctx ? mtmd_support_audio (mctx) : false,
+                /* allow_video           */ mctx ? mtmd_helper_support_video(mctx) : false,
                /* enable_thinking       */ enable_thinking,
                /* reasoning_budget      */ params_base.sampling.reasoning_budget_tokens,
                /* reasoning_budget_msg  */ params_base.sampling.reasoning_budget_message,
@@ -3586,6 +3587,7 @@ server_context_meta server_context::get_meta() const {
        /* has_mtmd               */ impl->mctx != nullptr,
        /* has_inp_image          */ impl->chat_params.allow_image,
        /* has_inp_audio          */ impl->chat_params.allow_audio,
+        /* has_inp_video          */ impl->chat_params.allow_video,
        /* json_ui_settings       */ impl->json_ui_settings,
        /* json_webui_settings    */ impl->json_webui_settings,  // Deprecated
        /* slot_n_ctx             */ impl->get_slot_n_ctx(),
@@ -4183,6 +4185,7 @@ void server_routes::init_routes() {
            { "model_path",                  meta->model_path },
            { "modalities",                  json {
                {"vision", meta->has_inp_image},
+                {"video",  meta->has_inp_video},
                {"audio",  meta->has_inp_audio},
            } },
            { "media_marker",                get_media_marker() },
@@ -4976,7 +4979,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_count_tokens(const l
        n_tokens = tokenize_mixed(vocab, prompt, true, true).size();
    }

-    json response = {{"input_tokens", static_cast<int>(n_tokens)}};
+    json response = {{"input_tokens", static_cast<int64_t>(n_tokens)}};
    if (is_oai) {
        response["object"] = "response.input_tokens";
    }
@@ -21,6 +21,7 @@ struct server_context_meta {
    bool has_mtmd;
    bool has_inp_image;
    bool has_inp_audio;
+    bool has_inp_video;
    json json_ui_settings;            // Primary: new name
    json json_webui_settings;            // Deprecated: use json_ui_settings instead (kept for backward compat)
    int slot_n_ctx;