opencl: flush profiling batch at shutdown for incomplete batches (#25016 )

xcframework : disable mtmd video on i/tv/visionos (#25018 )
model : Add label for LFM2.5-230M (#25008 )
2026-06-26 07:37:40 +02:00 · 2026-06-25 18:48:24 -07:00 · 2026-06-26 00:13:59 +02:00 · 2026-06-25 18:58:52 +02:00 · 2026-06-25 17:29:23 +02:00 · 2026-06-25 17:23:37 +02:00
167 changed files with 11847 additions and 11156 deletions
@@ -35,8 +35,20 @@ AMD ZenDNN:
 documentation:
    - changed-files:
        - any-glob-to-any-file:
+            - "**/*.md"
            - docs/**
            - media/**
+examples:
+    - all:
+        - changed-files:
+            - any-glob-to-any-file:
+                - app/**
+                - examples/**
+                - tools/**
+            - all-globs-to-all-files:
+                - '!tools/server/**'
+                - '!tools/mtmd/**'
+                - '!tools/ui/**'
 testing:
    - changed-files:
        - any-glob-to-any-file:
@@ -47,28 +59,12 @@ build:
            - cmake/**
            - CMakeLists.txt
            - CMakePresets.json
-examples:
-    - changed-files:
-        - any-glob-to-any-file:
-            - examples/**
-            - tools/**
 devops:
    - changed-files:
        - any-glob-to-any-file:
            - .devops/**
            - .github/**
            - ci/**
-python:
-    - changed-files:
-        - any-glob-to-any-file:
-            - "**/*.py"
-            - requirements/**
-            - gguf-py/**
-            - .flake8
-script:
-    - changed-files:
-        - any-glob-to-any-file:
-            - scripts/**
 android:
    - changed-files:
        - any-glob-to-any-file:
@@ -81,9 +77,20 @@ server:
    - changed-files:
        - any-glob-to-any-file:
            - tools/server/**
-
-
-
+mtmd:
+    - changed-files:
+        - any-glob-to-any-file:
+            - tools/mtmd/**
+conversion:
+    - changed-files:
+        - any-glob-to-any-file:
+            - conversion/**
+            - convert_*.py
+            - gguf-py/**
+vendor:
+    - changed-files:
+        - any-glob-to-any-file:
+            - vendor/**
 ggml:
    - changed-files:
        - any-glob-to-any-file:
@@ -222,6 +222,16 @@ if (LLAMA_BUILD_APP)
    add_subdirectory(app)
 endif()

+# Standalone libmtmd build without pulling in the rest of the tools/ tree.
+# Useful when packaging just the mtmd library for language bindings (e.g. an
+# Apple XCFramework, or a WASM build). When the full tools build is enabled,
+# mtmd is already built by the tools/ subdirectory above; this hook only fires
+# when LLAMA_BUILD_TOOLS is OFF to avoid double-adding the target.
+option(LLAMA_BUILD_MTMD "llama: build tools/mtmd library standalone" OFF)
+if (LLAMA_BUILD_MTMD AND NOT (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS))
+    add_subdirectory(tools/mtmd)
+endif()
+
 #
 # install
 #
@@ -1,6 +1,6 @@
 set(TARGET llama-app)

-add_executable(${TARGET} llama.cpp)
+add_executable(${TARGET} llama.cpp download.cpp)
 set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama)

 target_link_libraries(${TARGET} PRIVATE
@@ -0,0 +1,71 @@
+#include "arg.h"
+#include "common.h"
+#include "download.h"
+#include "log.h"
+
+#include <cstdio>
+#include <filesystem>
+
+static void print_usage(int /*argc*/, char ** argv) {
+    printf(
+        "\nexamples:\n"
+        "  %s -hf ggml-org/gemma-3-4b-it-qat-GGUF\n"
+        "  %s -hf ggml-org/gemma-3-4b-it-qat-GGUF:Q4_K_M\n"
+        "  %s -hf ggml-org/models -hff model.gguf\n"
+        "  %s -mu https://example.com/model.gguf -m model.gguf\n"
+        "\n",
+        argv[0], argv[0], argv[0], argv[0]
+    );
+}
+
+int llama_download(int argc, char ** argv);
+
+int llama_download(int argc, char ** argv) {
+    common_init();
+
+    common_params params;
+    params.verbosity = LOG_LEVEL_ERROR;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DOWNLOAD, print_usage)) {
+        return 1;
+    }
+
+    const bool has_source = !params.model.hf_repo.empty() || !params.model.url.empty() ||
+                            !params.model.path.empty()    || !params.model.docker_repo.empty();
+    if (!has_source) {
+        fprintf(stderr, "error: no model source specified (use --hf-repo, --model-url, --model or --docker-repo)\n");
+        return 1;
+    }
+
+    try {
+        common_models_handler handler = common_models_handler_init(params, LLAMA_EXAMPLE_DOWNLOAD);
+        common_models_handler_apply(handler, params);
+    } catch (const std::exception & e) {
+        fprintf(stderr, "error: %s\n", e.what());
+        return 1;
+    }
+
+    if (!params.models_preset.empty()) {
+        // -hf pointed at a preset repo: print the preset path and stop
+        printf("%s\n", params.models_preset.c_str());
+        return 0;
+    }
+    if (params.model.path.empty()) {
+        fprintf(stderr, "error: model download failed\n");
+        return 1;
+    }
+    if (!std::filesystem::exists(params.model.path)) {
+        fprintf(stderr, "error: model file does not exist: %s\n", params.model.path.c_str());
+        return 1;
+    }
+
+    printf("%s\n", params.model.path.c_str());
+    if (!params.mmproj.path.empty()) {
+        printf("%s\n", params.mmproj.path.c_str());
+    }
+    if (!params.speculative.draft.mparams.path.empty()) {
+        printf("%s\n", params.speculative.draft.mparams.path.c_str());
+    }
+
+    return 0;
+}
@@ -19,6 +19,7 @@ int llama_batched_bench(int argc, char ** argv);
 int llama_fit_params(int argc, char ** argv);
 int llama_quantize(int argc, char ** argv);
 int llama_perplexity(int argc, char ** argv);
+int llama_download(int argc, char ** argv);

 // Self-update is only supported for binaries built with llama-install.sh
 static int llama_update(int argc, char ** argv) {
@@ -61,6 +62,7 @@ static const command cmds[] = {
    {"serve",         "HTTP API server",                                    {"server"},   false,         llama_server       },
    {"cli",           "Command-line interactive interface",                 {"client"},   false,         llama_cli          },
    {"update",        "Update llama to the latest release",                 {},           UPDATE_HIDDEN, llama_update       },
+    {"download",      "Download a model",                                   {"get"},      false,         llama_download     },
    {"completion",    "Text completion",                                    {"complete"}, true,          llama_completion   },
    {"bench",         "Benchmark prompt processing and text generation",    {},           true,          llama_bench        },
    {"batched-bench", "Benchmark batched decoding performance",             {},           true,          llama_batched_bench},
@@ -13,6 +13,7 @@ LLAMA_BUILD_EXAMPLES=OFF
 LLAMA_BUILD_TOOLS=OFF
 LLAMA_BUILD_TESTS=OFF
 LLAMA_BUILD_SERVER=OFF
+LLAMA_BUILD_MTMD=ON
 GGML_METAL=ON
 GGML_METAL_EMBED_LIBRARY=ON
 GGML_BLAS_DEFAULT=ON
@@ -39,6 +40,7 @@ COMMON_CMAKE_ARGS=(
    -DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS}
    -DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
    -DLLAMA_BUILD_SERVER=${LLAMA_BUILD_SERVER}
+    -DLLAMA_BUILD_MTMD=${LLAMA_BUILD_MTMD}
    -DGGML_METAL_EMBED_LIBRARY=${GGML_METAL_EMBED_LIBRARY}
    -DGGML_BLAS_DEFAULT=${GGML_BLAS_DEFAULT}
    -DGGML_METAL=${GGML_METAL}
@@ -126,6 +128,8 @@ setup_framework_structure() {
    cp ggml/include/ggml-cpu.h     ${header_path}
    cp ggml/include/ggml-blas.h    ${header_path}
    cp ggml/include/gguf.h         ${header_path}
+    cp tools/mtmd/mtmd.h           ${header_path}
+    cp tools/mtmd/mtmd-helper.h    ${header_path}

    # Create module map (common for all platforms)
    cat > ${module_path}module.modulemap << EOF
@@ -247,6 +251,7 @@ combine_static_libraries() {
        "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-cpu.a"
        "${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a"
        "${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a"
+        "${base_dir}/${build_dir}/tools/mtmd/${release_dir}/libmtmd.a"
    )

    # Create temporary directory for processing
@@ -410,6 +415,7 @@ cmake -B build-ios-sim -G Xcode \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
    -DLLAMA_OPENSSL=OFF \
+    -DMTMD_VIDEO=OFF \
    -S .
 cmake --build build-ios-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet

@@ -424,6 +430,7 @@ cmake -B build-ios-device -G Xcode \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
    -DLLAMA_OPENSSL=OFF \
+    -DMTMD_VIDEO=OFF \
    -S .
 cmake --build build-ios-device --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet

@@ -450,6 +457,7 @@ cmake -B build-visionos -G Xcode \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
    -DLLAMA_OPENSSL=OFF \
    -DLLAMA_BUILD_SERVER=OFF \
+    -DMTMD_VIDEO=OFF \
    -S .
 cmake --build build-visionos --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet

@@ -465,6 +473,7 @@ cmake -B build-visionos-sim -G Xcode \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
    -DLLAMA_OPENSSL=OFF \
    -DLLAMA_BUILD_SERVER=OFF \
+    -DMTMD_VIDEO=OFF \
    -S .
 cmake --build build-visionos-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet

@@ -481,6 +490,7 @@ cmake -B build-tvos-sim -G Xcode \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
    -DLLAMA_OPENSSL=OFF \
+    -DMTMD_VIDEO=OFF \
    -S .
 cmake --build build-tvos-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet

@@ -496,6 +506,7 @@ cmake -B build-tvos-device -G Xcode \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
    -DLLAMA_OPENSSL=OFF \
+    -DMTMD_VIDEO=OFF \
    -S .
 cmake --build build-tvos-device --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet

@@ -80,8 +80,6 @@ add_library(${TARGET}
    http.h
    imatrix-loader.cpp
    imatrix-loader.h
-    json-partial.cpp
-    json-partial.h
    json-schema-to-grammar.cpp
    llguidance.cpp
    log.cpp
@@ -297,60 +297,6 @@ struct handle_model_result {
    std::string preset_path;
 };

-static handle_model_result common_params_handle_model(struct common_params_model & model,
-                                                      const common_download_opts & opts) {
-    handle_model_result result;
-
-    // TODO @ngxson : refactor this into a new common_model_download_context
-
-    if (!model.docker_repo.empty()) {
-        model.path = common_docker_resolve_model(model.docker_repo);
-    } else if (!model.hf_repo.empty()) {
-        // If -m was used with -hf, treat the model "path" as the hf_file to download
-        if (model.hf_file.empty() && !model.path.empty()) {
-            model.hf_file = model.path;
-            model.path = "";
-        }
-        common_download_opts hf_opts = opts;
-        auto download_result = common_download_model(model, hf_opts);
-
-        if (!download_result.preset_path.empty()) {
-            result.found_preset = true;
-            result.preset_path = download_result.preset_path;
-            return result; // skip everything else if preset.ini is used
-        }
-
-        if (download_result.model_path.empty()) {
-            throw std::runtime_error("failed to download model from Hugging Face");
-        }
-
-        model.path = download_result.model_path;
-
-        if (!download_result.mmproj_path.empty()) {
-            result.found_mmproj = true;
-            result.mmproj.path  = download_result.mmproj_path;
-        }
-
-        if (!download_result.mtp_path.empty()) {
-            result.found_mtp = true;
-            result.mtp.path  = download_result.mtp_path;
-        }
-    } else if (!model.url.empty()) {
-        if (model.path.empty()) {
-            auto f = string_split<std::string>(model.url, '#').front();
-            f = string_split<std::string>(f, '?').front();
-            model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
-        }
-
-        auto download_result = common_download_model(model, opts);
-        if (download_result.model_path.empty()) {
-            throw std::runtime_error("failed to download model from " + model.url);
-        }
-    }
-
-    return result;
-}
-
 const std::vector<ggml_type> kv_cache_types = {
    GGML_TYPE_F32,
    GGML_TYPE_F16,
@@ -395,77 +341,204 @@ static bool parse_bool_value(const std::string & value) {
 }

 //
-// CLI argument parsing functions
+// common_models_handler
 //

-bool common_params_handle_models(common_params & params, llama_example curr_ex, const common_params_handle_models_params & handle_params) {
-    const bool spec_type_draft_mtp = std::find(params.speculative.types.begin(),
-                                         params.speculative.types.end(),
-                                         COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params.speculative.types.end();
+static std::string get_default_local_path(const std::string & url) {
+    auto f = string_split<std::string>(url, '#').front();
+    f = string_split<std::string>(f, '?').front();
+    return fs_get_cache_file(string_split<std::string>(f, '/').back());
+}

+common_models_handler common_models_handler_init(const common_params & params, llama_example curr_ex) {
+    common_download_hf_plan plan;
    common_download_opts opts;
+
+    const bool spec_type_draft_mtp = std::find(params.speculative.types.begin(),
+                                        params.speculative.types.end(),
+                                        COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params.speculative.types.end();
+
+    // only download mmproj if the current example is using it
+    bool use_mmproj = false;
+    for (const auto & ex : mmproj_examples) {
+        if (curr_ex == ex) {
+            use_mmproj = true;
+            break;
+        }
+    }
+
    opts.bearer_token    = params.hf_token;
    opts.offline         = params.offline;
-    opts.skip_download   = params.skip_download;
    opts.download_mtp    = spec_type_draft_mtp;
-    opts.download_mmproj = !params.no_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty();
-    opts.preset_only     = handle_params.preset_only;
+    opts.download_mmproj = use_mmproj && !params.no_mmproj
+                        && params.mmproj.path.empty() && params.mmproj.url.empty();

-    if (handle_params.callback) {
-        opts.callback = handle_params.callback;
+    if (!params.model.hf_repo.empty()) {
+        plan = common_download_get_hf_plan(params.model, opts);
    }

-    // sub-models (draft, mmproj, vocoder) are explicitly specified by the user,
-    // so we should not auto-discover mtp/mmproj siblings for them
-    common_download_opts sub_opts = opts;
-    sub_opts.download_mtp    = false;
-    sub_opts.download_mmproj = false;
+    return common_models_handler{plan, opts};
+}

-    try {
-        auto res = common_params_handle_model(params.model, opts);
-        if (res.found_preset) {
-            if (!params.models_preset.empty()) {
-                throw std::invalid_argument("cannot use both --models-preset and -hf with a preset.ini file");
+bool common_models_handler_is_preset_repo(const common_models_handler & handler) {
+    return !handler.plan.preset.url.empty();
+}
+
+static std::vector<common_download_task> build_url_tasks(const common_params_model & model, common_download_opts opts) {
+    auto parts = common_download_get_all_parts(model.url);
+    std::vector<common_download_task> tasks;
+
+    // single-part: download straight to model.path if the user gave one (-m), else the cache default
+    if (parts.size() == 1) {
+        common_download_task task;
+        task.url        = parts[0];
+        task.local_path = model.path.empty() ? get_default_local_path(parts[0]) : model.path;
+        task.opts       = opts;
+        tasks.push_back(std::move(task));
+        return tasks;
+    }
+
+    // multi-part: place each part under the user's -m directory (if given), else the cache default
+    std::string base_dir;
+    if (!model.path.empty()) {
+        auto pos = model.path.rfind('/');
+        base_dir = pos == std::string::npos ? std::string(".") : model.path.substr(0, pos);
+    }
+
+    for (const auto & part : parts) {
+        common_download_task task;
+        task.url  = part;
+        task.opts = opts;
+
+        std::string local = get_default_local_path(part);
+        if (!base_dir.empty()) {
+            auto pos = local.rfind('/');
+            std::string name = pos == std::string::npos ? local : local.substr(pos + 1);
+            local = base_dir + "/" + name;
+        }
+        task.local_path = local;
+        tasks.push_back(std::move(task));
+    }
+    return tasks;
+}
+
+void common_models_handler_apply(common_models_handler & handler, common_params & params, common_download_callback * callback) {
+    std::vector<common_download_task> tasks;
+
+    auto & plan = handler.plan;
+
+    auto opts = handler.opts; // copy
+    opts.callback = callback;
+
+    // handle plain "url" if needed
+    auto handle_url = [&](common_params_model & model) {
+        if (!model.url.empty()) {
+            if (model.path.empty()) {
+                model.path = get_default_local_path(model.url);
            }
+        }
+    };
+    handle_url(params.model);
+    handle_url(params.mmproj);
+    handle_url(params.vocoder.model);
+    handle_url(params.speculative.draft.mparams);
+
+    // optionally, if docker repo is set, resolve it
+    if (!params.model.docker_repo.empty()) {
+        params.model.url  = common_docker_resolve_model(params.model.docker_repo);
+        params.model.path = get_default_local_path(params.model.url);
+    }
+
+    // handle plain "url" tasks (non-hf)
+    if (!params.model.url.empty()) {
+        auto url_tasks = build_url_tasks(params.model, opts);
+        // the first part is what gets loaded, so point params.model.path at it
+        if (!url_tasks.empty()) {
+            std::string first_path = url_tasks.front().local_path;
+            url_tasks.front().on_done = [&]() { params.model.path = first_path; };
+        }
+        for (auto & task : url_tasks) {
+            tasks.push_back(std::move(task));
+        }
+    }
+    if (!params.mmproj.url.empty()) {
+        common_download_task task;
+        task.url        = params.mmproj.url;
+        task.local_path = params.mmproj.path;
+        task.opts       = opts;
+        tasks.push_back(task);
+    }
+    if (!params.vocoder.model.url.empty()) {
+        common_download_task task;
+        task.url        = params.vocoder.model.url;
+        task.local_path = params.vocoder.model.path;
+        task.opts       = opts;
+        tasks.push_back(task);
+    }
+    if (!params.speculative.draft.mparams.url.empty()) {
+        common_download_task task;
+        task.url        = params.speculative.draft.mparams.url;
+        task.local_path = params.speculative.draft.mparams.path;
+        task.opts       = opts;
+        tasks.push_back(task);
+    }
+
+    // handle hf_plan tasks
+    if (!plan.model_files.empty()) {
+        for (size_t i = 0; i < plan.model_files.size(); ++i) {
+            auto & model_file = plan.model_files[i];
+            bool is_first = (i == 0);
+            tasks.emplace_back(model_file, opts, [&, is_first]() {
+                if (is_first) {
+                    // only use first part as model path
+                    params.model.path = hf_cache::finalize_file(model_file);
+                } else {
+                    hf_cache::finalize_file(model_file);
+                }
+            });
+        }
+    }
+    if (!plan.mmproj.local_path.empty()) {
+        tasks.emplace_back(plan.mmproj, opts, [&]() {
+            params.mmproj.path = hf_cache::finalize_file(plan.mmproj);
+        });
+    }
+    if (!plan.mtp.local_path.empty()) {
+        tasks.emplace_back(plan.mtp, opts, [&]() {
+            // only fall back to the discovered MTP head when no draft was explicitly provided
+            if (params.speculative.draft.mparams.empty()) {
+                params.speculative.draft.mparams.path = hf_cache::finalize_file(plan.mtp);
+            } else {
+                hf_cache::finalize_file(plan.mtp);
+            }
+        });
+    }
+    if (!plan.preset.local_path.empty()) {
+        tasks.emplace_back(plan.preset, opts, [&]() {
            // if HF repo is a preset repo, we simply run server in router mode with the preset.ini file
            params.models_preset_hf = params.model.hf_repo; // only for showing a warning
-            params.models_preset    = res.preset_path;
+            params.models_preset    = hf_cache::finalize_file(plan.preset);
            params.model = common_params_model{}; // make sure to clear model, so server starts in router mode
-            return true;
-        }
+        });
+    }

-        if (params.no_mmproj) {
-            params.mmproj = {};
-        } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
-            // optionally, handle mmproj model when -hf is specified
-            params.mmproj = res.mmproj;
-        }
-        // only download mmproj if the current example is using it
-        for (const auto & ex : mmproj_examples) {
-            if (curr_ex == ex) {
-                common_params_handle_model(params.mmproj, sub_opts);
-                break;
-            }
-        }
+    // run all tasks in parallel
+    if (!params.offline) {
+        common_download_run_tasks(tasks);
+    }

-        // when --spec-type mtp is set and no draft model was provided explicitly,
-        // fall back to the MTP head discovered alongside the -hf model
-        if (spec_type_draft_mtp && res.found_mtp &&
-            params.speculative.draft.mparams.path.empty() &&
-            params.speculative.draft.mparams.hf_repo.empty() &&
-            params.speculative.draft.mparams.url.empty()) {
-            params.speculative.draft.mparams.path = res.mtp.path;
+    // download successful, update params with the downloaded paths
+    for (const auto & task : tasks) {
+        if (task.on_done) {
+            task.on_done();
        }
-        common_params_handle_model(params.speculative.draft.mparams, sub_opts);
-        common_params_handle_model(params.vocoder.model,             sub_opts);
-        return true;
-    } catch (const common_skip_download_exception &) {
-        return false;
-    } catch (const std::exception &) {
-        throw;
    }
 }

+//
+// CLI argument parsing functions
+//
+
 static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
    common_params & params = ctx_arg.params;

@@ -594,12 +667,15 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
    const bool skip_model_download =
        // server will call common_params_handle_models() later, so we skip it here
        ctx_arg.ex == LLAMA_EXAMPLE_SERVER ||
+        // download calls common_params_handle_models() itself and prints the paths
+        ctx_arg.ex == LLAMA_EXAMPLE_DOWNLOAD ||
        // export_graph_ops loads only metadata
        ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;

    if (!skip_model_download) {
        // handle model and download
-        common_params_handle_models(params, ctx_arg.ex, {});
+        common_models_handler handler = common_models_handler_init(params, ctx_arg.ex);
+        common_models_handler_apply(handler, params);

        // model is required (except for server)
        // TODO @ngxson : maybe show a list of available models in CLI in this case
@@ -671,15 +747,19 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
            common_options.push_back(&opt);
        }
    }
-    printf("----- common params -----\n\n");
-    print_options(common_options);
-    printf("\n\n----- sampling params -----\n\n");
-    print_options(sampling_options);
-    printf("\n\n----- speculative params -----\n\n");
-    print_options(spec_options);
-    // TODO: maybe convert enum llama_example to string
-    printf("\n\n----- example-specific params -----\n\n");
-    print_options(specific_options);
+    bool first = true;
+    auto print_section = [&](const char * header, std::vector<common_arg *> & options) {
+        if (options.empty()) {
+            return;
+        }
+        printf("%s----- %s -----\n\n", first ? "" : "\n\n", header);
+        first = false;
+        print_options(options);
+    };
+    print_section("common params",           common_options);
+    print_section("sampling params",         sampling_options);
+    print_section("speculative params",      spec_options);
+    print_section("example-specific params", specific_options);
 }

 static void common_params_print_completion(common_params_context & ctx_arg) {
@@ -1079,7 +1159,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
     */
    auto add_opt = [&](common_arg arg) {
-        if ((arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) {
+        // download only exposes the handful of args explicitly tagged for it
+        const bool inherit_common = ex != LLAMA_EXAMPLE_DOWNLOAD;
+        if ((arg.in_example(ex) || (inherit_common && arg.in_example(LLAMA_EXAMPLE_COMMON))) && !arg.is_exclude(ex)) {
            ctx_arg.options.push_back(std::move(arg));
        }
    };
@@ -1090,7 +1172,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.usage = true;
        }
-    ));
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}));
    add_opt(common_arg(
        {"--version"},
        "show version and build info",
@@ -2212,7 +2294,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, bool value) {
            params.no_mmproj = !value;
        }
-    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO"));
+    ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_MMPROJ_AUTO"));
    add_opt(common_arg(
        {"--mmproj-offload"},
        {"--no-mmproj-offload"},
@@ -2611,14 +2693,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.model.path = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_MODEL"));
    add_opt(common_arg(
        {"-mu", "--model-url"}, "MODEL_URL",
        "model download url (default: unused)",
        [](common_params & params, const std::string & value) {
            params.model.url = value;
        }
-    ).set_env("LLAMA_ARG_MODEL_URL"));
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_MODEL_URL"));
    add_opt(common_arg(
        { "-dr", "--docker-repo" }, "[<repo>/]<model>[:quant]",
        "Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.\n"
@@ -2627,7 +2709,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.model.docker_repo = value;
        }
-    ).set_env("LLAMA_ARG_DOCKER_REPO"));
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_DOCKER_REPO"));
    add_opt(common_arg(
        {"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
        "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
@@ -2637,14 +2719,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.model.hf_repo = value;
        }
-    ).set_env("LLAMA_ARG_HF_REPO"));
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_HF_REPO"));
    add_opt(common_arg(
        {"-hff", "--hf-file"}, "FILE",
        "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
        [](common_params & params, const std::string & value) {
            params.model.hf_file = value;
        }
-    ).set_env("LLAMA_ARG_HF_FILE"));
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_HF_FILE"));
    add_opt(common_arg(
        {"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
        "Hugging Face model repository for the vocoder model (default: unused)",
@@ -2665,7 +2747,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.hf_token = value;
        }
-    ).set_env("HF_TOKEN"));
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("HF_TOKEN"));
+    add_opt(common_arg(
+        {"--mtp"},
+        "also download the multi-token prediction (MTP) head, if available (default: unused)",
+        [](common_params & params) {
+            params.speculative.types.push_back(COMMON_SPECULATIVE_TYPE_DRAFT_MTP);
+        }
+    ).set_examples({LLAMA_EXAMPLE_DOWNLOAD}));
    add_opt(common_arg(
        {"--context-file"}, "FNAME",
        "file to load context from (use comma-separated values to specify multiple files)",
@@ -8,6 +8,7 @@
 #include <string>
 #include <vector>
 #include <cstring>
+#include <memory>

 // pseudo-env variable to identify preset-only arguments
 #define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
@@ -130,19 +131,19 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
 // see: https://github.com/ggml-org/llama.cpp/issues/18163
 void common_params_add_preset_options(std::vector<common_arg> & args);

-struct common_params_handle_models_params {
-    common_download_callback * callback = nullptr;
-    bool preset_only = false; // if true, only check & download remote preset (for router mode)
+struct common_models_handler {
+    common_download_hf_plan plan;
+    common_download_opts opts;
 };

-// populate model paths (main model, mmproj, etc) from -hf if necessary
-// return true if the model is ready to use
-// throw an exception if there is an error that prevents the model from being used (e.g. network error, model not found, etc)
-// if params.skip_download is true, no downloads will be attempted. return false if the model is invalid or missing (e.g. ETag check failed)
-bool common_params_handle_models(
-    common_params & params,
-    llama_example curr_ex,
-    const common_params_handle_models_params & handle_params);
+// initialize downloading opts and hf_plan if needed, but does not download anything yet
+common_models_handler common_models_handler_init(const common_params & params, llama_example curr_ex);
+
+// check if the model is a preset repo (i.e. has a preset file)
+bool common_models_handler_is_preset_repo(const common_models_handler & handler);
+
+// download and update params with the downloaded model path
+void common_models_handler_apply(common_models_handler & handler, common_params & params, common_download_callback * callback = nullptr);

 // initialize argument parser context - used by test-arg-parser and preset
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
@@ -2758,5 +2758,9 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena &          src_pars
 std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_templates * chat_templates) {
    GGML_ASSERT(chat_templates != nullptr);
    GGML_ASSERT(chat_templates->template_default != nullptr);
+    if (chat_templates->template_tool_use != nullptr) {
+        // take the more expressive template when available
+        return chat_templates->template_tool_use->caps.to_map();
+    }
    return chat_templates->template_default->caps.to_map();
 }
@@ -96,6 +96,7 @@ enum llama_example {
    LLAMA_EXAMPLE_FIT_PARAMS,
    LLAMA_EXAMPLE_RESULTS,
    LLAMA_EXAMPLE_EXPORT_GRAPH_OPS,
+    LLAMA_EXAMPLE_DOWNLOAD,

    LLAMA_EXAMPLE_COUNT,
 };
@@ -290,13 +291,13 @@ struct common_params_sampling {
 };

 struct common_params_model {
-    std::string path        = ""; // model local path                                       // NOLINT
-    std::string url         = ""; // model url to download                                  // NOLINT
-    std::string hf_repo     = ""; // HF repo                                                // NOLINT
-    std::string hf_file     = ""; // HF file                                                // NOLINT
-    std::string docker_repo = ""; // Docker repo                                            // NOLINT
+    std::string path        = ""; // model local path
+    std::string url         = ""; // model url to download
+    std::string hf_repo     = ""; // HF repo
+    std::string hf_file     = ""; // HF file
+    std::string docker_repo = ""; // Docker repo

-    std::string get_name() {
+    std::string get_name() const {
        if (!hf_repo.empty()) {
            return hf_repo;
        }
@@ -305,6 +306,10 @@ struct common_params_model {
        }
        return path;
    }
+
+    bool empty() const {
+        return get_name().empty();
+    }
 };

 // draft-model-based speculative decoding parameters
@@ -367,7 +372,7 @@ struct common_params_speculative {
    common_params_speculative_ngram_cache ngram_cache;

    bool has_dft() const {
-        return !draft.mparams.path.empty() || !draft.mparams.hf_repo.empty();
+        return !draft.mparams.empty();
    }

    uint32_t need_n_rs_seq() const {
@@ -519,7 +524,6 @@ struct common_params {
    int32_t control_vector_layer_start = -1; // layer range for control vector
    int32_t control_vector_layer_end   = -1; // layer range for control vector
    bool    offline                    = false;
-    bool    skip_download              = false; // skip model file downloading

    int32_t ppl_stride      = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
    int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
@@ -292,10 +292,6 @@ static int common_download_file_single_online(const std::string & url,

    const bool file_exists = std::filesystem::exists(path);

-    if (!file_exists && opts.skip_download) {
-        return -2; // file is missing and download is disabled
-    }
-
    if (file_exists && skip_etag) {
        LOG_DBG("%s: using cached file: %s\n", __func__, path.c_str());
        return 304; // 304 Not Modified - fake cached response
@@ -362,9 +358,6 @@ static int common_download_file_single_online(const std::string & url,
            return 304; // 304 Not Modified - fake cached response
        }
        // pass this point, the file exists but is different from the server version, so we need to redownload it
-        if (opts.skip_download) {
-            return -2; // special code to indicate that the download was skipped due to etag mismatch
-        }
        if (remove(path.c_str()) != 0) {
            LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
            return -1;
@@ -691,19 +684,8 @@ static void list_available_gguf_files(const hf_cache::hf_files & files) {
    }
 }

-struct hf_plan {
-    hf_cache::hf_file primary;
-    hf_cache::hf_files model_files;
-    hf_cache::hf_file mmproj;
-    hf_cache::hf_file mtp;
-    hf_cache::hf_file preset; // if set, only this file is downloaded
-};
-
-static hf_plan get_hf_plan(const common_params_model  & model,
-                           const common_download_opts & opts,
-                           bool download_mmproj,
-                           bool download_mtp) {
-    hf_plan plan;
+common_download_hf_plan common_download_get_hf_plan(const common_params_model & model, const common_download_opts & opts) {
+    common_download_hf_plan plan;
    hf_cache::hf_files all;

    auto [repo, tag] = common_download_split_repo_tag(model.hf_repo);
@@ -752,127 +734,49 @@ static hf_plan get_hf_plan(const common_params_model  & model,
    plan.primary = primary;
    plan.model_files = get_split_files(all, primary);

-    if (download_mmproj) {
+    if (opts.download_mmproj) {
        plan.mmproj = find_best_mmproj(all, primary.path);
    }
-
-    if (download_mtp) {
+    if (opts.download_mtp) {
        plan.mtp = find_best_mtp(all, primary.path);
    }

    return plan;
 }

-struct download_task {
-    std::string url;
-    std::string path;
-};
-
-static std::vector<download_task> get_url_tasks(const common_params_model & model) {
-    auto split = get_gguf_split_info(model.url);
-
-    if (split.count <= 1) {
-        return {{model.url, model.path}};
-    }
-
-    auto filename = split.prefix;
-    if (auto pos = split.prefix.rfind('/'); pos != std::string::npos) {
-        filename = split.prefix.substr(pos + 1);
-    }
-
-    auto parent_path = std::filesystem::path(model.path).parent_path();
-    auto prefix_path = (parent_path / filename).string();
-
-    std::vector<download_task> tasks;
-    for (int i = 1; i <= split.count; i++) {
-        auto suffix = string_format("-%05d-of-%05d.gguf", i, split.count);
-        tasks.push_back({split.prefix + suffix, prefix_path + suffix});
-    }
-    return tasks;
-}
-
-common_download_model_result common_download_model(const common_params_model  & model,
-                                                   const common_download_opts & opts) {
-    common_download_model_result result;
-    std::vector<download_task> tasks;
-    hf_plan hf;
-
-    bool download_mmproj = opts.download_mmproj;
-    bool download_mtp = opts.download_mtp;
-    bool preset_only = opts.preset_only;
-    bool is_hf = !model.hf_repo.empty();
-
-    if (is_hf) {
-        hf = get_hf_plan(model, opts, download_mmproj, download_mtp);
-        if (!hf.preset.path.empty()) {
-            // if preset.ini exists, only download that file alone
-            tasks.push_back({hf.preset.url, hf.preset.local_path});
-        } else if (!preset_only) {
-            // only add other files if we're NOT in preset-only mode (normal run, non-router)
-            for (const auto & f : hf.model_files) {
-                tasks.push_back({f.url, f.local_path});
-            }
-            if (!hf.mmproj.path.empty()) {
-                tasks.push_back({hf.mmproj.url, hf.mmproj.local_path});
-            }
-            if (!hf.mtp.path.empty()) {
-                tasks.push_back({hf.mtp.url, hf.mtp.local_path});
-            }
-        }
-    } else if (!model.url.empty()) {
-        tasks = get_url_tasks(model);
-    } else {
-        result.model_path = model.path;
-        return result;
-    }
-
-    if (tasks.empty()) {
-        return result;
-    }
-
+void common_download_run_tasks(const std::vector<common_download_task> & tasks) {
    std::vector<std::future<int>> futures;
    for (const auto & task : tasks) {
        futures.push_back(std::async(std::launch::async,
-            [&task, &opts, is_hf]() {
-                return common_download_file_single(task.url, task.path, opts, is_hf);
+            [&task]() {
+                return common_download_file_single(task.url, task.local_path, task.opts, task.is_hf);
            }
        ));
    }

-    for (auto & f : futures) {
-        int status = f.get();
-        if (status == -2 && opts.skip_download) {
-            throw common_skip_download_exception();
-        }
+    for (size_t i = 0; i < futures.size(); ++i) {
+        std::string url = tasks[i].url;
+        int status = futures[i].get();
        bool is_ok = is_http_status_ok(status);
        if (!is_ok) {
-            return {};
+            throw std::runtime_error(string_format("Download '%s' failed with status code: %d", url.c_str(), status));
        }
    }
+}

-    if (is_hf) {
-        if (!hf.preset.path.empty()) {
-            // if preset.ini is used, do not set other paths
-            result.preset_path = hf_cache::finalize_file(hf.preset);
-        } else {
-            for (const auto & f : hf.model_files) {
-                hf_cache::finalize_file(f);
-            }
-            result.model_path = hf.primary.final_path;
+std::vector<std::string> common_download_get_all_parts(const std::string & url) {
+    auto split = get_gguf_split_info(url);

-            if (!hf.mmproj.path.empty()) {
-                result.mmproj_path = hf_cache::finalize_file(hf.mmproj);
-            }
-
-            if (!hf.mtp.path.empty()) {
-                result.mtp_path = hf_cache::finalize_file(hf.mtp);
-            }
-        }
-    } else {
-        result.model_path = model.path;
+    if (split.count <= 1) {
+        return {url};
    }

-    return result;
+    std::vector<std::string> parts;
+    for (int i = 1; i <= split.count; i++) {
+        auto suffix = string_format("-%05d-of-%05d.gguf", i, split.count);
+        parts.push_back(split.prefix + suffix);
+    }
+    return parts;
 }

 //
@@ -1,7 +1,10 @@
 #pragma once

+#include "hf-cache.h"
+
 #include <string>
 #include <vector>
+#include <functional>

 struct common_params_model;

@@ -47,67 +50,40 @@ struct common_cached_model_info {
    }
 };

-// Options for common_download_model and common_download_file_single
+// Options for common_download_file_single
 struct common_download_opts {
    std::string bearer_token;
    common_header_list headers;
    bool offline = false;
-    bool skip_download = false; // if true, only validation is performed, common_skip_download_exception may be thrown if the file is missing or invalid
    bool download_mmproj = false;
    bool download_mtp = false;
-    bool preset_only = false; // if true, only check & download remote preset (for router mode)
    common_download_callback * callback = nullptr;
 };

-// Result of common_download_model
-struct common_download_model_result {
-    std::string model_path;
-    std::string mmproj_path;
-    std::string mtp_path;
-    std::string preset_path;
+struct common_download_task {
+    common_download_opts opts;
+    std::string url;
+    std::string local_path;
+    std::function<void()> on_done;
+    bool is_hf = false;
+
+    common_download_task() = default;
+    common_download_task(hf_cache::hf_file f,
+            const common_download_opts & opts,
+            std::function<void()> on_done = nullptr)
+        : opts(opts), url(f.url), local_path(f.local_path), on_done(on_done), is_hf(true) {}
 };

-// throw if the file is missing or invalid (e.g. ETag check failed)
-struct common_skip_download_exception : public std::runtime_error {
-    common_skip_download_exception() : std::runtime_error("skip download") {}
-};
+void common_download_run_tasks(const std::vector<common_download_task> & tasks);

-// Download model from HuggingFace repo or URL
-//
-// input (via model struct):
-// - model.hf_repo: HF repo with optional tag, see common_download_split_repo_tag
-// - model.hf_file: specific file in the repo (requires hf_repo)
-// - model.url: simple download (used if hf_repo is empty)
-// - model.path: local file path
-//
-// tag matching (for HF repos without model.hf_file):
-// - if tag is specified, searches for GGUF matching that quantization
-// - if no tag, searches for Q4_K_M, then Q4_0, then first available GGUF
-//
-// split GGUF: multi-part files like "model-00001-of-00003.gguf" are automatically
-// detected and all parts are downloaded
-//
-// caching:
-// - HF repos: uses HuggingFace cache
-// - URLs: uses ETag-based caching
-//
-// when opts.offline=true, no network requests are made
-// when download_mmproj=true, searches for mmproj in same directory as model or any parent directory
-// then with the closest quantization bits
-// when download_mtp=true, applies the same sibling search for an MTP-head GGUF
-//
-// returns result with model_path, mmproj_path and mtp_path (empty when not found / on failure)
-common_download_model_result common_download_model(
-    const common_params_model & model,
-    const common_download_opts & opts = {}
-);
+// if url is a multi-part GGUF file, returns all parts, otherwise returns the single file
+std::vector<std::string> common_download_get_all_parts(const std::string & url);

 // returns list of cached models
 std::vector<common_cached_model_info> common_list_cached_models();

 // download single file from url to local path
 // returns status code or -1 on error
-// returns -2 if the download was skipped due to ETag mismatch (file outdated, skip_download=true)
 // skip_etag: if true, don't read/write .etag files (for HF cache where filename is the hash)
 int common_download_file_single(const std::string & url,
                                const std::string & path,
@@ -124,3 +100,12 @@ std::string common_docker_resolve_model(const std::string & docker);
 // - if tag is present, removes only files matching that tag (and orphaned blobs)
 // returns true if anything was removed
 bool common_download_remove(const std::string & hf_repo_with_tag);
+
+struct common_download_hf_plan {
+    hf_cache::hf_file primary;
+    hf_cache::hf_files model_files;
+    hf_cache::hf_file mmproj;
+    hf_cache::hf_file mtp;
+    hf_cache::hf_file preset; // if set, only this file is downloaded
+};
+common_download_hf_plan common_download_get_hf_plan(const common_params_model & model, const common_download_opts & opts);
@@ -1,324 +0,0 @@
-#include "json-partial.h"
-
-#include "log.h"
-
-#include <nlohmann/json.hpp>
-
-#include <string>
-#include <regex>
-
-using json = nlohmann::ordered_json;
-
-enum common_json_stack_element_type {
-    COMMON_JSON_STACK_ELEMENT_OBJECT,
-    COMMON_JSON_STACK_ELEMENT_KEY,
-    COMMON_JSON_STACK_ELEMENT_ARRAY,
-};
-
-struct common_json_stack_element {
-    common_json_stack_element_type type;
-    std::string key;
-};
-
-bool common_json_parse(
-    const std::string & input,
-    const std::string & healing_marker,
-    common_json & out)
-{
-    std::string::const_iterator it = input.begin();
-    const auto end = input.end();
-    return common_json_parse(it, end, healing_marker, out);
-}
-
-bool common_json_parse(
-    std::string::const_iterator & it,
-    const std::string::const_iterator & end,
-    const std::string & healing_marker,
-    common_json & out)
-{
-    // // https://json.nlohmann.me/features/parsing/sax_interface/
-    struct json_error_locator : public nlohmann::json_sax<json> {
-        std::size_t position;
-        bool found_error;
-        std::string last_token;
-        std::string exception_message;
-        std::vector<common_json_stack_element> stack;
-
-        json_error_locator() : position(0), found_error(false) {}
-
-        bool parse_error(std::size_t position, const std::string & last_token, const json::exception & ex) override { // NOLINT
-            this->position = position - 1;
-            this->found_error = true;
-            this->last_token = last_token;
-            this->exception_message = ex.what();
-            return false;
-        }
-        void close_value() {
-            if (!stack.empty() && (stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY)) {
-                stack.pop_back();
-            }
-        }
-        bool null() override { // NOLINT
-            close_value();
-            return true;
-        }
-        bool boolean(bool) override { // NOLINT
-            close_value();
-            return true;
-        }
-        bool number_integer(number_integer_t) override { // NOLINT
-            close_value();
-            return true;
-        }
-        bool number_unsigned(number_unsigned_t) override { // NOLINT
-            close_value();
-            return true;
-        }
-        bool number_float(number_float_t, const string_t &) override { // NOLINT
-            close_value();
-            return true;
-        }
-        bool string(string_t &) override { // NOLINT
-            close_value();
-            return true;
-        }
-        bool binary(binary_t &) override { // NOLINT
-            close_value();
-            return true;
-        }
-        bool start_object(std::size_t) override { // NOLINT
-            stack.push_back({COMMON_JSON_STACK_ELEMENT_OBJECT, ""});
-            return true;
-        }
-        bool end_object() override {
-            GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT);
-            stack.pop_back();
-            close_value();
-            return true;
-        }
-        bool key(string_t & key) override { // NOLINT
-            stack.push_back({COMMON_JSON_STACK_ELEMENT_KEY, key});
-            return true;
-        }
-        bool start_array(std::size_t) override { // NOLINT
-            stack.push_back({COMMON_JSON_STACK_ELEMENT_ARRAY, ""});
-            return true;
-        }
-        bool end_array() override {
-            GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY);
-            stack.pop_back();
-            close_value();
-            return true;
-        }
-    };
-    json_error_locator err_loc;
-    auto start = it;
-    json::sax_parse(it, end, &err_loc);
-
-    if (err_loc.found_error) {
-        it = start;
-        auto temptative_end = it + err_loc.position;
-        // LOG_DBG("Error at position %zu (is_end = %s): %s\n", err_loc.position, temptative_end == end ? "true" : "false", err_loc.exception_message.c_str());
-
-        auto input = std::string(it, temptative_end);
-        try {
-            out.json = json::parse(input);
-            // out.json = json::parse(it, temptative_end);
-            it = temptative_end;
-            return true;
-        } catch (const std::exception & ex) {
-            // No, needs healing.
-            LOG_DBG("Failed to parse up to error: %s: <<<%s>>>\n", ex.what(), std::string(it, temptative_end).c_str());
-        }
-        auto can_parse = [](const std::string & str) {
-            try {
-                auto _ = json::parse(str); // NOLINT
-                return true;
-            } catch (const std::exception &) {
-                return false;
-            }
-        };
-        if (!healing_marker.empty() && !err_loc.stack.empty()) {
-            std::string str(it, temptative_end);
-            auto last_non_sp_pos = str.find_last_not_of(" \n\r\t");
-            if (last_non_sp_pos == std::string::npos) {
-                throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
-            }
-            auto last_non_sp_char = str[last_non_sp_pos];
-            // Used to detect stops on a number, which may not be complete.
-            auto was_maybe_number = [&]() {
-                if (!str.empty() && std::isspace(str.back())) {
-                    return false;
-                }
-                return std::isdigit(last_non_sp_char) ||
-                    last_non_sp_char == '.' ||
-                    last_non_sp_char == 'e' ||
-                    last_non_sp_char == 'E' ||
-                    last_non_sp_char == '-';
-            };
-
-            std::string closing;
-            for (size_t i = err_loc.stack.size(); i > 0; i--) {
-                auto & el = err_loc.stack[i - 1];
-                if (el.type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
-                    closing += "}";
-                } else if (el.type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
-                    closing += "]";
-                } else if (el.type != COMMON_JSON_STACK_ELEMENT_KEY) {
-                    throw std::runtime_error("Unexpected stack element type");
-                }
-            }
-
-            // Matches a potentially partial unicode escape sequence, e.g. \u, \uX, \uXX, \uXXX, \uXXXX
-            static const std::regex partial_unicode_regex(R"(\\u(?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F])?)?)?)?$)");
-
-            auto is_high_surrogate = [&](const std::string & s) {
-                // Check if a partial of a high surrogate (U+D800-U+DBFF)
-                return s.length() >= 4 &&
-                    s[0] == '\\' && s[1] == 'u' &&
-                    std::tolower(s[2]) == 'd' &&
-                    (s[3] == '8' || s[3] == '9' || std::tolower(s[3]) == 'a' || std::tolower(s[3]) == 'b');
-            };
-
-            // Initialize the unicode marker to a low surrogate to handle the edge case
-            // where a high surrogate (U+D800-U+DBFF) is immediately followed by a
-            // backslash (\)
-            std::string unicode_marker_padding = "udc00";
-            std::smatch last_unicode_seq;
-
-            if (std::regex_search(str, last_unicode_seq, partial_unicode_regex)) {
-                std::smatch second_last_seq;
-                std::string prelude = str.substr(0, last_unicode_seq.position());
-
-                // Pad the escape sequence with 0s until it forms a complete sequence of 6 characters
-                unicode_marker_padding = std::string(6 - last_unicode_seq.length(), '0');
-
-                if (is_high_surrogate(last_unicode_seq.str())) {
-                    // If the sequence is a partial match for a high surrogate, add a low surrogate (U+DC00-U+UDFF)
-                    unicode_marker_padding += "\\udc00";
-                } else if (std::regex_search(prelude, second_last_seq, partial_unicode_regex)) {
-                    if (is_high_surrogate(second_last_seq.str())) {
-                        // If this follows a high surrogate, pad it to be a low surrogate
-                        if (last_unicode_seq.length() == 2) {
-                            unicode_marker_padding = "dc00";
-                        } else if (last_unicode_seq.length() == 3) {
-                            unicode_marker_padding = "c00";
-                        } else {
-                            // The original unicode_marker_padding is already padded with 0s
-                        }
-                    }
-                }
-            }
-
-            const auto & magic_seed = out.healing_marker.marker = healing_marker;//"$llama.cpp.json$";
-
-            if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY) {
-                // We're inside an object value
-                if (last_non_sp_char == ':' && can_parse(str + "1" + closing)) {
-                    // Was about to create an object value
-                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
-                } else if (can_parse(str + ": 1" + closing)) {
-                    str += (out.healing_marker.json_dump_marker = ":\"" + magic_seed) + "\"" + closing;
-                } else if (last_non_sp_char == '{' && can_parse(str + closing)) {
-                    // Was about to create an object
-                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
-                } else if (can_parse(str + "\"" + closing)) {
-                    // Was inside an object value string
-                    str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
-                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
-                    // Was inside an object value string after an escape
-                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
-                } else if (can_parse(str + unicode_marker_padding + "\"" + closing)) {
-                    // Was inside an object value string after a partial unicode escape
-                    str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing;
-                } else {
-                    // find last :
-                    auto last_pos = str.find_last_of(':');
-                    if (last_pos == std::string::npos) {
-                        throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
-                    }
-                    // Cutting back to opening : for object value
-                    str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
-                }
-            } else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
-                if ((last_non_sp_char == ',' || last_non_sp_char == '[') && can_parse(str + "1" + closing)) {
-                    // Was about to create an array value
-                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
-                } else if (can_parse(str + "\"" + closing)) {
-                    // Was inside an array value string
-                    str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
-                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
-                    // Was inside an array value string after an escape
-                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
-                } else if (can_parse(str + unicode_marker_padding + "\"" + closing)) {
-                    // Was inside an array value string after a partial unicode escape
-                    str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing;
-                } else if (!was_maybe_number() && can_parse(str + ", 1" + closing)) {
-                    // Had just finished a value
-                    str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\"" + closing;
-                } else {
-                    auto last_pos = str.find_last_of("[,");
-                    if (last_pos == std::string::npos) {
-                        throw std::runtime_error("Cannot heal a truncated JSON array stopped in an unknown location");
-                    }
-                    // Cutting back to last [ or , for array value
-                    str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
-                }
-            } else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
-                if ((last_non_sp_char == '{' && can_parse(str + closing)) ||
-                        (last_non_sp_char == ',' && can_parse(str + "\"\": 1" + closing))) {
-                    // Was about to create an object key+value
-                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
-                } else if (!was_maybe_number() && can_parse(str + ",\"\": 1" + closing)) {
-                    // Was about to create an object key+value
-                    str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\": 1" + closing;
-                } else if (can_parse(str + "\": 1" + closing)) {
-                    // Was inside an object key string
-                    str += (out.healing_marker.json_dump_marker = magic_seed) + "\": 1" + closing;
-                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\": 1" + closing)) {
-                    // Was inside an object key string after an escape
-                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\": 1" + closing;
-                } else if (can_parse(str + unicode_marker_padding + "\": 1" + closing)) {
-                    // Was inside an object key string after a partial unicode escape
-                    str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\": 1" + closing;
-                } else {
-                    auto last_pos = str.find_last_of(':');
-                    if (last_pos == std::string::npos) {
-                        throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
-                    }
-                    // fprintf(stderr, "Cutting back to last : for object key+value\n");
-                    str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
-                }
-            } else {
-                throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
-            }
-            // fprintf(stderr, "HEALED:\nSTRING <<<\n%s\n>>>\n\nmagic_cut: <<<\n%s\n>>>\n\n", str.c_str(), out.healing_marker.json_dump_marker.c_str());
-            out.json = json::parse(str);
-            it = temptative_end;
-            return true;
-        }
-        // handle unclosed top-level primitive
-        if (err_loc.position != 0 && !healing_marker.empty() && err_loc.stack.empty()) {
-            std::string str(it, temptative_end);
-            const auto & magic_seed = out.healing_marker.marker = healing_marker;
-            if (can_parse(str + "\"")) {
-                // Was inside an string
-                str += (out.healing_marker.json_dump_marker = magic_seed) + "\"";
-            } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"")) {
-                // Was inside an string after an escape
-                str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"";
-            } else {
-                // TODO: handle more unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...)
-                // fprintf(stderr, "Closing: TODO\n");
-                return false;
-            }
-            out.json = json::parse(str);
-            it = temptative_end;
-            return true;
-        }
-        return false;
-    }
-    out.json = json::parse(it, end);
-    it = end;
-    return true;
-}
@@ -1,39 +0,0 @@
-#pragma once
-
-// TODO: use json_fwd.hpp when possible
-#include <nlohmann/json.hpp>
-
-// Healing marker (empty if the JSON was fully parsed / wasn't healed).
-struct common_healing_marker {
-    // Raw marker.
-    std::string marker;
-
-    // Cutting the `common_json.json.dump()` string at the (only) occurrence of this marker should yield the original partial JSON string (modulo spaces / if it had the same dump format).
-    std::string json_dump_marker;
-};
-
-// Represents a parsed JSON object, with its optional healing marker (a JSON dump fragment that can be used to find the position of healing in the JSON dump string)
-struct common_json {
-    nlohmann::ordered_json json;
-
-    common_healing_marker healing_marker;
-};
-
-// Parse the JSON string, healing (closing) any partial JSON if `healing_marker` is not empty.
-//
-// Healing completes partial JSON strings by adding a (possibly modified) healing marker, then whatever is needed to close the JSON.
-// This allows to parse the resulting healed JSON string, yet be able to cut it again if needed at the healing marker.
-// (this is used when parsing JSON outputs from the models, then crafting partial JSONs for the partial tool calls in OAI format).
-//
-// For instance, parsing `{` with a healing marker `foo` will produce a healed JSON `{"foo":1}`, w/ json_dump_marker = `"foo"` (which can be used to break the JSON again).
-bool common_json_parse(
-    const std::string & input,
-    const std::string & healing_marker,
-    common_json & out);
-
-// Parse the JSON string (see overload above), but advancing an iterator to the end of the input when the (potentially partial) parsing succeeds.
-bool common_json_parse(
-    std::string::const_iterator & it,
-    const std::string::const_iterator & end,
-    const std::string & healing_marker,
-    common_json & out);
@@ -46,6 +46,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "DbrxForCausalLM": "dbrx",
    "DeciLMForCausalLM": "deci",
    "DeepseekForCausalLM": "deepseek",
+    "DeepseekOCRForCausalLM": "deepseek",
    "DeepseekV2ForCausalLM": "deepseek",
    "DeepseekV3ForCausalLM": "deepseek",
    "DeepseekV32ForCausalLM": "deepseek",
@@ -135,6 +136,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "LlamaModel": "llama",
    "Eagle3DraftModel": "llama",
    "Eagle3Speculator": "llama",
+    "Eagle3LlamaForCausalLM": "llama",
    "LlamaForCausalLMEagle3": "llama",
    "LlavaForConditionalGeneration": "llama",
    "LlavaStableLMEpochForCausalLM": "stablelm",
@@ -233,6 +235,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "UMT5ForConditionalGeneration": "t5",
    "UMT5Model": "t5",
    "UltravoxModel": "ultravox",
+    "UnlimitedOCRForCausalLM": "deepseek",
    "VLlama3ForCausalLM": "llama",
    "VoxtralForConditionalGeneration": "llama",
    "WavTokenizerDec": "wavtokenizer",
@@ -299,6 +302,7 @@ MMPROJ_MODEL_MAP: dict[str, str] = {
    "StepVLForConditionalGeneration": "step3",
    "Step3p7ForConditionalGeneration": "step3",
    "UltravoxModel": "ultravox",
+    "UnlimitedOCRForCausalLM": "deepseek",
    "VoxtralForConditionalGeneration": "ultravox",
    "YoutuVLForConditionalGeneration": "youtuvl",
 }
@@ -14,7 +14,7 @@ from .base import MmprojModel, ModelBase, TextModel, gguf, logger
 from .qwen import QwenModel


-@ModelBase.register("DeepseekOCRForCausalLM")
+@ModelBase.register("DeepseekOCRForCausalLM", "UnlimitedOCRForCausalLM")
 class DeepseekOCRVisionModel(MmprojModel):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
@@ -205,6 +205,8 @@ class DeepseekModel(TextModel):
@ModelBase.register(
    "DeepseekV2ForCausalLM",
    "DeepseekV3ForCausalLM",
+    "DeepseekOCRForCausalLM",
+    "UnlimitedOCRForCausalLM",
    "KimiVLForConditionalGeneration",
    "KimiK25ForConditionalGeneration",
    "YoutuForCausalLM",
@@ -224,7 +226,7 @@ class DeepseekV2Model(TextModel):
        self.origin_hf_arch = hparams.get('architectures', [None])[0]

        # special handling for Deepseek OCR
-        if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM"):
+        if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM", "UnlimitedOCRForCausalLM"):
            self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR
            self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
            self.gguf_writer.add_architecture()
@@ -350,6 +352,12 @@ class DeepseekV2Model(TextModel):

        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])

+        # Unlimited-OCR sliding window; written for metadata, the decoder ignores it (full MHA)
+        if is_ocr:
+            sliding_window = hparams.get("sliding_window_size") or hparams.get("sliding_window")
+            if sliding_window:
+                self.gguf_writer.add_sliding_window(sliding_window)
+
        if (rope_mscale_all := self.rope_parameters.get("mscale_all_dim")) is not None:
            # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
            # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
@@ -23,6 +23,7 @@ from .base import ModelBase, TextModel, gguf, logger
    "LlavaForConditionalGeneration",
    "VoxtralForConditionalGeneration",
    "LlamaForCausalLMEagle3",
+    "Eagle3LlamaForCausalLM",
    "Eagle3Speculator",
    "Eagle3DraftModel",
    "IQuestCoderForCausalLM",
@@ -413,6 +413,15 @@ In two device selection modes, the default SYCL backend is level_zero, you can c
 |------------------|----------------------------------------|
 | Single device    | --split-mode none --main-gpu DEVICE_ID |
 | Multiple devices | --split-mode layer (default)           |
+| Multiple devices | --split-mode tensor (tensor parallelism) |
+
+`--split-mode tensor` (tensor parallelism) shards each layer across the selected
+GPUs. It requires flash attention, which is auto-enabled when `--flash-attn` is
+left at its default `auto`, so `--split-mode tensor` works out of the box.
+Passing `--flash-attn off` together with `--split-mode tensor` is rejected at
+context creation. The default `f16` KV cache is recommended. Tensor parallelism
+is currently optimized for 2 GPUs; other device counts fall back to a generic
+all-reduce.

 Examples:

@@ -715,6 +724,15 @@ In two device selection modes, the default SYCL backend is level_zero, you can c
 |------------------|----------------------------------------|
 | Single device    | --split-mode none --main-gpu DEVICE_ID |
 | Multiple devices | --split-mode layer (default)           |
+| Multiple devices | --split-mode tensor (tensor parallelism) |
+
+`--split-mode tensor` (tensor parallelism) shards each layer across the selected
+GPUs. It requires flash attention, which is auto-enabled when `--flash-attn` is
+left at its default `auto`, so `--split-mode tensor` works out of the box.
+Passing `--flash-attn off` together with `--split-mode tensor` is rejected at
+context creation. The default `f16` KV cache is recommended. Tensor parallelism
+is currently optimized for 2 GPUs; other device counts fall back to a generic
+all-reduce.

 Examples:

@@ -24,7 +24,6 @@
            "GGML_LLAMAFILE":   "OFF",
            "GGML_OPENCL":      "ON",
            "GGML_HEXAGON":     "ON",
-            "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
            "LLAMA_OPENSSL":    "OFF"
        }
    },
@@ -47,7 +46,6 @@
            "GGML_LLAMAFILE":   "OFF",
            "GGML_OPENCL":      "ON",
            "GGML_HEXAGON":     "ON",
-            "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
            "LLAMA_OPENSSL":    "OFF"
        }
    },
@@ -73,7 +71,6 @@
            "GGML_LLAMAFILE":   "OFF",
            "GGML_OPENCL":      "OFF",
            "GGML_HEXAGON":     "ON",
-            "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
            "LLAMA_OPENSSL":    "OFF"
        }
    },
@@ -13,6 +13,45 @@ The `llama-server` application supports several implementations of speculative d
 A much smaller model (called the _draft model_) generates drafts.
 A draft model is the most used approach in speculative decoding.

+### EAGLE-3 (`draft-eagle3`)
+
+EAGLE-3 uses a small draft model that reads the target model's hidden states to predict the next tokens, so it
+reaches higher acceptance than a standalone draft model of the same size. The draft is a one-layer transformer
+trained for a specific target model; it shares the target model's tokenizer and, optionally, uses a reduced draft
+vocabulary with its own `lm_head`, which is mapped back using a `d2t` table.
+
+Convert the EAGLE-3 checkpoint with `--target-model-dir` so it inherits the target's tokenizer and the layer
+indices to read. Both the SpecForge `LlamaForCausalLMEagle3` and the vLLM/AngelSlim `Eagle3LlamaForCausalLM`
+checkpoint formats are supported (for example [`AngelSlim/Qwen3-4B_eagle3`](https://huggingface.co/AngelSlim/Qwen3-4B_eagle3)
+for `Qwen/Qwen3-4B`):
+
+```bash
+python convert_hf_to_gguf.py AngelSlim/Qwen3-4B_eagle3 \
+    --target-model-dir Qwen/Qwen3-4B --outtype bf16 --outfile Qwen3-4B-eagle3.gguf
+
+llama-server -m Qwen3-4B.gguf -md Qwen3-4B-eagle3.gguf --spec-type draft-eagle3
+```
+
+Supported EAGLE-3 draft models include:
+
+- [yuhuili/EAGLE3-LLaMA3.1-Instruct-8B](https://huggingface.co/yuhuili/EAGLE3-LLaMA3.1-Instruct-8B)
+- [yuhuili/EAGLE3-LLaMA3.3-Instruct-70B](https://huggingface.co/yuhuili/EAGLE3-LLaMA3.3-Instruct-70B)
+- [RedHatAI/gemma-4-31B-it-speculator.eagle3](https://huggingface.co/RedHatAI/gemma-4-31B-it-speculator.eagle3)
+- [RedHatAI/gemma-4-26B-A4B-it-speculator.eagle3](https://huggingface.co/RedHatAI/gemma-4-26B-A4B-it-speculator.eagle3)
+- [Tengyunw/qwen3_8b_eagle3](https://huggingface.co/Tengyunw/qwen3_8b_eagle3)
+- [Tengyunw/qwen3_30b_moe_eagle3](https://huggingface.co/Tengyunw/qwen3_30b_moe_eagle3)
+- [AngelSlim/Qwen3-1.7B_eagle3](https://huggingface.co/AngelSlim/Qwen3-1.7B_eagle3)
+- [AngelSlim/Qwen3-4B_eagle3](https://huggingface.co/AngelSlim/Qwen3-4B_eagle3)
+- [AngelSlim/Qwen3-8B_eagle3](https://huggingface.co/AngelSlim/Qwen3-8B_eagle3)
+- [AngelSlim/Qwen3-14B_eagle3](https://huggingface.co/AngelSlim/Qwen3-14B_eagle3)
+- [AngelSlim/Qwen3-32B_eagle3](https://huggingface.co/AngelSlim/Qwen3-32B_eagle3)
+- [AngelSlim/Qwen3-a3B_eagle3](https://huggingface.co/AngelSlim/Qwen3-a3B_eagle3)
+- [RedHatAI/gpt-oss-20b-speculator.eagle3](https://huggingface.co/RedHatAI/gpt-oss-20b-speculator.eagle3)
+- [lmsys/EAGLE3-gpt-oss-120b-bf16](https://huggingface.co/lmsys/EAGLE3-gpt-oss-120b-bf16)
+- [nvidia/gpt-oss-120b-Eagle3-long-context](https://huggingface.co/nvidia/gpt-oss-120b-Eagle3-long-context)
+
+For the full and up-to-date list of supported models, see #18039.
+
 ### n-gram Cache (`ngram-cache`)

 An n-gram is a sequence of n tokens. The n-gram cache implementation maintains statistics about short n-gram sequences.
@@ -108,7 +147,7 @@ If a draft model is combined with a draftless decoding the draftless decoding ha
 ### General Speculative Parameters

 ```
--spec-type [none|draft-simple|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
+--spec-type [none|draft-simple|draft-eagle3|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
                                        comma-separated list of types of speculative decoding to use
                                        (default: none)
                                        (env: LLAMA_ARG_SPEC_TYPE)
@@ -247,6 +286,7 @@ Specifies a comma-separated list of speculative decoding types to use.
 |------|-------------|
 | `none` | No speculative decoding (default) |
 | `draft-simple` | Use a simple draft model for speculation |
+| `draft-eagle3` | Use an EAGLE-3 draft model that reads the target's hidden states |
 | `draft-mtp` | Use Multi Token Prediction (MTP) heads from the main model |
 | `ngram-cache` | Use n-gram cache lookup |
 | `ngram-simple` | Use simple n-gram pattern matching |
@@ -266,7 +266,6 @@ set   (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
                                            "ggml: OpenCL API version to target")

 option(GGML_HEXAGON                         "ggml: enable Hexagon backend"                    OFF)
-set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml: quantize group size (32, 64, or 128)")

 # toolchain for vulkan-shaders-gen
 set   (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
@@ -27,6 +27,14 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int de
 // split tensor buffer that splits matrices by rows across multiple devices
 GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);

+// Tensor parallelism (--split-mode tensor): comm_init/free/allreduce_tensor
+// trio queried by the meta-backend via ggml_backend_reg_get_proc_address.
+// See typedefs in ggml/include/ggml-backend.h. Mirrors the CUDA backend's
+// pattern (ggml_backend_cuda_comm_*).
+GGML_BACKEND_API void * ggml_backend_sycl_comm_init(ggml_backend_t * backends, size_t n_backends);
+GGML_BACKEND_API void   ggml_backend_sycl_comm_free(void * comm_ctx);
+GGML_BACKEND_API bool   ggml_backend_sycl_comm_allreduce_tensor(void * comm_ctx, struct ggml_tensor ** tensors);
+
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
 GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);

@@ -34,26 +34,26 @@ template <float (*bin_op)(const float, const float),
 static __global__ void k_bin_bcast(const src0_t *         src0,
                                   const src1_t *         src1,
                                   dst_t *                dst,
-                                   const int              ne0,
-                                   const int              ne1,
-                                   const int              ne2,
+                                   const uint32_t         ne0,
+                                   const uint32_t         ne1,
+                                   const uint32_t         ne2,
                                   const uint3            ne3,
                                   const uint3            ne10,
                                   const uint3            ne11,
                                   const uint3            ne12,
                                   const uint3            ne13,
-                                 /*const int              s0,*/
-                                   const int              s1,
-                                   const int              s2,
-                                   const int              s3,
-                                   const int              s00,
-                                   const int              s01,
-                                   const int              s02,
-                                   const int              s03,
-                                   const int              s10,
-                                   const int              s11,
-                                   const int              s12,
-                                   const int              s13,
+                                 /*const uint32_t         s0,*/
+                                   const uint32_t         s1,
+                                   const uint32_t         s2,
+                                   const uint32_t         s3,
+                                   const uint32_t         s00,
+                                   const uint32_t         s01,
+                                   const uint32_t         s02,
+                                   const uint32_t         s03,
+                                   const uint32_t         s10,
+                                   const uint32_t         s11,
+                                   const uint32_t         s12,
+                                   const uint32_t         s13,
                                   src1_ptrs... src1s) {
    ggml_cuda_pdl_lc();
    const uint32_t i0s = blockDim.x * blockIdx.x + threadIdx.x;
@@ -61,7 +61,7 @@ static __global__ void k_bin_bcast(const src0_t *         src0,
    const uint32_t i2  = fastdiv((blockDim.z * blockIdx.z + threadIdx.z), ne3);
    const uint32_t i3  = (blockDim.z * blockIdx.z + threadIdx.z) - (i2 * ne3.z);

-    if (i0s >= (uint32_t)ne0 || i1 >= (uint32_t)ne1 || i2 >= (uint32_t)ne2 || i3 >= ne3.z) {
+    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3.z) {
        return;
    }

@@ -69,25 +69,32 @@ static __global__ void k_bin_bcast(const src0_t *         src0,
    const uint32_t i12 = fastmodulo(i2, ne12);
    const uint32_t i13 = fastmodulo(i3, ne13);

-    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
-    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
-    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;
+    const size_t i_src0 = size_t( i3)*s03 + size_t( i2)*s02 + size_t( i1)*s01;
+    const size_t i_src1 = size_t(i13)*s13 + size_t(i12)*s12 + size_t(i11)*s11;
+    const size_t i_dst  = size_t( i3)*s3  + size_t( i2)*s2  + size_t( i1)*s1;

    const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr;
    dst_t * dst_row = dst + i_dst;

+    const uint32_t s0 = blockDim.x * gridDim.x;
+
    ggml_cuda_pdl_sync();
-    for (int i0 = i0s; i0 < ne0; i0 += blockDim.x * gridDim.x) {
+    for (uint32_t i0 = i0s; i0 < ne0; i0 += s0) {
        const uint32_t i10 = fastmodulo(i0, ne10);

-        float result = src0_row ? (float) src0_row[i0*s00] : 0.0f;
+        float result = src0_row ? (float) src0_row[size_t(i0)*s00] : 0.0f;
        if constexpr (sizeof...(src1_ptrs) > 0) {
-            result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10*s10])));
+            result = (..., (result = bin_op(result, (float)src1s[i_src1 + size_t(i10)*s10])));
        } else {
-            result = bin_op(result, (float)src1[i_src1 + i10*s10]);
+            result = bin_op(result, (float)src1[i_src1 + size_t(i10)*s10]);
        }

        dst_row[i0] = (dst_t) result;
+
+        // protect i0 from overflow
+        if (ne0 - i0 <= s0) {
+           break;
+        }
    }
 }

@@ -110,19 +117,19 @@ static __global__ void k_bin_bcast_unravel(const src0_t *         src0,
                                           const uint3            ne12,
                                           const uint3            ne13,
                                         /*const int              s0,*/
-                                           const int              s1,
-                                           const int              s2,
-                                           const int              s3,
-                                           const int              s00,
-                                           const int              s01,
-                                           const int              s02,
-                                           const int              s03,
-                                           const int              s10,
-                                           const int              s11,
-                                           const int              s12,
-                                           const int              s13,
+                                           const uint32_t         s1,
+                                           const uint32_t         s2,
+                                           const uint32_t         s3,
+                                           const uint32_t         s00,
+                                           const uint32_t         s01,
+                                           const uint32_t         s02,
+                                           const uint32_t         s03,
+                                           const uint32_t         s10,
+                                           const uint32_t         s11,
+                                           const uint32_t         s12,
+                                           const uint32_t         s13,
                                           src1_ptrs... src1s) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+    const uint32_t i  = blockDim.x*blockIdx.x + threadIdx.x;

    const uint32_t i3 = fastdiv(i, prod_012);
    const uint32_t i2 = fastdiv(i - i3 * prod_012.z, prod_01);
@@ -133,25 +140,25 @@ static __global__ void k_bin_bcast_unravel(const src0_t *         src0,
        return;
    }

-    const int i11 = fastmodulo(i1, ne11);
-    const int i12 = fastmodulo(i2, ne12);
-    const int i13 = fastmodulo(i3, ne13);
+    const uint32_t i11 = fastmodulo(i1, ne11);
+    const uint32_t i12 = fastmodulo(i2, ne12);
+    const uint32_t i13 = fastmodulo(i3, ne13);

-    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
-    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
-    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;
+    const size_t i_src0 = size_t( i3)*s03 + size_t( i2)*s02 + size_t( i1)*s01;
+    const size_t i_src1 = size_t(i13)*s13 + size_t(i12)*s12 + size_t(i11)*s11;
+    const size_t i_dst  = size_t( i3)*s3  + size_t( i2)*s2  + size_t( i1)*s1;

    const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr;
    dst_t * dst_row = dst + i_dst;

-    const int i10 = fastmodulo(i0, ne10);
+    const uint32_t i10 = fastmodulo(i0, ne10);

    ggml_cuda_pdl_sync();
-    float result = src0_row ? (float) src0_row[i0*s00] : 0.0f;
+    float result = src0_row ? (float) src0_row[size_t(i0)*s00] : 0.0f;
    if constexpr (sizeof...(src1_ptrs) > 0) {
-        result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10*s10])));
+        result = (..., (result = bin_op(result, (float)src1s[i_src1 + size_t(i10)*s10])));
    } else {
-        result = bin_op(result, (float)src1[i_src1 + i10*s10]);
+        result = bin_op(result, (float)src1[i_src1 + size_t(i10)*s10]);
    }

    dst_row[i0] = (dst_t) result;
@@ -248,6 +255,31 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
        size_t s02 = nb02 / sizeof(src0_t);
        size_t s03 = nb03 / sizeof(src0_t);

+        GGML_ASSERT(ne0 <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(ne1 <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(ne2 <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(ne3 <= std::numeric_limits<uint32_t>::max());
+
+      //GGML_ASSERT(s0  <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(s1  <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(s2  <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(s3  <= std::numeric_limits<uint32_t>::max());
+
+        GGML_ASSERT(s00 <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(s01 <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(s02 <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(s03 <= std::numeric_limits<uint32_t>::max());
+
+        GGML_ASSERT(s10 <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(s11 <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(s12 <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(s13 <= std::numeric_limits<uint32_t>::max());
+
+        GGML_ASSERT(cne1[0] <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(cne1[1] <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(cne1[2] <= std::numeric_limits<uint32_t>::max());
+        GGML_ASSERT(cne1[3] <= std::numeric_limits<uint32_t>::max());
+
        GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
        GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
        GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
@@ -263,6 +295,8 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
        GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
        GGML_ASSERT(nb13 % sizeof(src1_t) == 0);

+        GGML_ASSERT(ne2 * ne3 <= std::numeric_limits<unsigned int>::max());
+
        const int block_size = 128;

        int64_t hne0 = std::max(ne0 / 2LL, 1LL);
@@ -281,7 +315,13 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
        const uint3 ne13 = init_fastdiv_values((uint32_t) cne1[3]);

        if (block_nums.z > 65535 || block_nums.y > 65535) {
-            int         block_num  = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size;
+            int64_t     block_num   = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size;
+
+            GGML_ASSERT(block_num              <= std::numeric_limits<uint32_t>::max());
+            GGML_ASSERT(block_num * block_size <= std::numeric_limits<uint32_t>::max());
+            GGML_ASSERT(ne0 * ne1              <= std::numeric_limits<uint32_t>::max());
+            GGML_ASSERT(ne0 * ne1 * ne2        <= std::numeric_limits<uint32_t>::max());
+
            const uint3 prod_012    = init_fastdiv_values((uint32_t) (ne0 * ne1 * ne2));
            const uint3 prod_01     = init_fastdiv_values((uint32_t) (ne0 * ne1));
            const uint3 ne0_fastdiv = init_fastdiv_values((uint32_t) ne0);
@@ -298,6 +338,10 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
                    s10, s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...);
            }
        } else {
+            GGML_ASSERT(int64_t(block_nums.x) * block_dims.x <= std::numeric_limits<uint32_t>::max());
+            GGML_ASSERT(int64_t(block_nums.y) * block_dims.y <= std::numeric_limits<uint32_t>::max());
+            GGML_ASSERT(int64_t(block_nums.z) * block_dims.z <= std::numeric_limits<uint32_t>::max());
+
            const uint3 ne3_fastdiv = init_fastdiv_values((uint32_t) ne3);
            {
                const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream);
@@ -53,10 +53,10 @@ static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const
    const int64_t nmat = ne / (ne00 * ne01);
    const int64_t n = ne00 * ne01;

-    const int x = blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.x;
-    const int y = blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.y;
-    const int tx = blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.x;  // transpose block offset
-    const int ty = blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.y;
+    const int64_t x  = (int64_t) blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.x;
+    const int64_t y  = (int64_t) blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.y;
+    const int64_t tx = (int64_t) blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.x;  // transpose block offset
+    const int64_t ty = (int64_t) blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.y;

    __shared__ float tile[2][CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D+1];
    int cur_tile_buf = 0;
@@ -197,7 +197,7 @@ static void ggml_cpy_scalar_contiguous_cuda(
 cudaStream_t stream) {

    const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
-    GGML_ASSERT(num_blocks < UINT_MAX);
+    GGML_ASSERT(num_blocks <= INT_MAX);
    const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream);
    ggml_cuda_kernel_launch(cpy_scalar_contiguous<src_t, dst_t>, launch_params, cx, cdst, ne);
 }
@@ -208,6 +208,14 @@ static void ggml_cpy_scalar_cuda(
    const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
    const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {

+    const auto launch_scalar_generic = [&]() {
+        const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+        GGML_ASSERT(num_blocks <= INT_MAX);
+        const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream);
+        ggml_cuda_kernel_launch(cpy_scalar<cpy_1_scalar<src_t, dst_t>>, launch_params,
+            cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+    };
+
    if (transposed) {
        GGML_ASSERT(ne == ne00*ne01*ne02);  // ne[3] is 1 assumed
        int64_t ne00n, ne01n, ne02n;
@@ -224,20 +232,18 @@ static void ggml_cpy_scalar_cuda(
        int64_t grid_x = (ne01n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D;
        int64_t grid_y = (ne00n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D;
        int64_t grid_z = (ne/(ne01n*ne00n) + CUDA_CPY_BLOCK_NM - 1) / CUDA_CPY_BLOCK_NM;
-        GGML_ASSERT(grid_x < UINT_MAX);
-        GGML_ASSERT(grid_y < USHRT_MAX);
-        GGML_ASSERT(grid_z < USHRT_MAX);
-        dim3 dimGrid(grid_x, grid_y, grid_z);
-        dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1);
-        const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(dimGrid, dimBlock, 0, stream);
-        ggml_cuda_kernel_launch(cpy_scalar_transpose<dst_t>, launch_params,
-            cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        GGML_ASSERT(grid_x <= INT_MAX);
+        if (grid_y > USHRT_MAX || grid_z > USHRT_MAX) {
+            launch_scalar_generic();
+        } else {
+            dim3 dimGrid(grid_x, grid_y, grid_z);
+            dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1);
+            const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(dimGrid, dimBlock, 0, stream);
+            ggml_cuda_kernel_launch(cpy_scalar_transpose<dst_t>, launch_params,
+                cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        }
    } else {
-        const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
-        GGML_ASSERT(num_blocks < UINT_MAX);
-        const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream);
-        ggml_cuda_kernel_launch(cpy_scalar<cpy_1_scalar<src_t, dst_t>>, launch_params,
-            cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        launch_scalar_generic();
    }
 }

@@ -248,7 +254,7 @@ static void ggml_cpy_f32_q8_0_cuda(

    GGML_ASSERT(ne % QK8_0 == 0);
    const int64_t num_blocks = ne / QK8_0;
-    GGML_ASSERT(num_blocks < UINT_MAX);
+    GGML_ASSERT(num_blocks <= INT_MAX);
    cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
@@ -259,7 +265,7 @@ static void ggml_cpy_q8_0_f32_cuda(
    const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {

    const int64_t num_blocks = ne;
-    GGML_ASSERT(num_blocks < UINT_MAX);
+    GGML_ASSERT(num_blocks <= INT_MAX);
    cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
@@ -271,7 +277,7 @@ static void ggml_cpy_f32_q4_0_cuda(

    GGML_ASSERT(ne % QK4_0 == 0);
    const int64_t num_blocks = ne / QK4_0;
-    GGML_ASSERT(num_blocks < UINT_MAX);
+    GGML_ASSERT(num_blocks <= INT_MAX);
    cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
@@ -284,7 +290,7 @@ static void ggml_cpy_q4_0_f32_cuda(
    const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
    cudaStream_t stream) {
    const int64_t num_blocks = ne;
-    GGML_ASSERT(num_blocks < UINT_MAX);
+    GGML_ASSERT(num_blocks <= INT_MAX);
    cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0><<<num_blocks, 1, 0, stream>>>(
        cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
         ne10, ne11, ne12, nb10, nb11, nb12, nb13);
@@ -297,7 +303,7 @@ static void ggml_cpy_f32_q4_1_cuda(

    GGML_ASSERT(ne % QK4_1 == 0);
    const int64_t num_blocks = ne / QK4_1;
-    GGML_ASSERT(num_blocks < UINT_MAX);
+    GGML_ASSERT(num_blocks <= INT_MAX);
    cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
@@ -310,7 +316,7 @@ static void ggml_cpy_q4_1_f32_cuda(
    const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
    cudaStream_t stream) {
    const int64_t num_blocks = ne;
-    GGML_ASSERT(num_blocks < UINT_MAX);
+    GGML_ASSERT(num_blocks <= INT_MAX);
    cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1><<<num_blocks, 1, 0, stream>>>(
        cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
         ne10, ne11, ne12, nb10, nb11, nb12, nb13);
@@ -323,7 +329,7 @@ static void ggml_cpy_f32_q5_0_cuda(

    GGML_ASSERT(ne % QK5_0 == 0);
    const int64_t num_blocks = ne / QK5_0;
-    GGML_ASSERT(num_blocks < UINT_MAX);
+    GGML_ASSERT(num_blocks <= INT_MAX);
    cpy_f32_q<cpy_blck_f32_q5_0, QK5_0><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
@@ -336,7 +342,7 @@ static void ggml_cpy_q5_0_f32_cuda(
    const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
    cudaStream_t stream) {
    const int64_t num_blocks = ne;
-    GGML_ASSERT(num_blocks < UINT_MAX);
+    GGML_ASSERT(num_blocks <= INT_MAX);
    cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0><<<num_blocks, 1, 0, stream>>>(
        cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
        ne10, ne11, ne12, nb10, nb11, nb12, nb13);
@@ -349,7 +355,7 @@ static void ggml_cpy_f32_q5_1_cuda(

    GGML_ASSERT(ne % QK5_1 == 0);
    const int64_t num_blocks = ne / QK5_1;
-    GGML_ASSERT(num_blocks < UINT_MAX);
+    GGML_ASSERT(num_blocks <= INT_MAX);
    cpy_f32_q<cpy_blck_f32_q5_1, QK5_1><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
@@ -362,7 +368,7 @@ static void ggml_cpy_q5_1_f32_cuda(
    const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
    cudaStream_t stream) {
    const int64_t num_blocks = ne;
-    GGML_ASSERT(num_blocks < UINT_MAX);
+    GGML_ASSERT(num_blocks <= INT_MAX);
    cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1><<<num_blocks, 1, 0, stream>>>(
        cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
        ne10, ne11, ne12, nb10, nb11, nb12, nb13);
@@ -375,7 +381,7 @@ static void ggml_cpy_f32_iq4_nl_cuda(

    GGML_ASSERT(ne % QK4_NL == 0);
    const int64_t num_blocks = ne / QK4_NL;
-    GGML_ASSERT(num_blocks < UINT_MAX);
+    GGML_ASSERT(num_blocks <= INT_MAX);
    cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
@@ -25,7 +25,6 @@ include(ExternalProject)
 option(GGML_HEXAGON_HTP_DEBUG  "ggml-hexagon: enable HTP debug output" OFF)
 option(GGML_HEXAGON_FA_EXP2_HF "ggml-hexagon: use FP16 exp2 polynomial in FA softmax instead of F32 exp round-trip" OFF)
 set(GGML_HEXAGON_HTP_CERT  "$ENV{HEXAGON_HTP_CERT}" CACHE PATH "ggml-hexagon: enable HTP library signing using certificate")
-set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml-hexagon: quantize group size (32, 64, or 128)")

 add_library(htp_iface OBJECT
    ${CMAKE_CURRENT_BINARY_DIR}/htp_iface_stub.c)
@@ -72,15 +71,12 @@ function(build_htp_skel V)
            -DHEXAGON_SDK_ROOT=${HEXAGON_SDK_ROOT}
            -DHEXAGON_TOOLS_ROOT=${HEXAGON_TOOLS_ROOT}
            -DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG}
-            -DGGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE}
            -DDSP_VERSION=${V}
            -DPREBUILT_LIB_DIR="toolv19_${V}")
    list(APPEND HTP_SKELS ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-${V}.so)
    set(HTP_SKELS ${HTP_SKELS} PARENT_SCOPE)
 endfunction()

-build_htp_skel(v68)
-build_htp_skel(v69)
 build_htp_skel(v73)
 build_htp_skel(v75)
 build_htp_skel(v79)
@@ -5,10 +5,12 @@
 #include "ggml-backend-impl.h"
 #include "ggml-common.h"

+#include <algorithm>
 #include <string>
 #include <vector>
 #include <stdio.h>
 #include "htp-ops.h"
+#include "htp/matmul-ops.h"

 struct htp_opnode {
    ggml_tensor * node = nullptr;
@@ -17,6 +19,13 @@ struct htp_opnode {

    htp_op_code opcode = HTP_OP_INVALID;

+    std::vector<ggml_tensor *> extra_dsts;
+
+    int32_t kernel_params[HTP_OP_MAX_KERN_PARAMS] = {0};
+
+    htp_opnode(ggml_tensor * node = nullptr, std::vector<ggml_tensor *> fused = {}, htp_op_code opcode = HTP_OP_INVALID, std::vector<ggml_tensor *> extra_dsts = {})
+        : node(node), fused(std::move(fused)), opcode(opcode), extra_dsts(std::move(extra_dsts)) {}
+
    ggml_op op() const {
        return node->op;
    }
@@ -25,6 +34,26 @@ struct htp_opnode {
        return fused.empty() ? node : fused.back();
    }

+    void add_fused(ggml_tensor * t, bool extra_dst = false) {
+        fused.push_back(t);
+        if (extra_dst) {
+            extra_dsts.push_back(t);
+        }
+    }
+
+    std::vector<const ggml_tensor *> get_outputs() const {
+        std::vector<const ggml_tensor *> res;
+        if (extra_dsts.empty()) {
+            res.push_back(dst());
+        } else {
+            res.push_back(node);
+            for (const auto * x : extra_dsts) {
+                res.push_back(x);
+            }
+        }
+        return res;
+    }
+
    const ggml_tensor * src0() const {
        return node->src[0];
    }
@@ -37,10 +66,6 @@ struct htp_opnode {
        return ggml_op_is_empty(node->op);
    }

-    void add_fused(ggml_tensor * t) {
-        fused.push_back(t);
-    }
-
    bool stackable() const {
        switch (this->op()) {
            case GGML_OP_MUL_MAT:
@@ -131,87 +156,117 @@ struct htp_opformat {
    char types[16 * GGML_MAX_SRC];
    char buffs[64 * GGML_MAX_SRC];
    char names[64 * GGML_MAX_SRC];
+    char kparams[128];

-    int format_tensor_dims(char * str, const struct ggml_tensor * t) {
+    int format_tensor_dims(char * str, size_t max_size, const struct ggml_tensor * t) {
        if (!t) {
-            return sprintf(str, "NONE");
+            return snprintf(str, max_size, "NONE");
        }
        if (t->ne[2] == 1 && t->ne[3] == 1) {
-            return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
+            return snprintf(str, max_size, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
        } else {
-            return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
+            return snprintf(str, max_size, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
        }
    }

-    void format_op_dims(char * str, const htp_opnode & node) {
+    void format_op_dims(char * str, size_t max_size, const htp_opnode & node) {
        char * p = str;
+        char * p_end = str + max_size;
        auto inputs = node.get_inputs();

        if (!inputs.empty()) {
-            p += format_tensor_dims(p, inputs[0]);
+            p += std::min((size_t)format_tensor_dims(p, p_end - p, inputs[0]), (size_t)(p_end - p));

            for (size_t i = 1; i < inputs.size(); i++) {
-                p += sprintf(p, " x ");
-                p += format_tensor_dims(p, inputs[i]);
+                if (p < p_end) {
+                    p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p));
+                }
+                if (p < p_end) {
+                    p += std::min((size_t)format_tensor_dims(p, p_end - p, inputs[i]), (size_t)(p_end - p));
+                }
            }

-            p += sprintf(p, " -> ");
+            if (p < p_end) {
+                p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p));
+            }
        }

        char self[64];
-        format_tensor_dims(self, node.dst());
-        p += sprintf(p, "%s", self);
+        format_tensor_dims(self, sizeof(self), node.dst());
+        if (p < p_end) {
+            p += std::min((size_t)snprintf(p, p_end - p, "%s", self), (size_t)(p_end - p));
+        }
    }

-    int format_tensor_strides(char * str, const struct ggml_tensor * t) {
+    int format_tensor_strides(char * str, size_t max_size, const struct ggml_tensor * t) {
        if (!t) {
-            return sprintf(str, "NONE");
+            return snprintf(str, max_size, "NONE");
        }
        const char * c = ggml_is_contiguous(t) ? "" : "!";

        if (t->ne[2] == 1 && t->ne[3] == 1) {
-            return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
+            return snprintf(str, max_size, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
        } else {
-            return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c);
+            return snprintf(str, max_size, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c);
        }
    }

-    void format_op_strides(char * str, const htp_opnode & node) {
+    void format_op_strides(char * str, size_t max_size, const htp_opnode & node) {
        char * p = str;
+        char * p_end = str + max_size;
        auto inputs = node.get_inputs();

        if (!inputs.empty()) {
-            p += format_tensor_strides(p, inputs[0]);
+            p += std::min((size_t)format_tensor_strides(p, p_end - p, inputs[0]), (size_t)(p_end - p));

            for (size_t i = 1; i < inputs.size(); i++) {
-                p += sprintf(p, " x ");
-                p += format_tensor_strides(p, inputs[i]);
+                if (p < p_end) {
+                    p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p));
+                }
+                if (p < p_end) {
+                    p += std::min((size_t)format_tensor_strides(p, p_end - p, inputs[i]), (size_t)(p_end - p));
+                }
            }

-            p += sprintf(p, " -> ");
+            if (p < p_end) {
+                p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p));
+            }
        }

        char self[64];
-        format_tensor_strides(self, node.dst());
-        p += sprintf(p, "%s", self);
+        format_tensor_strides(self, sizeof(self), node.dst());
+        if (p < p_end) {
+            p += std::min((size_t)snprintf(p, p_end - p, "%s", self), (size_t)(p_end - p));
+        }
    }

-    void format_op_types(char * str, const htp_opnode & node) {
+    void format_op_types(char * str, size_t max_size, const htp_opnode & node) {
        char * p = str;
+        char * p_end = str + max_size;
        auto inputs = node.get_inputs();

        if (!inputs.empty()) {
-            p += sprintf(p, "%s", inputs[0] ? ggml_type_name(inputs[0]->type) : "NONE");
-
-            for (size_t i = 1; i < inputs.size(); i++) {
-                p += sprintf(p, " x ");
-                p += sprintf(p, "%s", inputs[i] ? ggml_type_name(inputs[i]->type) : "NONE");
+            if (p < p_end) {
+                p += std::min((size_t)snprintf(p, p_end - p, "%s", inputs[0] ? ggml_type_name(inputs[0]->type) : "NONE"), (size_t)(p_end - p));
            }

-            p += sprintf(p, " -> ");
+            for (size_t i = 1; i < inputs.size(); i++) {
+                if (p < p_end) {
+                    p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p));
+                }
+                if (p < p_end) {
+                    p += std::min((size_t)snprintf(p, p_end - p, "%s", inputs[i] ? ggml_type_name(inputs[i]->type) : "NONE"), (size_t)(p_end - p));
+                }
+            }
+
+            if (p < p_end) {
+                p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p));
+            }
        }

-        p += sprintf(p, "%s", ggml_type_name(node.dst()->type));
+        if (p < p_end) {
+            p += std::min((size_t)snprintf(p, p_end - p, "%s", ggml_type_name(node.dst()->type)), (size_t)(p_end - p));
+        }
    }

    const char * tensor_buff_name(const struct ggml_tensor * t) {
@@ -221,51 +276,102 @@ struct htp_opformat {
        return "NONE";
    }

-    void format_op_buffs(char * str, const htp_opnode & node) {
+    void format_op_buffs(char * str, size_t max_size, const htp_opnode & node) {
        char * p = str;
+        char * p_end = str + max_size;
        auto inputs = node.get_inputs();

        if (!inputs.empty()) {
-            p += sprintf(p, "%s", tensor_buff_name(inputs[0]));
-
-            for (size_t i = 1; i < inputs.size(); i++) {
-                p += sprintf(p, " x ");
-                p += sprintf(p, "%s", tensor_buff_name(inputs[i]));
+            if (p < p_end) {
+                p += std::min((size_t)snprintf(p, p_end - p, "%s", tensor_buff_name(inputs[0])), (size_t)(p_end - p));
            }

-            p += sprintf(p, " -> ");
+            for (size_t i = 1; i < inputs.size(); i++) {
+                if (p < p_end) {
+                    p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p));
+                }
+                if (p < p_end) {
+                    p += std::min((size_t)snprintf(p, p_end - p, "%s", tensor_buff_name(inputs[i])), (size_t)(p_end - p));
+                }
+            }
+
+            if (p < p_end) {
+                p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p));
+            }
        }

-        p += sprintf(p, "%s", tensor_buff_name(node.dst()));
+        if (p < p_end) {
+            p += std::min((size_t)snprintf(p, p_end - p, "%s", tensor_buff_name(node.dst())), (size_t)(p_end - p));
+        }
    }

-    void format_op_names(char * str, const htp_opnode & node) {
+    void format_op_names(char * str, size_t max_size, const htp_opnode & node) {
        char * p = str;
+        char * p_end = str + max_size;
        auto inputs = node.get_inputs();

        if (!inputs.empty()) {
-            p += sprintf(p, "%s", inputs[0] ? inputs[0]->name : "NONE");
-
-            for (size_t i = 1; i < inputs.size(); i++) {
-                p += sprintf(p, " x ");
-                p += sprintf(p, "%s", inputs[i] ? inputs[i]->name : "NONE");
+            if (p < p_end) {
+                p += std::min((size_t)snprintf(p, p_end - p, "%s", inputs[0] ? inputs[0]->name : "NONE"), (size_t)(p_end - p));
            }

-            p += sprintf(p, " -> ");
+            for (size_t i = 1; i < inputs.size(); i++) {
+                if (p < p_end) {
+                    p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p));
+                }
+                if (p < p_end) {
+                    p += std::min((size_t)snprintf(p, p_end - p, "%s", inputs[i] ? inputs[i]->name : "NONE"), (size_t)(p_end - p));
+                }
+            }
+
+            if (p < p_end) {
+                p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p));
+            }
        }

-        p += sprintf(p, "%s", node.dst()->name);
+        if (p < p_end) {
+            p += std::min((size_t)snprintf(p, p_end - p, "%s", node.dst()->name), (size_t)(p_end - p));
+        }
+    }
+    void format_kernel_params(char * str, size_t max_size, const htp_opnode & node) {
+        if (node.opcode == HTP_OP_MUL_MAT || node.opcode == HTP_OP_MUL_MAT_ID ||
+            node.opcode == HTP_OP_MUL_MAT_QKV || node.opcode == HTP_OP_MUL_MAT_FFN) {
+            const auto * kparams = (const struct htp_mm_kernel_params *) node.kernel_params;
+            const char * path = "unknown";
+            int32_t type = kparams->kernel_type;
+            if (type == HTP_MM_KERNEL_HMX_2D || type == HTP_MM_KERNEL_HMX_F16_BATCHED) {
+                path = "hmx-tiled";
+            } else if (type == HTP_MM_KERNEL_HVX_F16_F16_VTCM || type == HTP_MM_KERNEL_HVX_F32_F32_VTCM ||
+                       type == HTP_MM_KERNEL_HVX_QUANT_ROW    || type == HTP_MM_KERNEL_HVX_QUANT_BLOCK) {
+                path = "hvx-tiled";
+            } else if (type == HTP_MM_KERNEL_HVX_F16_F16_DDR  || type == HTP_MM_KERNEL_HVX_F16_F32_DDR ||
+                       type == HTP_MM_KERNEL_HVX_F32_F32_DDR  || type == HTP_MM_KERNEL_HVX_F32_F16_DDR ||
+                       type == HTP_MM_KERNEL_HVX_QUANT_ROW_FLAT) {
+                path = "hvx-flat";
+            }
+            snprintf(str, max_size, "%s vtcm %d", path, (int) kparams->vtcm_size);
+        } else {
+            snprintf(str, max_size, "----");
+        }
    }

    void format(const htp_opnode & node) {
-        format_op_dims(dims, node);
-        format_op_strides(strides, node);
-        format_op_types(types, node);
-        format_op_buffs(buffs, node);
-        format_op_names(names, node);
+        format_op_dims(dims, sizeof(dims), node);
+        format_op_strides(strides, sizeof(strides), node);
+        format_op_types(types, sizeof(types), node);
+        format_op_buffs(buffs, sizeof(buffs), node);
+        format_op_names(names, sizeof(names), node);
+        format_kernel_params(kparams, sizeof(kparams), node);
    }

-    htp_opformat() {}
+    htp_opformat() {
+        strides[0] = '\0';
+        dims[0]    = '\0';
+        types[0]   = '\0';
+        buffs[0]   = '\0';
+        names[0]   = '\0';
+        kparams[0] = '\0';
+    }
    htp_opformat(const htp_opnode & node) { format(node); }
 };

@@ -19,43 +19,9 @@ add_library(${HTP_LIB} SHARED
    htp_iface_skel.c
    worker-pool.c
    hex-dma.c
-)
-
-target_compile_definitions(${HTP_LIB} PRIVATE
-    $<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,HTP_DEBUG=1,NDEBUG=1>
-    $<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,FARF_HIGH=1,>
-    FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE})
-
-if (GGML_HEXAGON_FA_EXP2_HF)
-    message(STATUS "ggml-htp: HMX_FA_USE_EXP2_HF=1 (use FP16 exp2 polynomial in FA softmax)")
-    target_compile_definitions(${HTP_LIB} PRIVATE HMX_FA_USE_EXP2_HF=1)
-endif()
-
-# HMX acceleration: available on v73+ architectures
-set(HTP_HMX_VERSIONS v73 v75 v79 v81)
-list(FIND HTP_HMX_VERSIONS ${DSP_VERSION} _hmx_idx)
-
-if (_hmx_idx GREATER_EQUAL 0)
-    target_sources(${HTP_LIB} PRIVATE
-        hmx-flash-attn-ops.c
-        hmx-matmul-ops.c
-        hmx-queue.c
-    )
-
-    # -mhmx enables HMX instruction set (needed by files that include hmx-utils.h)
-    set_source_files_properties(
-        hmx-flash-attn-ops.c
-        hmx-matmul-ops.c
-        hmx-queue.c
-        PROPERTIES COMPILE_OPTIONS "-mhmx"
-    )
-
-    target_compile_definitions(${HTP_LIB} PRIVATE HTP_HAS_HMX=1)
-endif()
-
-build_idl(htp_iface.idl ${HTP_LIB})
-
-target_sources(${HTP_LIB} PRIVATE
+    hmx-queue.c
+    flash-attn-ops.c
+    hmx-flash-attn-ops.c
    matmul-ops.c
    binary-ops.c
    unary-ops.c
@@ -63,7 +29,6 @@ target_sources(${HTP_LIB} PRIVATE
    softmax-ops.c
    act-ops.c
    rope-ops.c
-    flash-attn-ops.c
    set-rows-ops.c
    get-rows-ops.c
    cpy-ops.c
@@ -79,6 +44,17 @@ target_sources(${HTP_LIB} PRIVATE
    pad-ops.c
 )

+target_compile_definitions(${HTP_LIB} PRIVATE
+    $<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,HTP_DEBUG=1,NDEBUG=1>
+    $<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,FARF_HIGH=1,>)
+
+if (GGML_HEXAGON_FA_EXP2_HF)
+    message(STATUS "ggml-htp: HMX_FA_USE_EXP2_HF=1 (use FP16 exp2 polynomial in FA softmax)")
+    target_compile_definitions(${HTP_LIB} PRIVATE HMX_FA_USE_EXP2_HF=1)
+endif()
+
+build_idl(htp_iface.idl ${HTP_LIB})
+
 set_target_properties(${HTP_LIB} PROPERTIES EXPORT_COMPILE_COMMANDS ON)

 install(TARGETS ${HTP_LIB})
@@ -3,7 +3,7 @@ if (HEXAGON_TOOLCHAIN_INCLUDED)
 endif()
 set(HEXAGON_TOOLCHAIN_INCLUDED true)

-#Cross Compiling for Hexagon
+# Cross Compiling for Hexagon
 set(HEXAGON TRUE)
 set(CMAKE_SYSTEM_NAME QURT)
 set(CMAKE_SYSTEM_PROCESSOR Hexagon)
@@ -14,7 +14,6 @@ set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
 set(CUSTOM_RUNELF_PATH "")

-#To fix backward compatibility with EAI addon.
 if (NOT HEXAGON_SDK_ROOT)
    set(HEXAGON_SDK_ROOT $ENV{HEXAGON_SDK_ROOT})
 endif()
@@ -31,7 +30,6 @@ endif()
 file(TO_CMAKE_PATH "${HEXAGON_TOOLS_ROOT}" HEXAGON_TOOLS_ROOT)
 file(TO_CMAKE_PATH "${HEXAGON_SDK_ROOT}"   HEXAGON_SDK_ROOT)

-#Get the Binary extension of the Hexagon Toolchain
 if(CMAKE_HOST_SYSTEM_NAME STREQUAL Windows)
    set(HEXAGON_TOOLCHAIN_SUFFIX .exe)
 endif()
@@ -48,12 +46,12 @@ set(CMAKE_TRY_COMPILE_PLATFORM_VARIABLES
    HEXAGON_TOOLS_ROOT
 )

-#QURT Related includes and linker flags
+# QURT Related includes and linker flags
 set(V_ARCH ${HEXAGON_ARCH})
 set(_QURT_INSTALL_DIR "${HEXAGON_SDK_ROOT}/rtos/qurt/ADSP${V_ARCH}MP${V_ARCH_EXTN}")
 set(_QURT_INSTALL_DIR "${HEXAGON_SDK_ROOT}/rtos/qurt/compute${V_ARCH}${V_ARCH_EXTN}")

-if( ${TREE} MATCHES PAKMAN )
+if (${TREE} MATCHES PAKMAN)
    set(_QURT_INSTALL_DIR "${QURT_IMAGE_DIR}/compute${V_ARCH}${V_ARCH_EXTN}")
 endif()
 message(DEBUG "_QURT_INSTALL_DIR:${_QURT_INSTALL_DIR}")
@@ -83,11 +81,9 @@ set(QURT_START_LINK_LIBS
    )
 STRING(REPLACE ";" " " QURT_START_LINK_LIBS "${QURT_START_LINK_LIBS}")

-set(QURT_END_LINK_LIBS
-    ${TARGET_DIR}/fini.o
-    )
+set(QURT_END_LINK_LIBS ${TARGET_DIR}/fini.o)

-#Non QURT related includes and linker flags
+# Non QURT related includes and linker flags

 set(TARGET_DIR_NOOS "${HEXAGON_TOOLCHAIN}/Tools/target/hexagon/lib/${HEXAGON_ARCH}")

@@ -99,8 +95,10 @@ if (NOT NO_WRAP_MEM_API)
    set(WRAP_MEMALIGN -Wl,--wrap=memalign)
 endif()

+set(ARCH_FLAGS "-mcpu=${V_ARCH} -m${V_ARCH} -mhvx=${V_ARCH} -mhmx")
+
 set(PIC_SHARED_LD_FLAGS
-    -mcpu=${V_ARCH} -m${V_ARCH} -mhvx=${V_ARCH}
+    ${ARCH_FLAGS}
    -G0
    -fpic
    -Wl,-Bsymbolic
@@ -120,13 +118,13 @@ STRING(REPLACE ";" " " PIC_SHARED_LD_FLAGS "${PIC_SHARED_LD_FLAGS}")

 set(HEXAGON_PIC_SHARED_LINK_OPTIONS "${PIC_SHARED_LD_FLAGS}")

-#System include paths
+# System include paths
 include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs)
 include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs/stddef)
 include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/ipc/fastrpc/incs)

-#LLVM toolchain setup
-#Compiler paths, options and architecture
+# LLVM toolchain setup
+# Compiler paths, options and architecture
 set(CMAKE_C_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang${HEXAGON_TOOLCHAIN_SUFFIX})
 set(CMAKE_CXX_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang++${HEXAGON_TOOLCHAIN_SUFFIX})
 set(CMAKE_AR ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-ar${HEXAGON_TOOLCHAIN_SUFFIX})
@@ -137,8 +135,8 @@ set(CMAKE_PREFIX_PATH ${HEXAGON_TOOLCHAIN}/Tools/target/hexagon)
 set(CMAKE_SHARED_LIBRARY_SONAME_C_FLAG   "-Wl,-soname,")
 set(CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG "-Wl,-soname,")

-#Compiler Options
-set(COMMON_FLAGS "-mcpu=hexagon${V_ARCH} -m${V_ARCH} -mhvx=${V_ARCH} -fvectorize -flto -Wall -Werror -fno-zero-initialized-in-bss -G0 -fdata-sections -fpic ${XQF_ARGS}")
+# Compiler Options
+set(COMMON_FLAGS "${ARCH_FLAGS} -fvectorize -flto -Wall -Werror -fno-zero-initialized-in-bss -G0 -fdata-sections -fpic ${XQF_ARGS}")

 set(CMAKE_CXX_FLAGS_DEBUG          "${COMMON_FLAGS} -O0 -D_DEBUG -g")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} -O2 -g")
@@ -18,7 +18,8 @@
 #include "htp-ctx.h"
 #include "htp-ops.h"
 #include "htp-ops.h"
-#include "hmx-ops.h"
+
+int hmx_flash_attn_ext(struct htp_ops_context * octx);

 // Must be multiple of 32
 #define FLASH_ATTN_BLOCK_SIZE (32 * 2)
@@ -633,7 +634,6 @@ int op_flash_attn_ext(struct htp_ops_context * octx) {
        return HTP_STATUS_NO_SUPPORT;
    }

-#ifdef HTP_HAS_HMX
    // HMX path: head_dim multiple of 64, F16 KV, and no sinks
    if (k->type == HTP_TYPE_F16 && v->type == HTP_TYPE_F16 && k->ne[0] % 64 == 0 && v->ne[0] % 64 == 0 && octx->src[4] == NULL) {
        int ret = hmx_flash_attn_ext(octx);
@@ -642,7 +642,6 @@ int op_flash_attn_ext(struct htp_ops_context * octx) {
        }
        // VTCM too small or other failure -> fall through to HVX path
    }
-#endif

    struct htp_fa_context factx;
    factx.octx = octx;
@@ -0,0 +1,80 @@
+#ifndef HEX_COMMON_H
+#define HEX_COMMON_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include <stdbool.h>
+
+#ifndef SIZE_MAX
+#define SIZE_MAX ((size_t)-1)
+#endif
+
+#ifndef MAX
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#endif
+
+#ifndef MIN
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
+
+static inline uint32_t hex_ceil_pow2(uint32_t x) {
+    if (x <= 1) { return 1; }
+    int p = 2;
+    x--;
+    while (x >>= 1) { p <<= 1; }
+    return p;
+}
+
+static inline size_t hmx_ceil_div(size_t num, size_t den) {
+    return (num + den - 1) / den;
+}
+
+static inline int32_t hex_is_aligned(const void * addr, uint32_t align) {
+    return ((size_t) addr & (align - 1)) == 0;
+}
+
+static inline size_t hex_align_up(size_t v, size_t align) {
+    return hmx_ceil_div(v, align) * align;
+}
+
+static inline size_t hex_align_down(size_t v, size_t align) {
+    return (v / align) * align;
+}
+
+static inline int32_t hex_is_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
+    uint32_t left_off  = (size_t) addr & (chunk_size - 1);
+    uint32_t right_off = left_off + n;
+    return right_off <= chunk_size;
+}
+
+static inline uint32_t hex_round_up(uint32_t n, uint32_t m) {
+    return m * ((n + m - 1) / m);
+}
+
+static inline size_t hex_smin(size_t a, size_t b) {
+    return a < b ? a : b;
+}
+
+static inline size_t hex_smax(size_t a, size_t b) {
+    return a > b ? a : b;
+}
+
+static inline void hex_swap_ptr(void ** p1, void ** p2) {
+    void * t = *p1;
+    *p1      = *p2;
+    *p2      = t;
+}
+
+static inline bool hex_mul_overflow(size_t a, size_t b, size_t *out) {
+    if (a != 0 && b > SIZE_MAX / a) return true;
+    *out = a * b;
+    return false;
+}
+
+static inline bool hex_add_overflow(size_t a, size_t b, size_t *out) {
+    if (a > SIZE_MAX - b) return true;
+    *out = a + b;
+    return false;
+}
+
+#endif // HEX_COMMON_H
@@ -5,6 +5,7 @@
 #include <hexagon_types.h>
 #include <stdbool.h>
 #include <stdint.h>
+#include "hex-utils.h"

 #include "hex-profile.h"

@@ -127,13 +128,8 @@ static inline dma_ptr dma_make_ptr(void *dst, const void *src)
    return p;
 }

-#if __HVX_ARCH__ < 73
-static const uint32_t dma_src_l2_bypass_on = 1;
-static const uint32_t dma_dst_l2_bypass_on = 0;
-#else
 static const uint32_t dma_src_l2_bypass_on = 1;
 static const uint32_t dma_dst_l2_bypass_on = 1;
-#endif

 static inline bool dma_queue_push_single_1d(dma_queue * q, dma_ptr dptr, size_t size) {
    if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) {
@@ -11,14 +11,7 @@

 #include "hex-fastdiv.h"
 #include "hex-dump.h"
-
-#ifndef MAX
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-#endif
-
-#ifndef MIN
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#endif
+#include "hex-common.h"

 static inline uint64_t hex_get_cycles() {
    uint64_t cycles = 0;
@@ -32,54 +25,6 @@ static inline uint64_t hex_get_pktcnt() {
    return pktcnt;
 }

-static inline uint32_t hex_ceil_pow2(uint32_t x) {
-    if (x <= 1) { return 1; }
-    int p = 2;
-    x--;
-    while (x >>= 1) { p <<= 1; }
-    return p;
-}
-
-static inline size_t hmx_ceil_div(size_t num, size_t den) {
-    return (num + den - 1) / den;
-}
-
-static inline int32_t hex_is_aligned(const void * addr, uint32_t align) {
-    return ((size_t) addr & (align - 1)) == 0;
-}
-
-static inline size_t hex_align_up(size_t v, size_t align) {
-    return hmx_ceil_div(v, align) * align;
-}
-
-static inline size_t hex_align_down(size_t v, size_t align) {
-    return (v / align) * align;
-}
-
-static inline int32_t hex_is_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
-    uint32_t left_off  = (size_t) addr & (chunk_size - 1);
-    uint32_t right_off = left_off + n;
-    return right_off <= chunk_size;
-}
-
-static inline uint32_t hex_round_up(uint32_t n, uint32_t m) {
-    return m * ((n + m - 1) / m);
-}
-
-static inline size_t hex_smin(size_t a, size_t b) {
-    return a < b ? a : b;
-}
-
-static inline size_t hex_smax(size_t a, size_t b) {
-    return a > b ? a : b;
-}
-
-static inline void hex_swap_ptr(void ** p1, void ** p2) {
-    void * t = *p1;
-    *p1      = *p2;
-    *p2      = t;
-}
-
 static inline void hex_l2fetch(const void * p, uint32_t width, uint32_t stride, uint32_t height) {
    const uint64_t control = Q6_P_combine_RR(stride, Q6_R_combine_RlRl(width, height));
    Q6_l2fetch_AP((void *) p, control);
@@ -49,7 +49,7 @@
 // g_br = hex_align_up(gqa_factor * Br, 32) replaces Br for all Q/O/S/P/D dimensions.
 // Layout: Q + O_ping + O_pong + K_dma*2 + V_dma*2 + K_tile + V_tile + S + P + D + vectors + scales
 // Mask is DMA'd into a VTCM buffer (Br rows per KV block) to avoid DDR reads in softmax.
-static size_t hmx_fa_compute_vtcm_usage(size_t gqa_factor, size_t DK, size_t DV, size_t Br, size_t Bc, size_t n_threads, bool use_pipeline) {
+static size_t hmx_fa_compute_vtcm_usage(size_t gqa_factor, size_t DK, size_t DV, size_t Br, size_t Bc, size_t n_threads, bool pipeline) {
    const size_t g_br         = hex_align_up(gqa_factor * Br, HMX_FP16_TILE_N_ROWS);
    const size_t q_tile_size  = hex_align_up(g_br * DK * sizeof(__fp16), 4096);    // Q:  [g_br, DK]
    const size_t o_tile_size  = hex_align_up(g_br * DV * sizeof(__fp16), 4096);    // O:  [g_br, DV] x2 ping-pong
@@ -70,7 +70,7 @@ static size_t hmx_fa_compute_vtcm_usage(size_t gqa_factor, size_t DK, size_t DV,
           + k_dma_size  * 2               // K DMA x2
           + v_dma_size  * 2               // V DMA x2
           + k_tile_size * 1               // K tiles
-           + v_tile_size * (use_pipeline ? 2 : 1) // V tiles (double-buffered if pipelining)
+           + v_tile_size * (pipeline ? 2 : 1) // V tiles (double-buffered if pipelining)
           + s_tile_size * 2               // S + P
           + d_tile_size * 1               // D (diagonal matrix)
           + col_vec_size * 4              // m_vec, l_vec, s_rowmax, p_rowsum
@@ -290,7 +290,7 @@ static const int16_t d_tile_scatter_offsets[64] __attribute__((aligned(128))) =

 struct hmx_fa_context {
    const struct htp_ops_context * octx;
-    bool         use_pipeline;  // true when n_kv_blocks >= FA_MIN_KV_BLOCKS && n_threads >= 2
+    bool         pipeline;  // true when n_kv_blocks >= FA_MIN_KV_BLOCKS && n_threads >= 2
    uint32_t     n_threads;

    // Op parameters
@@ -409,7 +409,7 @@ static void fa_v_interleave_thread(unsigned int n, unsigned int i, void * data)
        return;
    }

-    __fp16 * v_tiles_dest = factx->use_pipeline ? factx->vtcm_v_tiles[args->buf_idx] : factx->vtcm_v_tiles[0];
+    __fp16 * v_tiles_dest = factx->pipeline ? factx->vtcm_v_tiles[args->buf_idx] : factx->vtcm_v_tiles[0];

    struct htp_thread_trace * tr = factx->octx->ctx ? &factx->octx->ctx->trace[i] : NULL;
    htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, start);
@@ -1312,13 +1312,13 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
    const size_t g_br = hex_align_up(G * Br, HMX_FP16_TILE_N_ROWS);

    const uint32_t n_kv_blocks  = (nek1 + Bc - 1) / Bc;
-    const bool     use_pipeline = (n_kv_blocks >= FA_MIN_KV_BLOCKS && n_threads_init >= 2);
+    const bool     pipeline = (n_kv_blocks >= FA_MIN_KV_BLOCKS && n_threads_init >= 2);

    // Bypass thread pool dispatch for small prompts/non-pipelined prefill by setting n_threads = 1
-    const uint32_t n_threads = use_pipeline ? n_threads_init : 1;
+    const uint32_t n_threads = pipeline ? n_threads_init : 1;

    FARF(HIGH, "hmx-fa: neq1=%u nek1=%u DK=%u DV=%u G=%u Br=%zu Bc=%zu g_br=%zu n_kv_blocks=%u pipeline=%d vtcm=%zu",
-         neq1, nek1, DK, DV, G, Br, Bc, g_br, n_kv_blocks, use_pipeline, vtcm_budget);
+         neq1, nek1, DK, DV, G, Br, Bc, g_br, n_kv_blocks, pipeline, vtcm_budget);

    // ======== Build context ========
    struct hmx_fa_context factx;
@@ -1339,7 +1339,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
    factx.n_kv_blocks    = n_kv_blocks;
    factx.is_q_fp32      = (q->type == HTP_TYPE_F32);
    factx.is_dst_fp32    = (dst->type == HTP_TYPE_F32);
-    factx.use_pipeline   = use_pipeline;
+    factx.pipeline   = pipeline;
    factx.mask_broadcast = (mask != NULL && mask->ne[2] == 1);

    // Extract op parameters (mutable during softcap adjustment, then stored as const in factx)
@@ -1405,7 +1405,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
    factx.vtcm_v_fp16[1]      = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, v_dma_bytes);
    factx.vtcm_k_tiles        = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, k_tile_bytes);
    factx.vtcm_v_tiles[0]     = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, v_tile_bytes);
-    if (use_pipeline) {
+    if (pipeline) {
        factx.vtcm_v_tiles[1] = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, v_tile_bytes);
    } else {
        factx.vtcm_v_tiles[1] = NULL;
@@ -1456,7 +1456,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
    // ======== HMX lock strategy ========
    // Pipeline: queue thread auto-acquires HMX lock on first push; released by suspend.
    // Fallback: main thread holds the lock (original behavior).
-    if (!factx.use_pipeline) {
+    if (!factx.pipeline) {
        HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
    }

@@ -1550,7 +1550,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                const size_t k_src_stride = size_k_row_padded / sizeof(__fp16);
                const size_t v_src_stride = size_v_row_padded / sizeof(__fp16);

-                if (factx.use_pipeline) {
+                if (factx.pipeline) {
                    // ==================================================================
                    // Pipeline path: HVX phases ‖ HMX queue worker
                    // ==================================================================
@@ -1780,7 +1780,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
                    fa_build_d_diag_inv_l(&factx, n_row_tiles, n_row_tiles_g_br);

                    // HMX: O_final = diag(1/l) @ O_prev
-                    if (factx.use_pipeline) {
+                    if (factx.pipeline) {
                        on_job.o_curr           = o_tile_curr;
                        on_job.o_prev           = o_tile_prev;
                        on_job.d_tiles          = factx.vtcm_d_tiles;
@@ -1826,7 +1826,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
        }  // end KV head loop
    }  // end batch loop

-    if (factx.use_pipeline) {
+    if (factx.pipeline) {
        hmx_queue_suspend(ctx->hmx_queue);
    } else {
        HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
@@ -1,6 +0,0 @@
-// HMX operations compiled as a single translation unit.
-// This allows interprocedural optimizations within HMX ops without requiring global HTP LTO.
-
-#include "hmx-queue.c"
-#include "hmx-matmul-ops.c"
-#include "hmx-flash-attn-ops.c"
@@ -1,88 +0,0 @@
-// HMX operation entry-point declarations.
-// Ported from htp-ops-lib/include/dsp/ops.h (renamed, benchmark kernels removed). (https://github.com/haozixu/htp-ops-lib)
-
-#ifndef HMX_OPS_H
-#define HMX_OPS_H
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "htp-ops.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct {
-    float        *dst;
-    const float  *activation;
-    const __fp16 *permuted_weight;
-    int           m;
-    int           k;
-    int           n;
-    int           act_stride;
-    int           weight_stride;
-    int           dst_stride;
-    int           ne02;
-    int           ne03;
-    int           ne12;
-    int           ne13;
-    size_t        src0_nb2;
-    size_t        src0_nb3;
-    size_t        src1_nb2;
-    size_t        src1_nb3;
-    size_t        dst_nb2;
-    size_t        dst_nb3;
-} hmx_matmul_f16_f32_batched_params_t;
-
-// HMX matrix multiplication — tile-permuted FP16 weights, FP32 activation/output
-// act_stride: activation row stride in elements (= k for contiguous, or
-//             nb[1]/sizeof(float) for permuted tensors like attention Q).
-// weight_stride: weight row stride in elements (= k for compact weights, or
-//                nb[1]/sizeof(__fp16) for permuted KV-cache views used by QK).
-int hmx_matmul_f16_f32(struct htp_context *ctx,
-                                float *restrict dst,
-                                const float *activation,
-                                const __fp16 *permuted_weight,
-                                int m, int k, int n,
-                                int act_stride,
-                                int weight_stride);
-
-// Batched F16 wrapper over hmx_mat_mul_f16_f32.
-// Batch semantics match ggml_mul_mat(): src0 broadcasts to src1 in dims 2/3.
-int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32_batched_params_t *params);
-
-// HMX matrix multiplication — all supported weight types (F16/F32/Q4_0/Q4_1/Q8_0/IQ4_NL/MXFP4)
-int hmx_matmul_2d_f32(struct htp_context *ctx,
-                                      float *restrict dst,
-                                      const float *activation,
-                                      const uint8_t *permuted_weight,
-                                      int m, int k, int n,
-                                      int act_stride,
-                                      int weight_stride,
-                                      int weight_type);
-
-struct mmid_row_mapping;
-
-int hmx_matmul_id_2d_f32(struct htp_context *ctx,
-                                         float *restrict dst,
-                                         const float *activation,
-                                         const uint8_t *permuted_weight,
-                                         int m, int k, int n,
-                                         int ne11,
-                                         size_t act_nb1, size_t act_nb2,
-                                         size_t dst_nb1, size_t dst_nb2,
-                                         int weight_stride,
-                                         int weight_type,
-                                         const struct mmid_row_mapping *matrix_rows,
-                                         int cur_a,
-                                         int mapping_stride);
-
-// HMX flash attention
-int hmx_flash_attn_ext(struct htp_ops_context * octx);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // HMX_OPS_H
@@ -13,7 +13,9 @@
 #include <stdint.h>
 #include <stdbool.h>

+#ifndef HTP_MAX_NTHREADS
 #define HTP_MAX_NTHREADS 10
+#endif
 #define HTP_MAX_MMAPS    16

 // Memory mapping
@@ -42,9 +44,13 @@ struct htp_ops_context {

    enum htp_op_code    op; // FIXME: rename to opcode
    int32_t             op_params[HTP_OP_MAX_PARAMS];
+    int32_t             kernel_params[HTP_OP_MAX_KERN_PARAMS];

    const struct htp_tensor * src[HTP_OP_MAX_INPUTS];
-    const struct htp_tensor * dst;
+    union {
+        const struct htp_tensor * dst;
+        const struct htp_tensor * dsts[HTP_OP_MAX_OUTPUTS];
+    };

    // TODO convert these to an array
    struct htp_spad src0_spad;
@@ -87,13 +93,13 @@ struct htp_context {

    struct htp_ops_context octx;

-#ifdef HTP_HAS_HMX
    struct hmx_queue *     hmx_queue; // Async HMX queue for pipeline overlap
-#endif
 };

 int op_matmul(struct htp_ops_context * octx);
 int op_matmul_id(struct htp_ops_context * octx);
+int op_matmul_qkv(struct htp_ops_context * octx);
+int op_matmul_ffn(struct htp_ops_context * octx);
 int op_binary(struct htp_ops_context * octx);
 int op_unary(struct htp_ops_context * octx);
 int op_sum_rows(struct htp_ops_context * octx);
@@ -28,18 +28,19 @@ enum htp_data_type {
    HTP_TYPE_MXFP4  = 39,

    // types used internally for repack, dyn.quant, etc
-    HTP_TYPE_Q4_0x4x2 = 200,
-    HTP_TYPE_Q4_1x4x2,
-    HTP_TYPE_Q8_0x4x2,
-    HTP_TYPE_MXFP4x4x2,
+    HTP_TYPE_Q4_0_TILED = 200,
+    HTP_TYPE_Q4_1_TILED,
+    HTP_TYPE_Q8_0_TILED,
+    HTP_TYPE_MXFP4_TILED,

    HTP_TYPE_INVALID
 };

 // Constats for internal types
-#define QK_Q4_0x4x2  256  // 4x Q4_0  blocks packed with next 4x Q4_0 blocks (size in bytes 128)
-#define QK_Q8_0x4x2  256  // 4x Q8_0  blocks concat with next 4x Q8_0 blocks
-#define QK_MXFP4x4x2 256  // 4x MXFP4 blocks concat with next 4x MXFP4 blocks
+#define QK_Q4_0_TILED  256  // 32x32 Q4_0 tiled layout
+#define QK_Q8_0_TILED  128  // 32x32 Q8_0 tiled layout
+#define QK_MXFP4_TILED 256  // 32x32 MXFP4 tiled layout
+


 // Mask to enable various stages of the Ops.
@@ -57,6 +58,8 @@ enum htp_op_code {
    HTP_OP_DIV = 3,
    HTP_OP_MUL_MAT,
    HTP_OP_MUL_MAT_ID,
+    HTP_OP_MUL_MAT_QKV,
+    HTP_OP_MUL_MAT_FFN,
    HTP_OP_RMS_NORM,
    HTP_OP_RMS_NORM_MUL,
    HTP_OP_UNARY_SILU,
@@ -99,7 +102,9 @@ enum htp_op_code {

 #define HTP_OP_MAX_DIMS    4    // aka GGML_MAX_DIMS
 #define HTP_OP_MAX_INPUTS  6    // aka GGML_MAX_SRCS
+#define HTP_OP_MAX_OUTPUTS 4
 #define HTP_OP_MAX_PARAMS  16   // aka GGML_MAX_OP_PARAMS
+#define HTP_OP_MAX_KERN_PARAMS 32

 #define HTP_OP_MAX_BUFS    16
 #define HTP_OP_MAX_REQS    256
@@ -142,8 +147,10 @@ struct htp_op_desc {
    uint32_t opcode;                    // GGML/HTP Op
    uint32_t flags;                     // Op flags
    int32_t  params[HTP_OP_MAX_PARAMS]; // Params for the op, e.g. epsilon of RMS norm
+    int32_t  kernel_params[HTP_OP_MAX_KERN_PARAMS]; // generic blob for host-precomputed parameters
    uint16_t src[HTP_OP_MAX_INPUTS];    // Input tensors indices
-    uint16_t dst;                       // Output tensor index
+    uint16_t dst[HTP_OP_MAX_OUTPUTS];   // Output tensor indices
+    uint16_t pad[2];                    // padding to align to 64 bits
 };

 #ifndef HTP_MAX_NTHREADS
@@ -11,12 +11,13 @@ struct htp_iface_pmu_conf {
 };

 interface htp_iface : remote_handle64 {
-    AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx, in uint32 use_hmx, in uint64 max_vmem);
+    AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx, in uint32 n_hmx, in uint64 max_vmem);
    AEEResult stop();
    AEEResult mmap(in uint32 fd, in uint32 size);
    AEEResult munmap(in uint32 fd);
    AEEResult profiler(in uint32 mode, in htp_iface_pmu_conf pmu);
    AEEResult etm(in uint32 enable);
+    AEEResult hwinfo(rout uint32 n_threads, rout uint32 n_hvx, rout uint32 n_hmx, rout uint64 vtcm_size);
 };

 #endif /* HTP_IDL */
@@ -170,25 +170,7 @@ static inline HVX_VectorPair hvx_vec_f16_to_f32(HVX_Vector v) {
 }
 #endif

-/* Q6_Vsf_equals_Vw is only available on v73+.*/
-#if __HVX_ARCH__ < 73
-static inline HVX_Vector hvx_vec_i32_to_qf32(HVX_Vector const in)
-{
-    HVX_Vector const vzero = Q6_V_vzero();
-    HVX_VectorPred is_zero = Q6_Q_vcmp_eq_VwVw(in, vzero);
-    HVX_Vector lshift = Q6_Vw_vnormamt_Vw(in);
-    HVX_Vector normalized = Q6_Vw_vasl_VwVw(in, lshift);
-    HVX_Vector vexp = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift);
-    HVX_Vector mant = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized);
-    HVX_Vector ret = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp));
-    return ret;
-}

-static inline HVX_Vector Q6_Vsf_equals_Vw(HVX_Vector const in)
-{
-    return Q6_Vsf_equals_Vqf32(hvx_vec_i32_to_qf32(in));
-}
-#endif

 static inline HVX_Vector hvx_vec_i16_from_hf_rnd_sat(HVX_Vector vin) {
    // This looks complicated.
@@ -305,4 +287,17 @@ static inline HVX_Vector hvx_vec_mul_f32_f32(HVX_Vector a, HVX_Vector b) {

 #endif // __HVX_ARCH__ < 79

+static inline HVX_Vector hvx_vec_load_act_tile(const uint8_t * y_q, uint32_t kt, HVX_Vector * v_act_all) {
+    if (kt % 4 == 0) {
+        *v_act_all = hvx_vmem(y_q + kt * 32);
+        return *v_act_all;
+    } else if (kt % 4 == 1) {
+        return Q6_V_vror_VR(*v_act_all, 32);
+    } else if (kt % 4 == 2) {
+        return Q6_V_vror_VR(*v_act_all, 64);
+    } else {
+        return Q6_V_vror_VR(*v_act_all, 96);
+    }
+}
+
 #endif /* HVX_BASE_H */
@@ -361,7 +361,7 @@ static void vtcm_free(struct htp_context * ctx) {
 static void htp_packet_callback(dspqueue_t queue, int error, void * context);
 static void htp_error_callback(dspqueue_t queue, int error, void * context);

-AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_queue_id, uint32 n_hvx, uint32 use_hmx, uint64_t max_vmem) {
+AEEResult htp_iface_start(remote_handle64 handle, uint32_t sess_id, uint64_t dsp_queue_id, uint32_t n_hvx, uint32_t n_hmx, uint64_t max_vmem) {
    struct htp_context * ctx = (struct htp_context *) handle;

    if (!ctx) {
@@ -395,10 +395,9 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
        return AEE_ENOMEMORY;
    }

-#ifdef HTP_HAS_HMX
-    ctx->hmx_enabled = use_hmx;
+    ctx->hmx_enabled = n_hmx;
    ctx->hmx_queue   = NULL;
-    if (use_hmx) {
+    if (n_hmx) {
        ctx->hmx_queue = hmx_queue_create(16, ctx->vtcm_rctx);
        if (ctx->hmx_queue) {
            ctx->hmx_queue->trace = &ctx->trace[HTP_MAX_NTHREADS];
@@ -407,8 +406,7 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
            ctx->hmx_enabled = false;
        }
    }
-    FARF(HIGH, "HMX %s (use_hmx=%d)", ctx->hmx_enabled ? "enabled" : "disabled", use_hmx);
-#endif
+    FARF(HIGH, "HMX %s (n_hmx=%d)", ctx->hmx_enabled ? "enabled" : "disabled", n_hmx);

    qurt_sysenv_max_hthreads_t hw_threads;
    qurt_sysenv_get_max_hw_threads(&hw_threads);
@@ -481,13 +479,11 @@ AEEResult htp_iface_stop(remote_handle64 handle) {
        dma_queue_delete(ctx->dma[i]);
    }

-#ifdef HTP_HAS_HMX
    if (ctx->hmx_queue) {
        hmx_queue_delete(ctx->hmx_queue);
        ctx->hmx_queue = NULL;
    }
    ctx->hmx_enabled = false;
-#endif

    vtcm_free(ctx);

@@ -500,6 +496,36 @@ AEEResult htp_iface_stop(remote_handle64 handle) {
    return AEE_SUCCESS;
 }

+AEEResult htp_iface_hwinfo(remote_handle64 handle, uint32_t * n_threads, uint32_t * n_hvx, uint32_t * n_hmx, uint64_t * vtcm_size) {
+    (void)handle;
+    if (!n_threads || !n_hvx || !n_hmx || !vtcm_size) {
+        return AEE_EBADPARM;
+    }
+
+    qurt_sysenv_max_hthreads_t hw_threads;
+    qurt_sysenv_get_max_hw_threads(&hw_threads);
+    uint32_t hw_nhvx = (qurt_hvx_get_units() >> 8) & 0xFF;
+
+    uint32_t n_hvx_val = hw_nhvx;
+    if (n_hvx_val > hw_threads.max_hthreads) {
+        n_hvx_val = hw_threads.max_hthreads;
+    }
+    if (n_hvx_val > HTP_MAX_NTHREADS) {
+        n_hvx_val = HTP_MAX_NTHREADS;
+    }
+
+    // for now we force n_threads == n_hvx
+    *n_threads = n_hvx_val;
+    *n_hvx     = n_hvx_val;
+    *n_hmx     = 1;
+
+    uint32_t vtcm_sz = 8 * 1024 * 1024; // 8MB default fallback
+    HAP_compute_res_query_VTCM(0, (unsigned int *)&vtcm_sz, NULL, NULL, NULL);
+    *vtcm_size = vtcm_sz;
+
+    return AEE_SUCCESS;
+}
+
 static void htp_error_callback(dspqueue_t queue, int error, void * context) {
    // No errors expected on the DSP.
    FARF(ERROR, "Error callback: 0x%08x", (unsigned) error);
@@ -554,6 +580,12 @@ static int execute_op(struct htp_ops_context * octx) {
        case HTP_OP_MUL_MAT_ID:
            return op_matmul_id(octx);

+        case HTP_OP_MUL_MAT_QKV:
+            return op_matmul_qkv(octx);
+
+        case HTP_OP_MUL_MAT_FFN:
+            return op_matmul_ffn(octx);
+
        case HTP_OP_MUL:
        case HTP_OP_ADD:
        case HTP_OP_SUB:
@@ -762,8 +794,9 @@ static void prep_tensors(struct htp_context *ctx, struct htp_buf_desc *bufs, str
    }
 }

-static void proc_op_req(struct htp_ops_context * octx, struct htp_tensor *tens, uint32_t idx, struct htp_op_desc * op) {
+static int proc_op_req(struct htp_ops_context * octx, struct htp_tensor *tens, uint32_t idx, struct htp_op_desc * op) {
    memcpy(octx->op_params, op->params, sizeof(octx->op_params));
+    memcpy(octx->kernel_params, op->kernel_params, sizeof(octx->kernel_params));
    octx->flags = op->flags;
    octx->op    = op->opcode;

@@ -785,22 +818,41 @@ static void proc_op_req(struct htp_ops_context * octx, struct htp_tensor *tens,
            src->ne[0], src->ne[1], src->ne[3], src->ne[3]);
    }

-    // Prep output tensor
-    struct htp_tensor *dst = tens + op->dst;
+    // Prep output tensors
+    for (uint32_t i = 0; i < HTP_OP_MAX_OUTPUTS; i++) {
+        uint16_t dst_idx = op->dst[i];
+        if (dst_idx == 0xffff) {
+            octx->dsts[i] = NULL;
+            continue;
+        }
+        struct htp_tensor *dst = tens + dst_idx;
+        octx->dsts[i] = dst;

-    octx->dst = dst;
+        FARF(HIGH, "prep-dst[%u] #%u: data %p size %u : %u:%u:%u:%u", i, dst_idx, (void*) dst->data, dst->size,
+            dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
+    }

-    FARF(HIGH, "prep-dst #%u: data %p size %u : %u:%u:%u:%u", op->dst, (void*) dst->data, dst->size,
-        dst->ne[0], dst->ne[1], dst->ne[3], dst->ne[3]);
+    int status = execute_op(octx);

-    (void) execute_op(octx);
+    octx->src0_spad.src = NULL;
+    octx->src1_spad.src = NULL;
+    octx->src2_spad.src = NULL;
+    octx->src3_spad.src = NULL;
+    octx->dst_spad.src  = NULL;

    // flush buffers on output
-    hex_l2flush((void *) dst->data, dst->size);
-    dst->flags |= HTP_TENSOR_FLUSHED;
+    for (uint32_t i = 0; i < HTP_OP_MAX_OUTPUTS; i++) {
+        if (octx->dsts[i]) {
+            struct htp_tensor *dst = (struct htp_tensor *)octx->dsts[i];
+            hex_l2flush((void *) dst->data, dst->size);
+            dst->flags |= HTP_TENSOR_FLUSHED;

-    FARF(HIGH, "post-dst #%u: data %p size %u : %u:%u:%u:%u", op->dst, (void*) dst->data, dst->size,
-        dst->ne[0], dst->ne[1], dst->ne[3], dst->ne[3]);
+            FARF(HIGH, "post-dst[%u] #%u: data %p size %u : %u:%u:%u:%u", i, op->dst[i], (void*) dst->data, dst->size,
+                dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
+        }
+    }
+
+    return status;
 }

 #define DSPQUEUE_POLL_TIMEOUT_USEC 100
@@ -892,20 +944,26 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
            }
        }

+        int      op_status = HTP_STATUS_OK;
+        uint32_t op_wakeup = n_ops / 2; // half-way throgh the batch
+
        for (uint32_t i=0; i < n_ops; i++) {
            struct profile_data prof;

-            if (i == (n_ops-1)) {
-                // wake up the host before starting the last op
+            if (i == op_wakeup) {
                dspqueue_write_early_wakeup_noblock(queue, 0, 0);
            }

            profile_start(ctx->profiler, &prof);

-            proc_op_req(octx, tens, i, &ops[i]);
+            op_status = proc_op_req(octx, tens, i, &ops[i]);

            profile_stop(ctx->profiler, &prof);

+            if (op_status != HTP_STATUS_OK) {
+                break;
+            }
+
            if (ctx->profiler) {
                pds[i].opcode = ops[i].opcode;
                pds[i].usecs  = prof.usecs;
@@ -919,7 +977,7 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {

        struct htp_opbatch_rsp rsp;
        rsp.id        = req.id;
-        rsp.status    = HTP_STATUS_OK;
+        rsp.status    = op_status;
        rsp.n_bufs    = n_bufs;
        rsp.n_tensors = n_tens;
        rsp.n_ops     = n_ops;
@@ -0,0 +1,508 @@
+#ifndef HTP_MATMUL_OPS_H
+#define HTP_MATMUL_OPS_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include "htp-ops.h"
+#include "hex-fastdiv.h"
+#include "hex-common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// --- HMX Tile Constraints ---
+#define HTP_MM_HMX_TILE_N_COLS 32
+#define HTP_MM_HMX_TILE_N_ROWS 32
+#define HTP_MM_HMX_TILE_SIZE   (32 * 32 * sizeof(__fp16)) // 2048 bytes
+#define HTP_MM_HMX_TILE_N_ELMS 1024
+#define HTP_MM_HMX_MIN_NROWS   4
+
+// --- Weight Repacked Tile Sizes ---
+#define HTP_MM_WEIGHT_TILE_SIZE_Q4_0   576
+#define HTP_MM_WEIGHT_TILE_SIZE_Q4_1   640
+#define HTP_MM_WEIGHT_TILE_SIZE_Q8_0   1088
+#define HTP_MM_WEIGHT_TILE_SIZE_IQ4_NL 576
+#define HTP_MM_WEIGHT_TILE_SIZE_MXFP4  544
+
+// --- Weight Repacked Aligned Tile Sizes ---
+#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q4_0   640
+#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q4_1   640
+#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q8_0   1152
+#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_IQ4_NL 640
+#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_MXFP4  640
+
+// --- Activation Tiled Block Sizes (including padding) ---
+#define HTP_MM_ACT_TILE_SIZE_Q8_0      1152
+#define HTP_MM_ACT_TILE_SIZE_Q8_1      1280
+
+#define HTP_MM_MAX_PREFETCH 16
+
+// --- Solver Cost Model Penalty Weights (HMX-specific) ---
+#define HTP_MM_HMX_COST_W_DEQUANT 3 // cost penalty for quantized weight loading/dequantization
+#define HTP_MM_HMX_COST_A_CONVERT 2 // cost penalty for activation loading/conversion
+
+// --- DMA Activation Transfer Configuration ---
+#define HTP_MM_DMA_ACT_ROWS_PER_STEP 2
+#define HTP_MM_DMA_ACT_MULTIPLIER    4
+
+enum htp_mm_kernel_type {
+    HTP_MM_KERNEL_UNSUPPORTED = 0,
+
+    // HMX paths
+    HTP_MM_KERNEL_HMX_2D,
+    HTP_MM_KERNEL_HMX_F16_BATCHED,
+
+    // HVX floating-point paths
+    HTP_MM_KERNEL_HVX_F16_F16_VTCM,
+    HTP_MM_KERNEL_HVX_F16_F16_DDR,
+    HTP_MM_KERNEL_HVX_F16_F32_DDR,
+
+    HTP_MM_KERNEL_HVX_F32_F32_VTCM,
+    HTP_MM_KERNEL_HVX_F32_F32_DDR,
+    HTP_MM_KERNEL_HVX_F32_F16_DDR,
+
+    // HVX quantized paths
+    HTP_MM_KERNEL_HVX_QUANT_ROW,      // standard row-wise parallel quantization
+    HTP_MM_KERNEL_HVX_QUANT_BLOCK,    // parallel block-wise quantization
+    HTP_MM_KERNEL_HVX_QUANT_ROW_FLAT, // row-wise fallback flat quantization
+};
+
+// Op-specific struct for precomputed matmul params
+struct htp_mm_kernel_params {
+    int32_t  kernel_type;        // enum htp_mm_kernel_type
+    int32_t  pipeline;           // 1 = pipelined execution, 0 = standard
+    int32_t  m_chunk;            // Row chunk size (M chunk)
+    int32_t  n_chunk;            // Col chunk size (N chunk)
+    int32_t  n_threads;          // Number of threads to spawn
+    int32_t  n_act_threads;      // Number of threads for activation preparation
+    int32_t  n_hmx;              // 1 = use HMX, 0 = use HVX
+    int32_t  n_prefetch;         // Prefetch lookahead buffers/rows in VTCM
+    int32_t  tile_size;          // Weight tile size
+    int32_t  aligned_tile_size;  // Aligned weight tile size (padded to 128)
+    int32_t  src1_row_size;      // Row size for quantized activation
+    int32_t  vtcm_size;          // Total required scratchpad size in VTCM
+    int32_t  vtcm_src0_size;     // src0 scratchpad size in VTCM
+    int32_t  vtcm_src1_size;     // src1 scratchpad size in VTCM
+    int32_t  vtcm_src2_size;     // src2 scratchpad size in VTCM (fused only)
+    int32_t  vtcm_src3_size;     // src3 scratchpad size in VTCM (fused only)
+    int32_t  vtcm_dst_size;      // dst scratchpad size in VTCM
+
+    // Precomputed division values
+    struct fastdiv_values div_ne12_ne1;
+    struct fastdiv_values div_ne1;
+    struct fastdiv_values div_r2;
+    struct fastdiv_values div_r3;
+    struct fastdiv_values div_ne11;
+};
+
+#if defined(__cplusplus)
+static_assert(sizeof(struct htp_mm_kernel_params) <= 128, "htp_matmul_kernel_params is too large for kernel_params blob");
+#else
+_Static_assert(sizeof(struct htp_mm_kernel_params) <= 128, "htp_matmul_kernel_params is too large for kernel_params blob");
+#endif
+
+struct mmid_row_mapping {
+    uint32_t i1;
+    uint32_t i2;
+};
+
+// Search for optimal (mc, nc) chunk sizes within VTCM budget.
+static inline int htp_mm_hmx_compute_chunks(size_t   vtcm_total,
+                              size_t   overhead,
+                              size_t   per_n_cost,
+                              size_t   per_m_cost,
+                              size_t   per_mn_cost,
+                              size_t   m,
+                              size_t   n,
+                              size_t   m_block_cost,
+                              size_t   n_block_cost,
+                              size_t * m_chunk_out,
+                              size_t * n_chunk_out,
+                              size_t * total_out) {
+    if (m == 0 || n == 0) return -1;
+    if (vtcm_total <= overhead) return -1;
+    if (per_n_cost == 0 || per_m_cost == 0 || per_mn_cost == 0) return -1;
+
+    const size_t usable = vtcm_total - overhead;
+
+    size_t best_cost = SIZE_MAX;
+    size_t best_mn   = 0;
+    size_t best_m = 0, best_n = 0;
+
+    const size_t n_max = hex_align_down((size_t)n, HTP_MM_HMX_TILE_N_COLS);
+    for (size_t nc = n_max; nc >= HTP_MM_HMX_TILE_N_COLS; nc -= HTP_MM_HMX_TILE_N_COLS) {
+        size_t n_fixed = 0, ncmn = 0, mc_denom = 0;
+        if (hex_mul_overflow(nc, per_n_cost, &n_fixed)) continue;
+        if (n_fixed >= usable) goto next_nc;
+
+        if (hex_mul_overflow(nc, per_mn_cost, &ncmn)) goto next_nc;
+        if (hex_add_overflow(per_m_cost, ncmn, &mc_denom) || mc_denom == 0) goto next_nc;
+
+        {
+            size_t remain = usable - n_fixed;
+            size_t mc = remain / mc_denom;
+            mc = hex_align_down(mc, HTP_MM_HMX_TILE_N_ROWS);
+            mc = hex_smin(mc, m);
+
+            if (mc == 0) {
+                goto next_nc;
+            }
+
+            size_t mblocks = ((size_t) m + mc - 1) / mc;
+            size_t nblocks = ((size_t) n + nc - 1) / nc;
+            size_t cost    = mblocks * m_block_cost + nblocks * n_block_cost;
+            size_t mn      = mc * nc;
+            if (cost < best_cost || (cost == best_cost && mn > best_mn)) {
+                best_cost = cost;
+                best_mn   = mn;
+                best_m    = mc;
+                best_n    = nc;
+            }
+        }
+
+next_nc:
+        if (nc == HTP_MM_HMX_TILE_N_COLS) break;  // avoid size_t underflow
+    }
+
+    if (best_m == 0 || best_n == 0) return -1;
+
+    // Compute exact total (with overflow checks)
+    size_t t0 = 0, t1 = 0, t2 = 0, mn = 0, total = 0;
+    if (hex_mul_overflow(best_n, per_n_cost, &t0)) return -1;
+    if (hex_mul_overflow(best_m, per_m_cost, &t1)) return -1;
+    if (hex_mul_overflow(best_m, best_n, &mn))     return -1;
+    if (hex_mul_overflow(mn, per_mn_cost, &t2))    return -1;
+    if (hex_add_overflow(t0, t1, &total))          return -1;
+    if (hex_add_overflow(total, t2, &total))       return -1;
+    if (hex_add_overflow(total, overhead, &total)) return -1;
+
+    *m_chunk_out = best_m;
+    *n_chunk_out = best_n;
+    *total_out   = total;
+    return 0;
+}
+
+// --- Tile Size Helpers ---
+static inline uint32_t htp_mm_get_weight_tile_size(int weight_type) {
+    switch (weight_type) {
+        case HTP_TYPE_Q4_0:
+        case HTP_TYPE_IQ4_NL:
+            return HTP_MM_WEIGHT_TILE_SIZE_Q4_0;
+        case HTP_TYPE_Q4_1:
+            return HTP_MM_WEIGHT_TILE_SIZE_Q4_1;
+        case HTP_TYPE_Q8_0:
+            return HTP_MM_WEIGHT_TILE_SIZE_Q8_0;
+        case HTP_TYPE_MXFP4:
+            return HTP_MM_WEIGHT_TILE_SIZE_MXFP4;
+        default:
+            return 0;
+    }
+}
+
+static inline uint32_t htp_mm_get_weight_aligned_tile_size(int weight_type) {
+    switch (weight_type) {
+        case HTP_TYPE_Q4_0:
+        case HTP_TYPE_IQ4_NL:
+            return HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q4_0;
+        case HTP_TYPE_Q4_1:
+            return HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q4_1;
+        case HTP_TYPE_Q8_0:
+            return HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q8_0;
+        case HTP_TYPE_MXFP4:
+            return HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_MXFP4;
+        default:
+            return 0;
+    }
+}
+
+// --- Activation/Row Size Helpers ---
+static inline size_t htp_mm_q8_0_tiled_row_size(uint32_t ne) {
+    const uint32_t ne_padded = ((ne + 127) / 128) * 128;
+    const uint32_t nb_32 = ne_padded / 32;
+    return nb_32 * HTP_MM_ACT_TILE_SIZE_Q8_0;
+}
+
+static inline size_t htp_mm_q8_1_tiled_row_size(uint32_t ne) {
+    const uint32_t ne_padded = ((ne + 127) / 128) * 128;
+    const uint32_t nb_32 = ne_padded / 32;
+    return nb_32 * HTP_MM_ACT_TILE_SIZE_Q8_1;
+}
+
+static inline size_t htp_mm_q8_0_flat_row_size(uint32_t ne) {
+    const uint32_t quants_size = hex_align_up(ne, 128);
+    const uint32_t num_scales = (ne + 31) / 32;
+    const uint32_t scales_size = hex_align_up(num_scales * 2, 128);
+    return quants_size + scales_size;
+}
+
+static inline size_t htp_mm_q8_1_flat_row_size(uint32_t ne) {
+    const uint32_t quants_size = hex_align_up(ne, 128);
+    const uint32_t num_scales = (ne + 31) / 32;
+    const uint32_t scales_size = hex_align_up(num_scales * 4, 128);
+    return quants_size + scales_size;
+}
+
+static inline size_t htp_mm_get_tiled_row_stride(int weight_type, uint32_t k) {
+    uint32_t nb = (k + QK_Q4_0_TILED - 1) / QK_Q4_0_TILED;
+    switch (weight_type) {
+        case HTP_TYPE_Q4_0:
+        case HTP_TYPE_IQ4_NL:
+        case HTP_TYPE_Q4_1:
+        case HTP_TYPE_Q8_0:
+        case HTP_TYPE_MXFP4:
+            return (size_t) nb * htp_mm_get_weight_tile_size(weight_type);
+        case HTP_TYPE_F16:
+            return (size_t) k * sizeof(__fp16);
+        case HTP_TYPE_F32:
+            return (size_t) k * sizeof(float);
+        default:
+            return 0;
+    }
+}
+
+static inline size_t htp_mm_round_up(size_t n, size_t m) {
+    return ((n + m - 1) / m) * m;
+}
+
+static inline bool htp_mm_hmx_pipeline(uint32_t m) {
+    return m > 32;
+}
+
+static inline void htp_mm_hmx_get_2d_chunk_costs(
+    int wtype, uint32_t k, bool pipeline, uint32_t aligned_tile_size,
+    size_t * size_per_n_out, size_t * size_per_m_out, size_t * size_per_mn_out
+) {
+    const bool is_quant = (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_F32);
+    const size_t row_stride = htp_mm_get_tiled_row_stride(wtype, k);
+    const size_t vec_dot_size = k * sizeof(uint16_t);
+    const uint32_t n_k_tiles = k / HTP_MM_HMX_TILE_N_COLS;
+    const size_t qweight_row_stride = is_quant ? (size_t)(n_k_tiles * aligned_tile_size) / 32 : 0;
+
+    *size_per_n_out = (pipeline ? 2 : 1) * (is_quant ? qweight_row_stride : row_stride) +
+                      (pipeline ? 2 * vec_dot_size : vec_dot_size);
+    *size_per_m_out = vec_dot_size;
+    *size_per_mn_out = (pipeline ? 2 : 1) * sizeof(uint16_t);
+}
+
+static inline void htp_mm_hmx_get_batched_chunk_costs(
+    uint32_t k, uint32_t group_size,
+    size_t * size_per_n_out, size_t * size_per_m_out, size_t * size_per_mn_out
+) {
+    const size_t vec_dot_size = k * sizeof(uint16_t);
+    *size_per_n_out = 3 * vec_dot_size;
+    *size_per_m_out = group_size * vec_dot_size;
+    *size_per_mn_out = sizeof(uint16_t);
+}
+
+static inline size_t htp_mm_hmx_get_2d_vtcm_size(
+    int wtype, uint32_t k, size_t mc, size_t nc, bool pipeline, uint32_t act_threads, uint32_t aligned_tile_size
+) {
+    const uint32_t n_k_tiles = k / HTP_MM_HMX_TILE_N_COLS;
+    const bool is_quant = (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_F32);
+    const size_t row_stride = htp_mm_get_tiled_row_stride(wtype, k);
+    const size_t vec_dot_size = k * sizeof(uint16_t);
+
+    const size_t act_f32_size = htp_mm_round_up(act_threads * 4 * k * sizeof(float), HTP_MM_HMX_TILE_SIZE);
+    size_t weight_area_size = is_quant
+        ? htp_mm_round_up((nc / 32) * n_k_tiles * aligned_tile_size, HTP_MM_HMX_TILE_SIZE)
+        : htp_mm_round_up(nc * row_stride, HTP_MM_HMX_TILE_SIZE);
+    if (pipeline) {
+        weight_area_size *= 2;
+    }
+    const size_t act_area_size    = htp_mm_round_up(mc * vec_dot_size, HTP_MM_HMX_TILE_SIZE);
+    const size_t output_area_size = htp_mm_round_up(mc * nc * sizeof(uint16_t), HTP_MM_HMX_TILE_SIZE);
+
+    size_t scratch0_size = htp_mm_round_up(nc * vec_dot_size, HTP_MM_HMX_TILE_SIZE);
+    size_t scratch1_size = pipeline ? scratch0_size : 0;
+    size_t scratch2_size = pipeline ? output_area_size : 0;
+
+    return weight_area_size + act_area_size + act_f32_size + output_area_size +
+           scratch0_size + scratch1_size + scratch2_size + 256;
+}
+
+static inline size_t htp_mm_hmx_get_batched_vtcm_size(
+    int wtype, uint32_t k, size_t mc, size_t nc, uint32_t group_size, bool use_dma_activation, bool pipeline, uint32_t act_threads) {
+    (void)wtype;
+    (void)pipeline;
+    const size_t vec_dot_size     = k * sizeof(uint16_t);
+    const size_t f32_scratch_size = use_dma_activation
+        ? htp_mm_round_up(act_threads * 4 * k * sizeof(float), HTP_MM_HMX_TILE_SIZE) : 0;
+
+    const size_t act_head_stride   = mc * k;
+    const size_t weight_area_size  = htp_mm_round_up(nc * vec_dot_size, HTP_MM_HMX_TILE_SIZE);
+    const size_t act_area_size     = htp_mm_round_up(group_size * act_head_stride * sizeof(uint16_t), HTP_MM_HMX_TILE_SIZE);
+    const size_t output_area_size  = htp_mm_round_up(group_size * mc * nc * sizeof(uint16_t), HTP_MM_HMX_TILE_SIZE);
+    const size_t scratch_area_size = htp_mm_round_up(nc * vec_dot_size, HTP_MM_HMX_TILE_SIZE);
+
+    return weight_area_size + act_area_size + output_area_size +
+           2 * scratch_area_size + 256 + f32_scratch_size;
+}
+
+static inline size_t htp_mm_hvx_get_vtcm_sizes(
+    int kernel_type,
+    int wtype,
+    uint32_t ne10,       // k
+    uint32_t src1_nrows, // m_total (or act_nrows)
+    uint32_t n_threads,
+    size_t dst_row_size,
+    size_t src0_row_size,
+    size_t src1_row_size,
+    uint32_t n_prefetch,
+    size_t * vtcm_src0_size_out,
+    size_t * vtcm_src1_size_out,
+    size_t * vtcm_dst_size_out
+) {
+    size_t vtcm_src0_size = 0;
+    size_t vtcm_src1_size = 0;
+    size_t vtcm_dst_size  = 0;
+
+    const bool is_repack = (wtype == HTP_TYPE_Q4_0 || wtype == HTP_TYPE_Q4_1 ||
+                            wtype == HTP_TYPE_Q8_0 || wtype == HTP_TYPE_IQ4_NL ||
+                            wtype == HTP_TYPE_MXFP4);
+
+    const size_t src0_row_size_padded = htp_mm_round_up(src0_row_size, 128);
+    const size_t dst_nrows = (src1_nrows > 1) ? 0 : 1;
+
+    switch (kernel_type) {
+        case HTP_MM_KERNEL_HVX_F16_F16_VTCM: {
+            size_t f16_src1_row_size = htp_mm_round_up(ne10 * 2, 128);
+            vtcm_src1_size = htp_mm_round_up(f16_src1_row_size * src1_nrows, 256);
+            vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256) * n_threads;
+            vtcm_dst_size  = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) * n_threads : 0;
+            break;
+        }
+        case HTP_MM_KERNEL_HVX_F16_F32_DDR:
+        case HTP_MM_KERNEL_HVX_F16_F16_DDR:
+        case HTP_MM_KERNEL_HVX_F32_F32_DDR:
+        case HTP_MM_KERNEL_HVX_F32_F16_DDR: {
+            vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size, 256) * n_threads;
+            vtcm_src1_size = htp_mm_round_up(n_prefetch * src1_row_size, 256) * n_threads;
+            vtcm_dst_size  = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) * n_threads : 0;
+            break;
+        }
+        case HTP_MM_KERNEL_HVX_F32_F32_VTCM: {
+            size_t f32_src1_row_size = htp_mm_round_up(ne10 * 4, 128);
+            vtcm_src1_size = htp_mm_round_up(f32_src1_row_size * src1_nrows, 256);
+            vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256) * n_threads;
+            vtcm_dst_size  = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) * n_threads : 0;
+            break;
+        }
+        case HTP_MM_KERNEL_HVX_QUANT_BLOCK:
+        case HTP_MM_KERNEL_HVX_QUANT_ROW: {
+            size_t q_src1_row_size = (wtype == HTP_TYPE_Q4_1) ? htp_mm_q8_1_tiled_row_size(ne10) : htp_mm_q8_0_tiled_row_size(ne10);
+
+            vtcm_dst_size  = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) : 0;
+            vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256);
+            vtcm_src1_size = htp_mm_round_up(q_src1_row_size * src1_nrows, 256);
+
+            // src0 spad is also used in dynamic quantizer to store padded src1 rows
+            size_t src1_row_size_padded = htp_mm_round_up(q_src1_row_size, QK_Q8_0_TILED * sizeof(float));
+            if (vtcm_src0_size < src1_row_size_padded) {
+                vtcm_src0_size = src1_row_size_padded;
+            }
+
+            vtcm_src0_size = vtcm_src0_size * n_threads;
+            vtcm_dst_size  = vtcm_dst_size * n_threads;
+
+            if (is_repack) {
+                uint32_t aligned_tile_size = htp_mm_get_weight_aligned_tile_size(wtype);
+                uint32_t n_k_tiles = ne10 / 32;
+                uint32_t tile_row_size = n_k_tiles * aligned_tile_size;
+                size_t repacked_vtcm_size = htp_mm_round_up(n_prefetch * tile_row_size, 256);
+                if (repacked_vtcm_size < src1_row_size_padded) {
+                    repacked_vtcm_size = src1_row_size_padded;
+                }
+                vtcm_src0_size = repacked_vtcm_size * n_threads;
+            }
+            break;
+        }
+        case HTP_MM_KERNEL_HVX_QUANT_ROW_FLAT: {
+            size_t q_src1_row_size = (wtype == HTP_TYPE_Q4_1) ? htp_mm_q8_1_flat_row_size(ne10) : htp_mm_q8_0_flat_row_size(ne10);
+
+            vtcm_dst_size  = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) : 0;
+            vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256);
+            vtcm_src1_size = htp_mm_round_up(q_src1_row_size * src1_nrows, 256);
+
+            size_t src1_row_size_padded = htp_mm_round_up(q_src1_row_size, 256);
+            if (vtcm_src0_size < src1_row_size_padded) {
+                vtcm_src0_size = src1_row_size_padded;
+            }
+
+            vtcm_src0_size = vtcm_src0_size * n_threads;
+            vtcm_dst_size  = vtcm_dst_size * n_threads;
+
+            if (is_repack) {
+                uint32_t aligned_tile_size = htp_mm_get_weight_aligned_tile_size(wtype);
+                uint32_t n_k_tiles = ne10 / 32;
+                uint32_t tile_row_size = n_k_tiles * aligned_tile_size;
+                size_t repacked_vtcm_size = htp_mm_round_up(n_prefetch * tile_row_size, 256);
+                if (repacked_vtcm_size < src1_row_size_padded) {
+                    repacked_vtcm_size = src1_row_size_padded;
+                }
+                vtcm_src0_size = repacked_vtcm_size * n_threads;
+            }
+            break;
+        }
+        default:
+            break;
+    }
+
+    *vtcm_src0_size_out = vtcm_src0_size;
+    *vtcm_src1_size_out = vtcm_src1_size;
+    *vtcm_dst_size_out  = vtcm_dst_size;
+
+    return vtcm_src0_size + vtcm_src1_size + vtcm_dst_size;
+}
+
+static inline size_t htp_mm_hvx_id_get_vtcm_sizes(
+    int wtype,
+    uint32_t ne10,                // k
+    uint32_t src1_nrows,
+    uint32_t n_threads,
+    size_t src0_row_size,    // nb01
+    uint32_t n_prefetch,
+    size_t * vtcm_src0_size_out,
+    size_t * vtcm_src1_size_out
+) {
+    const bool is_repack = (wtype == HTP_TYPE_Q4_0 || wtype == HTP_TYPE_Q4_1 ||
+                            wtype == HTP_TYPE_Q8_0 || wtype == HTP_TYPE_IQ4_NL ||
+                            wtype == HTP_TYPE_MXFP4);
+
+    const size_t src0_row_size_padded = htp_mm_round_up(src0_row_size, 128);
+    const size_t src1_row_size = (wtype == HTP_TYPE_Q4_1) ? htp_mm_q8_1_tiled_row_size(ne10)
+                                                          : htp_mm_q8_0_tiled_row_size(ne10);
+
+    size_t src0_sz_per_thread = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256);
+    size_t src1_sz            = htp_mm_round_up(src1_row_size * src1_nrows, 256);
+
+    // src0 spad also holds temporary transposed src1 columns during dynamic quantization.
+    const size_t src1_row_size_padded = htp_mm_round_up(src1_row_size, QK_Q8_0_TILED * sizeof(float));
+    if (src0_sz_per_thread < src1_row_size_padded) {
+        src0_sz_per_thread = src1_row_size_padded;
+    }
+
+    if (is_repack) {
+        const uint32_t aligned_tile_size = htp_mm_get_weight_aligned_tile_size(wtype);
+        const uint32_t n_k_tiles    = ne10 / 32;
+        const uint32_t tile_row_size = n_k_tiles * aligned_tile_size;
+        size_t repacked_vtcm_size = htp_mm_round_up(n_prefetch * tile_row_size, 256);
+        if (repacked_vtcm_size < src1_row_size_padded) {
+            repacked_vtcm_size = src1_row_size_padded;
+        }
+        src0_sz_per_thread = repacked_vtcm_size;
+    }
+
+    const size_t vtcm_src0_size = src0_sz_per_thread * n_threads;
+
+    *vtcm_src0_size_out = vtcm_src0_size;
+    *vtcm_src1_size_out = src1_sz;
+
+    return vtcm_src0_size + src1_sz;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // HTP_MATMUL_OPS_H
@@ -14,8 +14,6 @@ Drivers_Dir = 13
 1 = %DiskId%

 [SourceDisksFiles]
-libggml-htp-v68.so = 1
-libggml-htp-v69.so = 1
 libggml-htp-v73.so = 1
 libggml-htp-v75.so = 1
 libggml-htp-v79.so = 1
@@ -28,8 +26,6 @@ ExcludeFromSelect = *
 CopyFiles=Drivers_Dir

 [Drivers_Dir]
-libggml-htp-v68.so,,,0x10 ;COPYFLG_NO_OVERWRITE
-libggml-htp-v69.so,,,0x10 ;COPYFLG_NO_OVERWRITE
 libggml-htp-v73.so,,,0x10 ;COPYFLG_NO_OVERWRITE
 libggml-htp-v75.so,,,0x10 ;COPYFLG_NO_OVERWRITE
 libggml-htp-v79.so,,,0x10 ;COPYFLG_NO_OVERWRITE
@@ -850,6 +850,7 @@ struct ggml_backend_opencl_context {
        ref_count--;
        if (ref_count == 0) {
 #ifdef GGML_OPENCL_PROFILING
+            flush_profiling_batch();
            write_profiling_info();
            profiling_results.clear();
 #endif
@@ -10152,14 +10153,8 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
    float eps;
    memcpy(&eps, dst->op_params, sizeof(float));

-    const int ne00 = src0 ? src0->ne[0] : 0;
-    const int ne01 = src0 ? src0->ne[1] : 0;
-    const int ne02 = src0 ? src0->ne[2] : 0;
-    const int ne03 = src0 ? src0->ne[3] : 0;
-
-    const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
-    const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
-    const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
+    GGML_TENSOR_LOCALS(int,      ne0, src0, ne);
+    GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);

    const int nth = MIN(64, ne00);

@@ -10173,11 +10168,12 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(int),       &ne01));
    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),       &ne02));
    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),       &ne03));
-    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong),  &nb01));
-    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong),  &nb02));
-    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),  &nb03));
-    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float),     &eps));
-    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth, NULL));
+    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong),  &nb00));
+    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong),  &nb01));
+    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),  &nb02));
+    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),  &nb03));
+    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float),     &eps));
+    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(float)*nth, NULL));

    size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
    size_t local_work_size[] = {(size_t)nth, 1, 1};
@@ -24,6 +24,7 @@ kernel void kernel_norm(
        int ne01,
        int ne02,
        int ne03,
+        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
@@ -43,7 +44,8 @@ kernel void kernel_norm(
    // parallel sum
    sum[get_local_id(0)] = 0.0f;
    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
-        sum[get_local_id(0)] += x[i00];
+        // this kernel handles float, nb00/4 translates byte offset to element offset
+        sum[get_local_id(0)] += x[i00*nb00/4];
    }
    // reduce
    barrier(CLK_LOCAL_MEM_FENCE);
@@ -60,7 +62,8 @@ kernel void kernel_norm(
    global float * y = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
    sum[get_local_id(0)] = 0.0f;
    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
-        y[i00] = x[i00] - mean;
+        // this kernel handles float, nb00/4 translates byte offset to element offset
+        y[i00] = x[i00*nb00/4] - mean;
        sum[get_local_id(0)] += y[i00] * y[i00];
    }

@@ -103,8 +103,8 @@ void ggml_sycl_op_conv_3d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    // allocate packed arrays: A_packed (k x m), B_packed (k x n)
    ggml_sycl_pool_alloc<float> A_packed_alloc(ctx.pool());
    ggml_sycl_pool_alloc<float> B_packed_alloc(ctx.pool());
-    A_packed_alloc.alloc((size_t) knl_n_total * patch_total * sizeof(float));
-    B_packed_alloc.alloc((size_t) knl_n_total * oc * sizeof(float));
+    A_packed_alloc.alloc((size_t) knl_n_total * patch_total);
+    B_packed_alloc.alloc((size_t) knl_n_total * oc);

    float * A_packed = A_packed_alloc.get();
    float * B_packed = B_packed_alloc.get();
@@ -115,10 +115,16 @@ void ggml_sycl_op_conv_3d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {

    // Combined kernel: im2col -> pack A, and pack B simultaneously
    const char * src1_base = (const char *) src1->data;
+    const char * src0_base = (const char *) src0->data;
    const int64_t src1_nb0 = src1->nb[0];
    const int64_t src1_nb1 = src1->nb[1];
    const int64_t src1_nb2 = src1->nb[2];
    const int64_t src1_nb3 = src1->nb[3];
+    const int64_t src1_w = src1->ne[0];
+    const int64_t src1_h = src1->ne[1];
+    const int64_t src1_d = src1->ne[2];
+
+    const bool src0_is_f32 = (src0->type == GGML_TYPE_F32);

    // Compute correct strides for src0 as (knl_n_total, oc) matrix
    const int64_t src0_packed_nb0 = kernel_type_size;
@@ -165,7 +171,7 @@ void ggml_sycl_op_conv_3d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
        const int64_t sz = dst_z * s2 + kz * d2 - p2;

        float val = 0.0f;
-        if (sx >= 0 && sx < src1->ne[0] && sy >= 0 && sy < src1->ne[1] && sz >= 0 && sz < src1->ne[2]) {
+        if (sx >= 0 && sx < src1_w && sy >= 0 && sy < src1_h && sz >= 0 && sz < src1_d) {
            const int64_t channel_idx = batch_idx * c + ic;
            const char * ptr = src1_base + sx * src1_nb0 + sy * src1_nb1 + sz * src1_nb2 + channel_idx * src1_nb3;
            val = *(const float *) ptr;
@@ -184,9 +190,9 @@ void ggml_sycl_op_conv_3d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {

        const int64_t row = t % k;
        const int64_t col = t / k;
-        const char * src_ptr = (const char *) src0->data + row * src0_packed_nb0 + col * src0_packed_nb1;
+        const char * src_ptr = src0_base + row * src0_packed_nb0 + col * src0_packed_nb1;
        float v;
-        if (src0->type == GGML_TYPE_F32) {
+        if (src0_is_f32) {
            v = *(const float *) src_ptr;
        } else {
            v = sycl::vec<sycl::half, 1>(*(const sycl::half *) src_ptr).convert<float, sycl::rounding_mode::automatic>()[0];
@@ -5859,6 +5859,250 @@ static ggml_backend_dev_t ggml_backend_sycl_reg_get_device(ggml_backend_reg_t re
    return ctx->devices[index];
 }

+// ==========================================================================
+// Tensor parallelism (--split-mode tensor) for the SYCL backend.
+//
+// The meta-backend invokes these three entry points via get_proc_address:
+//   * ggml_backend_sycl_comm_init             - one-time per-graph setup
+//   * ggml_backend_sycl_comm_allreduce_tensor - per-allreduce step
+//   * ggml_backend_sycl_comm_free             - tear-down
+//
+// For N=2 (dual-GPU), this is a degenerate ring allreduce with dual paths
+// chosen by tensor size:
+//
+//   * Small (nelem < 32K): FP32 direct memcpy + per-device ADD
+//     kernel. The kernel depends_on() its corresponding memcpy event
+//     so it doesn't read partial data. Both devices run in parallel.
+//
+//   * Large (nelem >= 32K): BF16-compressed. Each device compresses
+//     its FP32 partial to BF16 locally, cross-device memcpys
+//     to the peer (half the PCI bandwidth), where it is decompressed
+//     and added into the local FP32 partial. 6 SYCL submissions per
+//     allreduce (2 compress + 2 memcpy + 2 decompress-add) vs the
+//     4 for the small path, but the bandwidth saving > 6 GB/s PCIe x 2
+//     dominates for larger tensors.
+//
+// Storage: A persistent uint8_t buffer per device, sized to
+// 4 * nelem bytes. Both paths reinterpret the same bytes (small path
+// as nelem floats; large path as outbox + inbox = 2*nelem uint16_t
+// each, using the full 4*nelem byte budget either way). Single
+// alloc+free per device keeps the SYCL pool's strict-LIFO invariant
+// trivial.
+//
+// For non-(N=2 FP32 contiguous) cases, comm_init or comm_allreduce_tensor
+// returns null/false, causing the meta-backend to use its generic
+// butterfly all-reduce fallback.
+// ==========================================================================
+
+struct ggml_backend_sycl_comm_context {
+    std::vector<ggml_backend_t> backends;
+    // ONE persistent per-device byte buffer, 4*nelem bytes.  Both the
+    // FP32 small-tensor path and the BF16 large-tensor path share it
+    // by reinterpreting.
+    std::unique_ptr<ggml_sycl_pool_alloc<uint8_t>> buf0;
+    std::unique_ptr<ggml_sycl_pool_alloc<uint8_t>> buf1;
+    int64_t buf_nelem = 0;
+};
+
+void * ggml_backend_sycl_comm_init(ggml_backend_t * backends, size_t n_backends) try {
+    for (size_t i = 0; i < n_backends; ++i) {
+        if (!ggml_backend_is_sycl(backends[i])) {
+            return nullptr;
+        }
+    }
+
+    // Initial version: N=2 only. For N!=2, returning null makes the
+    // meta-backend skip this backend-specific allreduce entirely.
+    if (n_backends != 2) {
+        return nullptr;
+    }
+
+    auto * ctx = new ggml_backend_sycl_comm_context;
+    ctx->backends.assign(backends, backends + n_backends);
+    auto * sctx0 = (ggml_backend_sycl_context *) backends[0]->context;
+    auto * sctx1 = (ggml_backend_sycl_context *) backends[1]->context;
+    ctx->buf0 = std::make_unique<ggml_sycl_pool_alloc<uint8_t>>(sctx0->pool());
+    ctx->buf1 = std::make_unique<ggml_sycl_pool_alloc<uint8_t>>(sctx1->pool());
+    return ctx;
+}
+catch (const sycl::exception &) { return nullptr; }
+catch (...)                     { return nullptr; }
+
+void ggml_backend_sycl_comm_free(void * comm_ctx_v) {
+    auto * comm_ctx = static_cast<ggml_backend_sycl_comm_context *>(comm_ctx_v);
+    if (comm_ctx == nullptr) {
+        return;
+    }
+
+    // Sync both per-device queues so the pool_alloc destructors don't
+    // return memory still in use by the last kernel.
+    if (comm_ctx->backends.size() == 2) {
+        auto * sctx0 = (ggml_backend_sycl_context *) comm_ctx->backends[0]->context;
+        auto * sctx1 = (ggml_backend_sycl_context *) comm_ctx->backends[1]->context;
+        try {
+            sctx0->stream()->wait();
+            sctx1->stream()->wait();
+        } catch (...) { /* best effort during shutdown */ }
+    }
+
+    delete comm_ctx;
+}
+
+bool ggml_backend_sycl_comm_allreduce_tensor(void * comm_ctx_v, struct ggml_tensor ** tensors) try {
+    if (comm_ctx_v == nullptr) {
+        return false;
+    }
+
+    auto * comm_ctx = static_cast<ggml_backend_sycl_comm_context *>(comm_ctx_v);
+    const size_t n_backends = comm_ctx->backends.size();
+
+    // Fast path: N=2, F32/F16, contiguous, matching shapes.
+    if (n_backends != 2) {
+        return false;
+    }
+    // Accept F32 or F16 inputs natively (types must match). F16 takes the
+    // direct 2-byte memcpy + add path below; other types return false so the
+    // meta-backend uses its generic all-reduce.
+    if (tensors[0]->type != tensors[1]->type) {
+        return false;
+    }
+    if (tensors[0]->type != GGML_TYPE_F32 && tensors[0]->type != GGML_TYPE_F16) {
+        return false;
+    }
+    if (!ggml_is_contiguous(tensors[0]) || !ggml_is_contiguous(tensors[1])) {
+        return false;
+    }
+    if (ggml_nelements(tensors[0]) != ggml_nelements(tensors[1])) {
+        return false;
+    }
+
+    const int64_t nelem  = ggml_nelements(tensors[0]);
+    const size_t  nbytes = ggml_nbytes(tensors[0]);
+    if (nelem == 0) {
+        return true;
+    }
+
+    auto * ctx0 = (ggml_backend_sycl_context *) comm_ctx->backends[0]->context;
+    auto * ctx1 = (ggml_backend_sycl_context *) comm_ctx->backends[1]->context;
+    queue_ptr q0 = ctx0->stream();
+    queue_ptr q1 = ctx1->stream();
+
+    // Grow per-device byte buffers if needed (4 * nelem bytes each).
+    if (comm_ctx->buf_nelem < nelem) {
+        comm_ctx->buf0->realloc(nelem * 4);
+        comm_ctx->buf1->realloc(nelem * 4);
+        comm_ctx->buf_nelem = nelem;
+    }
+    uint8_t * buf0 = comm_ctx->buf0->get();
+    uint8_t * buf1 = comm_ctx->buf1->get();
+
+    // F16 native path: direct 2-byte cross-device copy + add, skipping the
+    // F32 round-trip the meta-backend fallback would force. Cross-device copies
+    // go through dev2dev_memcpy because the two devices are in separate SYCL
+    // contexts (a raw peer-USM q->memcpy would be a silent no-op).
+    if (tensors[0]->type == GGML_TYPE_F16) {
+        sycl::half * f16_out0 = (sycl::half *) tensors[0]->data;
+        sycl::half * f16_out1 = (sycl::half *) tensors[1]->data;
+        sycl::half * f16_tmp0 = (sycl::half *) buf0;
+        sycl::half * f16_tmp1 = (sycl::half *) buf1;
+
+        q0->wait();
+        q1->wait();
+        dev2dev_memcpy(ctx0->device, *q0, ctx1->device, *q1, f16_tmp0, tensors[1]->data, nbytes);
+        dev2dev_memcpy(ctx1->device, *q1, ctx0->device, *q0, f16_tmp1, tensors[0]->data, nbytes);
+
+        q0->submit([&](sycl::handler & h) {
+            h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
+                f16_out0[i] = (sycl::half) ((float) f16_out0[i] + (float) f16_tmp0[i]);
+            });
+        });
+        q1->submit([&](sycl::handler & h) {
+            h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
+                f16_out1[i] = (sycl::half) ((float) f16_out1[i] + (float) f16_tmp1[i]);
+            });
+        });
+        return true;
+    }
+
+    float * out0 = (float *) tensors[0]->data;
+    float * out1 = (float *) tensors[1]->data;
+
+    // BF16 threshold: above this, the PCIe savings from halving the
+    // cross-device bytes outweigh the 2 extra compress kernels.
+    // Below: stay on the FP32 fast path.  Threshold mirrors the CUDA
+    // NCCL allreduce pattern for n_backends=2.
+    static constexpr int64_t BF16_THRESHOLD = 32768;
+
+    if (nelem < BF16_THRESHOLD) {
+        // FP32 small path: 4 SYCL submissions per allreduce.
+        float * tmp0 = (float *) buf0;
+        float * tmp1 = (float *) buf1;
+
+        // COMM-D2D-FIX: the two devices are in SEPARATE SYCL contexts, so a raw
+        // q->memcpy of a peer USM pointer is a silent no-op. Route cross-device
+        // copies through dev2dev_memcpy (L0 direct copy / host staging). It is
+        // synchronous, so wait for the local partials to be produced first.
+        q0->wait();
+        q1->wait();
+        dev2dev_memcpy(ctx0->device, *q0, ctx1->device, *q1, tmp0, tensors[1]->data, nbytes);
+        dev2dev_memcpy(ctx1->device, *q1, ctx0->device, *q0, tmp1, tensors[0]->data, nbytes);
+
+        q0->submit([&](sycl::handler & h) {
+            h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
+                out0[i] += tmp0[i];
+            });
+        });
+        q1->submit([&](sycl::handler & h) {
+            h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
+                out1[i] += tmp1[i];
+            });
+        });
+        return true;
+    }
+
+    // BF16 large path: 6 SYCL submissions per allreduce, but the
+    // cross-device memcpy is HALF the bytes. Pure bit-shift
+    // conversion (no rounding) — matches ggml's truncating fp32->bf16.
+    uint16_t * outbox0 = (uint16_t *) buf0;
+    uint16_t * inbox0  = outbox0 + nelem;
+    uint16_t * outbox1 = (uint16_t *) buf1;
+    uint16_t * inbox1  = outbox1 + nelem;
+
+    // Phase A: compress each device's local partial in parallel.
+    sycl::event c0 = q0->parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
+        outbox0[i] = (uint16_t) (sycl::bit_cast<uint32_t>(out0[i]) >> 16);
+    });
+
+    sycl::event c1 = q1->parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
+        outbox1[i] = (uint16_t) (sycl::bit_cast<uint32_t>(out1[i]) >> 16);
+    });
+
+    // Phase B: COMM-D2D-FIX-BF16 cross-device copy of compressed bytes via
+    // dev2dev_memcpy (separate SYCL contexts; sync copy after compress).
+    const size_t bf16_bytes = nelem * sizeof(uint16_t);
+    c0.wait();
+    c1.wait();
+    dev2dev_memcpy(ctx0->device, *q0, ctx1->device, *q1, inbox0, outbox1, bf16_bytes);
+    dev2dev_memcpy(ctx1->device, *q1, ctx0->device, *q0, inbox1, outbox0, bf16_bytes);
+
+    // Phase C: decompress + add into local FP32 partial.
+    q0->submit([&](sycl::handler & h) {
+        h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
+            out0[i] += sycl::bit_cast<float>(((uint32_t) inbox0[i]) << 16);
+        });
+    });
+
+    q1->submit([&](sycl::handler & h) {
+        h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
+            out1[i] += sycl::bit_cast<float>(((uint32_t) inbox1[i]) << 16);
+        });
+    });
+
+    return true;
+}
+catch (const sycl::exception &) { return false; }
+catch (...)                     { return false; }
+
 static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name) {
    GGML_UNUSED(reg);

@@ -5866,6 +6110,17 @@ static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, cons
        return (void *)ggml_backend_sycl_split_buffer_type;
    }

+    // Tensor parallelism (--split-mode tensor) entry points.
+    if (strcmp(name, "ggml_backend_comm_init") == 0) {
+        return (void *)ggml_backend_sycl_comm_init;
+    }
+    if (strcmp(name, "ggml_backend_comm_free") == 0) {
+        return (void *)ggml_backend_sycl_comm_free;
+    }
+    if (strcmp(name, "ggml_backend_comm_allreduce_tensor") == 0) {
+        return (void *)ggml_backend_sycl_comm_allreduce_tensor;
+    }
+
    // SYCL doesn't support registering host memory, left here for reference
    // "ggml_backend_register_host_buffer"
    // "ggml_backend_unregister_host_buffer"
@@ -699,6 +699,7 @@ struct vk_device_struct {

    bool add_rms_fusion;
    uint32_t partials_binding_alignment;
+    uint32_t max_nodes_per_submit;

    bool shader_64b_indexing;

@@ -5878,6 +5879,14 @@ static vk_device ggml_vk_get_device(size_t idx) {
        device->subgroup_vote = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) &&
                                (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eVote);

+        // Submit at least every 100 nodes, in case there are workloads without as much matmul.
+        device->max_nodes_per_submit = 100;
+        const char* GGML_VK_MAX_NODES_PER_SUBMIT = getenv("GGML_VK_MAX_NODES_PER_SUBMIT");
+        if (GGML_VK_MAX_NODES_PER_SUBMIT != nullptr) {
+            uint32_t max_nodes_per_submit = std::stoul(GGML_VK_MAX_NODES_PER_SUBMIT);
+            device->max_nodes_per_submit = std::max(max_nodes_per_submit, 1u);
+        }
+
        const bool force_disable_f16 = getenv("GGML_VK_DISABLE_F16") != nullptr;

        device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
@@ -16173,8 +16182,6 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
    // Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
    // Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB
    // (and scaled down based on model size, so smaller models submit earlier).
-    // Also submit at least every 100 nodes, in case there are workloads without as much matmul.
-    int nodes_per_submit = 100;
    int submitted_nodes = 0;
    int submit_count = 0;
    uint64_t mul_mat_bytes = 0;
@@ -16400,7 +16407,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg

        // Signal the almost_ready fence when the graph is mostly complete (< 20% remaining)
        bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5;
-        bool submit = (submitted_nodes >= nodes_per_submit) ||
+        bool submit = ((uint32_t)submitted_nodes >= ctx->device->max_nodes_per_submit) ||
                      (mul_mat_bytes_per_submit != 0 && mul_mat_bytes >= mul_mat_bytes_per_submit) ||
                      (i + ctx->num_additional_fused_ops >= last_node) ||
                      (almost_ready && !ctx->almost_ready_fence_pending);
@@ -11,6 +11,7 @@
 #include <future>
 #include <queue>
 #include <condition_variable>
+#include <atomic>
 #include <cstdio>
 #include <cstring>
 #include <cstdlib>
@@ -34,6 +35,9 @@

 std::mutex lock;
 std::vector<std::pair<std::string, std::string>> shader_fnames;
+// Set when any shader subprocess fails (non-zero exit / stderr / launch failure) so the
+// build is stopped instead of silently producing a broken libggml-vulkan. (issue #24393)
+static std::atomic<bool> compile_failed{false};
 std::locale c_locale("C");

 std::string GLSLC = "glslc";
@@ -78,7 +82,7 @@ enum MatMulIdType {

 namespace {

-void execute_command(std::vector<std::string>& command, std::string& stdout_str, std::string& stderr_str) {
+int execute_command(std::vector<std::string>& command, std::string& stdout_str, std::string& stderr_str) {
 #ifdef _WIN32
    HANDLE stdout_read, stdout_write;
    HANDLE stderr_read, stderr_write;
@@ -127,8 +131,11 @@ void execute_command(std::vector<std::string>& command, std::string& stdout_str,
    CloseHandle(stdout_read);
    CloseHandle(stderr_read);
    WaitForSingleObject(pi.hProcess, INFINITE);
+    DWORD exit_code = 1;
+    GetExitCodeProcess(pi.hProcess, &exit_code);
    CloseHandle(pi.hProcess);
    CloseHandle(pi.hThread);
+    return (int)exit_code;
 #else
    int stdout_pipe[2];
    int stderr_pipe[2];
@@ -175,7 +182,9 @@ void execute_command(std::vector<std::string>& command, std::string& stdout_str,

        close(stdout_pipe[0]);
        close(stderr_pipe[0]);
-        waitpid(pid, nullptr, 0);
+        int status = 0;
+        waitpid(pid, &status, 0);
+        return WIFEXITED(status) ? WEXITSTATUS(status) : -1;
    }
 #endif
 }
@@ -372,13 +381,14 @@ void string_to_spv_func(std::string name, std::string in_path, std::string out_p
        // }
        // std::cout << std::endl;

-        execute_command(cmd, stdout_str, stderr_str);
-        if (!stderr_str.empty()) {
-            std::cerr << "cannot compile " << name << "\n\n";
+        int exit_code = execute_command(cmd, stdout_str, stderr_str);
+        if (exit_code != 0 || !stderr_str.empty()) {
+            std::cerr << "cannot compile " << name << " (exit code " << exit_code << ")\n\n";
            for (const auto& part : cmd) {
                std::cerr << part << " ";
            }
            std::cerr << "\n\n" << stderr_str << std::endl;
+            compile_failed = true;
            return;
        }

@@ -398,6 +408,7 @@ void string_to_spv_func(std::string name, std::string in_path, std::string out_p
        shader_fnames.push_back(std::make_pair(name, out_path));
    } catch (const std::exception& e) {
        std::cerr << "Error executing command for " << name << ": " << e.what() << std::endl;
+        compile_failed = true;
    }
 }

@@ -1271,6 +1282,11 @@ int main(int argc, char** argv) {

    process_shaders();

+    if (compile_failed) {
+        std::cerr << "vulkan-shaders-gen: one or more shaders failed to compile" << std::endl;
+        return EXIT_FAILURE;
+    }
+
    write_output_files();

    return EXIT_SUCCESS;
@@ -57,19 +57,25 @@ oppoll=
 opflt=
 [ "$OF" != "" ] && opflt="GGML_HEXAGON_OPFILTER=$OF"

+opfuse=
+[ "$OC" != "" ] && opfuse="GGML_HEXAGON_OPFUSION=$OC"
+
 vmem=
 [ "$VM" != "" ] && vmem="GGML_HEXAGON_VMEM=$VM"

 mbuf=
 [ "$MB" != "" ] && mbuf="GGML_HEXAGON_MBUF=$MB"

+mmsel=
+[ "$MM" != "" ] && mmsel="GGML_HEXAGON_MM_SELECT=$MM"
+
 set -x

 adb $adbserial $adbhost shell " \
  cd $basedir; ulimit -c unlimited;        \
    LD_LIBRARY_PATH=$basedir/$branch/lib   \
    ADSP_LIBRARY_PATH=$basedir/$branch/lib \
-    $verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $oppoll $opflt $vmem $mbuf \
+    $verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $oppoll $opflt $opfuse $vmem $mbuf $mmsel \
      ./$branch/bin/llama-completion --no-mmap -m $basedir/../gguf/$model \
         --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1                  \
         --ctx-size 8192 --ubatch-size 1024 -fa on                        \
@@ -51,6 +51,12 @@ opqueue=
 oppoll=
 [ "$OP" != "" ] && oppoll="GGML_HEXAGON_OPPOLL=$OP"

+opfuse=
+[ "$OC" != "" ] && opfuse="GGML_HEXAGON_OPFUSION=$OC"
+
+mmsel=
+[ "$MM" != "" ] && mmsel="GGML_HEXAGON_MM_SELECT=$MM"
+
 set -x

 tool=$1; shift
@@ -59,5 +65,5 @@ adb $adbserial $adbhost shell " \
  cd $basedir; ulimit -c unlimited;        \
    LD_LIBRARY_PATH=$basedir/$branch/lib   \
    ADSP_LIBRARY_PATH=$basedir/$branch/lib \
-    $verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $oppoll ./$branch/bin/$tool $@ \
+    $verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $oppoll $opfuse $mmsel ./$branch/bin/$tool $@ \
 "
@@ -26,7 +26,7 @@ COL_MAP = {
 }

 op_pattern = re.compile(
-    r"profile-op\s+(?P<op_name>[A-Z_0-9+]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+.*?\s+(?:op-)?usec\s+(?P<usec>\d+)\s+(?:op-)?cycles\s+(?P<cycles>\d+)(?:\s+start\s+(?P<start>\d+))?(?:\s+mhz\s+(?P<mhz>[\d.]+))?(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?(?:\s+evt\s+\[(?P<evt>[\d,\s]+)\])?"
+    r"profile-op\s+(?P<op_name>[A-Z_0-9+]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+.*?\s+:\s+(?:op-)?usec\s+(?P<usec>\d+)\s+(?:op-)?cycles\s+(?P<cycles>\d+)(?:\s+start\s+(?P<start>\d+))?(?:\s+mhz\s+(?P<mhz>[\d.]+))?(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?(?:\s+evt\s+\[(?P<evt>[\d,\s]+)\])?"
 )

 trace_pattern = re.compile(
@@ -93,9 +93,40 @@ def parse_log(file_path, pmu_index=None):
                + int(ts_match.group('us'))
            )

-        op_match = op_pattern.search(line)
+        if "|" in line and "profile-op" in line:
+            parts = [p.strip() for p in line.split("|")]
+            prefix = parts[0]
+            prefix_match = re.search(r"profile-op\s+(?P<op_name>[A-Z_0-9+]+)", prefix)
+            if not prefix_match:
+                continue
+
+            if len(parts) == 7:
+                dims, types, timings = parts[2], parts[3], parts[6]
+            elif len(parts) == 6:
+                dims, types, timings = parts[2], parts[3], parts[5]
+            else:
+                continue
+
+            timing_match = re.search(
+                r"(?:op-)?usec\s+(?P<usec>\d+)\s+(?:op-)?cycles\s+(?P<cycles>\d+)(?:\s+start\s+(?P<start>\d+))?(?:\s+mhz\s+(?P<mhz>[\d.]+))?(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?(?:\s+evt\s+\[(?P<evt>[\d,\s]+)\])?",
+                timings
+            )
+            if not timing_match:
+                continue
+
+            op_match = timing_match
+            op_name = prefix_match.group("op_name")
+        else:
+            op_match = op_pattern.search(line)
+            if op_match:
+                op_name = op_match.group('op_name')
+                dims = op_match.group('dims').strip()
+                types = op_match.group('types').strip()
+            else:
+                op_match = None
+
        if op_match:
-            pmu_raw = op_match.group('pmu')
+            pmu_raw = op_match.group('pmu') if 'pmu' in op_match.groupdict() else None
            pmu_val = None
            if pmu_raw and pmu_index is not None:
                try:
@@ -105,7 +136,7 @@ def parse_log(file_path, pmu_index=None):
                except (ValueError, IndexError):
                    pmu_val = None

-            evt_raw = op_match.group('evt')
+            evt_raw = op_match.group('evt') if 'evt' in op_match.groupdict() else None
            evt_val = None
            if evt_raw:
                try:
@@ -122,9 +153,9 @@ def parse_log(file_path, pmu_index=None):
            op_text = line[idx + 11:].strip() if idx != -1 else line.strip()

            current_op = {
-                'name':         op_match.group('op_name'),
-                'dims':         op_match.group('dims').strip(),
-                'types':        op_match.group('types').strip(),
+                'name':         op_name,
+                'dims':         dims,
+                'types':        types,
                'op_text':      op_text,
                'usec':         int(op_match.group('usec')),
                'cycles':       int(op_match.group('cycles')),
@@ -12,7 +12,7 @@ from collections import defaultdict
 logger = logging.getLogger("ggml-hexagon-trace")

 op_pattern = re.compile(
-    r"profile-op\s+(?P<op_name>[A-Z_0-9+]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+(?P<strides>[\d:x\s\->!]+)\s+:\s+(?:op-)?usec\s+(?P<usec>\d+)\s+(?:op-)?cycles\s+(?P<cycles>\d+)(?:\s+start\s+(?P<start>\d+))?(?:\s+mhz\s+(?P<mhz>[\d.]+))?(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?(?:\s+evt\s+\[(?P<evt>[\d,\s]+)\])?"
+    r"profile-op\s+(?P<op_name>[A-Z_0-9+]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+(?P<strides>[\d:x\s\->!]+?)\s+:\s+(?:(?P<params>.*?)\s+:\s+)?(?:op-)?usec\s+(?P<usec>\d+)\s+(?:op-)?cycles\s+(?P<cycles>\d+)(?:\s+start\s+(?P<start>\d+))?(?:\s+mhz\s+(?P<mhz>[\d.]+))?(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?(?:\s+evt\s+\[(?P<evt>[\d,\s]+)\])?"
 )

 trace_pattern = re.compile(
@@ -66,7 +66,40 @@ def parse_log(file_path):

    for line in f:
        line_idx += 1
-        op_match = op_pattern.search(line)
+        if "|" in line and "profile-op" in line:
+            parts = [p.strip() for p in line.split("|")]
+            prefix = parts[0]
+            prefix_match = re.search(r"profile-op\s+(?P<op_name>[A-Z_0-9+]+)", prefix)
+            if not prefix_match:
+                continue
+
+            if len(parts) == 7:
+                dims, types, strides, params, timings = parts[2], parts[3], parts[4], parts[5], parts[6]
+            elif len(parts) == 6:
+                dims, types, strides, params, timings = parts[2], parts[3], parts[4], "", parts[5]
+            else:
+                continue
+
+            timing_match = re.search(
+                r"(?:op-)?usec\s+(?P<usec>\d+)\s+(?:op-)?cycles\s+(?P<cycles>\d+)(?:\s+start\s+(?P<start>\d+))?(?:\s+mhz\s+(?P<mhz>[\d.]+))?(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?(?:\s+evt\s+\[(?P<evt>[\d,\s]+)\])?",
+                timings
+            )
+            if not timing_match:
+                continue
+
+            op_match = timing_match
+            op_name = prefix_match.group("op_name")
+        else:
+            op_match = op_pattern.search(line)
+            if op_match:
+                op_name = op_match.group('op_name')
+                dims = op_match.group('dims').strip() if op_match.group('dims') else ''
+                types = op_match.group('types').strip() if op_match.group('types') else ''
+                strides = op_match.group('strides').strip() if op_match.group('strides') else ''
+                params = op_match.group('params').strip() if ('params' in op_match.groupdict() and op_match.group('params')) else ''
+            else:
+                op_match = None
+
        if op_match:
            cycles_start_raw = op_match.group('start')
            unwrapped_cycles_start = None
@@ -77,10 +110,11 @@ def parse_log(file_path):
            op_text = line[idx + 11:].strip() if idx != -1 else line.strip()

            current_op = {
-                'name':         op_match.group('op_name'),
-                'dims':         op_match.group('dims').strip() if op_match.group('dims') else '',
-                'types':        op_match.group('types').strip() if op_match.group('types') else '',
-                'strides':      op_match.group('strides').strip() if op_match.group('strides') else '',
+                'name':         op_name,
+                'dims':         dims,
+                'types':        types,
+                'strides':      strides,
+                'params':       params,
                'op_text':      op_text,
                'usec':         int(op_match.group('usec')),
                'cycles':       int(op_match.group('cycles')),
@@ -397,6 +431,8 @@ def generate_perfetto_trace(filtered_ops, output_path):
                debug_annots.append(make_debug_annotation("line", int_val=op['line_num']))
            if 'strides' in op and op['strides']:
                debug_annots.append(make_debug_annotation("strides", string_val=op['strides']))
+            if 'params' in op and op['params'] and op['params'] != '----':
+                debug_annots.append(make_debug_annotation("params", string_val=op['params']))

            # Slice Begin
            evt_begin = make_track_event(1, 2, name=f"{op['name']} ({op['dims']})", category="operator", debug_annotations=debug_annots)
@@ -700,6 +700,7 @@ const char * llm_type_name(llm_type type) {
        case LLM_TYPE_160M:          return "160M";
        case LLM_TYPE_190M:          return "190M";
        case LLM_TYPE_220M:          return "220M";
+        case LLM_TYPE_230M:          return "230M";
        case LLM_TYPE_250M:          return "250M";
        case LLM_TYPE_256M:          return "256M";
        case LLM_TYPE_270M:          return "270M";
@@ -36,6 +36,7 @@ enum llm_type {
    LLM_TYPE_160M,
    LLM_TYPE_190M,
    LLM_TYPE_220M,
+    LLM_TYPE_230M,
    LLM_TYPE_250M,
    LLM_TYPE_256M,
    LLM_TYPE_270M,
@@ -847,7 +847,7 @@ static void init_quantize_state_counters(quantize_state_impl & qs, std::vector<t
            qs.has_tied_embeddings = false;
        }
    }
-    qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer();
+    qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer_all;
 }

 //
@@ -13,6 +13,7 @@ void llama_model_lfm2::load_arch_hparams(llama_model_loader & ml) {
    hparams.n_layer_dense_lead = hparams.n_layer();

    switch (hparams.n_ff()) {
+        case  2560: type = LLM_TYPE_230M; break;
        case  4608: type = LLM_TYPE_350M; break;
        case  6912: type = LLM_TYPE_700M; break;
        case  8192: type = LLM_TYPE_1_2B; break;
@@ -199,7 +199,6 @@ llama_build_and_test(test-jinja.cpp)
 llama_test(test-jinja NAME test-jinja-py ARGS -py LABEL python)
 llama_build_and_test(test-chat-auto-parser.cpp WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
 llama_build_and_test(test-chat-template.cpp)
-llama_build_and_test(test-json-partial.cpp)
 llama_build_and_test(test-log.cpp)
 llama_build_and_test(
    test-peg-parser.cpp
@@ -8176,6 +8176,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
    test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 1, 4, 1}, {-1,-1,-1,-1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {-1,-1,-1,-1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 2097121, 1, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 2, 524281, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}));

    // CPY - different src/dst shapes (reshaping via CPY)
    // Use permutations of {3, 5, 7, 32}. Total elements: 3*5*7*32 = 3360.
@@ -8420,6 +8422,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
        }
    }

+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 2880, 32, 2880, {1, 1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q8_0, GGML_TYPE_F32, 2880, 32, 2880, {1, 1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_MXFP4, GGML_TYPE_F32, 2880, 32, 2880, {1, 1}, {1, 1}));
+
+
 #if 0
    {
        // Test paths in OpenCL
@@ -8594,6 +8601,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {

    // gpt-oss issue with Vulkan mmq_id
    test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_MXFP4, GGML_TYPE_F32, 32, 2, false, 2880, 32, 2880));
+    test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_Q4_0, GGML_TYPE_F32, 32, 2, false, 2880, 32, 2880));

    for (ggml_type type_a : all_types) {
        test_cases.emplace_back(new test_mul_mat_id(type_a, GGML_TYPE_F32, 4, 2, false, 64, 16, 3*ggml_blck_size(type_a)));
@@ -1,287 +0,0 @@
-#include "common.h"
-#include "json-partial.h"
-#include <exception>
-#include <iostream>
-#include <stdexcept>
-
-template <class T> static void assert_equals(const T & expected, const T & actual) {
-  if (expected != actual) {
-      std::cerr << "Expected: " << expected << std::endl;
-      std::cerr << "Actual: " << actual << std::endl;
-      std::cerr << std::flush;
-      throw std::runtime_error("Test failed");
-  }
-}
-
-static void test_json_healing() {
-  auto parse = [](const std::string & str) {
-      std::cerr << "# Parsing: " << str << '\n';
-      std::string::const_iterator it = str.begin();
-      const auto end = str.end();
-      common_json out;
-      std::string healing_marker = "$llama.cpp.json$";
-      if (common_json_parse(it, end, healing_marker, out)) {
-          auto dump = out.json.dump();
-          std::cerr << "Parsed: " << dump << '\n';
-          std::cerr << "Magic: " << out.healing_marker.json_dump_marker << '\n';
-          std::string result;
-          if (!out.healing_marker.json_dump_marker.empty()) {
-              auto i = dump.find(out.healing_marker.json_dump_marker);
-              if (i == std::string::npos) {
-                  throw std::runtime_error("Failed to find magic in dump " + dump + " (magic: " + out.healing_marker.json_dump_marker + ")");
-              }
-              result = dump.substr(0, i);
-          } else {
-            result = dump;
-          }
-          std::cerr << "Result: " << result << '\n';
-          if (string_starts_with(str, result)) {
-            std::cerr << "Failure!\n";
-          }
-        //   return dump;
-      } else {
-        throw std::runtime_error("Failed to parse: " + str);
-      }
-
-  };
-  auto parse_all = [&](const std::string & str) {
-      for (size_t i = 1; i < str.size(); i++) {
-          parse(str.substr(0, i));
-      }
-  };
-  parse_all("{\"a\": \"b\"}");
-  parse_all("{\"hey\": 1, \"ho\\\"ha\": [1]}");
-
-  parse_all("[{\"a\": \"b\"}]");
-
-  auto test = [&](const std::vector<std::string> & inputs, const std::string & expected, const std::string & expected_marker) {
-      for (const auto & input : inputs) {
-        common_json out;
-        assert_equals(true, common_json_parse(input, "$foo", out));
-        assert_equals<std::string>(expected, out.json.dump(/* indent */ -1, /* indent_char */ ' ', /* ensure_ascii */ true));
-        assert_equals<std::string>(expected_marker, out.healing_marker.json_dump_marker);
-      }
-  };
-  // No healing needed:
-  test(
-    {
-      R"([{"a":"b"}, "y"])",
-    },
-    R"([{"a":"b"},"y"])",
-    ""
-  );
-  // Partial literals can't be healed:
-  test(
-    {
-      R"([1)",
-      R"([tru)",
-      R"([n)",
-      R"([nul)",
-      R"([23.2)",
-    },
-    R"(["$foo"])",
-    R"("$foo)"
-  );
-  test(
-    {
-      R"({"a": 1)",
-      R"({"a": tru)",
-      R"({"a": n)",
-      R"({"a": nul)",
-      R"({"a": 23.2)",
-    },
-    R"({"a":"$foo"})",
-    R"("$foo)"
-  );
-  test(
-    {
-      R"({)",
-    },
-    R"({"$foo":1})",
-    R"("$foo)"
-  );
-  test(
-    {
-      R"([)",
-    },
-    R"(["$foo"])",
-    R"("$foo)"
-  );
-  // Healing right after a full literal
-  test(
-    {
-      R"(1 )",
-    },
-    R"(1)",
-    ""
-  );
-  test(
-    {
-      R"(true)",
-      R"(true )",
-    },
-    R"(true)",
-    ""
-  );
-  test(
-    {
-      R"(null)",
-      R"(null )",
-    },
-    R"(null)",
-    ""
-  );
-  test(
-    {
-      R"([1 )",
-    },
-    R"([1,"$foo"])",
-    R"(,"$foo)"
-  );
-  test(
-    {
-      R"([{})",
-      R"([{} )",
-    },
-    R"([{},"$foo"])",
-    R"(,"$foo)"
-  );
-  test(
-    {
-      R"([true)",
-    },
-    // TODO: detect the true/false/null literal was complete
-    R"(["$foo"])",
-    R"("$foo)"
-  );
-  test(
-    {
-      R"([true )",
-    },
-    R"([true,"$foo"])",
-    R"(,"$foo)"
-  );
-  test(
-    {
-      R"([true,)",
-    },
-    R"([true,"$foo"])",
-    R"("$foo)"
-  );
-  // Test nesting
-  test(
-    {
-      R"([{"a": [{"b": [{)",
-    },
-    R"([{"a":[{"b":[{"$foo":1}]}]}])",
-    R"("$foo)"
-  );
-  test(
-    {
-      R"([{"a": [{"b": [)",
-    },
-    R"([{"a":[{"b":["$foo"]}]}])",
-    R"("$foo)"
-  );
-
-  test(
-    {
-      R"([{"a": "b"})",
-      R"([{"a": "b"} )",
-    },
-    R"([{"a":"b"},"$foo"])",
-    R"(,"$foo)"
-  );
-  test(
-    {
-      R"([{"a": "b"},)",
-      R"([{"a": "b"}, )",
-    },
-    R"([{"a":"b"},"$foo"])",
-    R"("$foo)"
-  );
-  test(
-    {
-      R"({ "code)",
-    },
-    R"({"code$foo":1})",
-    R"($foo)"
-  );
-  test(
-    {
-      R"({ "code\)",
-    },
-    R"({"code\\$foo":1})",
-    R"(\$foo)"
-  );
-  test(
-    {
-      R"({ "code")",
-    },
-    R"({"code":"$foo"})",
-    R"(:"$foo)"
-  );
-  test(
-    {
-      R"({ "key")",
-    },
-    R"({"key":"$foo"})",
-    R"(:"$foo)"
-  );
-  // Test unicode escape sequences
-  test(
-    {
-      R"({"a":"\u)",
-    },
-    R"({"a":"\u0000$foo"})",
-    R"(0000$foo)"
-  );
-  test(
-    {
-      R"({"a":"\u00)",
-    },
-    R"({"a":"\u0000$foo"})",
-    R"(00$foo)"
-  );
-  test(
-    {
-      R"({"a":"\ud300)",
-    },
-    R"({"a":"\ud300$foo"})",
-    R"($foo)"
-  );
-  test(
-    {
-      R"({"a":"\ud800)",
-    },
-    R"({"a":"\ud800\udc00$foo"})",
-    R"(\udc00$foo)"
-  );
-  test(
-    {
-      R"({"a":"\ud800\)",
-    },
-    R"({"a":"\ud800\udc00$foo"})",
-    R"(udc00$foo)"
-  );
-  test(
-    {
-      R"({"a":"\ud800\u)",
-    },
-    R"({"a":"\ud800\udc00$foo"})",
-    R"(dc00$foo)"
-  );
-  test(
-    {
-      R"({"a":"\ud800\udc00)",
-    },
-    R"({"a":"\ud800\udc00$foo"})",
-    R"($foo)"
-  );
-}
-
-int main() {
-    test_json_healing();
-    std::cerr << "All tests passed.\n";
-    return 0;
-}
@@ -146,6 +146,8 @@ int main(int argc, char ** argv) {
                }

                LOG_INF("Model %d/%d, Context %d/%d: %s\n\n", m + 1, num_models, c + 1, num_contexts, result.c_str());
+
+                llama_synchronize(ctx.get());
            });
        }
    }
@@ -1035,25 +1035,23 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {

    if (!params.hf_repo.empty()) {
        for (size_t i = 0; i < params.hf_repo.size(); i++) {
-            common_params_model model;
-
-            if (params.hf_file.empty() || params.hf_file[i].empty()) {
-                model.hf_repo = params.hf_repo[i];
-            } else {
-                model.hf_repo = params.hf_repo[i];
-                model.hf_file = params.hf_file[i];
+            common_params p;
+            p.hf_token      = params.hf_token;
+            p.offline       = params.offline;
+            p.model.hf_repo = params.hf_repo[i];
+            if (!params.hf_file.empty() && !params.hf_file[i].empty()) {
+                p.model.hf_file = params.hf_file[i];
            }

-            common_download_opts opts;
-            opts.bearer_token = params.hf_token;
-            opts.offline         = params.offline;
-            auto download_result = common_download_model(model, opts);
-            if (download_result.model_path.empty()) {
+            // only the text model file is needed
+            common_models_handler models_handler = common_models_handler_init(p, LLAMA_EXAMPLE_BENCH);
+            common_models_handler_apply(models_handler, p);
+            if (p.model.path.empty()) {
                fprintf(stderr, "error: failed to download model from HuggingFace\n");
                exit(1);
            }

-            params.model.push_back(download_result.model_path);
+            params.model.push_back(p.model.path);
        }
    }

@@ -115,22 +115,28 @@ if (TARGET mtmd)
    endif()
 endif()

-add_executable(llama-llava-cli    deprecation-warning.cpp)
-add_executable(llama-gemma3-cli   deprecation-warning.cpp)
-add_executable(llama-minicpmv-cli deprecation-warning.cpp)
-add_executable(llama-qwen2vl-cli  deprecation-warning.cpp)
+# Gate CLI binaries on LLAMA_BUILD_TOOLS so that standalone library-only
+# builds (LLAMA_BUILD_MTMD=ON with LLAMA_BUILD_TOOLS=OFF — e.g. Apple
+# XCFramework packaging) skip the executables entirely. LLAMA_BUILD_COMMON
+# defaults to ON in standalone builds, so we cannot rely on it for gating.
+if (LLAMA_BUILD_TOOLS)
+    add_executable(llama-llava-cli    deprecation-warning.cpp)
+    add_executable(llama-gemma3-cli   deprecation-warning.cpp)
+    add_executable(llama-minicpmv-cli deprecation-warning.cpp)
+    add_executable(llama-qwen2vl-cli  deprecation-warning.cpp)

-set(TARGET llama-mtmd-cli)
-add_executable         (${TARGET} mtmd-cli.cpp)
-set_target_properties  (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
-if(LLAMA_TOOLS_INSTALL)
-    install(TARGETS ${TARGET} RUNTIME)
+    set(TARGET llama-mtmd-cli)
+    add_executable         (${TARGET} mtmd-cli.cpp)
+    set_target_properties  (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
+    if(LLAMA_TOOLS_INSTALL)
+        install(TARGETS ${TARGET} RUNTIME)
+    endif()
+    target_link_libraries  (${TARGET} PRIVATE llama-common mtmd Threads::Threads)
+    target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+    # mtmd-debug tool
+    add_executable(llama-mtmd-debug debug/mtmd-debug.cpp)
+    set_target_properties(llama-mtmd-debug PROPERTIES OUTPUT_NAME llama-mtmd-debug)
+    target_link_libraries(llama-mtmd-debug PRIVATE llama-common mtmd Threads::Threads)
+    target_compile_features(llama-mtmd-debug PRIVATE cxx_std_17)
 endif()
-target_link_libraries  (${TARGET} PRIVATE llama-common mtmd Threads::Threads)
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
-
-# mtmd-debug tool
-add_executable(llama-mtmd-debug debug/mtmd-debug.cpp)
-set_target_properties(llama-mtmd-debug PROPERTIES OUTPUT_NAME llama-mtmd-debug)
-target_link_libraries(llama-mtmd-debug PRIVATE llama-common mtmd Threads::Threads)
-target_compile_features(llama-mtmd-debug PRIVATE cxx_std_17)
@@ -9,6 +9,7 @@ its output, and holds them against the HF model's scores.

 import argparse
 import logging
+import re
 import subprocess
 import sys
 import unicodedata
@@ -28,6 +29,12 @@ class ModelSpec:
    mmproj_arg: str
    model_default: str
    mmproj_default: str
+    prompt: str = "Free OCR. "
+    n_predict: int = 512
+    n_ctx: int | None = None
+    # Unlimited-OCR's "document parsing" prompt emits <|det|> grounding markup that
+    # the HF reference strips in result.md; drop it before scoring to match.
+    strip_grounding: bool = False


@dataclass
@@ -63,6 +70,20 @@ MODELS = {
        model_default="gguf_models/deepseek-ai/deepseek-ocr-2-bf16.gguf",
        mmproj_default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-2-bf16.gguf",
    ),
+    "unlimited": ModelSpec(
+        key="unlimited", label="Unlimited-OCR",
+        model_arg="--llama-model-unlimited", mmproj_arg="--mmproj-unlimited",
+        model_default="gguf_models/baidu/unlimited-ocr-bf16.gguf",
+        mmproj_default="gguf_models/baidu/mmproj-unlimited-ocr-bf16.gguf",
+        # "Free OCR." immediately emits EOS on this checkpoint; the HF reference
+        # (demo/unlimited_ocr_scores.py) uses "document parsing.", which grounds.
+        prompt="document parsing.",
+        # Grounding emits ~3x the tokens of plain OCR, so it needs a larger budget
+        # and context to reach the article body the ground truth covers.
+        n_predict=4096,
+        n_ctx=16384,
+        strip_grounding=True,
+    ),
 }

 CASES = [
@@ -82,9 +103,26 @@ CASES = [
        # is one pixel off and lands at ~0.69 instead.
        hf_cer=0.7761, hf_chrf=28.70, cer_tol=0.12, chrf_tol=8.0,
    ),
+    TestCase(
+        model_key="unlimited", label="single-view scan",
+        image="tools/mtmd/test-1.jpeg",
+        ground_truth="tools/mtmd/tests/test-1-ground-truth.txt",
+        # HF reference: Unlimited-OCR scoring (gundam, bf16) on this image/ground-truth.
+        # Decoder runs full MHA, not R-SWA; the band absorbs that gap + bf16 variance.
+        hf_cer=0.1869, hf_chrf=75.23, cer_tol=0.06, chrf_tol=6.0,
+    ),
 ]


+GROUNDING_TAG_RE = re.compile(r"<\|(ref|det)\|>.*?<\|/\1\|>", re.DOTALL)
+
+
+def strip_grounding(text: str) -> str:
+    """Drop <|ref|>..<|/ref|> / <|det|>..<|/det|> grounding markup, matching the
+    cleaned result.md the HF reference scores against."""
+    return GROUNDING_TAG_RE.sub("", text)
+
+
 def arg_dest(flag: str) -> str:
    return flag.lstrip("-").replace("-", "_")

@@ -129,19 +167,19 @@ def compute_chrf(expected: str, ocr_out: str) -> float:
    return CHRF().sentence_score(ocr_out, [expected]).score


-def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str:
+def run_mtmd_cli(spec: "ModelSpec", model_path, mmproj_path, image_path, bin_path) -> str:
    """Run mtmd-cli on the image and return its output."""
    cmd = [
        str(bin_path),
        "-m", str(model_path),
        "--mmproj", str(mmproj_path),
        "--image", str(image_path),
-        "-p", "Free OCR. ",
+        "-p", spec.prompt,
        "--chat-template", "deepseek-ocr",
        "--temp", "0",
        "--flash-attn", "off",  # match the HF "eager" attention reference
        "--no-warmup",
-        "-n", "512",  # cap loops on hard images (KV would otherwise fill)
+        "-n", str(spec.n_predict),  # cap loops on hard images (KV would otherwise fill)
        # HF decodes with no_repeat_ngram_size; llama.cpp's analog is DRY.
        # Default DRY breakers include "\n", so they are cleared below.
        "--dry-multiplier", "0.8",
@@ -150,6 +188,8 @@ def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str:
        "--dry-penalty-last-n", "-1",
        "--dry-sequence-breaker", "none",
    ]
+    if spec.n_ctx is not None:
+        cmd += ["-c", str(spec.n_ctx)]
    logger.debug(f"  command: {' '.join(cmd)}")

    try:
@@ -164,6 +204,8 @@ def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str:
        raise RuntimeError(f"llama-mtmd-cli failed with code {result.returncode}")

    output = result.stdout.decode("utf-8", errors="replace").strip()
+    if spec.strip_grounding:
+        output = strip_grounding(output)
    if not output:
        raise RuntimeError("llama-mtmd-cli produced no output on stdout")
    logger.info(f"  output: {len(output)} chars")
@@ -193,7 +235,7 @@ def evaluate(case: "TestCase", expected: str, ocr_out: str) -> bool:

    logger.info("")
    logger.info("=" * 60)
-    logger.info("Free OCR evaluation:")
+    logger.info("OCR evaluation:")
    logger.info("=" * 60)
    logger.info(f"  CER               {cer:>7.4f}    (HF {case.hf_cer:.4f}, <= {case.cer_max:>7.4f}  -> {verdict(cer_pass)})")
    logger.info(f"  chrF (0-100)      {chrf:>7.2f}    (HF {case.hf_chrf:.2f}, >= {case.chrf_min:>7.2f}  -> {verdict(chrf_pass)})")
@@ -269,9 +311,9 @@ def main() -> int:
        expected = read_expected_text(ground_truth)
        logger.info(f"  Image: {case.image}")
        logger.info(f"  Expected text: {len(expected)} chars")
-        logger.info("  Running llama.cpp 'Free OCR'")
+        logger.info(f"  Running llama.cpp prompt {model_spec.prompt!r}")
        try:
-            ocr_out = run_mtmd_cli(model, mmproj, image, binary)
+            ocr_out = run_mtmd_cli(model_spec, model, mmproj, image, binary)
        except RuntimeError as e:
            logger.error(f"  Error: {e}")
            results[title] = False
@@ -40,6 +40,7 @@ struct debug_options {
    bool               enable_reasoning  = true;
    bool               debug_jinja       = false;
    bool               force_tool_call   = false;
+    bool               parallel_tool_calls = true;
    output_mode        mode              = output_mode::BOTH;
    input_message_type input_message     = input_message_type::NONE;
 };
@@ -87,6 +88,7 @@ static void print_usage(const char * program_name) {
    LOG_ERR("\nOptions:\n");
    LOG_ERR("  --no-tools              Disable tool definitions\n");
    LOG_ERR("  --force-tool-call       Set tool calls to forced\n");
+    LOG_ERR("  --parallel-tool-calls=0|1 Set parallel_tool_calls (default: 1)\n");
    LOG_ERR("  --generation-prompt=0|1 Set add_generation_prompt (default: 1)\n");
    LOG_ERR("  --enable-reasoning=0|1  Enable reasoning parsing (default: 1)\n");
    LOG_ERR("  --output=MODE           Output mode: analysis, template, both (default: both)\n");
@@ -121,6 +123,8 @@ static bool parse_options(int argc, char ** argv, debug_options & opts) {
            opts.debug_jinja = true;
        } else if (arg == "--no-tools") {
            opts.with_tools = false;
+        } else if (arg.rfind("--parallel-tool-calls=", 0) == 0) {
+            opts.parallel_tool_calls = parse_bool_option(arg.substr(22));
        } else if (arg.rfind("--generation-prompt=", 0) == 0) {
            opts.generation_prompt = parse_bool_option(arg.substr(20));
        } else if (arg.rfind("--enable-reasoning=", 0) == 0) {
@@ -349,7 +353,7 @@ static autoparser::generation_params prepare_params(const debug_options & opts,
        params.tools       = json();
        params.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE;
    }
-    params.parallel_tool_calls = false;
+    params.parallel_tool_calls = opts.parallel_tool_calls;
    return params;
 }

@@ -223,8 +223,8 @@ void server_model_meta::update_caps() {
            "LLAMA_ARG_HF_REPO_FILE",
        });
        params.offline = true;
-        // params.skip_download = true; // TODO: ideally, we should validate the model here, but it takes too much time
-        common_params_handle_models(params, LLAMA_EXAMPLE_SERVER, {});
+        common_models_handler handler = common_models_handler_init(params, LLAMA_EXAMPLE_SERVER);
+        common_models_handler_apply(handler, params); // note: this won't download the model because offline=true
        if (params.mmproj.path.empty()) {
            multimodal = { false, false };
        } else {
@@ -1393,9 +1393,8 @@ struct server_download_state : public common_download_callback {

    bool run(common_params & params) {
        try {
-            common_params_handle_models_params p;
-            p.callback = this;
-            common_params_handle_models(params, LLAMA_EXAMPLE_SERVER, p);
+            common_models_handler handler = common_models_handler_init(params, LLAMA_EXAMPLE_SERVER);
+            common_models_handler_apply(handler, params, this);
            is_ok = true;
        } catch (const std::exception & e) {
            auto model_name = params.model.get_name();
@@ -1768,23 +1767,14 @@ void server_models_routes::init_routes() {
            throw std::invalid_argument("model must be a non-empty string");
        }

-        common_params_model model;
-        common_download_opts opts;
+        common_params p;
+        p.model.hf_repo  = name;
+        p.hf_token       = params.hf_token;

-        model.hf_repo        = name;
-        opts.bearer_token    = params.hf_token;
-        // note: we only check main model, no need sidecar here
-        opts.download_mmproj = false;
-        opts.download_mtp    = false;
-
-        // first, only check if the model is valid and can be downloaded
-        opts.skip_download = true;
+        // validate by fetching metadata
        bool ok = false;
        try {
-            auto validation = common_download_model(model, opts);
-            ok = !validation.model_path.empty();
-        } catch (const common_skip_download_exception &) {
-            // model is valid and will be downloaded
+            common_models_handler_init(p, LLAMA_EXAMPLE_SERVER);
            ok = true;
        } catch (...) {
            SRV_ERR("unknown error while validating model '%s'\n", name.c_str());
@@ -89,15 +89,16 @@ int llama_server(int argc, char ** argv) {
    llama_backend_init();
    llama_numa_init(params.numa);

-    // note: router mode also accepts -hf remote-preset, so we need to check that first
-    if (!params.model.hf_repo.empty()) {
-        try {
-            common_params_handle_models_params handle_params;
-            handle_params.preset_only = true;
-            common_params_handle_models(params, LLAMA_EXAMPLE_SERVER, handle_params);
-        } catch (const std::exception & e) {
-            // ignored for now
+    common_models_handler models_handler;
+    try {
+        models_handler = common_models_handler_init(params, LLAMA_EXAMPLE_SERVER);
+        if (common_models_handler_is_preset_repo(models_handler)) {
+            // apply the preset and start the server in router mode
+            common_models_handler_apply(models_handler, params);
        }
+    } catch (const std::exception & e) {
+        SRV_ERR("failed to fetch model metadata: %s\n", e.what());
+        return 1;
    }

    // router server never loads a model and must not touch the GPU
@@ -241,6 +242,19 @@ int llama_server(int argc, char ** argv) {
    // Google Cloud Platform (Vertex AI) compat
    ctx_http.register_gcp_compat();

+    // return 403 for disabled features
+    server_http_context::handler_t res_403 = [](const server_http_req &) {
+        auto res = std::make_unique<server_http_res>();
+        res->status = 403;
+        res->data = safe_json_to_str({
+            {"error", {
+                {"message", "this feature is disabled"},
+                {"type", "feature_disabled"},
+            }}
+        });
+        return res;
+    };
+
    // CORS proxy (EXPERIMENTAL, only used by the Web UI for MCP)
    if (params.ui_mcp_proxy) {
        SRV_WRN("%s", "-----------------\n");
@@ -249,7 +263,11 @@ int llama_server(int argc, char ** argv) {
        SRV_WRN("%s", "-----------------\n");
        ctx_http.get ("/cors-proxy",      ex_wrapper(proxy_handler_get));
        ctx_http.post("/cors-proxy",      ex_wrapper(proxy_handler_post));
+    } else {
+        ctx_http.get ("/cors-proxy",      ex_wrapper(res_403));
+        ctx_http.post("/cors-proxy",      ex_wrapper(res_403));
    }
+
    // EXPERIMENTAL built-in tools
    if (!params.server_tools.empty()) {
        try {
@@ -264,6 +282,9 @@ int llama_server(int argc, char ** argv) {
        SRV_WRN("%s", "-----------------\n");
        ctx_http.get ("/tools",           ex_wrapper(tools.handle_get));
        ctx_http.post("/tools",           ex_wrapper(tools.handle_post));
+    } else {
+        ctx_http.get ("/tools",           ex_wrapper(res_403));
+        ctx_http.post("/tools",           ex_wrapper(res_403));
    }

    //
@@ -274,7 +295,12 @@ int llama_server(int argc, char ** argv) {
        return child.run_download(params);
    } else if (!is_router_server) {
        // single-model mode (NOT spawned by router)
-        common_params_handle_models(params, LLAMA_EXAMPLE_SERVER, {});
+        try {
+            common_models_handler_apply(models_handler, params);
+        } catch (const std::exception & e) {
+            SRV_ERR("failed to download model: %s\n", e.what());
+            return 1;
+        }
    }

    //
@@ -16,7 +16,7 @@ def test_mcp_no_proxy():
    server.start()

    res = server.make_request("GET", "/cors-proxy")
-    assert res.status_code == 404
+    assert res.status_code == 403


 def test_mcp_proxy():
@@ -28,10 +28,9 @@ vite.config.ts.timestamp-*
 # PWA Artifacts
 apple-splash-*.png
 apple-touch-icon-*.png
-favicon.ico
-favicon-dark.ico
 maskable-icon-*.png
 pwa-*.png
+static/favicon*

 # Storybook
 *storybook.log
@@ -35,7 +35,7 @@
 				"bits-ui": "2.18.1",
 				"clsx": "2.1.1",
 				"dexie": "4.4.3",
-				"dompurify": "3.4.5",
+				"dompurify": "3.4.11",
 				"eslint": "9.39.4",
 				"eslint-config-prettier": "10.1.8",
 				"eslint-plugin-storybook": "10.4.2",
@@ -8653,9 +8653,9 @@
 			"peer": true
 		},
 		"node_modules/dompurify": {
-			"version": "3.4.5",
-			"resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.4.5.tgz",
-			"integrity": "sha512-OrwIBKsdNSVEeubdJ1HBv/wNENRM9ytAVCv7YXt//A3vPdVMNuACRqK9mXCGCBW2ln7BT/A4X0jXHo2Gu89miA==",
+			"version": "3.4.11",
+			"resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.4.11.tgz",
+			"integrity": "sha512-zhlUV12GsaRzMsf9q5M254YhA4+VuF0fG+QFqu6aYpoGlKtz+w8//jBcGVYBgQkR5GHjUomejY84AV+/uPbWdw==",
 			"dev": true,
 			"license": "(MPL-2.0 OR Apache-2.0)",
 			"optionalDependencies": {
@@ -10226,9 +10226,9 @@
 			}
 		},
 		"node_modules/hono": {
-			"version": "4.12.23",
-			"resolved": "https://registry.npmjs.org/hono/-/hono-4.12.23.tgz",
-			"integrity": "sha512-eIaZ9qDgu7XV0pxOCrg7/WhnQ6Ivm22UcxhXx/A3dcbqbbYgBEkc6e/J/s7j2tS96zoB0S9VBdLwQNCWwUo4LA==",
+			"version": "4.12.26",
+			"resolved": "https://registry.npmjs.org/hono/-/hono-4.12.26.tgz",
+			"integrity": "sha512-uyZtpnYxM9CmQ7QsQknM4zN8EftNqhON1qYeIKM0Se67CCEe2c44xyGURwB0axX2fBDu1dqHrHAc1hmNT8ITkw==",
 			"dev": true,
 			"license": "MIT",
 			"engines": {
@@ -54,7 +54,7 @@
 		"bits-ui": "2.18.1",
 		"clsx": "2.1.1",
 		"dexie": "4.4.3",
-		"dompurify": "3.4.5",
+		"dompurify": "3.4.11",
 		"eslint": "9.39.4",
 		"eslint-config-prettier": "10.1.8",
 		"eslint-plugin-storybook": "10.4.2",
@@ -1,4 +1,10 @@
 import { defineConfig } from '@vite-pwa/assets-generator/config';
+import { FAVICON_COLORS, PWA_ASSET_GENERATOR } from './src/lib/constants/pwa';
+import { writeThemeFavicons } from './scripts/favicon-colorize';
+
+writeThemeFavicons(FAVICON_COLORS.LIGHT, FAVICON_COLORS.DARK, {
+	padding: PWA_ASSET_GENERATOR.FAVICON_PADDING
+});

 export default defineConfig({
 	headLinkOptions: {
@@ -7,7 +13,8 @@ export default defineConfig({
 	preset: {
 		transparent: {
 			sizes: [],
-			favicons: [[48, 'favicon-dark.ico']]
+			favicons: [[48, 'favicon-dark.ico']],
+			padding: PWA_ASSET_GENERATOR.FAVICON_PADDING
 		},
 		maskable: {
 			sizes: []
@@ -5,15 +5,32 @@ import {
 } from '@vite-pwa/assets-generator/config';
 import { readFileSync } from 'node:fs';
 import { resolve } from 'node:path';
-import { THEME_COLORS, PWA_GENERATOR_DEVICES, PWA_ASSET_GENERATOR } from './src/lib/constants/pwa';
+import {
+	THEME_COLORS,
+	PWA_GENERATOR_DEVICES,
+	PWA_ASSET_GENERATOR,
+	FAVICON_COLORS
+} from './src/lib/constants/pwa';
 import { SplashOrientation } from './src/lib/enums/splash.enums';
+import { writeThemeFavicons } from './scripts/favicon-colorize';
+
+writeThemeFavicons(FAVICON_COLORS.LIGHT, FAVICON_COLORS.DARK, {
+	padding: PWA_ASSET_GENERATOR.FAVICON_PADDING
+});

 export default defineConfig({
 	headLinkOptions: {
 		preset: PWA_ASSET_GENERATOR.LINK_PRESET
 	},
 	preset: combinePresetAndAppleSplashScreens(
-		minimal2023Preset,
+		{
+			...minimal2023Preset,
+			// tiny margin so favicon.ico / pwa-*.png breathe inside the canvas
+			transparent: {
+				...minimal2023Preset.transparent,
+				padding: PWA_ASSET_GENERATOR.FAVICON_PADDING
+			}
+		},
 		{
 			padding: PWA_ASSET_GENERATOR.SPLASH_PADDING,
 			resizeOptions: {
@@ -0,0 +1,107 @@
+import { mkdirSync, readFileSync, writeFileSync } from 'node:fs';
+import { dirname, resolve } from 'node:path';
+import { fileURLToPath } from 'node:url';
+
+const HERE = dirname(fileURLToPath(import.meta.url));
+const PROJECT_ROOT = resolve(HERE, '..');
+
+const DEFAULT_LOGO = resolve(PROJECT_ROOT, 'src/lib/assets/logo.svg');
+const DEFAULT_OUT_DIR = resolve(PROJECT_ROOT, 'static');
+const DEFAULT_OUT_LIGHT = resolve(DEFAULT_OUT_DIR, 'favicon.svg');
+const DEFAULT_OUT_DARK = resolve(DEFAULT_OUT_DIR, 'favicon-dark.svg');
+
+const CURRENT_COLOR = 'currentColor';
+
+export interface ColorizedFavicon {
+	light: string;
+	dark: string;
+}
+
+export interface WriteThemeFaviconsOptions {
+	sourcePath?: string;
+	lightOutPath?: string;
+	darkOutPath?: string;
+	/**
+	 * Fraction of the icon (0..1) to leave as an even margin on each side.
+	 * Applied by wrapping the inner content in a `<g transform="...">` so the
+	 * source `src/lib/assets/logo.svg` is not modified. Pass 0 to disable.
+	 */
+	padding?: number;
+}
+
+/**
+ * Replace every `currentColor` occurrence in the SVG with the given color.
+ * Pure: no filesystem access, so it is straightforward to unit-test.
+ */
+export function colorizeFaviconSvg(
+	svg: string,
+	lightColor: string,
+	darkColor: string
+): ColorizedFavicon {
+	return {
+		light: svg.replaceAll(CURRENT_COLOR, lightColor),
+		dark: svg.replaceAll(CURRENT_COLOR, darkColor)
+	};
+}
+
+/**
+ * Shrink the inner SVG content uniformly and re-center it so `padding` (a
+ * 0..1 fraction) is reserved as equal margin on each side. Returns the input
+ * unchanged for non-positive padding, missing/invalid `viewBox`, or unexpected
+ * markup so the caller always gets a renderable SVG.
+ */
+export function padFaviconSvg(svg: string, padding: number): string {
+	if (!(padding > 0) || padding >= 1) return svg;
+
+	const viewBoxMatch = svg.match(/viewBox\s*=\s*["']([^"']+)["']/i);
+	if (!viewBoxMatch) return svg;
+
+	const parts = viewBoxMatch[1]
+		.trim()
+		.split(/[\s,]+/)
+		.map(Number);
+	if (parts.length !== 4 || parts.some((n) => !Number.isFinite(n))) return svg;
+
+	const [, , width, height] = parts;
+	if (width <= 0 || height <= 0) return svg;
+
+	const scale = 1 - padding;
+	const translateX = (padding * width) / 2;
+	const translateY = (padding * height) / 2;
+
+	const openTagStart = svg.search(/<svg\b/i);
+	if (openTagStart === -1) return svg;
+	const openTagEnd = svg.indexOf('>', openTagStart);
+	if (openTagEnd === -1) return svg;
+	const closeStart = svg.lastIndexOf('</svg');
+	if (closeStart === -1 || closeStart <= openTagEnd) return svg;
+
+	const openTag = svg.slice(0, openTagEnd + 1);
+	const inner = svg.slice(openTagEnd + 1, closeStart);
+	const closeTag = svg.slice(closeStart);
+
+	const group = `<g transform="translate(${translateX} ${translateY}) scale(${scale})">`;
+	return `${openTag}${group}${inner}</g>${closeTag}`;
+}
+
+/**
+ * Read `src/lib/assets/logo.svg`, colorize it for both themes, and write
+ * the results to the static directory so the PWA asset generator can consume
+ * them. Paths can be overridden for tests.
+ */
+export function writeThemeFavicons(
+	lightColor: string,
+	darkColor: string,
+	{
+		sourcePath = DEFAULT_LOGO,
+		lightOutPath = DEFAULT_OUT_LIGHT,
+		darkOutPath = DEFAULT_OUT_DARK,
+		padding = 0
+	}: WriteThemeFaviconsOptions = {}
+): void {
+	const source = readFileSync(sourcePath, 'utf-8');
+	const { light, dark } = colorizeFaviconSvg(source, lightColor, darkColor);
+	mkdirSync(dirname(lightOutPath), { recursive: true });
+	writeFileSync(lightOutPath, padFaviconSvg(light, padding));
+	writeFileSync(darkOutPath, padFaviconSvg(dark, padding));
+}
@@ -48,6 +48,7 @@

 	--chat-form-area-height: 8rem;
 	--chat-form-area-offset: 2rem;
+	--chat-form-padding-top: 6rem;
 	--max-message-height: max(24rem, min(80dvh, calc(100dvh - var(--chat-form-area-height) - 12rem)));
 }

@@ -55,6 +56,7 @@
 	:root {
 		--chat-form-area-height: 24rem;
 		--chat-form-area-offset: 12rem;
+		--chat-form-padding-top: 6rem;
 	}
 }

@@ -141,7 +143,6 @@
 		@apply bg-background text-foreground;
 		scrollbar-width: thin;
 		scrollbar-gutter: stable;
-		overflow: hidden; /* Added due to Mermaid rendering somehow causing the double scrollbar */
 	}

 	/* Global scrollbar styling - visible only on hover */
@@ -193,3 +194,7 @@
 		scrollbar-width: none;
 	}
 }
+
+.mermaidTooltip {
+	display: none !important;
+}
@@ -10,9 +10,9 @@ import { isElementInViewport } from '$lib/utils/viewport';
 */
 export function fadeInView(
 	node: HTMLElement,
-	options: { duration?: number; y?: number; skipIfVisible?: boolean } = {}
+	options: { duration?: number; y?: number; delay?: number; skipIfVisible?: boolean } = {}
 ) {
-	const { duration = 300, y = 0, skipIfVisible = false } = options;
+	const { duration = 300, y = 0, delay = 0, skipIfVisible = false } = options;

 	if (skipIfVisible && isElementInViewport(node)) {
 		return;
@@ -27,10 +27,12 @@ export function fadeInView(
 			(entries) => {
 				for (const entry of entries) {
 					if (entry.isIntersecting) {
-						requestAnimationFrame(() => {
-							node.style.opacity = '1';
-							node.style.transform = 'translateY(0)';
-						});
+						setTimeout(() => {
+							requestAnimationFrame(() => {
+								node.style.opacity = '1';
+								node.style.transform = 'translateY(0)';
+							});
+						}, delay);
 						observer.disconnect();
 					}
 				}
@@ -0,0 +1,7 @@
+<svg width="512" height="512" viewBox="0 0 512 512" fill="none" xmlns="http://www.w3.org/2000/svg">
+    <path d="M244.95 8C215.233 8 187.774 23.8591 172.923 49.5999L95.6009 183.625C60.2162 244.959 104.481 321.6 175.29 321.6H208L316.977 132.708C348.959 77.2719 308.95 8 244.95 8ZM208 321.6H351.947C415.982 321.6 456.013 390.91 424.013 446.377C409.155 472.132 381.681 488 351.947 488H271.29C200.481 488 156.216 411.359 191.601 350.026L208 321.6Z" fill="currentColor"/>
+    <path d="M208 321.6H16L106.462 164.8L208 321.6Z" fill="currentColor"/>
+    <path d="M388.923 8L208 321.6L253.6 8H388.923Z" fill="currentColor"/>
+    <path d="M304 488H112L202.462 331.2L304 488Z" fill="currentColor"/>
+    <path d="M496 321.6H208L419.399 454.4L496 321.6Z" fill="currentColor"/>
+</svg>
@@ -8,12 +8,13 @@
 		ariaLabel?: string;
 		class?: string;
 		disabled?: boolean;
+		href?: string;
 		icon: Component;
 		iconSize?: string;
-		onclick: (e?: MouseEvent) => void;
+		onclick?: (e?: MouseEvent) => void;
 		size?: ButtonSize;
 		stopPropagationOnClick?: boolean;
-		tooltip: string;
+		tooltip?: string;
 		variant?: ButtonVariant;
 		tooltipSide?: TooltipSide;
 	}
@@ -22,6 +23,7 @@
 		icon,
 		tooltip,
 		variant = 'ghost',
+		href = '',
 		size = 'sm',
 		class: className = '',
 		disabled = false,
@@ -31,34 +33,49 @@
 		onclick,
 		ariaLabel
 	}: Props = $props();
+
+	let innerWidth = $state(0);
+	const showTooltip = $derived(!!tooltip && innerWidth > 768);
 </script>

-<Tooltip.Root>
-	<Tooltip.Trigger>
-		<!-- prevent another nested button element -->
-		{#snippet child({ props })}
-			<Button
-				{...props}
-				{variant}
-				{size}
-				{disabled}
-				onclick={(e: MouseEvent) => {
-					if (stopPropagationOnClick) e.stopPropagation();
+{#snippet button(props = {})}
+	<Button
+		{...props}
+		{href}
+		{variant}
+		{size}
+		{disabled}
+		onclick={(e: MouseEvent) => {
+			if (stopPropagationOnClick) e.stopPropagation();

-					onclick?.(e);
-				}}
-				class="h-6 w-6 p-0 {className} flex hover:bg-transparent data-[state=open]:bg-transparent!"
-				aria-label={ariaLabel || tooltip}
-			>
-				{#if icon}
-					{@const IconComponent = icon}
-					<IconComponent class={iconSize} />
-				{/if}
-			</Button>
-		{/snippet}
-	</Tooltip.Trigger>
+			onclick?.(e);
+		}}
+		class="h-6 w-6 p-0 {className} flex hover:bg-transparent data-[state=open]:bg-transparent!"
+		aria-label={ariaLabel || tooltip}
+	>
+		{#if icon}
+			{@const IconComponent = icon}

-	<Tooltip.Content side={tooltipSide}>
-		<p>{tooltip}</p>
-	</Tooltip.Content>
-</Tooltip.Root>
+			<IconComponent class={iconSize} />
+		{/if}
+	</Button>
+{/snippet}
+
+{#if showTooltip}
+	<Tooltip.Root>
+		<Tooltip.Trigger>
+			<!-- prevent another nested button element -->
+			{#snippet child({ props })}
+				{@render button(props)}
+			{/snippet}
+		</Tooltip.Trigger>
+
+		<Tooltip.Content side={tooltipSide}>
+			<p>{tooltip}</p>
+		</Tooltip.Content>
+	</Tooltip.Root>
+{:else}
+	{@render button({ href })}
+{/if}
+
+<svelte:window bind:innerWidth />
@@ -494,7 +494,7 @@
 	/>

 	<div
-		class="{INPUT_CLASSES} overflow-hidden rounded-3xl backdrop-blur-md {disabled
+		class="{INPUT_CLASSES} overflow-hidden rounded-4xl md:rounded-3xl backdrop-blur-md {disabled
 			? 'cursor-not-allowed opacity-60'
 			: ''}"
 		data-slot="input-area"
@@ -510,7 +510,7 @@
 		/>

 		<div
-			class="flex-column relative min-h-[48px] items-center rounded-3xl py-2 pb-2.25 shadow-sm transition-all focus-within:shadow-md md:!py-3"
+			class="flex-column relative min-h-12 items-center rounded-4xl md:rounded-3xl py-2 pb-2.25 shadow-sm transition-all focus-within:shadow-md md:py-3!"
 			onpaste={handlePaste}
 		>
 			<ChatFormTextarea
@@ -15,7 +15,7 @@
 <Tooltip.Root>
 	<Tooltip.Trigger class="w-full">
 		<Button
-			class="file-upload-button h-8 w-8 rounded-full p-0"
+			class="file-upload-button md:h-8 md:w-8 h-9 w-9 rounded-full p-0"
 			{disabled}
 			{onclick}
 			variant="secondary"
@@ -15,6 +15,7 @@
 	import { McpLogo } from '$lib/components/app';
 	import { PencilRuler, ChevronDown, ChevronRight } from '@lucide/svelte';
 	import { HealthCheckStatus } from '$lib/enums';
+	import { AttachmentAction } from '$lib/enums/attachment.enums';

 	interface Props {
 		class?: string;
@@ -270,14 +271,22 @@
 					</Collapsible.Root>
 				{/if}

-				<button type="button" class={sheetItemClass} onclick={onSystemPromptClick}>
+				<button
+					type="button"
+					class={sheetItemClass}
+					onclick={() => attachmentMenu.callbacks[AttachmentAction.SYSTEM_PROMPT_CLICK]()}
+				>
 					<MessageSquare class="h-4 w-4 shrink-0" />

 					<span>System Message</span>
 				</button>

 				{#if hasMcpPromptsSupport}
-					<button type="button" class={sheetItemClass} onclick={onMcpPromptClick}>
+					<button
+						type="button"
+						class={sheetItemClass}
+						onclick={() => attachmentMenu.callbacks[AttachmentAction.MCP_PROMPT_CLICK]()}
+					>
 						<Zap class="h-4 w-4 shrink-0" />

 						<span>MCP Prompt</span>
@@ -285,7 +294,11 @@
 				{/if}

 				{#if hasMcpResourcesSupport}
-					<button type="button" class={sheetItemClass} onclick={onMcpResourcesClick}>
+					<button
+						type="button"
+						class={sheetItemClass}
+						onclick={() => attachmentMenu.callbacks[AttachmentAction.MCP_RESOURCES_CLICK]()}
+					>
 						<FolderOpen class="h-4 w-4 shrink-0" />

 						<span>MCP Resources</span>
@@ -42,6 +42,7 @@
 		{hasMcpPromptsSupport}
 		{hasMcpResourcesSupport}
 		{onFileUpload}
+		{onSystemPromptClick}
 		{onMcpPromptClick}
 		{onMcpResourcesClick}
 	>
@@ -20,7 +20,7 @@
 		type="submit"
 		disabled={isDisabled}
 		class={[
-			'h-8 w-8 rounded-full p-0',
+			'md:h-8 md:w-8 h-9 w-9 rounded-full p-0',
 			showErrorState &&
 				'bg-red-400/10 text-red-400 hover:bg-red-400/20 hover:text-red-400 disabled:opacity-100'
 		]}
@@ -1,4 +1,5 @@
 <script lang="ts">
+	import { isMobile } from '$lib/stores/viewport.svelte';
 	import { autoResizeTextarea } from '$lib/utils';
 	import { onMount } from 'svelte';

@@ -37,7 +38,9 @@
 	}

 	export function focus() {
-		textareaElement?.focus();
+		if (isMobile.current) return;
+
+		textareaElement?.focus({ preventScroll: true });
 	}

 	export function resetHeight() {
@@ -231,7 +231,7 @@
 			editedContent = message.content;
 		}

-		textareaElement?.focus();
+		textareaElement?.focus({ preventScroll: true });
 		editedExtras = message.extra ? [...message.extra] : [];
 		editedUploadedFiles = [];

@@ -324,7 +324,7 @@
 	}
 </script>

-<div use:fadeInView>
+<div use:fadeInView class="chat-message">
 	{#if message.role === MessageRole.SYSTEM}
 		<ChatMessageSystem
 			bind:textareaElement
@@ -180,6 +180,9 @@

 	let displayedModel = $derived(message.model ?? null);

+	// model being switched to while it loads, so the selector bar tracks it
+	let pendingModel = $state<string | null>(null);
+
 	let isCurrentlyLoading = $derived(isLoading());
 	let isStreaming = $derived(isChatStreaming());
 	let hasNoContent = $derived(!message?.content?.trim());
@@ -207,6 +210,42 @@
 			isLastAssistantMessage
 	);

+	let assistantEl: HTMLDivElement | undefined = $state();
+	let lastUserMessageHeight = $state(0);
+	let assistantMarginTop = $state(0);
+
+	$effect(() => {
+		if (!assistantEl) return;
+
+		assistantMarginTop = Math.round(parseFloat(getComputedStyle(assistantEl).marginTop));
+
+		const chatMessageEl = assistantEl.closest('.chat-message');
+		const previousChatMessage = chatMessageEl?.previousElementSibling;
+		const userMessageEl = previousChatMessage?.querySelector(
+			'.chat-message-user'
+		) as HTMLElement | null;
+
+		if (!userMessageEl) {
+			lastUserMessageHeight = 0;
+			return;
+		}
+
+		const updateHeight = () => {
+			const rect = userMessageEl.getBoundingClientRect();
+			const marginTop = Math.round(parseFloat(getComputedStyle(userMessageEl).marginTop));
+			lastUserMessageHeight = Math.round(rect.height + marginTop);
+		};
+
+		updateHeight();
+
+		const resizeObserver = new ResizeObserver(updateHeight);
+		resizeObserver.observe(userMessageEl);
+
+		return () => {
+			resizeObserver.disconnect();
+		};
+	});
+
 	function handleCopyModel() {
 		void copyToClipboard(displayedModel ?? '');
 	}
@@ -219,12 +258,17 @@
 </script>

 <div
-	class="text-md group w-full leading-7.5 {className}"
+	bind:this={assistantEl}
+	class="chat-message-assistant text-md group w-full leading-7.5 {className}"
+	style:--last-user-message-height={lastUserMessageHeight > 0
+		? `${lastUserMessageHeight}px`
+		: undefined}
+	style:--assistant-margin-top={assistantMarginTop > 0 ? `${assistantMarginTop}px` : undefined}
 	role="group"
 	aria-label="Assistant message with actions"
 >
 	{#if showProcessingInfoTop}
-		<div class="mt-6 w-full max-w-[48rem]" in:fade>
+		<div class="mt-6 w-full max-w-3xl" in:fade>
 			<div class="processing-container">
 				<span class="processing-text">
 					{modelLoadingText ??
@@ -257,7 +301,7 @@
 	{/if}

 	{#if showProcessingInfoBottom}
-		<div class="mt-4 w-full max-w-[48rem]" in:fade>
+		<div class="mt-4 w-full max-w-3xl" in:fade>
 			<div class="processing-container">
 				<span class="processing-text">
 					{modelLoadingText ??
@@ -277,13 +321,19 @@
 			>
 				{#if isRouter}
 					<ModelsSelectorDropdown
-						currentModel={displayedModel}
+						currentModel={pendingModel ?? displayedModel}
 						disabled={isLoading()}
 						onModelChange={async (modelId: string, modelName: string) => {
 							const status = modelsStore.getModelStatus(modelId);

 							if (status !== ServerModelStatus.LOADED) {
-								await modelsStore.loadModel(modelId);
+								pendingModel = modelId;
+
+								try {
+									await modelsStore.loadModel(modelId);
+								} finally {
+									pendingModel = null;
+								}
 							}

 							onRegenerate(modelName);
@@ -351,6 +401,23 @@
 </div>

 <style>
+	:global(.chat-message):last-child .chat-message-assistant {
+		--assistant-min-height-offset: calc(
+			var(--last-user-message-height, 19rem) + var(--chat-form-height, 6rem) +
+				var(--chat-form-bottom-position, 0.5rem) + var(--chat-form-padding-top, 6rem) +
+				var(--assistant-margin-top, 3rem)
+		);
+		min-height: calc(100dvh - var(--assistant-min-height-offset));
+
+		@media (width > 768px) {
+			--assistant-min-height-offset: calc(
+				var(--last-user-message-height, 18rem) + var(--chat-form-height, 6rem) +
+					var(--chat-form-bottom-position, 1rem) + var(--chat-form-padding-top, 6rem) +
+					var(--assistant-margin-top, 3rem)
+			);
+		}
+	}
+
 	.processing-container {
 		display: flex;
 		flex-direction: column;
@@ -48,7 +48,7 @@

 <div
 	aria-label="User message with actions"
-	class="group flex flex-col items-end gap-3 md:gap-2 {className}"
+	class="chat-message-user group flex flex-col items-end gap-3 md:gap-2 {className}"
 	role="group"
 >
 	{#if editCtx.isEditing}
@@ -19,7 +19,7 @@
 		renderMarkdown = false,
 		textColorClass = 'text-foreground',
 		cardBgClass = 'dark:bg-primary/15',
-		maxHeightStyle = 'max-height: var(--max-message-height);'
+		maxHeightStyle = ''
 	}: Props = $props();

 	let isMultiline = $state(false);
@@ -59,7 +59,7 @@

 {#if content.trim()}
 	<Card
-		class="max-w-[80%] overflow-y-auto rounded-[1.125rem] border-none bg-primary/5 px-3.75 py-1.5 {textColorClass} backdrop-blur-md data-[multiline]:py-2.5 {cardBgClass}"
+		class="chat-message-user-bubble max-w-[80%] overflow-y-auto rounded-[1.125rem] border-none bg-primary/5 px-3.75 py-1.5 {textColorClass} backdrop-blur-md data-multiline:py-2.5 {cardBgClass}"
 		data-multiline={isMultiline ? '' : undefined}
 		style="{maxHeightStyle} overflow-wrap: anywhere; word-break: break-word;"
 	>
@@ -37,6 +37,7 @@
 	let allConversationMessages = $state<DatabaseMessage[]>([]);
 	let isVisible = $state(false);
 	let previousConversationId = $state<string | null>(null);
+	let previousRouteId = $state<string | null>(null);

 	const currentConfig = config();

@@ -157,8 +158,9 @@
 		});
 	});

-	beforeNavigate(() => {
+	beforeNavigate((navigation) => {
 		isVisible = false;
+		previousRouteId = navigation.from?.route.id ?? null;
 	});

 	afterNavigate(() => {
@@ -249,12 +251,13 @@
 </script>

 <div
-	class="transition-opacity delay-300 duration-500 ease-out
-		{isVisible ? 'opacity-100' : 'opacity-0'}"
+	class="transition-opacity duration-500 ease-out
+		{isVisible ? 'opacity-100' : 'opacity-0'}
+		{previousRouteId === '/(chat)/chat/[id]' ? '' : 'delay-300'}"
 >
 	{#each displayMessages as { message, toolMessages, isLastAssistantMessage, siblingInfo } (message.id)}
 		<ChatMessage
-			class="mx-auto mt-12 w-full max-w-[48rem]"
+			class="mx-auto mt-12 w-full max-w-3xl"
 			{message}
 			{toolMessages}
 			{isLastAssistantMessage}
@@ -1,31 +1,28 @@
 <script lang="ts">
-	import { Trash2 } from '@lucide/svelte';
-	import { afterNavigate } from '$app/navigation';
 	import { page } from '$app/state';
 	import {
 		ChatScreenForm,
 		ChatMessages,
 		ChatScreenDragOverlay,
 		ChatScreenProcessingInfo,
-		ChatScreenActionScrollDown,
-		DialogEmptyFileAlert,
-		DialogFileUploadError,
-		DialogChatError,
 		ServerLoadingSplash,
-		DialogConfirmation,
 		ChatScreenServerError
 	} from '$lib/components/app';
 	import { setProcessingInfoContext } from '$lib/contexts';
-	import { ErrorDialogType } from '$lib/enums';
 	import { createAutoScrollController } from '$lib/hooks/use-auto-scroll.svelte';
+	import { useChatScreenActiveModel } from '$lib/hooks/use-chat-screen-active-model.svelte';
+	import { useChatScreenDragAndDrop } from '$lib/hooks/use-chat-screen-drag-and-drop.svelte';
+	import { useChatScreenFileUpload } from '$lib/hooks/use-chat-screen-file-upload.svelte';
+	import { useChatScreenScroll } from '$lib/hooks/use-chat-screen-scroll.svelte';
 	import { useKeyboardShortcuts } from '$lib/hooks/use-keyboard-shortcuts.svelte';
+	import { device } from '$lib/stores/device.svelte';
+	import { isMobile } from '$lib/stores/viewport.svelte';
 	import {
 		chatStore,
 		errorDialog,
 		isLoading,
 		isChatStreaming,
 		isEditing,
-		getAddFilesHandler,
 		activeProcessingState
 	} from '$lib/stores/chat.svelte';
 	import {
@@ -34,138 +31,81 @@
 		activeConversation
 	} from '$lib/stores/conversations.svelte';
 	import { config } from '$lib/stores/settings.svelte';
-	import { serverLoading, serverError, isRouterMode } from '$lib/stores/server.svelte';
-	import { modelsStore, modelOptions, selectedModelId } from '$lib/stores/models.svelte';
-	import { isFileTypeSupported, filterFilesByModalities } from '$lib/utils';
-	import { parseFilesToMessageExtras, processFilesToChatUploaded } from '$lib/utils/browser-only';
-	import { onMount } from 'svelte';
+	import { serverLoading, serverError } from '$lib/stores/server.svelte';
+	import { parseFilesToMessageExtras } from '$lib/utils/browser-only';
+	import { onDestroy, onMount } from 'svelte';
 	import ChatScreenGreeting from './ChatScreenGreeting.svelte';
+	import ChatScreenActionScrollDown from './ChatScreenActionScrollDown.svelte';
+	import ChatScreenDialogsAndAlerts from './ChatScreenDialogsAndAlerts.svelte';
+	import { ROUTES } from '$lib/constants';

 	let { showCenteredEmpty = false } = $props();

-	const autoScroll = createAutoScrollController();
-
-	let disableAutoScroll = $derived(Boolean(config().disableAutoScroll));
-	let chatScrollContainer: HTMLDivElement | undefined = $state();
-	let dragCounter = $state(0);
-	let isDragOver = $state(false);
-	let showFileErrorDialog = $state(false);
-	let uploadedFiles = $state<ChatUploadedFile[]>([]);
-
-	let fileErrorData = $state<{
-		generallyUnsupported: File[];
-		modalityUnsupported: File[];
-		modalityReasons: Record<string, string>;
-		supportedTypes: string[];
-	}>({
-		generallyUnsupported: [],
-		modalityUnsupported: [],
-		modalityReasons: {},
-		supportedTypes: []
-	});
-
-	let showDeleteDialog = $state(false);
-
-	let showEmptyFileDialog = $state(false);
-
-	let processingInfoVisible = $state(false);
-
-	let emptyFileNames = $state<string[]>([]);
-
-	let initialMessage = $state('');
-
-	let isEmpty = $derived(
-		showCenteredEmpty && !activeConversation() && activeMessages().length === 0 && !isLoading()
-	);
-
-	let activeErrorDialog = $derived(errorDialog());
-	let isServerLoading = $derived(serverLoading());
-	let hasPropsError = $derived(!!serverError());
-
-	let isCurrentConversationLoading = $derived(isLoading() || isChatStreaming());
-
-	let showProcessingInfo = $derived(
-		isCurrentConversationLoading ||
-			(config().keepStatsVisible && !!page.params.id) ||
-			activeProcessingState() !== null
-	);
-
-	let isRouter = $derived(isRouterMode());
-
-	let conversationModel = $derived(
-		chatStore.getConversationModel(activeMessages() as DatabaseMessage[])
-	);
-
-	let activeModelId = $derived.by(() => {
-		const options = modelOptions();
-
-		if (!isRouter) {
-			return options.length > 0 ? options[0].model : null;
-		}
-
-		const selectedId = selectedModelId();
-		if (selectedId) {
-			const model = options.find((m) => m.id === selectedId);
-			if (model) return model.model;
-		}
-
-		if (conversationModel) {
-			const model = options.find((m) => m.model === conversationModel);
-			if (model) return model.model;
-		}
-
-		return null;
-	});
-
-	let modelPropsVersion = $state(0);
-
 	setProcessingInfoContext({
 		get showProcessingInfo() {
 			return showProcessingInfo;
 		}
 	});

-	$effect(() => {
-		if (activeModelId) {
-			const cached = modelsStore.getModelProps(activeModelId);
+	let disableAutoScroll = $derived(Boolean(config().disableAutoScroll) || isMobile.current);
+	let isMobileUserScrolledUp = $state(false);
+	let mobileScrollDownHint = $state(false);
+	let mobileScrollDownHintLockedUntil = $state(0);
+	let emptyFileNames = $state<string[]>([]);
+	let initialMessage = $state('');
+	let showDeleteDialog = $state(false);
+	let showEmptyFileDialog = $state(false);
+	let isEmpty = $derived(
+		showCenteredEmpty && !activeConversation() && activeMessages().length === 0 && !isLoading()
+	);
+	let activeErrorDialog = $derived(errorDialog());
+	let isServerLoading = $derived(serverLoading());
+	let hasPropsError = $derived(!!serverError());
+	let isCurrentConversationLoading = $derived(isLoading() || isChatStreaming());
+	let showProcessingInfo = $derived(
+		isCurrentConversationLoading ||
+			(config().keepStatsVisible && !!page.params.id) ||
+			activeProcessingState() !== null
+	);
+	let chatFormBottomPosition = $derived.by(() => {
+		if (!isMobile.current) return '1rem';
+		if (device.isStandalone) return '1.5rem';
+		if (device.isIOSSafari) return '0.25rem';
+		return '0.5rem';
+	});

-			if (!cached) {
-				modelsStore.fetchModelProps(activeModelId).then(() => {
-					modelPropsVersion++;
-				});
+	const autoScroll = createAutoScrollController();
+	const scroll = useChatScreenScroll(autoScroll);
+	const activeModel = useChatScreenActiveModel();
+	const fileUpload = useChatScreenFileUpload({
+		capabilities: () => ({
+			hasVision: activeModel.hasVisionModality,
+			hasAudio: activeModel.hasAudioModality,
+			hasVideo: activeModel.hasVideoModality
+		}),
+		activeModelId: () => activeModel.activeModelId
+	});
+	const dragAndDrop = useChatScreenDragAndDrop({
+		onDrop: fileUpload.handleFileUpload
+	});
+	const { handleKeydown } = useKeyboardShortcuts({
+		deleteActiveConversation: () => {
+			if (activeConversation()) {
+				showDeleteDialog = true;
 			}
 		}
 	});

-	let hasAudioModality = $derived.by(() => {
-		if (activeModelId) {
-			void modelPropsVersion;
+	function handleMobileScroll() {
+		if (!isMobile.current) return;

-			return modelsStore.modelSupportsAudio(activeModelId);
-		}
+		const container = scroll.chatScrollContainer;
+		if (!container) return;

-		return false;
-	});
-
-	let hasVideoModality = $derived.by(() => {
-		if (activeModelId) {
-			void modelPropsVersion;
-
-			return modelsStore.modelSupportsVideo(activeModelId);
-		}
-
-		return false;
-	});
-
-	let hasVisionModality = $derived.by(() => {
-		if (activeModelId) {
-			void modelPropsVersion;
-
-			return modelsStore.modelSupportsVision(activeModelId);
-		}
-
-		return false;
-	});
+		const distanceFromBottom =
+			container.scrollHeight - container.clientHeight - container.scrollTop;
+		isMobileUserScrolledUp = distanceFromBottom > 300;
+	}

 	async function handleDeleteConfirm() {
 		const conversation = activeConversation();
@@ -177,27 +117,69 @@
 		showDeleteDialog = false;
 	}

-	function handleProcessingInfoVisibility(visible: boolean) {
-		processingInfoVisible = visible;
-	}
+	async function handleSendMessage(message: string, files?: ChatUploadedFile[]): Promise<boolean> {
+		const plainFiles = files ? $state.snapshot(files) : undefined;
+		const result = plainFiles
+			? await parseFilesToMessageExtras(plainFiles, activeModel.activeModelId ?? undefined)
+			: undefined;

-	function handleDragEnter(event: DragEvent) {
-		event.preventDefault();
-
-		dragCounter++;
-
-		if (event.dataTransfer?.types.includes('Files')) {
-			isDragOver = true;
+		if (result?.emptyFiles && result.emptyFiles.length > 0) {
+			emptyFileNames = result.emptyFiles;
+			showEmptyFileDialog = true;
+			if (files) {
+				const emptyFileNamesSet = new Set(result.emptyFiles);
+				fileUpload.uploadedFiles = fileUpload.uploadedFiles.filter(
+					(file) => !emptyFileNamesSet.has(file.name)
+				);
+			}
+			return false;
 		}
+
+		handleSendLikeScroll();
+
+		await chatStore.sendMessage(message, result?.extras);
+		return true;
 	}

-	function handleDragLeave(event: DragEvent) {
-		event.preventDefault();
+	function handleSendLikeScroll() {
+		if (!isMobile.current) {
+			autoScroll.enable();
+		}

-		dragCounter--;
+		setTimeout(() => {
+			const container = scroll.chatScrollContainer;
+			if (!container) return;

-		if (dragCounter === 0) {
-			isDragOver = false;
+			const lastUserBubble = container.querySelector(
+				'.chat-message:nth-last-child(2) .chat-message-user .chat-message-user-bubble'
+			) as HTMLElement | null;
+
+			if (isMobile.current) {
+				// Keep the last user message bubble just above the input on mobile
+				const bubbleHeight = lastUserBubble?.scrollHeight ?? 0;
+				const baseHeight = container.scrollHeight - innerHeight;
+
+				container.scrollTo({
+					top: bubbleHeight > 0 ? baseHeight - bubbleHeight : baseHeight,
+					behavior: 'smooth'
+				});
+			} else if (lastUserBubble) {
+				// On desktop, place the last user message near the top of the viewport
+				const topPadding = 24;
+				const bubbleRect = lastUserBubble.getBoundingClientRect();
+				container.scrollTo({
+					top: Math.max(0, container.scrollTop + bubbleRect.top - topPadding),
+					behavior: 'smooth'
+				});
+			} else {
+				autoScroll.scrollToBottom();
+			}
+		}, 100);
+
+		if (isMobile.current) {
+			autoScroll.setDisabled(disableAutoScroll);
+			mobileScrollDownHint = true;
+			mobileScrollDownHintLockedUntil = Date.now() + 500;
 		}
 	}

@@ -207,273 +189,138 @@
 		}
 	}

-	function handleDragOver(event: DragEvent) {
-		event.preventDefault();
-	}
-
-	function handleDrop(event: DragEvent) {
-		event.preventDefault();
-
-		isDragOver = false;
-		dragCounter = 0;
-
-		if (event.dataTransfer?.files) {
-			const files = Array.from(event.dataTransfer.files);
-
-			if (isEditing()) {
-				const handler = getAddFilesHandler();
-
-				if (handler) {
-					handler(files);
-					return;
-				}
-			}
-
-			processFiles(files);
-		}
-	}
-
-	function handleFileRemove(fileId: string) {
-		uploadedFiles = uploadedFiles.filter((f) => f.id !== fileId);
-	}
-
-	function handleFileUpload(files: File[]) {
-		processFiles(files);
-	}
-
-	const { handleKeydown } = useKeyboardShortcuts({
-		deleteActiveConversation: () => {
-			if (activeConversation()) {
-				showDeleteDialog = true;
-			}
-		}
-	});
-
 	async function handleSystemPromptAdd(draft: { message: string; files: ChatUploadedFile[] }) {
 		if (draft.message || draft.files.length > 0) {
 			chatStore.savePendingDraft(draft.message, draft.files);
 		}
-
 		await chatStore.addSystemPrompt();
 	}

-	function handleScroll() {
-		autoScroll.handleScroll();
-	}
-
-	async function handleSendMessage(message: string, files?: ChatUploadedFile[]): Promise<boolean> {
-		const plainFiles = files ? $state.snapshot(files) : undefined;
-		const result = plainFiles
-			? await parseFilesToMessageExtras(plainFiles, activeModelId ?? undefined)
-			: undefined;
-
-		if (result?.emptyFiles && result.emptyFiles.length > 0) {
-			emptyFileNames = result.emptyFiles;
-			showEmptyFileDialog = true;
-
-			if (files) {
-				const emptyFileNamesSet = new Set(result.emptyFiles);
-				uploadedFiles = uploadedFiles.filter((file) => !emptyFileNamesSet.has(file.name));
-			}
-			return false;
-		}
-
-		const extras = result?.extras;
-
-		// Enable autoscroll for user-initiated message sending
-		autoScroll.enable();
-		await chatStore.sendMessage(message, extras);
-		autoScroll.scrollToBottom();
-
-		return true;
-	}
-
-	async function processFiles(files: File[]) {
-		const generallySupported: File[] = [];
-		const generallyUnsupported: File[] = [];
-
-		for (const file of files) {
-			if (isFileTypeSupported(file.name, file.type)) {
-				generallySupported.push(file);
-			} else {
-				generallyUnsupported.push(file);
-			}
-		}
-
-		// Use model-specific capabilities for file validation
-		const capabilities = {
-			hasVision: hasVisionModality,
-			hasAudio: hasAudioModality,
-			hasVideo: hasVideoModality
-		};
-		const { supportedFiles, unsupportedFiles, modalityReasons } = filterFilesByModalities(
-			generallySupported,
-			capabilities
-		);
-
-		const allUnsupportedFiles = [...generallyUnsupported, ...unsupportedFiles];
-
-		if (allUnsupportedFiles.length > 0) {
-			const supportedTypes: string[] = ['text files', 'PDFs'];
-
-			if (hasVisionModality) supportedTypes.push('images');
-			if (hasAudioModality) supportedTypes.push('audio files');
-			if (hasVideoModality) supportedTypes.push('video files');
-
-			fileErrorData = {
-				generallyUnsupported,
-				modalityUnsupported: unsupportedFiles,
-				modalityReasons,
-				supportedTypes
-			};
-			showFileErrorDialog = true;
-		}
-
-		if (supportedFiles.length > 0) {
-			const processed = await processFilesToChatUploaded(
-				supportedFiles,
-				activeModelId ?? undefined
-			);
-			uploadedFiles = [...uploadedFiles, ...processed];
-		}
-	}
-
-	afterNavigate(() => {
-		if (!disableAutoScroll) {
+	$effect(() => {
+		const shouldDisableAutoScroll =
+			config().disableAutoScroll || (isMobile.current && isCurrentConversationLoading);
+		autoScroll.setDisabled(shouldDisableAutoScroll);
+		if (!shouldDisableAutoScroll) {
 			autoScroll.enable();
 		}
 	});

-	function handleMessagesReady() {
-		if (disableAutoScroll) return;
-
-		if (!autoScroll.userScrolledUp) {
-			requestAnimationFrame(() => {
-				autoScroll.scrollToBottom('instant');
-			});
-		}
-	}
-
 	onMount(() => {
+		const pendingDraft = chatStore.consumePendingDraft();
+		if (pendingDraft) {
+			initialMessage = pendingDraft.message;
+			fileUpload.uploadedFiles = pendingDraft.files;
+		}
+
 		autoScroll.startObserving();

 		if (!disableAutoScroll) {
 			autoScroll.enable();
 		}

-		const pendingDraft = chatStore.consumePendingDraft();
-		if (pendingDraft) {
-			initialMessage = pendingDraft.message;
-			uploadedFiles = pendingDraft.files;
+		if (isMobile.current && isCurrentConversationLoading) {
+			mobileScrollDownHint = true;
+			mobileScrollDownHintLockedUntil = Date.now() + 500;
 		}
+
+		handleMobileScroll();
 	});

-	$effect(() => {
-		autoScroll.setContainer(chatScrollContainer);
-	});
-
-	$effect(() => {
-		autoScroll.setDisabled(disableAutoScroll);
-	});
+	onDestroy(() => autoScroll.destroy());
 </script>

-{#if isDragOver}
+{#if dragAndDrop.isDragOver}
 	<ChatScreenDragOverlay />
 {/if}

-<svelte:window onkeydown={handleKeydown} />
+<svelte:window
+	onkeydown={handleKeydown}
+	onscroll={(e) => {
+		scroll.handleScroll(e);
+		handleMobileScroll();
+		if (e.isTrusted && Date.now() > mobileScrollDownHintLockedUntil) {
+			mobileScrollDownHint = false;
+		}
+	}}
+/>

 {#if isServerLoading}
 	<ServerLoadingSplash />
 {:else}
 	<div
-		bind:this={chatScrollContainer}
-		aria-label="Chat interface with file drop zone"
-		class="flex h-full flex-col overflow-y-auto px-4 md:px-6"
-		ondragenter={handleDragEnter}
-		ondragleave={handleDragLeave}
-		ondragover={handleDragOver}
-		ondrop={handleDrop}
-		onscroll={handleScroll}
+		class="chat-screen flex grow flex-col min-h-[calc(100dvh-1rem)] md:min-h-full px-4 md:py-0 pt-12 pb-48 md:pb-4"
+		style:--chat-form-bottom-position={chatFormBottomPosition}
+		ondragenter={dragAndDrop.dragHandlers.dragenter}
+		ondragleave={dragAndDrop.dragHandlers.dragleave}
+		ondragover={dragAndDrop.dragHandlers.dragover}
+		ondrop={dragAndDrop.dragHandlers.drop}
 		role="main"
 	>
-		<div class="flex grow flex-col pt-14">
-			{#if !isEmpty}
-				<ChatMessages
-					messages={activeMessages()}
-					onMessagesReady={handleMessagesReady}
-					onUserAction={() => {
-						autoScroll.enable();
-						if (!autoScroll.userScrolledUp) {
-							autoScroll.scrollToBottom();
-						}
-					}}
-				/>
-			{/if}
+		{#if !isEmpty}
+			<ChatMessages
+				messages={activeMessages()}
+				onUserAction={() => {
+					handleSendLikeScroll();
+				}}
+			/>
+		{/if}

-			<div
-				class={[
-					'pointer-events-none sticky right-4 left-4 mt-auto transition-all duration-200',
-					isEmpty ? 'bottom-[calc(50dvh-7rem)]' : 'bottom-4 pt-24 md:pt-32'
-				]}
-			>
-				<ChatScreenGreeting {isEmpty} />
+		<div
+			class={[
+				'pointer-events-none md:sticky fixed  mt-auto transition-all duration-200',
+				device.isStandalone
+					? 'bottom-6 right-4 left-4'
+					: device.isIOSSafari
+						? 'bottom-1 left-2 right-2'
+						: 'bottom-2 right-2 left-2',
+				isEmpty ? 'md:bottom-[calc(50dvh-7rem)] 2xl:bottom-[calc(50dvh-4rem)]' : 'md:bottom-4'
+			]}
+			style:padding-top={!isEmpty ? 'var(--chat-form-padding-top)' : undefined}
+		>
+			<ChatScreenGreeting {isEmpty} />

-				<ChatScreenActionScrollDown
-					container={chatScrollContainer}
-					hasProcessingInfoVisible={processingInfoVisible}
-				/>
+			<ChatScreenServerError />

-				<ChatScreenProcessingInfo onVisibilityChange={handleProcessingInfoVisibility} />
-
-				<ChatScreenServerError />
-
-				<div class="conversation-chat-form pointer-events-auto rounded-t-3xl">
-					<ChatScreenForm
-						disabled={hasPropsError || isEditing()}
-						{initialMessage}
-						isLoading={isCurrentConversationLoading}
-						onFileRemove={handleFileRemove}
-						onFileUpload={handleFileUpload}
-						onSend={handleSendMessage}
-						onStop={() => chatStore.stopGeneration()}
-						onSystemPromptAdd={handleSystemPromptAdd}
-						bind:uploadedFiles
+			<div class="pointer-events-none flex flex-col gap-6 items-center w-full">
+				{#if (isMobile.current ? mobileScrollDownHint || isMobileUserScrolledUp : autoScroll.userScrolledUp) && page.url.hash.includes(ROUTES.CHAT) && page.params.id}
+					<ChatScreenActionScrollDown
+						onclick={() => {
+							mobileScrollDownHint = false;
+							scroll.chatScrollContainer?.scrollTo({
+								top: scroll.chatScrollContainer.scrollHeight,
+								behavior: 'smooth'
+							});
+						}}
 					/>
-				</div>
+				{/if}
+
+				{#if showProcessingInfo}
+					<ChatScreenProcessingInfo />
+				{/if}
 			</div>
+
+			<ChatScreenForm
+				class="pointer-events-auto conversation-chat-form"
+				disabled={hasPropsError || isEditing()}
+				{initialMessage}
+				isLoading={isCurrentConversationLoading}
+				onFileRemove={fileUpload.handleFileRemove}
+				onFileUpload={fileUpload.handleFileUpload}
+				onSend={handleSendMessage}
+				onStop={() => chatStore.stopGeneration()}
+				onSystemPromptAdd={handleSystemPromptAdd}
+				bind:uploadedFiles={fileUpload.uploadedFiles}
+			/>
 		</div>
 	</div>
 {/if}

-<DialogFileUploadError bind:open={showFileErrorDialog} {fileErrorData} />
-
-<DialogConfirmation
-	bind:open={showDeleteDialog}
-	title="Delete Conversation"
-	description="Are you sure you want to delete this conversation? This action cannot be undone and will permanently remove all messages in this conversation."
-	confirmText="Delete"
-	cancelText="Cancel"
-	variant="destructive"
-	icon={Trash2}
-	onConfirm={handleDeleteConfirm}
-	onCancel={() => (showDeleteDialog = false)}
-/>
-
-<DialogEmptyFileAlert
-	bind:open={showEmptyFileDialog}
-	emptyFiles={emptyFileNames}
-	onOpenChange={(open) => {
-		if (!open) {
-			emptyFileNames = [];
-		}
-	}}
-/>
-
-<DialogChatError
-	message={activeErrorDialog?.message ?? ''}
-	contextInfo={activeErrorDialog?.contextInfo}
-	onOpenChange={handleErrorDialogOpenChange}
-	open={Boolean(activeErrorDialog)}
-	type={activeErrorDialog?.type ?? ErrorDialogType.SERVER}
+<ChatScreenDialogsAndAlerts
+	{showDeleteDialog}
+	{handleDeleteConfirm}
+	{showEmptyFileDialog}
+	{emptyFileNames}
+	{activeErrorDialog}
+	{handleErrorDialogOpenChange}
+	{fileUpload}
 />
@@ -1,58 +1,18 @@
 <script lang="ts">
 	import { ArrowDown } from '@lucide/svelte';
-	import { Button } from '$lib/components/ui/button';
+	import ActionIcon from '$lib/components/app/actions/ActionIcon.svelte';

-	interface Props {
-		container: HTMLDivElement | undefined;
-		hasProcessingInfoVisible: boolean;
-	}
-
-	let { container, hasProcessingInfoVisible }: Props = $props();
-
-	let show = $state(false);
-
-	let buttonBottom = $derived(hasProcessingInfoVisible ? '2rem' : '0');
-
-	function checkVisibility() {
-		if (!container) return;
-		const { scrollTop, scrollHeight, clientHeight } = container;
-		const distanceFromBottom = scrollHeight - clientHeight - scrollTop;
-		show = distanceFromBottom > clientHeight * 0.5;
-	}
-
-	function scrollToBottom() {
-		if (container) {
-			container.scrollTo({
-				top: container.scrollHeight,
-				behavior: 'smooth'
-			});
-		}
-	}
-
-	$effect(() => {
-		const c = container;
-		if (c) {
-			c.addEventListener('scroll', checkVisibility);
-			checkVisibility();
-			return () => {
-				c.removeEventListener('scroll', checkVisibility);
-			};
-		}
-	});
+	let { onclick }: { onclick: (e?: MouseEvent) => void } = $props();
 </script>

-<div class="relative z-50 mx-auto mb-4 flex max-w-[48rem] justify-center">
-	<Button
-		onclick={scrollToBottom}
-		variant="secondary"
-		size="icon"
-		disabled={!show}
-		class="pointer-events-auto absolute h-10 w-10 rounded-full bg-background/80 shadow-lg backdrop-blur-sm transition-all duration-200 hover:bg-muted/80"
-		style="bottom: {buttonBottom}; transform: translateY({show ? '0' : '2rem'}); opacity: {show
-			? 1
-			: 0};"
-		aria-label="Scroll to bottom"
-	>
-		<ArrowDown class="h-4 w-4" />
-	</Button>
+<div class="pointer-events-auto flex justify-center relative h-0">
+	<ActionIcon
+		icon={ArrowDown}
+		{onclick}
+		ariaLabel="Scroll to bottom"
+		tooltip="Scroll to bottom"
+		size="lg"
+		iconSize="h-4 w-4"
+		class="h-9 w-9 rounded-full bg-accent text-accent-foreground absolute bottom-4 shadow-md"
+	/>
 </div>
@@ -0,0 +1,55 @@
+<script lang="ts">
+	import { Trash2 } from '@lucide/svelte';
+	import { ErrorDialogType } from '$lib/enums';
+	import {
+		DialogChatError,
+		DialogConfirmation,
+		DialogEmptyFileAlert,
+		DialogFileUploadError
+	} from '$lib/components/app';
+
+	let {
+		showDeleteDialog,
+		handleDeleteConfirm,
+		showEmptyFileDialog,
+		emptyFileNames,
+		activeErrorDialog,
+		handleErrorDialogOpenChange,
+		fileUpload
+	} = $props();
+</script>
+
+<DialogFileUploadError
+	bind:open={fileUpload.showFileErrorDialog}
+	fileErrorData={fileUpload.fileErrorData}
+/>
+
+<DialogConfirmation
+	bind:open={showDeleteDialog}
+	title="Delete Conversation"
+	description="Are you sure you want to delete this conversation? This action cannot be undone and will permanently remove all messages in this conversation."
+	confirmText="Delete"
+	cancelText="Cancel"
+	variant="destructive"
+	icon={Trash2}
+	onConfirm={handleDeleteConfirm}
+	onCancel={() => (showDeleteDialog = false)}
+/>
+
+<DialogEmptyFileAlert
+	bind:open={showEmptyFileDialog}
+	emptyFiles={emptyFileNames}
+	onOpenChange={(open) => {
+		if (!open) {
+			emptyFileNames = [];
+		}
+	}}
+/>
+
+<DialogChatError
+	message={activeErrorDialog?.message ?? ''}
+	contextInfo={activeErrorDialog?.contextInfo}
+	onOpenChange={handleErrorDialogOpenChange}
+	open={Boolean(activeErrorDialog)}
+	type={activeErrorDialog?.type ?? ErrorDialogType.SERVER}
+/>
@@ -2,6 +2,7 @@
 	import { afterNavigate } from '$app/navigation';
 	import { page } from '$app/state';
 	import { ChatForm } from '$lib/components/app';
+	import { isMobile } from '$lib/stores/viewport.svelte';
 	import { onMount } from 'svelte';
 	import { useDraftMessages } from '$lib/hooks/use-draft-messages.svelte';

@@ -32,7 +33,30 @@
 	}: Props = $props();

 	let chatFormRef: ChatForm | undefined = $state(undefined);
+	let formWrapperEl: HTMLDivElement | undefined = $state();
 	let chatId = $derived(page.params.id as string | undefined);
+
+	$effect(() => {
+		if (!formWrapperEl) return;
+
+		const formEl = formWrapperEl.querySelector('form') as HTMLElement | null;
+		if (!formEl) return;
+
+		const updateHeight = () => {
+			const height = Math.round(formEl.getBoundingClientRect().height);
+			document.documentElement.style.setProperty('--chat-form-height', `${height}px`);
+		};
+
+		updateHeight();
+
+		const resizeObserver = new ResizeObserver(updateHeight);
+		resizeObserver.observe(formEl);
+
+		return () => {
+			resizeObserver.disconnect();
+			document.documentElement.style.removeProperty('--chat-form-height');
+		};
+	});
 	let hasLoadingAttachments = $derived(uploadedFiles.some((f) => f.isLoading));
 	let message = $derived(initialMessage);
 	let previousIsLoading = $derived(isLoading);
@@ -83,12 +107,14 @@
 	}

 	onMount(() => {
-		setTimeout(() => chatFormRef?.focus(), 10);
+		if (!isMobile.current) {
+			setTimeout(() => chatFormRef?.focus(), 100);
+		}
 	});

 	afterNavigate((navigation) => {
-		if (navigation?.from != null) {
-			setTimeout(() => chatFormRef?.focus(), 10);
+		if (navigation?.from != null && !isMobile.current) {
+			setTimeout(() => chatFormRef?.focus(), 100);
 		}
 	});

@@ -108,12 +134,12 @@
 	});
 </script>

-<div class="relative mx-auto max-w-[48rem]">
+<div class="chat-screen-form-wrapper" bind:this={formWrapperEl}>
 	<ChatForm
+		class="mx-auto max-w-3xl {className}"
 		bind:this={chatFormRef}
 		bind:value={message}
 		bind:uploadedFiles
-		class={className}
 		{disabled}
 		{isLoading}
 		showMcpPromptButton
@@ -1,5 +1,4 @@
 <script lang="ts">
-	import { fadeInView } from '$lib/actions/fade-in-view.svelte';
 	import { serverStore } from '$lib/stores/server.svelte';

 	interface Props {
@@ -11,10 +10,9 @@

 <div
 	class={[
-		'pointer-events-none mb-4 hidden px-4 text-center',
-		isEmpty && 'pointer-events-auto block!'
+		'pointer-events-none mb-4 hidden px-4 text-center text-balance',
+		isEmpty && 'mb-[calc(50dvh-8rem)] md:mb-6 pointer-events-auto block!'
 	]}
-	use:fadeInView={{ duration: 300 }}
 >
 	<h1 class="mb-2 text-2xl font-semibold tracking-tight md:text-3xl">Hello there</h1>

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
shaofeiqi	5c7c22c3e1	opencl: flush profiling batch at shutdown for incomplete batches (#25016 )	2026-06-25 18:48:24 -07:00
Sigbjørn Skjæret	beac5309f1	xcframework : disable mtmd video on i/tv/visionos (#25018 )	2026-06-26 00:13:59 +02:00
Tarek Dakhran	9d5d882d8c	model : Add label for LFM2.5-230M (#25008 )	2026-06-25 18:58:52 +02:00
Oliver Simons	1ec44d178d	CUDA: Various fixes to `cpy.cu` (#25000 ) * Add failing test-case to test-backend-ops Extracted from https://github.com/ggml-org/llama.cpp/issues/24072 * Minimize repro with help of AI N = 8 * (65535 - 1) + 1 = 524273 * Port and adjust workaround from https://github.com/LostRuins/koboldcpp/commit/0ba798341e0c70517cb226cb63c966b086a3b5b3 Fall-back should share code, also relax y-z constraint to be inclusive * Add test-case + fallback also for y dim * Fix x-guards which is 2^{31}-1, so inlusive of INT_MAX * Fix overflow problems for transposed copy kernel	2026-06-25 17:29:23 +02:00
Xuan-Son Nguyen	c7cddefcbd	misc: fix labeler (#25012 )	2026-06-25 17:23:37 +02:00
Xuan-Son Nguyen	e9d1b76d0a	server: use status code 403 for disabled features (#24970 ) * server: use status code 403 for disabled features * cont * fix test case	2026-06-25 16:36:40 +02:00
Xuan-Son Nguyen	099bf06952	misc: update lables (#24920 ) * misc: update lables * bring back examples, add mtmd	2026-06-25 16:26:56 +02:00
Xuan-Son Nguyen	60bc8866b1	common: refactor model handling (#24980 ) * common: refactor models handling * remote preset * cont * rm skip_download option * missing header * fix plan.model_files * fix --offline case * move hf_plan to download * refactor * rm redundant curr_ex, add comments * adapt	2026-06-25 15:17:51 +02:00
Kashif Rasul	e8ecce53b8	docs : Eagle3 qwen3 draft model support (#24977 ) * eagle3: accept Eagle3LlamaForCausalLM draft checkpoints * docs: add eagle3 speculative decoding section * docs: address eagle3 review comments * docs: add more angelslim eagle3 models * docs: add gpt-oss eagle3 models and link to pr 18039	2026-06-25 15:58:00 +03:00
Adrien Gallouët	683b04cc4a	app : add the llama download subcommand (#24982 ) * app : add the download command (with llama-download) Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Remove llama-download tool for now Signed-off-by: Adrien Gallouët <angt@huggingface.co> --------- Signed-off-by: Adrien Gallouët <angt@huggingface.co>	2026-06-25 13:36:36 +02:00
fairydreaming	f728adab68	ggml : address integer overflows in binary ops CUDA implementation (#24706 ) * ggml : address integer overflows in binary ops CUDA implementation * ggml : add size_t casts to avoid integer overflows * ggml : add more asserts checking integer overflows in binary ops CUDA implementation --------- Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>	2026-06-25 10:06:44 +02:00
Pascal	3e61ea0e2f	ui: fix always-show-sidebar-on-desktop setting after navigation refactor (#24979 )	2026-06-25 09:45:55 +02:00
Christopher Albert	fdbd6abee2	tests : synchronize contexts at end of test-thread-safety (#24935 ) Assisted-by: Claude	2026-06-25 09:22:51 +03:00
Abraham Gonzalez	e12a0128ab	build: include libmtmd in Apple XCFramework (#21935 ) Adds an opt-in LLAMA_BUILD_MTMD CMake option so build-xcframework.sh can link libmtmd.a into the framework binary without pulling in the rest of tools/ (which doesn't cross-build cleanly to iOS/tvOS/visionOS). - CMakeLists.txt: new option, default OFF. When on with LLAMA_BUILD_TOOLS=OFF, only the tools/mtmd subdir is added. Useful for any binding that wants just libmtmd (Apple XCFramework, WASM). - tools/mtmd/CMakeLists.txt: gate the CLI exe targets on LLAMA_BUILD_TOOLS. Gating on LLAMA_BUILD_COMMON is not enough — it defaults ON in standalone builds and visionOS xcodebuild then fails with "install TARGETS given no BUNDLE DESTINATION for MACOSX_BUNDLE executable target 'llama-mtmd-cli'". - build-xcframework.sh: turn the option on, pass -DLLAMA_BUILD_MTMD, add libmtmd.a to combine_static_libraries, and copy mtmd.h and mtmd-helper.h into the framework Headers dir. The umbrella module map then exposes them, so Swift / Obj-C consumers can import the mtmd C API directly. After this, nm on ios-arm64/llama.framework/llama shows 52 _mtmd_ symbols. Verified end-to-end: a Swift target links the produced framework and calls mtmd_default_marker, mtmd_bitmap_init, etc. without a shim on macos / iphoneos / iphonesimulator / xros slices. Co-authored-by: Abraham Gonzalez <abraham@theabecaster.com>	2026-06-25 08:37:30 +03:00
Sigbjørn Skjæret	b3ce5cedf4	quant : fix quantizing moe with mtp (#24986 )	2026-06-25 08:36:49 +03:00
David Spruill	e9fb3b3fc0	sycl : support --split-mode tensor (#24152 ) * Sycl tp stage1 (#1) * SYCL: tensor parallelism (--split-mode tensor) for dual-GPU Adds the comm_init/comm_free/comm_allreduce_tensor trio that the meta-backend queries via get_proc_address to enable backend-specific all-reduce, mirroring the pattern used by ggml-cuda.cu. For N=2 (the common dual-GPU case) implements a degenerate ring all-reduce with two size-branched paths: * Small (nelem < 32768): FP32 direct memcpy + per-device ADD kernel chained via depends_on(memcpy_event). 4 SYCL submissions/call. * Large (nelem >= 32768): BF16-compressed. Each device compresses FP32 -> BF16 in a local outbox, cross-device memcpys to the peer's inbox (HALF the PCIe bytes), then decompresses + adds into the local FP32 partial. 6 SYCL submissions/call but PCIe bytes halved -- wins for any tensor where PCIe dominates kernel time. Threshold and BF16 path pattern mirror the CUDA NCCL allreduce. Storage: ONE persistent uint8_t buffer per device, 4 * nelem bytes (matches both path layouts: FP32 nelem floats; BF16 outbox+inbox = 2 * nelem uint16_t each). Single alloc+free per device keeps the SYCL pool's strict-LIFO invariant trivial. Initial impl handles N=2 FP32 contiguous tensors. Other cases return false, causing the meta-backend to use its generic butterfly fallback. Per-call sync is intentionally omitted. SYCL in-order queue semantics ensure that the meta-backend's next compute on the same per-device queue waits for our final ADD, and the next allreduce's first op on the same persistent buffer waits via the same queue. Only comm_free does an explicit final wait. OneCCL is NOT used: OneCCL 2021.17 hardcodes single-device-per-process in communicator_impl.hpp:47 (condition devices.size() == 1), which is incompatible with llama.cpp's single-process multi-GPU model. Measured on dual Intel Arc Pro B70 (NEO 26.05.x, oneAPI 2025.3 + DPC++ nightly): Llama-3.3-70B Q4_K_M, -sm tensor -fa 1 -ctk f16 -ctv f16: pp512 = 377.08 t/s (vs 313.65 layer mode = +20.2%) tg128 = 17.40 t/s (vs 9.74 layer mode = +78.6%) Qwen3-Coder-Next-80B-A3B Q3_K_M (MoE): pp512 = 216.56 t/s (vs 156.58 meta-backend butterfly = +38.3%) tg128 = 17.60 t/s (vs 14.31 meta-backend butterfly = +23.0%) Qwen3-4B Q4_K_M: pp64 = 984.51 t/s, tg16 = 49.29 t/s Llama-3.3-70B in SYCL TP now comfortably beats production layer mode on both prefill and decode. Coder-Next-80B-A3B (MoE) also wins on both — the BF16 path is what unlocks the many-medium-allreduces prefill pattern. Build/CMake: no changes. No new dependencies. ~210 lines added across ggml-sycl.h and ggml-sycl.cpp. * Fix comments * documentation update to address PR feedback * Bring over my device-to-device memcpy chagnes * move the dev2dev_memcpy calls to the upstream 7-parameter variety * Fix a typo and remove a trailing whitespace	2026-06-25 08:35:21 +03:00
Neo Zhang	9c10954865	sycl : fix the failed UT cases of conv_3d (#24900 )	2026-06-25 08:27:58 +03:00
lhez	fdb2c11c70	opencl: support non-contig rows in norm (#24965 )	2026-06-24 19:21:25 -07:00
Piotr Wilkin (ilintar)	09cedfd699	chat: harden caps check (#24973 )	2026-06-25 02:49:22 +02:00
Max Krasnyansky	8be759e6f7	hexagon: MUL_MAT and MUL_MAT_ID rework : 32x32 tiled weight repack, kernel-params, cached graphs (#24954 ) * hex-mm: new weight layout and fusion updates * hvx-mm: unroll the new tiled vec_dots to optimize hvx register util * hex-mm: optimize dyn.quant format for q8_0 and q8_1 to reduce overhead in vec_dots. * hvx-mm: parallel quantizer per block for large rows * hvx-mm: simplify and futher optimize dyn.quant and vec_dots * hvx-mm: keep intermediate per tile accumulators in fp16 * hmx-mm: optimize weight dequant by aligning the repacked tiles with the DMA * hmx-mm: remove qweight scratch and just use vtcm_weight * hmx-mm: remove all unused and obsolete code * hmx-mm: the new tiled repack format is here to stay -- rename all x4x2 to _tiled * hmx-mm: improve activation processing with dma prefetch * hex-mm: fix hmx/hvx fallback logic and MUL_MAT_ID allocation (unbreaks OLMoE) * hex-mm: align the weight tiles with dma just like we did in hmx-mm * hex-mm: factor out common mm bits into htp/matmul-ops.h * hex-mm: start moving mm kernel selection to the host * hex-mm: move all of the matmul param compute into the host * hmx-mm: restore pipelined mode * hmx-mm: unroll the dequant functions to optimize register usage * hmx-mm: further improve activation process * hex-mm: use vtcm_seq_alloc for all vtcm allocations and define more common functions * hex-mm: improve mm optimizer to acount for number of activation threads * hex-mm: fix matmul-id kernel params selection (unbreaks OLMoE and LFM) * hexagon: remove support for arch < v73 since HMX is now required for most use-cases * hex-mm: cleanup naming for consistency * hex-mm: make sure matmul fusion accounts for vtcm allocation * hex-mm: minor cleanup for kernel_params definition * hex-mm: replace hardcoded limits with proper checks for vtcm requirements * hex-mm: add support for non-tiled mm as a fallback option and factor out hvx kernels into separate header * hex-mm: remove unused functions * hex-mm: add shorthand for MM_SELECT in run-tool script * hvx-mm: factor out hvx/hmx microkernels and unify matmul entry and dispatch * hex-mm: further cleanup matmul fallback path * hex-mm: refactor matmul entry point and dispatch a bit further * hexagon: update cmake build to enable hmx for everything * hex-ops: optimize kernel_param updates and include summary in the logs * hex-mm: add support for GGML_HEXAGON_MM_SELECT * hex-mm: add hex-common header * hex-mm: pass correct number of tasks to workpool * hex-mm: add proper checks for no-work in dyn.quant tasks * hex-mm: convert all quantizers into a macro * hex-mm: fix hvx-flat fallback to pass all MUL_MAT tests * hex-mm: vectorize q8_1 quantizer * hex-mm: improve fused ffn mm stride handling * hex-mm: consistent use of n_threads and pipeline in kernel_params * hexagon: minor formatting * hex-mm: update MUL_MAT_ID kernel_param handling to make sure host/npu are in sync * hvx-mm: go back to accumulating in fp32 in tiled hvx kernels, more accurate and same perf * hvx-mm: unroll the loops and remove masking that is not needed for tiled accums * hmx-mm: optimize activation processing (slit loops, some unrolling, etc) * hmx-mm: minor optimization for output processing * hex-mm: consistent use of uint32_t and size_t in mm kernels * hex-mm: remove legacy restrictions for rows to be multiple of 256 * hexagon: replace sprintf with snprintf * hex-mm: relax hardcoded nrows checks and rely on VTCM size requirements * hexagon: minor alignment fix * hexagon: fix trailing spaces * hex-mm: relax padding from 256 to 128 (leftovers) * hex-mm: remove redundant checks for weight align to 128 we always use 2D dma for the weights and align them properly * hmx-mm: MUL_MAT_ID better work distribution between hvx threads and hmx tracing * hex-mm: specialize per-token mmid activation handling * hex-profile: update python scripts to handle kernel-params section in the logging output * hex-mm: move n_prefetch (aka dma_depth) into kernel params and remove unused fields * hex-trace: use easier to parse format, simply and fix post-proc scripts * hmx-mm: relax 32 row limit for output processing which helps utilization * hmx-mm: use start-chunk idx for tracing info * hmx-mm: parameterize activation dma pipeline * hexagon: add support for simple graph caching to avoid recomputing kernel-params * hex-mm: remove left-over repack functions * hex-mm: tighten n_prefetch asserts * hex-mm: remove duplicate round/align_up helper * hexagon: cleanup common header used in host/npu * hexagon: update early wakeup threshold * hmx-mm: define cost constants and update solver to assume that repacked ne[1] is padded to 32 * hmx-mm: make precompute_matmul a bit more readable (split into smaller functions, etc) * hex-mm: remove n_threads constraint * hex-mm: minor formatting updates * hex-mm: remove obsolete profiling logs * hex-mm: restore hardcode gate to refuse lm-head to avoid repacking that tensor	2026-06-24 12:14:25 -07:00
Saba Fallah	894bb27af3	mtmd: model: unlimited-ocr: converter + parity test (#24969 )	2026-06-24 18:20:22 +02:00
Xuan-Son Nguyen	fb401045cc	common: remove unused json-partial (#24968 )	2026-06-24 18:12:16 +02:00
Wagner Bruna	51eae8cfca	vulkan: allow reducing the graph submission batches to avoid timeouts (#24872 )	2026-06-24 16:29:24 +02:00
liminfei-amd	1191758c5d	vulkan: fail the build when a shader fails to compile (#24450 ) * vulkan-shaders-gen: fail the build when a shader fails to compile vulkan-shaders-gen did not detect shader-compile subprocess failures, so a broken libggml-vulkan could be produced while the build reported success and the breakage only surfaced at run time. execute_command() discarded the child exit code (POSIX waitpid passed nullptr for status; the Windows branch never called GetExitCodeProcess) and string_to_spv decided success only from whether stderr was empty, so a non-zero exit with empty stderr, or a subprocess that failed to launch, was treated as success. Return the child exit code from execute_command() (WEXITSTATUS on POSIX, GetExitCodeProcess on Windows), treat a non-zero exit or non-empty stderr or a launch exception as a failure, and record it in an atomic flag. main() checks the flag after process_shaders() and returns EXIT_FAILURE before writing the output files, so the build stops instead of emitting a broken backend. Fixes #24393 Signed-off-by: liminfei-amd <91481003+liminfei-amd@users.noreply.github.com> * vulkan-shaders-gen: simplify compile_failed access and drop unreachable return Address review feedback on #24450: - Access the std::atomic<bool> compile_failed directly (= / implicit bool) instead of .store()/.load(); the flag stays atomic because the worker threads in process_shaders() set it concurrently. - Remove the unreachable trailing return -1 in execute_command(): on POSIX the child _exit()s after execvp and the parent returns (fork()<0 throws); on Windows the block returns the exit code. Signed-off-by: liminfei-amd <91481003+liminfei-amd@users.noreply.github.com> --------- Signed-off-by: liminfei-amd <91481003+liminfei-amd@users.noreply.github.com>	2026-06-24 11:42:03 +02:00
Pascal	00139b660b	ui: loading bar below the model picker (#24931 ) * ui: show model load progress on the selector trigger Mirror the in-dropdown stage progress as a thin bar on the selector trigger, so the active model's load percent stays visible when the menu is closed. Same status gating and composite fraction as the dropdown row, so both bars track the selected model in sync. Suggested-by: Julien Chaumond <@julien-c> * ui: show model load progress bar on the in-conversation model selector * ui: tune model load indicator to a pulsing highlight (suggested by @ngxson) Also wire the indicator onto the mobile sheet trigger, which was missing it since mobile uses the sheet instead of the dropdown. * ui: thin (@allozaur) pulsating (@ngxson) model load bar	2026-06-24 10:50:44 +02:00
Aleksander Grygier	ef9c13d4c2	ui: New Logo + Navigation cleanup & Mobile UI/UX improvements (#24897 ) * chore: `npm audit fix --force` * feat: Update sidebar toggle to use Logo * refactor: Clean up favicon SVG * feat: Refactor logo component and implement theme-aware favicon generation * feat: Add configurable padding to generated PWA assets * test: Add unit tests for writeThemeFavicons * refactor: Componentization * feat: WIP * feat: WIP * feat: WIP * feat: Mobile UI * feat: add SEARCH route constant * feat: create SidebarNavigationSearchResults component * refactor: use SidebarNavigationSearchResults in conversation list * feat: enable mobile search navigation in sidebar actions * feat: add mobile search route and page * fix: prevent sidebar overflow on mobile viewports * fix: Mobile sidebar * feat: Mobile Search WIP * feat: Mobile WIP * feat: Add PWA standalone detection and refine mobile UI * feat: Improve mobile layout, sidebar handling, and chat scrolling * feat: Improve mobile sidebar visibility and iOS Safari chat spacing * fix: Disable auto-scroll on mobile * chore: Linting * fix: Wrong condition * feat: Mobile chat scroll * refactor: WIP * fix: Desktop initial scroll always working again * fix: Partial fix for mobile auto-scroll / initial scroll * fix: Desktop auto-scroll on initial load and during streaming * fix: Mobile scrolling logic * refactor: Clean up * feat: Improve start UI * feat: Add `delay` to `fadeInView` * feat: Auto-scroll button * refactor: Cleanup * refactor: Extract chat dialogs and alerts into dedicated component * refactor: Reorganize ChatScreen component structure and initialization * feat: Improve auto-scroll after sending message * feat: UI improvements * fix: Settings link * feat: UI improvements * fix: better UI spacing * fix: Remove unneeded logic * fix: Chat Processing Info UI rendering * feat: Improve mobile UI * feat: UI improvement * fix: Conditional transition delay for Chat Messages based on route from * fix: Delay mobile sidebar collapse for smoother transitions * fix: Mobile scroll down button + sidebar pointer events * fix: Mobile UI * fix: Auto scrolling * fix: Implement dynamic height calculations for chat auto-scroll positioning and UI elements * fix: Retrieve `autofocus` for Chat Form textarea * fix: Use proper class Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> * refactor: extract scroll-to-bottom logic and fix message send flow * fix: update viewport store usage and remove conflicting autofocus * feat: add accessibility labels to scroll down button * fix: correct HTML structure in sidebar empty states * fix: dynamically toggle processing info visibility * chore: remove commented exports and fix formatting * fix * fix: Mobile Chat Form Add Action Sheet interactions --------- Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>	2026-06-24 10:21:33 +02:00