server: fix report progress for loading spec models, add "stages" list (#24870 )

* server: fix report progress for loading spec models, add "stages" list * improve * nits * nits 2
server: refactor batch construction (#24843 )
2026-06-21 21:27:37 +02:00 · 2026-06-21 17:36:52 +02:00 · 2026-06-21 14:16:11 +02:00 · 2026-06-21 14:12:15 +02:00 · 2026-06-21 14:04:52 +02:00 · 2026-06-21 13:40:52 +02:00
109 changed files with 2993 additions and 1437 deletions
@@ -13,6 +13,20 @@ ARG APP_REVISION=N/A
 # BUILD STAGE
 # Compile all binary files and libraries
 # ==============================================================================
+ARG NODE_VERSION=24
+
+FROM docker.io/node:$NODE_VERSION AS web
+
+ARG APP_VERSION
+
+WORKDIR /app/tools/ui
+
+COPY tools/ui/package.json tools/ui/package-lock.json ./
+RUN npm ci
+
+COPY tools/ui/ ./
+RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
+
 FROM ${CANN_BASE_IMAGE} AS build

 # -- Install build dependencies --
@@ -26,6 +40,8 @@ WORKDIR /app
 # -- Copy project files --
 COPY . .

+COPY --from=web /app/tools/ui/dist tools/ui/dist
+
 # -- Set CANN environment variables (required for compilation) --
 # Using ENV instead of `source` allows environment variables to persist across the entire image layer
 ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
@@ -3,6 +3,20 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

+ARG NODE_VERSION=24
+
+FROM docker.io/node:$NODE_VERSION AS web
+
+ARG APP_VERSION
+
+WORKDIR /app/tools/ui
+
+COPY tools/ui/package.json tools/ui/package-lock.json ./
+RUN npm ci
+
+COPY tools/ui/ ./
+RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
+
 FROM docker.io/ubuntu:$UBUNTU_VERSION AS build

 ARG TARGETARCH
@@ -16,6 +30,8 @@ WORKDIR /app

 COPY . .

+COPY --from=web /app/tools/ui/dist tools/ui/dist
+
 RUN if [ "$TARGETARCH" = "amd64" ] || [ "$TARGETARCH" = "arm64" ]; then \
        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
    else \
@@ -11,6 +11,20 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

+ARG NODE_VERSION=24
+
+FROM docker.io/node:$NODE_VERSION AS web
+
+ARG APP_VERSION
+
+WORKDIR /app/tools/ui
+
+COPY tools/ui/package.json tools/ui/package-lock.json ./
+RUN npm ci
+
+COPY tools/ui/ ./
+RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
+
 FROM ${BASE_CUDA_DEV_CONTAINER} AS build

 ARG GCC_VERSION
@@ -26,6 +40,8 @@ WORKDIR /app

 COPY . .

+COPY --from=web /app/tools/ui/dist tools/ui/dist
+
 RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
    export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
    fi && \
@@ -5,6 +5,20 @@ ARG APP_REVISION=N/A

 ## Build Image

+ARG NODE_VERSION=24
+
+FROM docker.io/node:$NODE_VERSION AS web
+
+ARG APP_VERSION
+
+WORKDIR /app/tools/ui
+
+COPY tools/ui/package.json tools/ui/package-lock.json ./
+RUN npm ci
+
+COPY tools/ui/ ./
+RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
+
 FROM docker.io/intel/deep-learning-essentials:$ONEAPI_VERSION AS build

 ARG GGML_SYCL_F16=ON
@@ -22,6 +36,8 @@ WORKDIR /app

 COPY . .

+COPY --from=web /app/tools/ui/dist tools/ui/dist
+
 RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
        echo "GGML_SYCL_F16 is set" \
        && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON" \
@@ -10,6 +10,20 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

+ARG NODE_VERSION=24
+
+FROM docker.io/node:$NODE_VERSION AS web
+
+ARG APP_VERSION
+
+WORKDIR /app/tools/ui
+
+COPY tools/ui/package.json tools/ui/package-lock.json ./
+RUN npm ci
+
+COPY tools/ui/ ./
+RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
+
 FROM ${BASE_MUSA_DEV_CONTAINER} AS build

 # MUSA architecture to build for (defaults to all supported archs)
@@ -29,6 +43,8 @@ WORKDIR /app

 COPY . .

+COPY --from=web /app/tools/ui/dist tools/ui/dist
+
 RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
    fi && \
@@ -22,6 +22,20 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

+ARG NODE_VERSION=24
+
+FROM docker.io/node:$NODE_VERSION AS web
+
+ARG APP_VERSION
+
+WORKDIR /app/tools/ui
+
+COPY tools/ui/package.json tools/ui/package-lock.json ./
+RUN npm ci
+
+COPY tools/ui/ ./
+RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
+
 ## Build Image
 FROM docker.io/ubuntu:${UBUNTU_VERSION} AS build

@@ -69,6 +83,8 @@ WORKDIR /app

 COPY . .

+COPY --from=web /app/tools/ui/dist tools/ui/dist
+
 # Build Stage
 RUN bash -c "source ${OpenVINO_DIR}/setupvars.sh && \
    cmake -B build/ReleaseOV -G Ninja \
@@ -11,6 +11,20 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

+ARG NODE_VERSION=24
+
+FROM docker.io/node:$NODE_VERSION AS web
+
+ARG APP_VERSION
+
+WORKDIR /app/tools/ui
+
+COPY tools/ui/package.json tools/ui/package-lock.json ./
+RUN npm ci
+
+COPY tools/ui/ ./
+RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
+
 ### Build image
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build

@@ -38,6 +52,8 @@ WORKDIR /app

 COPY . .

+COPY --from=web /app/tools/ui/dist tools/ui/dist
+
 RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
    cmake -S . -B build \
        -DGGML_HIP=ON \
@@ -3,6 +3,20 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

+ARG NODE_VERSION=24
+
+FROM docker.io/node:$NODE_VERSION AS web
+
+ARG APP_VERSION
+
+WORKDIR /app/tools/ui
+
+COPY tools/ui/package.json tools/ui/package-lock.json ./
+RUN npm ci
+
+COPY tools/ui/ ./
+RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
+
 FROM docker.io/ubuntu:$UBUNTU_VERSION AS build

 # Install build tools
@@ -17,6 +31,8 @@ WORKDIR /app

 COPY . .

+COPY --from=web /app/tools/ui/dist tools/ui/dist
+
 RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
    cmake --build build --config Release -j$(nproc)

@@ -3,6 +3,20 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

+ARG NODE_VERSION=24
+
+FROM docker.io/node:$NODE_VERSION AS web
+
+ARG APP_VERSION
+
+WORKDIR /app/tools/ui
+
+COPY tools/ui/package.json tools/ui/package-lock.json ./
+RUN npm ci
+
+COPY tools/ui/ ./
+RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
+
 FROM docker.io/ubuntu:$UBUNTU_VERSION AS build

 RUN apt-get update && \
@@ -14,6 +28,8 @@ WORKDIR /app

 COPY . .

+COPY --from=web /app/tools/ui/dist tools/ui/dist
+
 RUN cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_ZENDNN=ON && \
    cmake --build build -j $(nproc)

@@ -10,6 +10,8 @@

 build*/

+tools/ui/node_modules/
+
 models/*

 /llama-cli
@@ -58,6 +58,13 @@ jobs:
          git tag ${{ steps.srctag.outputs.name }} || exit 0
          git push origin ${{ steps.srctag.outputs.name }} || exit 0

+  build_ui:
+    name: Build UI
+    needs: create_tag
+    uses: ./.github/workflows/ui-build.yml
+    with:
+      hf_ui_version: ${{ needs.create_tag.outputs.source_tag }}
+
  prepare_matrices:
    name: Prepare Docker matrices
    runs-on: ubuntu-24.04
@@ -79,7 +86,7 @@ jobs:
          [
            { "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
            { "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
-            { "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" },
+            { "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x", "prebuilt_ui": true },
            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
            { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.3.0", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
@@ -135,7 +142,7 @@ jobs:

  push_to_registry:
    name: Push Docker image to Docker Registry
-    needs: [prepare_matrices, create_tag]
+    needs: [prepare_matrices, create_tag, build_ui]

    runs-on: ${{ matrix.config.runs_on }}
    strategy:
@@ -150,6 +157,13 @@ jobs:
          fetch-depth: 0
          ref: ${{ needs.create_tag.outputs.source_tag }}

+      - name: Download prebuilt UI
+        if: ${{ matrix.config.prebuilt_ui == true }}
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
+        with:
+          name: ui-build
+          path: tools/ui/dist
+
      - name: Set up QEMU
        if: ${{ contains(matrix.config.platforms, 'linux/amd64') }}
        uses: docker/setup-qemu-action@ce360397dd3f832beb865e1373c09c0e9f86d70a # v4
@@ -1627,6 +1627,7 @@ jobs:
            **Windows:**
            - [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
            - [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip)
+            - [Windows arm64 (OpenCL Adreno)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-opencl-adreno-arm64.zip)
            - [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) - [CUDA 12.4 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-12.4-x64.zip)
            - [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.3-x64.zip) - [CUDA 13.3 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.3-x64.zip)
            - [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
@@ -25,13 +25,3 @@ Commits:
 - Do not explicitly set the git author in commits - rely on the default git config
 - Always use `--no-gpg-sign` when committing
 - Never `git push` without explicit confirmation from the user
-
-Resources (read on demand):
- [CONTRIBUTING.md](CONTRIBUTING.md)
- [Build documentation](docs/build.md)
- [Server usage documentation](tools/server/README.md)
- [Server development documentation](tools/server/README-dev.md)
- [PEG parser](docs/development/parsing.md)
- [Auto parser](docs/autoparser.md)
- [Jinja engine](common/jinja/README.md)
- [PR template](.github/pull_request_template.md)
@@ -17,6 +17,7 @@
 #   define NOMINMAX
 #endif
 #include <windows.h>
+#include <shellapi.h>
 #endif

 #define JSON_ASSERT GGML_ASSERT
@@ -302,7 +303,6 @@ static handle_model_result common_params_handle_model(struct common_params_model

    if (!model.docker_repo.empty()) {
        model.path = common_docker_resolve_model(model.docker_repo);
-        model.name = model.docker_repo;
    } else if (!model.hf_repo.empty()) {
        // If -m was used with -hf, treat the model "path" as the hf_file to download
        if (model.hf_file.empty() && !model.path.empty()) {
@@ -322,7 +322,6 @@ static handle_model_result common_params_handle_model(struct common_params_model
            throw std::runtime_error("failed to download model from Hugging Face");
        }

-        model.name = model.hf_repo;
        model.path = download_result.model_path;

        if (!download_result.mmproj_path.empty()) {
@@ -893,7 +892,44 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
    return true;
 }

+#ifdef _WIN32
+struct utf8_argv {
+    std::vector<std::string> buf;
+    std::vector<char*> ptrs;
+};
+
+static utf8_argv make_utf8_argv() {
+    utf8_argv out;
+    int wargc = 0;
+    LPWSTR* wargv = CommandLineToArgvW(GetCommandLineW(), &wargc);
+    if (!wargv) return out;
+
+    out.buf.reserve(wargc);
+    for (int i = 0; i < wargc; ++i) {
+        int n = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, wargv[i], -1, nullptr, 0, nullptr, nullptr);
+        if (n <= 0) { out.buf.emplace_back(); continue; }
+        auto& s = out.buf.emplace_back();
+        s.resize(static_cast<size_t>(n - 1));
+        (void)WideCharToMultiByte(CP_UTF8, 0, wargv[i], -1, s.data(), n, nullptr, nullptr);
+    }
+    LocalFree(wargv);
+
+    out.ptrs.reserve(out.buf.size() + 1);
+    for (auto& s : out.buf) out.ptrs.push_back(s.data());
+    out.ptrs.push_back(nullptr);
+    return out;
+}
+#endif
+
 bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
+#ifdef _WIN32
+    auto utf8 = make_utf8_argv();
+    // repair argv only when it matches the process command line
+    if (static_cast<int>(utf8.buf.size()) == argc) {
+        argv = utf8.ptrs.data();
+    }
+#endif
+
    auto ctx_arg = common_params_parser_init(params, ex, print_usage);
    const common_params params_org = ctx_arg.params; // the example can modify the default params

@@ -2830,62 +2866,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.api_prefix = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
-    // Deprecated: use --ui-config instead (kept for backward compat)
    add_opt(common_arg(
-        {"--webui-config"}, "JSON",
-        "[DEPRECATED: use --ui-config] JSON that provides default WebUI settings (overrides WebUI defaults)",
-        [](common_params & params, const std::string & value) {
-            params.ui_config_json = value;
-            params.webui_config_json = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG"));
-
-    add_opt(common_arg(
-        {"--ui-config"}, "JSON",
+        {"--ui-config", "--webui-config"}, "JSON",
        "JSON that provides default UI settings (overrides UI defaults)",
        [](common_params & params, const std::string & value) {
            params.ui_config_json = value;
-            params.webui_config_json = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_CONFIG"));
-
-    // Deprecated: use --ui-config-file instead (kept for backward compat)
    add_opt(common_arg(
-        {"--webui-config-file"}, "PATH",
-        "[DEPRECATED: use --ui-config-file] JSON file that provides default WebUI settings (overrides WebUI defaults)",
-        [](common_params & params, const std::string & value) {
-            params.ui_config_json = read_file(value);
-            params.webui_config_json = params.ui_config_json;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
-
-    add_opt(common_arg(
-        {"--ui-config-file"}, "PATH",
+        {"--ui-config-file", "--webui-config-file"}, "PATH",
        "JSON file that provides default UI settings (overrides UI defaults)",
        [](common_params & params, const std::string & value) {
            params.ui_config_json = read_file(value);
-            params.webui_config_json = params.ui_config_json;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_CONFIG_FILE"));
-
-    // Deprecated: use --ui-mcp-proxy instead (kept for backward compat)
    add_opt(common_arg(
-        {"--webui-mcp-proxy"},
-        {"--no-webui-mcp-proxy"},
-        "[DEPRECATED: use --ui-mcp-proxy/--no-ui-mcp-proxy] experimental: whether to enable MCP CORS proxy",
-        [](common_params & params, bool value) {
-            params.ui_mcp_proxy = value;
-            params.webui_mcp_proxy = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_MCP_PROXY"));
-
-    add_opt(common_arg(
-        {"--ui-mcp-proxy"},
-        {"--no-ui-mcp-proxy"},
+        {"--ui-mcp-proxy", "--webui-mcp-proxy"},
+        {"--no-ui-mcp-proxy", "--no-webui-mcp-proxy"},
        "experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)",
        [](common_params & params, bool value) {
            params.ui_mcp_proxy = value;
-            params.webui_mcp_proxy = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_MCP_PROXY"));
    add_opt(common_arg(
@@ -2897,24 +2897,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.server_tools = parse_csv_row(value);
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TOOLS"));
-    // Deprecated: use --ui/--no-ui instead (kept for backward compat)
    add_opt(common_arg(
-        {"--webui"},
-        {"--no-webui"},
-        "[DEPRECATED: use --ui/--no-ui] whether to enable the Web UI",
+        {"-ag", "--agent"},
+        {"-no-ag", "--no-agent"},
+        "whether to enable CORS proxy and all built-in tools - do not enable in untrusted environments (default: disabled)",
        [](common_params & params, bool value) {
-            params.ui = value;
-            params.webui = value;
+            if (value) {
+                params.server_tools = {"all"};
+                params.ui_mcp_proxy = true;
+            } else {
+                params.server_tools.clear();
+                params.ui_mcp_proxy = false;
+            }
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI"));
-
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_AGENT"));
    add_opt(common_arg(
-        {"--ui"},
-        {"--no-ui"},
+        {"--ui", "--webui"},
+        {"--no-ui", "--no-webui"},
        string_format("whether to enable the Web UI (default: %s)", params.ui ? "enabled" : "disabled"),
        [](common_params & params, bool value) {
            params.ui = value;
-            params.webui = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI"));
    add_opt(common_arg(
@@ -2945,7 +2947,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
    add_opt(common_arg(
        {"--api-key-file"}, "FNAME",
-        "path to file containing API keys (default: none)",
+        "path to file containing API keys, one per line; lines starting with a hash are treated as comments (default: none)",
        [](common_params & params, const std::string & value) {
            std::ifstream key_file(value);
            if (!key_file) {
@@ -2953,7 +2955,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            }
            std::string key;
            while (std::getline(key_file, key)) {
-                if (!key.empty()) {
+                if (!key.empty() && key[0] != '#') {
                    params.api_keys.push_back(key);
                }
            }
@@ -1074,6 +1074,18 @@ std::vector<common_file_info> fs_list(const std::string & path, bool include_dir
    return files;
 }

+std::ifstream fs_open_ifstream(const std::string & fname, std::ios_base::openmode mode) {
+#ifdef _WIN32
+    int wlen = MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, NULL, 0);
+    if (!wlen) { return std::ifstream(); }
+    std::vector<wchar_t> wfname(wlen);
+    (void)MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, wfname.data(), wlen);
+    return std::ifstream(wfname.data(), mode);
+#else
+    return std::ifstream(fname, mode);
+#endif
+}
+
 //
 // TTY utils
 //
@@ -2034,7 +2046,7 @@ bool common_prompt_batch_decode(
 }

 size_t common_prompt_checkpoint::size() const {
-    return data_tgt.size() + data_dft.size();
+    return data_tgt.size() + data_dft.size() + data_spec.size();
 }

 bool common_prompt_checkpoint::empty() const {
@@ -2049,6 +2061,7 @@ void common_prompt_checkpoint::clear() {

    data_tgt.clear();
    data_dft.clear();
+    data_spec.clear();
 }

 void common_prompt_checkpoint::update_pos(
@@ -2138,4 +2151,5 @@ void common_prompt_checkpoint::clear_tgt() {

 void common_prompt_checkpoint::clear_dft() {
    data_dft.clear();
+    data_spec.clear();
 }
@@ -295,7 +295,16 @@ struct common_params_model {
    std::string hf_repo     = ""; // HF repo                                                // NOLINT
    std::string hf_file     = ""; // HF file                                                // NOLINT
    std::string docker_repo = ""; // Docker repo                                            // NOLINT
-    std::string name        = ""; // in format <user>/<model>[:<tag>] (tag is optional)     // NOLINT
+
+    std::string get_name() {
+        if (!hf_repo.empty()) {
+            return hf_repo;
+        }
+        if (!docker_repo.empty()) {
+            return docker_repo;
+        }
+        return path;
+    }
 };

 // draft-model-based speculative decoding parameters
@@ -363,7 +372,7 @@ struct common_params_speculative {

    uint32_t need_n_rs_seq() const {
        bool needs_rs_seq = std::any_of(types.begin(), types.end(), [&](auto t) {
-            return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP;
+            return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP || t == COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3;
        });

        return needs_rs_seq ? draft.n_max : 0u;
@@ -624,12 +633,6 @@ struct common_params {

    // UI configs
    bool ui = true;
-
-    // Deprecated: use ui, ui_mcp_proxy, ui_config_json instead
-    bool webui = ui;
-    bool webui_mcp_proxy = false;
-    std::string webui_config_json;
-
    bool ui_mcp_proxy = false;
    std::string ui_config_json;

@@ -848,6 +851,9 @@ struct common_file_info {
 };
 std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);

+// fs open, also handle UTF8 on Windows
+std::ifstream fs_open_ifstream(const std::string & fname, std::ios_base::openmode mode);
+
 //
 // TTY utils
 //
@@ -1065,6 +1071,10 @@ struct common_prompt_checkpoint {
    std::vector<uint8_t> data_tgt;
    std::vector<uint8_t> data_dft;

+    // (optional) speculative-decoding implementation state stashed with the checkpoint
+    // (e.g. eagle3's deferred-boundary g_embd row)
+    std::vector<uint8_t> data_spec;
+
    size_t size() const;

    bool empty() const;
@@ -686,59 +686,62 @@ value set_statement::execute_impl(context & ctx) {
    return mk_val<value_undefined>();
 }

+static inline void bind_parameters(const std::string & name, const statements & this_args, const func_args & args, context & ctx) {
+    const size_t expected_count = this_args.size();
+    const size_t input_count = args.count();
+
+    JJ_DEBUG("Invoking '%s' with %zu input arguments (expected %zu)", name.c_str(), input_count, expected_count);
+    for (size_t i = 0; i < expected_count; ++i) {
+        if (i < input_count) {
+            if (is_stmt<identifier>(this_args[i])) {
+                // normal parameter
+                std::string param_name = cast_stmt<identifier>(this_args[i])->val;
+                value param_value = args.get_kwarg_or_pos(param_name, i);
+                JJ_DEBUG("  Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
+                ctx.set_val(param_name, param_value);
+            } else if (is_stmt<keyword_argument_expression>(this_args[i])) {
+                // default argument used as normal parameter
+                auto kwarg = cast_stmt<keyword_argument_expression>(this_args[i]);
+                if (!is_stmt<identifier>(kwarg->key)) {
+                    throw std::runtime_error("Keyword argument key must be an identifier in '" + name + "'");
+                }
+                std::string param_name = cast_stmt<identifier>(kwarg->key)->val;
+                value param_value = args.get_kwarg_or_pos(param_name, i);
+                JJ_DEBUG("  Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
+                ctx.set_val(param_name, param_value);
+            } else {
+                throw std::runtime_error("Invalid parameter type in '" + name + "'");
+            }
+        } else {
+            auto & default_arg = this_args[i];
+            if (is_stmt<keyword_argument_expression>(default_arg)) {
+                auto kwarg = cast_stmt<keyword_argument_expression>(default_arg);
+                if (!is_stmt<identifier>(kwarg->key)) {
+                    throw std::runtime_error("Keyword argument key must be an identifier in '" + name + "'");
+                }
+                std::string param_name = cast_stmt<identifier>(kwarg->key)->val;
+                JJ_DEBUG("  Binding parameter '%s' to default argument of type %s", param_name.c_str(), kwarg->val->type().c_str());
+                ctx.set_val(param_name, kwarg->val->execute(args.ctx));
+            } else {
+                throw std::runtime_error("Not enough arguments provided to '" + name + "'");
+            }
+            //std::string param_name = cast_stmt<identifier>(default_args[i])->val;
+            //JJ_DEBUG("  Binding parameter '%s' to default", param_name.c_str());
+            //ctx.var[param_name] = default_args[i]->execute(ctx);
+        }
+    }
+}
+
 value macro_statement::execute_impl(context & ctx) {
    if (!is_stmt<identifier>(this->name)) {
        throw std::runtime_error("Macro name must be an identifier");
    }
    std::string name = cast_stmt<identifier>(this->name)->val;

-    const func_handler func = [this, name, &ctx](const func_args & args) -> value {
-        size_t expected_count = this->args.size();
-        size_t input_count = args.count();
+    const func_handler func = [this, name](const func_args & args) -> value {
+        context macro_ctx(args.ctx); // new scope for macro execution

-        JJ_DEBUG("Invoking macro '%s' with %zu input arguments (expected %zu)", name.c_str(), input_count, expected_count);
-        context macro_ctx(ctx); // new scope for macro execution
-
-        // bind parameters
-        for (size_t i = 0; i < expected_count; ++i) {
-            if (i < input_count) {
-                if (is_stmt<identifier>(this->args[i])) {
-                    // normal parameter
-                    std::string param_name = cast_stmt<identifier>(this->args[i])->val;
-                    value param_value = args.get_kwarg_or_pos(param_name, i);
-                    JJ_DEBUG("  Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
-                    macro_ctx.set_val(param_name, param_value);
-                } else if (is_stmt<keyword_argument_expression>(this->args[i])) {
-                    // default argument used as normal parameter
-                    auto kwarg = cast_stmt<keyword_argument_expression>(this->args[i]);
-                    if (!is_stmt<identifier>(kwarg->key)) {
-                        throw std::runtime_error("Keyword argument key must be an identifier in macro '" + name + "'");
-                    }
-                    std::string param_name = cast_stmt<identifier>(kwarg->key)->val;
-                    value param_value = args.get_kwarg_or_pos(param_name, i);
-                    JJ_DEBUG("  Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
-                    macro_ctx.set_val(param_name, param_value);
-                } else {
-                    throw std::runtime_error("Invalid parameter type in macro '" + name + "'");
-                }
-            } else {
-                auto & default_arg = this->args[i];
-                if (is_stmt<keyword_argument_expression>(default_arg)) {
-                    auto kwarg = cast_stmt<keyword_argument_expression>(default_arg);
-                    if (!is_stmt<identifier>(kwarg->key)) {
-                        throw std::runtime_error("Keyword argument key must be an identifier in macro '" + name + "'");
-                    }
-                    std::string param_name = cast_stmt<identifier>(kwarg->key)->val;
-                    JJ_DEBUG("  Binding parameter '%s' to default argument of type %s", param_name.c_str(), kwarg->val->type().c_str());
-                    macro_ctx.set_val(param_name, kwarg->val->execute(ctx));
-                } else {
-                    throw std::runtime_error("Not enough arguments provided to macro '" + name + "'");
-                }
-                //std::string param_name = cast_stmt<identifier>(default_args[i])->val;
-                //JJ_DEBUG("  Binding parameter '%s' to default", param_name.c_str());
-                //macro_ctx.var[param_name] = default_args[i]->execute(ctx);
-            }
-        }
+        bind_parameters(name, this->args, args, macro_ctx);

        // execute macro body
        JJ_DEBUG("Executing macro '%s' body with %zu statements", name.c_str(), this->body.size());
@@ -752,6 +755,46 @@ value macro_statement::execute_impl(context & ctx) {
    return mk_val<value_undefined>();
 }

+value call_statement::execute_impl(context & ctx) {
+    auto call_expr = cast_stmt<call_expression>(this->call);
+    if (!call_expr) {
+        throw std::runtime_error("Call statement requires a valid call expression");
+    }
+
+    value callee_val = call_expr->callee->execute(ctx);
+    if (!is_val<value_func>(callee_val)) {
+        throw std::runtime_error("Callee is not a function: got " + callee_val->type());
+    }
+    auto * callee_func = cast_val<value_func>(callee_val);
+
+    context caller_ctx(ctx); // new scope for caller execution
+
+    const func_handler func = [this, caller_ctx = std::move(caller_ctx)](const func_args & args) -> value {
+        context block_ctx(caller_ctx); // new scope for block execution
+
+        bind_parameters("caller", this->caller_args, args, block_ctx);
+
+        JJ_DEBUG("Executing call body with %zu statements", this->body.size());
+        auto res = exec_statements(this->body, block_ctx);
+        JJ_DEBUG("Call body execution complete, result: %s", res->val_str.str().c_str());
+        return res;
+    };
+
+    context call_ctx(ctx);
+    call_ctx.set_val("caller", mk_val<value_func>("caller", func));
+
+    func_args args(call_ctx);
+
+    for (const auto & arg_expr : call_expr->args) {
+        auto arg_val = arg_expr->execute(ctx);
+        JJ_DEBUG("  Argument type: %s", arg_val->type().c_str());
+        args.push_back(arg_val);
+    }
+
+    JJ_DEBUG("Calling macro '%s' with %zu arguments", callee_func->name.c_str(), args.count());
+    return callee_func->invoke(args);
+}
+
 value member_expression::execute_impl(context & ctx) {
    value object = this->object->execute(ctx);

@@ -552,6 +552,7 @@ struct call_statement : public statement {
        for (const auto & arg : this->caller_args) chk_type<expression>(arg);
    }
    std::string type() const override { return "CallStatement"; }
+    value execute_impl(context & ctx) override;
 };

 struct ternary_expression : public expression {
@@ -233,27 +233,27 @@ struct BuiltinRule {
 };

 static std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
-    {"boolean", {"(\"true\" | \"false\") space", {}}},
+    {"boolean", {"(\"true\" | \"false\")", {}}},
    {"decimal-part", {"[0-9]{1,16}", {}}},
    {"integral-part", {"[0] | [1-9] [0-9]{0,15}", {}}},
-    {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
-    {"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
+    {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)?", {"integral-part", "decimal-part"}}},
+    {"integer", {"(\"-\"? integral-part)", {"integral-part"}}},
    {"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
-    {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
-    {"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
-    {"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
+    {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? space \"}\"", {"string", "value"}}},
+    {"array", {"\"[\" space ( value (\",\" space value)* )? space \"]\"", {"value"}}},
+    {"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\"", {}}},
    {"char",   {"[^\"\\\\\\x7F\\x00-\\x1F] | [\\\\] ([\"\\\\bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
-    {"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
-    {"null", {"\"null\" space", {}}},
+    {"string", {"\"\\\"\" char* \"\\\"\"", {"char"}}},
+    {"null", {"\"null\"", {}}},
 };

 static std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
    {"date", {"[0-9]{4} \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
    {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9]{3} )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
    {"date-time", {"date \"T\" time", {"date", "time"}}},
-    {"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
-    {"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
-    {"date-time-string", {"\"\\\"\" date-time \"\\\"\" space", {"date-time"}}}
+    {"date-string", {"\"\\\"\" date \"\\\"\"", {"date"}}},
+    {"time-string", {"\"\\\"\" time \"\\\"\"", {"time"}}},
+    {"date-time-string", {"\"\\\"\" date-time \"\\\"\"", {"date-time"}}}
 };

 static bool is_reserved_name(const std::string & name) {
@@ -551,16 +551,16 @@ private:
            }
            return join_seq();
        };
-        return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space");
+        return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\"");
    }

    /*
        Returns a rule that matches a JSON string that is none of the provided strings

        not_strings({"a"})
-            -> ["] ( [a] char+ | [^"a] char* )? ["] space
+            -> ["] ( [a] char+ | [^"a] char* )? ["]
        not_strings({"and", "also"})
-            -> ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["] space
+            -> ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["]
    */
    std::string _not_strings(const std::vector<std::string> & strings) {

@@ -619,7 +619,7 @@ private:
        if (!trie.is_end_of_string) {
            out << "?";
        }
-        out << " [\"] space";
+        out << " [\"]";
        return out.str();
    }

@@ -725,7 +725,7 @@ private:
            rule += " )?";
        }

-        rule += " \"}\" space";
+        rule += " space \"}\"";

        return rule;
    }
@@ -858,14 +858,14 @@ public:
            return _add_rule(rule_name, _generate_union_rule(name, schema_types));
        }
        if (schema.contains("const")) {
-            return _add_rule(rule_name, _generate_constant_rule(schema["const"]) + " space");
+            return _add_rule(rule_name, _generate_constant_rule(schema["const"]));
        }
        if (schema.contains("enum")) {
            std::vector<std::string> enum_values;
            for (const auto & v : schema["enum"]) {
                enum_values.push_back(_generate_constant_rule(v));
            }
-            return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ") space");
+            return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ")");
        }
        if ((schema_type.is_null() || schema_type == "object")
                && (schema.contains("properties") ||
@@ -933,7 +933,7 @@ public:
                    }
                }
                if (!enum_intersection.empty()) {
-                    return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ") space");
+                    return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ")");
                }
            }
            return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
@@ -948,7 +948,7 @@ public:
                    }
                    rule += visit(items[i], name + (name.empty() ? "" : "-") + "tuple-" + std::to_string(i));
                }
-                rule += " \"]\" space";
+                rule += " space \"]\"";
                return _add_rule(rule_name, rule);
            }
            std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item");
@@ -956,7 +956,7 @@ public:
            json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json();
            int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : std::numeric_limits<int>::max();

-            return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space");
+            return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " space \"]\"");
        }
        if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) {
            return _visit_pattern(schema["pattern"], rule_name);
@@ -972,7 +972,7 @@ public:
            std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
            int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
            int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
-            return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
+            return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\"");
        }
        if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
            int64_t min_value = std::numeric_limits<int64_t>::min();
@@ -990,7 +990,7 @@ public:
            std::stringstream out;
            out << "(";
            build_min_max_int(min_value, max_value, out);
-            out << ") space";
+            out << ")";
            return _add_rule(rule_name, out.str());
        }
        if (schema.empty() || schema_type == "object") {
@@ -6,13 +6,14 @@
 #include "unicode.h"

 #include <algorithm>
+#include <deque>
 #include <initializer_list>
 #include <map>
 #include <memory>
 #include <nlohmann/json.hpp>
 #include <regex>
+#include <set>
 #include <stdexcept>
-#include <unordered_set>

 // Trick to catch missing branches
 template <typename T>
@@ -88,40 +89,7 @@ struct trie {
        return match_result{match_result::NO_MATCH};
    }

-    struct prefix_and_next {
-        std::vector<uint32_t> prefix;
-        std::vector<uint32_t> next_chars;
-    };
-
-    std::vector<prefix_and_next> collect_prefix_and_next() {
-        std::vector<uint32_t>        prefix;
-        std::vector<prefix_and_next> result;
-        collect_prefix_and_next(0, prefix, result);
-        return result;
-    }
-
  private:
-    void collect_prefix_and_next(size_t index, std::vector<uint32_t> & prefix, std::vector<prefix_and_next> & out) {
-        if (!nodes[index].is_word) {
-            if (!nodes[index].children.empty()) {
-                std::vector<uint32_t> chars;
-                chars.reserve(nodes[index].children.size());
-                for (const auto & p : nodes[index].children) {
-                    chars.push_back(p.first);
-                }
-                out.emplace_back(prefix_and_next{prefix, chars});
-            }
-        }
-
-        for (const auto & p : nodes[index].children) {
-            uint32_t ch = p.first;
-            auto child = p.second;
-            prefix.push_back(ch);
-            collect_prefix_and_next(child, prefix, out);
-            prefix.pop_back();
-        }
-    }
-
    size_t create_node() {
        size_t index = nodes.size();
        nodes.emplace_back();
@@ -153,6 +121,65 @@ struct trie {
    }
 };

+// Aho-Corasick automaton
+struct aho_corasick {
+    trie                t;
+    std::vector<size_t> fail;      // failure links
+    std::vector<size_t> order;     // states in BFS order
+    std::vector<bool>   terminal;  // match states (directly or via a suffix link)
+    std::set<uint32_t>  alphabet;  // every character with a transition
+
+    aho_corasick(const std::vector<std::string> & strings) : t(strings) {
+        const auto & nodes = t.nodes;
+        const size_t n = nodes.size();
+
+        fail.assign(n, 0);
+        order.reserve(n);
+
+        std::deque<size_t> queue{ 0 };
+        while (!queue.empty()) {
+            size_t u = queue.front();
+            queue.pop_front();
+            order.push_back(u);
+            for (const auto & [ch, v] : nodes[u].children) {
+                if (u != 0) {
+                    size_t f = fail[u];
+                    while (f && nodes[f].children.find(ch) == nodes[f].children.end()) {
+                        f = fail[f];
+                    }
+                    auto it = nodes[f].children.find(ch);
+                    fail[v] = (it != nodes[f].children.end() && it->second != v) ? it->second : 0;
+                }
+                queue.push_back(v);
+            }
+        }
+
+        terminal.assign(n, false);
+        for (size_t u : order) {
+            terminal[u] = nodes[u].is_word || (u != 0 && terminal[fail[u]]);
+        }
+
+        for (const auto & node : nodes) {
+            for (const auto & [ch, v] : node.children) {
+                alphabet.insert(ch);
+            }
+        }
+    }
+
+    size_t num_states()          const { return t.nodes.size(); }
+    bool   is_terminal(size_t s) const { return terminal[s]; }
+
+    // follow failure links until a transition on `ch` exists.
+    size_t next(size_t state, uint32_t ch) const {
+        const auto & nodes = t.nodes;
+        while (state && nodes[state].children.find(ch) == nodes[state].children.end()) {
+            state = fail[state];
+        }
+        auto it = nodes[state].children.find(ch);
+        return it != nodes[state].children.end() ? it->second : 0;
+    }
+};
+
 static std::pair<uint32_t, size_t> parse_hex_escape(const std::string & str, size_t pos, int hex_count) {
    if (pos + hex_count > str.length()) {
        return {0, 0};
@@ -992,12 +1019,12 @@ void common_peg_arena::resolve_refs() {
 }

 std::string common_peg_arena::dump(common_peg_parser_id id) const {
-    std::unordered_set<common_peg_parser_id> visited;
+    std::set<common_peg_parser_id> visited;
    return dump_impl(id, visited);
 }

 std::string common_peg_arena::dump_impl(common_peg_parser_id                       id,
-                                        std::unordered_set<common_peg_parser_id> & visited) const {
+                                        std::set<common_peg_parser_id> & visited) const {
    // Check for cycles
    if (visited.count(id)) {
        return "[cycle]";
@@ -1342,7 +1369,7 @@ common_peg_parser common_peg_parser_builder::json_object() {
 common_peg_parser common_peg_parser_builder::json_array() {
    return rule("json-array", [this]() {
        auto ws = space();
-        auto elements = sequence({json(), zero_or_more(sequence({literal(","), ws, json()}))});
+        auto elements = sequence({json(), zero_or_more(sequence({ws, literal(","), ws, json()}))});
        return sequence({
            literal("["),
            ws,
@@ -1502,61 +1529,74 @@ static std::string gbnf_escape_char_class(uint32_t c) {
    return std::string(buf);
 }

-static std::string gbnf_excluding_pattern(const std::vector<std::string> & strings) {
-    trie matcher(strings);
-    auto pieces = matcher.collect_prefix_and_next();
+// GBNF grammar matching strings that contain no string in `strings` as a
+// substring. Emits the complement of an Aho-Corasick automaton DFA and returns
+// the start state rule name.
+//
+// ref: https://github.com/ggml-org/llama.cpp/pull/24839
+static std::string gbnf_excluding_grammar(const common_grammar_builder & builder,
+                                          const std::string &            prefix,
+                                          const std::vector<std::string> & strings) {
+    aho_corasick ac(strings);

-    std::string pattern;
-    std::string trailing;  // optional proper-prefix of a delimiter, allowed only at the very end
-    for (size_t i = 0; i < pieces.size(); ++i) {
-        if (i > 0) {
-            pattern += " | ";
+    auto state_name = [&](size_t s) -> std::string {
+        if (s == 0) {
+            return prefix;
        }
+        std::string num = std::to_string(s);
+        num = num.size() == 1 ? ("0" + num) : num;
+        return prefix + "-" + num;
+    };

-        const auto & pre = pieces[i].prefix;
-        const auto & chars = pieces[i].next_chars;
-
-        std::string cls;
-        cls.reserve(chars.size());
+    auto char_class = [](const std::vector<uint32_t> & chars, bool negate) {
+        std::string s = negate ? "[^" : "[";
        for (uint32_t ch : chars) {
-            cls += gbnf_escape_char_class(ch);
+            s += gbnf_escape_char_class(ch);
+        }
+        return s + "]";
+    };
+
+    for (size_t q = 0; q < ac.num_states(); q++) {
+        if (ac.is_terminal(q)) {
+            continue; // match states are dropped
        }

-        if (!pre.empty()) {
-            std::string pre_literal = gbnf_format_literal(common_unicode_cpts_to_utf8(pre));
-            pattern += pre_literal + " [^" + cls + "]";
-            // Each interior alternative consumes a delimiter-prefix plus a disambiguating
-            // char, so the repetition alone cannot match a value that *ends* on a proper
-            // prefix of a delimiter (e.g. a trailing "\n" when the delimiter is
-            // "\n</parameter>\n"). The runtime until() (greedy first-match) accepts such
-            // values, so without this the grammar would reject input the parser accepts.
-            // Allow the value to terminate on any proper prefix as an optional tail.
-            // This makes the grammar a slight superset of the runtime language (a value
-            // may end on the longest prefix, which greedy first-match would not itself
-            // produce); harmless for constrained generation, which only needs to admit
-            // every runtime-valid string.
-            if (!trailing.empty()) {
-                trailing += " | ";
+        std::map<size_t, std::vector<uint32_t>> buckets;
+        std::vector<uint32_t> excluded;
+        for (uint32_t c : ac.alphabet) {
+            size_t d = ac.next(q, c);
+            if (ac.is_terminal(d)) {
+                excluded.push_back(c); // completes a forbidden string -> omit
+            } else if (d != 0) {
+                buckets[d].push_back(c); // specific non-root destination
+                excluded.push_back(c);
            }
-            trailing += pre_literal;
-        } else {
-            pattern += "[^" + cls + "]";
        }
+
+        std::string rhs = "|"; // every state is accepting
+        for (const auto & [d, chars] : buckets) {
+            rhs += " " + char_class(chars, false) + " " + state_name(d) + " |";
+        }
+        rhs += " " + char_class(excluded, true) + " " + state_name(0);
+
+        builder.add_rule(state_name(q), rhs);
    }

-    std::string result = "(" + pattern + ")*";
-    if (!trailing.empty()) {
-        result += " (" + trailing + ")?";
+    // An empty delimiter makes the start state terminal. Emit an entry rule
+    // that matches nothing so the returned reference stays valid.
+    if (ac.is_terminal(0)) {
+        builder.add_rule(prefix, "|");
    }
-    return result;
+
+    return state_name(0);
 }

-static std::unordered_set<std::string> collect_reachable_rules(
+static std::set<std::string> collect_reachable_rules(
    const common_peg_arena & arena,
    const common_peg_parser_id & rule
 ) {
-    std::unordered_set<std::string> reachable;
-    std::unordered_set<std::string> visited;
+    std::set<std::string> reachable;
+    std::set<std::string> visited;

    std::function<void(common_peg_parser_id)> visit = [&](common_peg_parser_id id) {
        const auto & parser = arena.get(id);
@@ -1765,7 +1805,7 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
                if (p.delimiters.empty()) {
                    return ".*";
                }
-                return gbnf_excluding_pattern(p.delimiters);
+                return gbnf_excluding_grammar(builder, "until-" + std::to_string(id), p.delimiters);
            } else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
                if (schema_delegates(p)) {
                    return to_gbnf(p.child);
@@ -1789,7 +1829,7 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
    };

    // Collect reachable rules
-    std::unordered_set<std::string> reachable_rules;
+    std::set<std::string> reachable_rules;

    if (lazy) {
        // Collect rules reachable from trigger rules
@@ -3,8 +3,8 @@
 #include <nlohmann/json_fwd.hpp>

 #include <memory>
+#include <set>
 #include <unordered_map>
-#include <unordered_set>
 #include <string>
 #include <string_view>
 #include <functional>
@@ -335,7 +335,7 @@ class common_peg_arena {
    friend class common_peg_parser_builder;

  private:
-    std::string dump_impl(common_peg_parser_id id, std::unordered_set<common_peg_parser_id> & visited) const;
+    std::string dump_impl(common_peg_parser_id id, std::set<common_peg_parser_id> & visited) const;

    common_peg_parser_id add_parser(common_peg_parser_variant parser);
    void add_rule(const std::string & name, common_peg_parser_id id);
@@ -161,6 +161,10 @@ struct common_speculative_impl {

    virtual void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) = 0;

+    // (optional) serialize/restore per-seq internal state (e.g. eagle3's deferred boundary).
+    virtual bool get_state(llama_seq_id /*seq_id*/, std::vector<uint8_t> & /*data*/) const { return false; }
+    virtual void set_state(llama_seq_id /*seq_id*/, const std::vector<uint8_t> & /*data*/) {}
+
    // true if this implementation requires the target context to extract post-norm embeddings
    virtual bool need_embd() const = 0;

@@ -841,6 +845,49 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
                    (size_t) n_embd_dec * sizeof(float));
    }

+    // we only need to stash the deferred boundary's g_embd row for recurrent/hybrid targets:
+    // their single-position checkpoints drop it on restore
+    bool need_boundary_stash() const {
+        const llama_model * model_tgt = llama_get_model(params.ctx_tgt);
+        return llama_model_is_recurrent(model_tgt) || llama_model_is_hybrid(model_tgt);
+    }
+
+    bool get_state(llama_seq_id seq_id, std::vector<uint8_t> & data) const override {
+        if (!need_boundary_stash()) {
+            return false;
+        }
+        if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq || pending_pos_last[seq_id] < 0) {
+            return false;
+        }
+
+        const llama_pos          pos = pending_pos_last[seq_id];
+        const std::vector<float> & g = pending_g_last[seq_id];
+
+        data.resize(sizeof(llama_pos) + g.size() * sizeof(float));
+        std::memcpy(data.data(),                     &pos,     sizeof(llama_pos));
+        std::memcpy(data.data() + sizeof(llama_pos), g.data(), g.size() * sizeof(float));
+        return true;
+    }
+
+    void set_state(llama_seq_id seq_id, const std::vector<uint8_t> & data) override {
+        if (!need_boundary_stash()) {
+            return;
+        }
+        if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) {
+            return;
+        }
+        if (data.size() != sizeof(llama_pos) + (size_t) n_embd_dec * sizeof(float)) {
+            return;
+        }
+
+        llama_pos pos = -1;
+        std::memcpy(&pos, data.data(), sizeof(llama_pos));
+
+        pending_pos_last[seq_id] = pos;
+        pending_g_last[seq_id].resize(n_embd_dec);
+        std::memcpy(pending_g_last[seq_id].data(), data.data() + sizeof(llama_pos), (size_t) n_embd_dec * sizeof(float));
+    }
+
    bool need_embd() const override {
        return false;
    }
@@ -858,7 +905,13 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {

    int32_t n_embd = 0;

-    bool is_mem_shared = false;
+    // One MTP draft driver, three modes (set once in the ctor):
+    //   is_mem_shared (gemma4): shares the target KV, runs all heads in one graph.
+    //   chain_heads (step35): n_mtp_layers trained heads, one per draft step.
+    //   neither (qwen35 / qwen35moe): a single trained MTP head.
+    int32_t n_mtp_layers  = 1;
+    bool    is_mem_shared = false;   // gemma4
+    bool    chain_heads   = false;   // derived in the ctor: n_mtp_layers > 1 && !is_mem_shared

    // Per-sequence cross-batch carryover: pair (h_p, x_{p+1}) at MTP pos p+1.
    // The last h-row of one process() call needs the first token of the NEXT
@@ -873,10 +926,8 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
    std::vector<std::vector<float>> verify_h;
    std::vector<int32_t> verify_h_rows;

-    // Per-seq draft length from the last draft() call, used in accept() to
-    // roll back ctx_dft's recurrent state past the AR draft's redundant
-    // pre-advancement before process() mirrored the verify batch.
-    std::vector<uint16_t> last_n_drafted;
+    std::vector<int>                i_last;
+    std::vector<std::vector<float>> chain_h;

    common_speculative_impl_draft_mtp(const common_params_speculative & params, uint32_t n_seq)
        : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_MTP, n_seq)
@@ -889,6 +940,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
        n_embd = llama_model_n_embd_out(llama_get_model(ctx_dft));
        GGML_ASSERT(n_embd == llama_model_n_embd(llama_get_model(ctx_tgt)) &&
                "MTP input row width must match the target h_nextn width");
+        n_mtp_layers = std::max(1, (int) llama_model_n_layer_nextn(llama_get_model(ctx_dft)));

        LOG_INF("%s: adding speculative implementation 'draft-mtp'\n", __func__);
        LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d, backend_sampling=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd, (int) this->params.backend_sampling);
@@ -935,16 +987,25 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
        llama_set_embeddings_nextn(ctx_dft, true, /*masked*/ true);

        is_mem_shared = llama_get_ctx_other(ctx_dft) == ctx_tgt;
+        chain_heads   = n_mtp_layers > 1 && !is_mem_shared;
+
+        if (chain_heads) {
+            this->params.n_max = std::min(this->params.n_max, n_mtp_layers);
+
+            chain_h.assign(n_seq, {});
+            for (auto & c : chain_h) {
+                c.reserve((size_t) (this->params.n_max + 1) * n_embd);
+            }
+        }

        pending_h.assign(n_seq, std::vector<float>(n_embd, 0.0f));

+        i_last.assign(n_seq, -1);
        i_batch_beg.assign(n_seq, -1);
        i_batch_end.assign(n_seq, -1);

        verify_h.assign(n_seq, {});
        verify_h_rows.assign(n_seq, 0);
-
-        last_n_drafted.assign(n_seq, 0);
    }

    ~common_speculative_impl_draft_mtp() override {
@@ -1050,9 +1111,34 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
                set_h(i_batch_beg[seq_id], pending_h[seq_id].data());
            }

-            const int32_t rc = llama_decode(ctx_dft, batch);
-            if (rc != 0) {
-                LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (pos=%d)\n", __func__, (int) rc, (int) batch_in.pos[0]);
+            auto * mem_dft = llama_get_memory(ctx_dft);
+
+            bool ok = true;
+            for (int head = 0; head < n_mtp_layers; ++head) {
+                if (chain_heads) {
+                    // ref: https://github.com/ggml-org/llama.cpp/pull/24340/changes#r3413498544
+                    for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+                        if (i_batch_beg[seq_id] < 0) {
+                            continue;
+                        }
+                        llama_memory_seq_rm(mem_dft, seq_id, batch_in.pos[i_batch_beg[seq_id]], -1);
+                    }
+                    llama_set_nextn_layer_offset(ctx_dft, head);
+                }
+
+                const int32_t rc = llama_decode(ctx_dft, batch);
+                if (rc != 0) {
+                    LOG_ERR("%s: llama_decode(ctx_dft) head=%d failed rc=%d (pos=%d)\n",
+                            __func__, head, (int) rc, (int) batch_in.pos[0]);
+                    ok = false;
+                    break;
+                }
+            }
+
+            if (chain_heads) {
+                llama_set_nextn_layer_offset(ctx_dft, 0); // restore default for non-draft decodes
+            }
+            if (!ok) {
                return false;
            }
        }
@@ -1087,7 +1173,6 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
        int n_drafting = 0;
        std::vector<bool> drafting(n_seq);

-        const float * h_row = nullptr;
        const size_t row_bytes = (size_t) n_embd * sizeof(float);

        for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
@@ -1102,22 +1187,43 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
            common_sampler_reset(smpls[seq_id].get());

            common_batch_add(batch, dp.id_last, dp.n_past, { seq_id }, true);
+            std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd, pending_h[seq_id].data(), row_bytes);

-            h_row = pending_h[seq_id].data();
-            std::memcpy(batch.embd + n_embd*(batch.n_tokens - 1), h_row, row_bytes);
-        }
+            i_last[seq_id] = batch.n_tokens - 1;

-        int ret = llama_decode(ctx_dft, batch);
-        if (ret != 0) {
-            LOG_WRN("%s: llama_decode returned %d\n", __func__, ret);
-            return;
+            if (chain_heads) {
+                chain_h[seq_id].assign(pending_h[seq_id].begin(), pending_h[seq_id].end());
+            }
        }

        int i = 0;

        while (n_drafting > 0) {
-            int i_batch = 0;
+            // each step decodes under a different head, i.e. a different decoder layer, and
+            // KV is per layer. process() filled this layer's KV only for positions < n_past
+            // (prompt + accepted prefix) — nothing in the draft region yet. so reset the
+            // draft region (the seq_rm lower bound is n_past, leaving the prompt KV intact)
+            // and select head i so it rebuilds its own layer's KV there; decoding just the
+            // latest token would leave its attention reading cells only another head wrote.
+            if (chain_heads) {
+                auto * mem_dft = llama_get_memory(ctx_dft);
+                for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+                    if (drafting[seq_id]) {
+                        llama_memory_seq_rm(mem_dft, seq_id, dparams[seq_id].n_past, -1);
+                    }
+                }
+                llama_set_nextn_layer_offset(ctx_dft, i);
+            }

+            int ret = llama_decode(ctx_dft, batch);
+            if (ret != 0) {
+                LOG_WRN("%s: llama_decode[%d] returned %d\n", __func__, i, ret);
+                break;
+            }
+
+            // rebuild the batch for the next step: the growing-KV paths re-add only the
+            // new token (the KV already holds the prefix), while chained heads re-add the
+            // whole prefix at the next head. dropped sequences are simply not re-added.
            common_batch_clear(batch);

            for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
@@ -1127,9 +1233,8 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {

                auto * smpl = smpls[seq_id].get();

-                common_sampler_sample(smpl, ctx_dft, i_batch, true);
-                h_row = llama_get_embeddings_nextn_ith(ctx_dft, i_batch);
-                ++i_batch;
+                common_sampler_sample(smpl, ctx_dft, i_last[seq_id], true);
+                const float * h_row = llama_get_embeddings_nextn_ith(ctx_dft, i_last[seq_id]);

                const auto * cur_p = common_sampler_get_candidates(smpl, true);

@@ -1163,30 +1268,41 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
                    continue;
                }

-                if (is_mem_shared) {
+                if (chain_heads) {
+                    // ref: https://github.com/ggml-org/llama.cpp/pull/24340#discussion_r3448031546
+                    chain_h[seq_id].insert(chain_h[seq_id].end(), h_row, h_row + n_embd);
+
+                    const int n_rows = (int) result.size() + 1; // id_last + tokens drafted so far
+                    for (int t = 0; t < n_rows; ++t) {
+                        const llama_token tok = (t == 0) ? dp.id_last : result[t - 1];
+                        common_batch_add(batch, tok, dp.n_past + t, { seq_id }, t == n_rows - 1);
+                        std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd,
+                                    chain_h[seq_id].data() + (size_t) t * n_embd, row_bytes);
+                    }
+                } else if (is_mem_shared) {
                    // note: with shared memory (e.g. Gemma4 assistants) we use the same position for all draft tokens
                    // ref: https://github.com/huggingface/transformers/blob/effde20942e3f82a1b97449f60b3a48c5ff96145/docs/source/en/model_doc/gemma4_assistant.md?plain=1#L36-L37
                    common_batch_add(batch, id, dp.n_past, { seq_id }, true);
+                    std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd, h_row, row_bytes);
                } else {
                    common_batch_add(batch, id, dp.n_past + i + 1, { seq_id }, true);
+                    std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd, h_row, row_bytes);
                }
-                std::memcpy(batch.embd + n_embd*(batch.n_tokens - 1), h_row, row_bytes);
+
+                i_last[seq_id] = batch.n_tokens - 1;
            }

            if (batch.n_tokens == 0) {
                break;
            }

-            // evaluate the drafted tokens on the draft model
-            ret = llama_decode(ctx_dft, batch);
-            if (ret != 0) {
-                LOG_WRN("%s: llama_decode[%d] returned %d\n", __func__, i, ret);
-                break;
-            }
-
            ++i;
        }

+        if (chain_heads) {
+            llama_set_nextn_layer_offset(ctx_dft, 0); // restore default for non-draft decodes
+        }
+
        for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
            auto & dp = dparams[seq_id];
            if (!dp.drafting) {
@@ -1196,8 +1312,6 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
            if (dp.result->size() < (size_t) params.n_min) {
                dp.result->clear();
            }
-
-            last_n_drafted[seq_id] = (uint16_t) dp.result->size();
        }
    }

@@ -1810,7 +1924,7 @@ common_speculative * common_speculative_init(common_params_speculative & params,

        bool has_draft_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE));
        bool has_draft_eagle3 = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3)) && params.draft.ctx_dft != nullptr;
-        bool has_mtp = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_MTP)) && params.draft.ctx_dft != nullptr;
+        bool has_draft_mtp    = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_MTP))    && params.draft.ctx_dft != nullptr;



@@ -1848,7 +1962,7 @@ common_speculative * common_speculative_init(common_params_speculative & params,
        if (has_draft_eagle3) {
            configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, params));
        }
-        if (has_mtp) {
+        if (has_draft_mtp) {
            configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_MTP, params));
        }
    }
@@ -2118,6 +2232,31 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u
    }
 }

+// TODO: support the case of more than one speculative implementations having a state
+bool common_speculative_get_state(common_speculative * spec, llama_seq_id seq_id, std::vector<uint8_t> & data) {
+    if (spec == nullptr) {
+        return false;
+    }
+
+    for (auto & impl : spec->impls) {
+        if (impl->get_state(seq_id, data)) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+void common_speculative_set_state(common_speculative * spec, llama_seq_id seq_id, const std::vector<uint8_t> & data) {
+    if (spec == nullptr) {
+        return;
+    }
+
+    for (auto & impl : spec->impls) {
+        impl->set_state(seq_id, data);
+    }
+}
+
 void common_speculative_print_stats(const common_speculative * spec) {
    if (spec == nullptr) {
        return;
@@ -68,6 +68,10 @@ void common_speculative_draft(common_speculative * spec);
 // informs the speculative context that n_accepted tokens were accepted by the target model
 void common_speculative_accept(common_speculative * spec, llama_seq_id, uint16_t n_accepted);

+// (optional) get/set internal state
+bool common_speculative_get_state(common_speculative * spec, llama_seq_id seq_id, std::vector<uint8_t> & data);
+void common_speculative_set_state(common_speculative * spec, llama_seq_id seq_id, const std::vector<uint8_t> & data);
+
 // print statistics about the speculative decoding
 void common_speculative_print_stats(const common_speculative * spec);

@@ -126,7 +126,7 @@ class BailingMoeV2Model(TextModel):
        if (rope_dim := hparams.get("head_dim")) is None:
            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]

-        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
+        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5)))
        self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
@@ -1119,8 +1119,10 @@ class TextModel(ModelBase):

        rope_theta = self.find_hparam(["global_rope_theta", "rope_global_theta", "rope_theta_global", "rope_theta", "rotary_emb_base"], optional=True)
        local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "rope_theta_local", "swa_rope_theta", "rope_local_base_freq"], optional=True)
+        partial_rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"], optional=True)
+        original_max_position_embeddings = self.find_hparam(["original_max_position_embeddings"], optional=True)

-        # Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
+        # Ensure global params are mirrored in rope_parameters
        if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
            if local_rope_theta is not None:
                self.rope_parameters["sliding_attention"] = {"rope_theta": local_rope_theta}
@@ -1128,6 +1130,10 @@ class TextModel(ModelBase):
                self.rope_parameters["rope_theta"] = rope_theta
            if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None:
                self.rope_parameters["rope_type"] = rope_type
+            if "partial_rotary_factor" not in self.rope_parameters and partial_rotary_factor is not None:
+                self.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
+            if "original_max_position_embeddings" not in self.rope_parameters and original_max_position_embeddings is not None:
+                self.rope_parameters["original_max_position_embeddings"] = original_max_position_embeddings

    @classmethod
    def __init_subclass__(cls):
@@ -148,7 +148,7 @@ class ChatGLMModel(TextModel):
            rope_dim = self.hparams["attention_dim"]
        else:
            rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
-        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
+        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5)))
        self.gguf_writer.add_add_bos_token(False)
        rope_freq = 10000
        if "rope_ratio" in self.hparams:
@@ -161,7 +161,7 @@ class DeciModel(TextModel):
                factor = rope_params.get("factor", 8.0)
                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
-                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
+                old_context_len = rope_params.get("original_max_position_embeddings", 8192)

                low_freq_wavelen = old_context_len / low_freq_factor
                high_freq_wavelen = old_context_len / high_freq_factor
@@ -24,7 +24,7 @@ class ExaoneModel(TextModel):

        assert (hparams["activation_function"] == "silu")

-        rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
+        rotary_factor = self.rope_parameters.get("partial_rotary_factor")
        rotary_factor = rotary_factor if rotary_factor is not None else 1.0
        self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))

@@ -39,7 +39,7 @@ class ExaoneModel(TextModel):
                factor = rope_params.get("factor", 8.0)
                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
-                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
+                old_context_len = rope_params.get("original_max_position_embeddings", 8192)

                low_freq_wavelen = old_context_len / low_freq_factor
                high_freq_wavelen = old_context_len / high_freq_factor
@@ -104,7 +104,7 @@ class Exaone4Model(TextModel):
                factor = rope_params.get("factor", 16.0)
                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
-                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
+                old_context_len = rope_params.get("original_max_position_embeddings", 8192)

                low_freq_wavelen = old_context_len / low_freq_factor
                high_freq_wavelen = old_context_len / high_freq_factor
@@ -693,7 +693,7 @@ class Gemma4Model(Gemma3Model):
            self.gguf_writer.add_head_count_kv(value_arr)

        # handle n_rot differently for global vs swa layers
-        partial_rotary_factor_swa = self.hparams.get("partial_rotary_factor", 1.0)
+        partial_rotary_factor_swa = self.rope_parameters.get("partial_rotary_factor", 1.0)
        n_rot_full = int(head_dim_full) # "proportional" is used, see generate_extra_tensors
        n_rot_swa = int(head_dim_swa * partial_rotary_factor_swa)
        self.gguf_writer.add_rope_dimension_count(n_rot_full)
@@ -124,7 +124,7 @@ class Glm4MoeModel(TextModel):
                self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
            )
        self.gguf_writer.add_rope_dimension_count(
-            int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))
+            int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5))
        )

        # MoE parameters - Use only routed expert count (shared experts handled separately)
@@ -226,7 +226,7 @@ class GlmMoeDsaModel(DeepseekV2Model):
        super().set_gguf_parameters()

        rope_dim = self.hparams["qk_rope_head_dim"]
-        partial_rotary_factor = self.hparams.get("partial_rotary_factor", 1.0)
+        partial_rotary_factor = self.rope_parameters.get("partial_rotary_factor", 1.0)
        self.gguf_writer.add_rope_dimension_count(int(rope_dim * partial_rotary_factor))

        # NextN/MTP prediction layers
@@ -289,7 +289,7 @@ class LlamaModel(TextModel):
                factor = rope_params.get("factor", 8.0)
                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
-                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
+                old_context_len = rope_params.get("original_max_position_embeddings", 8192)

                low_freq_wavelen = old_context_len / low_freq_factor
                high_freq_wavelen = old_context_len / high_freq_factor
@@ -154,7 +154,7 @@ class MimoV2Model(TextModel):
        self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
        self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])

-        rope_dim = int(self.hparams["head_dim"] * self.hparams["partial_rotary_factor"])
+        rope_dim = int(self.hparams["head_dim"] * self.rope_parameters["partial_rotary_factor"])
        self.gguf_writer.add_rope_dimension_count(rope_dim)

        self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon", 1e-5))
@@ -32,11 +32,9 @@ class MiniCPMModel(TextModel):
    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
        rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]

-        rope_scaling = self.find_hparam(['rope_scaling'], True)
-        if rope_scaling is not None:
-            long_factors = rope_scaling.get('long_factor', None)
-            short_factors = rope_scaling.get('short_factor', None)
-
+        long_factors = self.rope_parameters.get('long_factor')
+        short_factors = self.rope_parameters.get('short_factor')
+        if long_factors or short_factors:
            if long_factors is None or short_factors is None:
                raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')

@@ -85,13 +83,11 @@ class MiniCPM3Model(TextModel):
        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])

    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
-        rope_scaling = self.find_hparam(['rope_scaling'], True)
-        if rope_scaling is not None:
+        long_factors = self.rope_parameters.get('long_factor')
+        short_factors = self.rope_parameters.get('short_factor')
+        if long_factors or short_factors:
            rope_dims = self.hparams["qk_rope_head_dim"]

-            long_factors = rope_scaling.get('long_factor', None)
-            short_factors = rope_scaling.get('short_factor', None)
-
            if long_factors is None or short_factors is None:
                raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')

@@ -125,17 +125,18 @@ class NemotronModel(TextModel):
        self.gguf_writer.add_layer_norm_eps(f_norm_eps)

        # * Partial RoPE
-        rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
+        rot_pct = self.rope_parameters["partial_rotary_factor"]
        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        n_head = self.find_hparam(["num_attention_heads", "n_head"])
        self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)

        # * RopeScaling for Nemotron
-        if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
+        factor = self.hparams.get("factor") or self.rope_parameters.get("factor")
+        if factor is None:
            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
        else:
            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-            self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])
+            self.gguf_writer.add_rope_scaling_factor(factor)

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        # * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side
@@ -18,7 +18,7 @@ class Phi2Model(TextModel):
    model_arch = gguf.MODEL_ARCH.PHI2

    def set_gguf_parameters(self):
-        rot_pct = self.find_hparam(["partial_rotary_factor"])
+        rot_pct = self.rope_parameters["partial_rotary_factor"]
        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        n_head = self.find_hparam(["num_attention_heads", "n_head"])

@@ -149,8 +149,8 @@ class Phi3MiniModel(TextModel):
        n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
        rms_eps = self.find_hparam(["rms_norm_eps"])
        max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
-        orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
-        rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
+        orig_max_pos_embds = self.rope_parameters["original_max_position_embeddings"]
+        rot_pct = self.rope_parameters.get("partial_rotary_factor", 1.0)
        rope_dims = int(rot_pct * n_embd) // n_head

        self.gguf_writer.add_context_length(max_pos_embds)
@@ -174,18 +174,19 @@ class Phi3MiniModel(TextModel):
        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        n_head = self.find_hparam(["num_attention_heads", "n_head"])
        max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
-        orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
-        rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
+        orig_max_pos_embds = self.rope_parameters["original_max_position_embeddings"]
+        rot_pct = self.rope_parameters.get("partial_rotary_factor", 1.0)
        rope_dims = int(rot_pct * n_embd) // n_head

        # write rope scaling for long context (128k) model
-        rope_scaling = self.find_hparam(['rope_scaling'], True)
-        if rope_scaling is None:
+        long_factors = self.rope_parameters.get('long_factor')
+        short_factors = self.rope_parameters.get('short_factor')
+        if not long_factors:
            return

        scale = max_pos_embds / orig_max_pos_embds

-        rope_scaling_type = rope_scaling.get('rope_type', rope_scaling.get('type', '')).lower()
+        rope_scaling_type = self.rope_parameters.get('rope_type', '').lower()
        if len(rope_scaling_type) == 0:
            raise KeyError('Missing the required key rope_scaling.type')

@@ -198,9 +199,6 @@ class Phi3MiniModel(TextModel):

        self.gguf_writer.add_rope_scaling_attn_factors(attn_factor)

-        long_factors = rope_scaling.get('long_factor', None)
-        short_factors = rope_scaling.get('short_factor', None)
-
        if long_factors is None or short_factors is None:
            raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')

@@ -280,7 +280,7 @@ class Qwen3NextModel(Qwen2MoeModel):
        self.gguf_writer.add_full_attention_interval(self.hparams.get("full_attention_interval", 4))
        if (rope_dim := self.hparams.get("head_dim")) is None:
            rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
-        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.25)))
+        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.25)))

    @classmethod
    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
@@ -28,7 +28,7 @@ class StableLMModel(TextModel):
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
        self.gguf_writer.add_block_count(self.block_count)
        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
-        rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
+        rotary_factor = self.rope_parameters["partial_rotary_factor"]
        self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
        self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
@@ -314,7 +314,7 @@ class Step35Model(TextModel):
        factor = float(rope_params.get("factor", 8.0))
        low_freq_factor = float(rope_params.get("low_freq_factor", 1.0))
        high_freq_factor = float(rope_params.get("high_freq_factor", 4.0))
-        old_context_len = int(rope_params.get("original_max_position_embeddings", self.hparams.get("original_max_position_embeddings", 8192)))
+        old_context_len = int(rope_params.get("original_max_position_embeddings", 8192))

        low_freq_wavelen = old_context_len / low_freq_factor
        high_freq_wavelen = old_context_len / high_freq_factor
@@ -198,18 +198,18 @@ class BuiltinRule:
 SPACE_RULE = '| " " | "\\n"{1,2} [ \\t]{0,20}'

 PRIMITIVE_RULES = {
-    'boolean'      : BuiltinRule('("true" | "false") space', []),
+    'boolean'      : BuiltinRule('("true" | "false")', []),
    'decimal-part' : BuiltinRule('[0-9]{1,16}', []),
    'integral-part': BuiltinRule('[0] | [1-9] [0-9]{0,15}', []),
-    'number'       : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
-    'integer'      : BuiltinRule('("-"? integral-part) space', ['integral-part']),
+    'number'       : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?', ['integral-part', 'decimal-part']),
+    'integer'      : BuiltinRule('("-"? integral-part)', ['integral-part']),
    'value'        : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
-    'object'       : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
-    'array'        : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
-    'uuid'         : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []),
+    'object'       : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? space "}"', ['string', 'value']),
+    'array'        : BuiltinRule('"[" space ( value ("," space value)* )? space "]"', ['value']),
+    'uuid'         : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\""', []),
    'char'         : BuiltinRule(r'[^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})', []),
-    'string'       : BuiltinRule(r'"\"" char* "\"" space', ['char']),
-    'null'         : BuiltinRule('"null" space', []),
+    'string'       : BuiltinRule(r'"\"" char* "\""', ['char']),
+    'null'         : BuiltinRule('"null"', []),
 }

 # TODO: support "uri", "email" string formats
@@ -217,9 +217,9 @@ STRING_FORMAT_RULES = {
    'date'            : BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
    'time'            : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
    'date-time'       : BuiltinRule('date "T" time', ['date', 'time']),
-    'date-string'     : BuiltinRule('"\\"" date "\\"" space', ['date']),
-    'time-string'     : BuiltinRule('"\\"" time "\\"" space', ['time']),
-    'date-time-string': BuiltinRule('"\\"" date-time "\\"" space', ['date-time']),
+    'date-string'     : BuiltinRule('"\\"" date "\\""', ['date']),
+    'time-string'     : BuiltinRule('"\\"" time "\\""', ['time']),
+    'date-time-string': BuiltinRule('"\\"" date-time "\\""', ['date-time']),
 }

 DOTALL = '[\\U00000000-\\U0010FFFF]'
@@ -319,7 +319,7 @@ class SchemaConverter:
                out.append(f'[^"{"".join(rejects)}] {char_rule}*')
        visit(trie)

-        out.append(f' ){"" if trie.is_end_of_string else "?"} ["] space')
+        out.append(f' ){"" if trie.is_end_of_string else "?"} ["]')
        return ''.join(out)

    def _add_rule(self, name, rule):
@@ -549,7 +549,7 @@ class SchemaConverter:
        return self._add_rule(
            name,
            to_rule(transform()) if self._raw_pattern \
-                else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space")
+                else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\"")


    def _resolve_ref(self, ref):
@@ -580,10 +580,10 @@ class SchemaConverter:
            return self._add_rule(rule_name, self._generate_union_rule(name, [{**schema, 'type': t} for t in schema_type]))

        elif 'const' in schema:
-            return self._add_rule(rule_name, self._generate_constant_rule(schema['const']) + ' space')
+            return self._add_rule(rule_name, self._generate_constant_rule(schema['const']))

        elif 'enum' in schema:
-            rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ') space'
+            rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ')'
            return self._add_rule(rule_name, rule)

        elif schema_type in (None, 'object') and \
@@ -624,7 +624,7 @@ class SchemaConverter:
                    enum_intersection &= s

                if enum_intersection:
-                    rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in sorted(enum_intersection))) + ') space'
+                    rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in sorted(enum_intersection))) + ')'
                    return self._add_rule(rule_name, rule)

            return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None))
@@ -638,12 +638,12 @@ class SchemaConverter:
                    ' "," space '.join(
                        self.visit(item, f'{name}{"-" if name else ""}tuple-{i}')
                        for i, item in enumerate(items)) +
-                    ' "]" space')
+                    ' space "]"')
            else:
                item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item')
                min_items = schema.get("minItems", 0)
                max_items = schema.get("maxItems")
-                return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' "]" space')
+                return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' space "]"')

        elif schema_type in (None, 'string') and 'pattern' in schema:
            return self._visit_pattern(schema['pattern'], rule_name)
@@ -663,7 +663,7 @@ class SchemaConverter:
            min_len = schema.get('minLength', 0)
            max_len = schema.get('maxLength')

-            return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space')
+            return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\""')

        elif schema_type in (None, 'integer') and \
                ('minimum' in schema or 'exclusiveMinimum' in schema or 'maximum' in schema or 'exclusiveMaximum' in schema):
@@ -680,7 +680,7 @@ class SchemaConverter:

            out = ["("]
            _generate_min_max_int(min_value, max_value, out)
-            out.append(") space")
+            out.append(")")
            return self._add_rule(rule_name, ''.join(out))

        elif (schema_type == 'object') or (len(schema) == 0):
@@ -765,7 +765,7 @@ class SchemaConverter:
                rule += ' )'
            rule += ' )?'

-        rule += ' "}" space'
+        rule += ' space "}"'

        return rule

@@ -5,7 +5,7 @@ project("ggml" C CXX ASM)
 ### GGML Version
 set(GGML_VERSION_MAJOR 0)
 set(GGML_VERSION_MINOR 15)
-set(GGML_VERSION_PATCH 1)
+set(GGML_VERSION_PATCH 2)
 set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")

 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
@@ -2417,15 +2417,14 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
            // Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size
            GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size);

-            parallel_for_ggml(params, n_batch, [&](int begin, int end) {
-                for (int batch_idx = begin; batch_idx < end; ++batch_idx) {
+            parallel_for_ggml(params, n_batch * M, [&](int begin, int end) {
+                for (int idx = begin; idx < end; ++idx) {
+                    int batch_idx = idx / M;
+                    int m         = idx % M;
                    int64_t src1_offset = ggml_batch_offset(src1, batch_idx, ne2);
                    const float * A_data = (const float *)((const char *)src1->data + src1_offset);
                    char * wdata_batch = (char *)wdata + batch_idx * M * row_size_A;
-
-                    for (int m = 0; m < M; ++m) {
-                        from_float<vec_dot_type>(A_data + m * K, wdata_batch + m * row_size_A, K);
-                    }
+                    from_float<vec_dot_type>(A_data + m * K, wdata_batch + m * row_size_A, K);
                }
            });
        });
@@ -2345,7 +2345,7 @@ class tinyBLAS_Q0_PPC {
            else if (n_aligned % 16 == 0) nc = 16;
            else                          nc = 8;
        }
-        bool can_use_tiled = n_aligned > 0 && (m % mc == 0) && (k % kc == 0);
+        bool can_use_tiled = n_aligned > 0 && (m % mc == 0);
        if (can_use_tiled) {
            matmul_tiled(m, n_aligned, mc, nc, kc);
            if (n > n_aligned) {
@@ -3063,13 +3063,14 @@ class tinyBLAS_Q0_PPC {
            int64_t ii = (job / xtiles) * mc;
            int64_t jj = (job % xtiles) * nc;
            for (int64_t kk = 0; kk < k; kk += kc) {
+                int64_t k_cur = MIN(kc, k - kk);
                if constexpr(is_Ablock_q4) {
-                    packNormal_q4_fp16(A + ii * lda + kk, lda, mc, kc, (uint8_t *)A_pack);
+                    packNormal_q4_fp16(A + ii * lda + kk, lda, mc, k_cur, (uint8_t *)A_pack);
                } else {
-                    packNormal_q8_fp16(A + ii * lda + kk, lda, mc, kc, (uint8_t *)A_pack);
+                    packNormal_q8_fp16(A + ii * lda + kk, lda, mc, k_cur, (uint8_t *)A_pack);
                }
-                packNormal_q8_fp16(B + jj * ldb + kk, ldb, nc, kc, (uint8_t *)B_pack);
-                KERNEL_Q0(ii, jj, mc, nc, kc, kk, A_pack, B_pack);
+                packNormal_q8_fp16(B + jj * ldb + kk, ldb, nc, k_cur, (uint8_t *)B_pack);
+                KERNEL_Q0(ii, jj, mc, nc, k_cur, kk, A_pack, B_pack);
            }
        }
    }
@@ -0,0 +1,81 @@
+#include "col2im-1d.cuh"
+#include "convert.cuh"
+
+// col2im_1d: scatter-add GEMM columns to 1D signal (gather approach)
+// columns: [K*OC, T_in]  ->  output: [T_out, OC]
+// Supports F32, F16, BF16 data with F32 accumulator.
+
+template <typename T>
+static __global__ void col2im_1d_kernel(
+        const T * __restrict__ col,
+        T       * __restrict__ dst,
+        const int T_in, const uint3 T_out_fd,
+        const int OC, const int K, const int K_OC,
+        const int s0, const int p0, const int total) {
+
+    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx >= total) return;
+
+    // dst layout: [T_out, OC], ne[0]=T_out fastest
+    const uint2 qr  = fast_div_modulo((uint32_t)idx, T_out_fd);  // qr.x = idx / T_out, qr.y = idx % T_out
+    const int oc    = (int)qr.x;
+    const int t_out = (int)qr.y;
+    const int t_abs = t_out + p0;  // absolute position in uncropped signal
+
+    // Gather: find all (t_in, k) where t_in*s + k == t_abs, 0 <= k < K
+    int t_in_min = (t_abs - K + s0) / s0;  // ceil((t_abs - K + 1) / s)
+    if (t_in_min < 0) t_in_min = 0;
+    int t_in_max = t_abs / s0;
+    if (t_in_max >= T_in) t_in_max = T_in - 1;
+
+    float sum = 0.0f;
+    for (int t_in = t_in_min; t_in <= t_in_max; t_in++) {
+        const int k = t_abs - t_in * s0;
+        // col layout: [K*OC, T_in], column index = oc * K + k
+        sum += ggml_cuda_cast<float>(col[(oc * K + k) + t_in * K_OC]);
+    }
+
+    dst[idx] = ggml_cuda_cast<T>(sum);
+}
+
+void ggml_cuda_op_col2im_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
+    const int32_t OC = ((const int32_t *)(dst->op_params))[1];
+    const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
+
+    const int K_OC = (int) src0->ne[0];
+    const int T_in = (int) src0->ne[1];
+    const int K    = K_OC / OC;
+    const int T_out = (int) dst->ne[0];
+
+    const uint3 T_out_fd = init_fastdiv_values((uint32_t)T_out);
+
+    const int total = T_out * OC;
+    const int block_size = 256;
+    const int num_blocks = (total + block_size - 1) / block_size;
+
+    switch (src0->type) {
+        case GGML_TYPE_F32: {
+            col2im_1d_kernel<<<num_blocks, block_size, 0, stream>>>(
+                (const float *)src0->data, (float *)dst->data,
+                T_in, T_out_fd, OC, K, K_OC, s0, p0, total);
+        } break;
+        case GGML_TYPE_F16: {
+            col2im_1d_kernel<<<num_blocks, block_size, 0, stream>>>(
+                (const half *)src0->data, (half *)dst->data,
+                T_in, T_out_fd, OC, K, K_OC, s0, p0, total);
+        } break;
+        case GGML_TYPE_BF16: {
+            col2im_1d_kernel<<<num_blocks, block_size, 0, stream>>>(
+                (const nv_bfloat16 *)src0->data, (nv_bfloat16 *)dst->data,
+                T_in, T_out_fd, OC, K, K_OC, s0, p0, total);
+        } break;
+        default:
+            GGML_ABORT("col2im_1d: unsupported type");
+    }
+}
@@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_op_col2im_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
@@ -11,6 +11,7 @@
 #include "ggml-cuda/argsort.cuh"
 #include "ggml-cuda/binbcast.cuh"
 #include "ggml-cuda/clamp.cuh"
+#include "ggml-cuda/col2im-1d.cuh"
 #include "ggml-cuda/concat.cuh"
 #include "ggml-cuda/conv-transpose-1d.cuh"
 #include "ggml-cuda/conv2d.cuh"
@@ -3051,6 +3052,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_CONV_TRANSPOSE_1D:
            ggml_cuda_op_conv_transpose_1d(ctx,dst);
            break;
+        case GGML_OP_COL2IM_1D:
+            ggml_cuda_op_col2im_1d(ctx, dst);
+            break;
        case GGML_OP_POOL_2D:
            ggml_cuda_op_pool2d(ctx, dst);
            break;
@@ -5316,6 +5320,14 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                }
                return false;
            } break;
+        case GGML_OP_COL2IM_1D:
+            {
+                ggml_type src0_type = op->src[0]->type;
+                return (src0_type == GGML_TYPE_F32 || src0_type == GGML_TYPE_F16 || src0_type == GGML_TYPE_BF16) &&
+                    op->type == src0_type &&
+                    ggml_is_contiguous(op->src[0]) &&
+                    ggml_is_contiguous(op);
+            } break;
        case GGML_OP_SILU_BACK:
            return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
            break;
@@ -183,24 +183,25 @@ static inline void hvx_transpose_32x32_f32(HVX_Vector m[32]) {
 // transposed into VTCM.
 //
 // VTCM layouts (per thread):
-//   src1_T : {d_inner_per_thread, d_conv}   — staged once per launch (small).
-//   src0_T : {d_inner_tile,     ncs}        — staged per d_inner-tile.
+//   src1_T : {d_inner_stride, d_conv}       - staged once per launch (small).
+//   src0_T : {d_inner_tile,     ncs}        - staged per d_inner-tile.
 //
 // d_inner_tile is chosen so that per-thread VTCM stays under the budget.
 // Each thread iterates ceil(d_inner_per_thread d_inner_tile) tiles serially.
 #define HTP_SSM_CONV_VTCM_BUDGET (1u << 20) // 1 MiB per thread

-// Scalar transpose: src1 {d_conv, d_inner} (DDR) -> {d_inner_per_thread, d_conv} (VTCM)
+// Scalar transpose: src1 {d_conv, d_inner} (DDR) -> {d_inner_stride, d_conv} (VTCM)
 static inline void transpose_src1(const float * src1_data,
                                  uint32_t      src1_stride_inner,
                                  uint32_t      i1_off,
                                  uint32_t      d_inner_per_thread,
+                                  uint32_t      d_inner_stride,
                                  uint32_t      d_conv,
                                  float *       src1_T) {
    for (uint32_t i = 0; i < d_inner_per_thread; ++i) {
        const float * src_row = src1_data + (i1_off + i) * src1_stride_inner;
        for (uint32_t j = 0; j < d_conv; ++j) {
-            src1_T[j * d_inner_per_thread + i] = src_row[j];
+            src1_T[j * d_inner_stride + i] = src_row[j];
        }
    }
 }
@@ -280,6 +281,7 @@ static void ssm_conv_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void
    }

    const uint32_t d_inner_per_thread = ir1 - ir0;
+    const uint32_t d_inner_stride     = scctx->nrows_per_thread;
    const uint32_t d_inner_tile       = scctx->d_inner_tile;

    const float * src0_data = (const float *) src0->data;
@@ -290,8 +292,8 @@ static void ssm_conv_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void
    float * src0_T = (float *)(octx->src0_spad.data + ith * octx->src0_spad.size_per_thread);
    float * src1_T = (float *)(octx->src1_spad.data + ith * octx->src1_spad.size_per_thread);

-    // Stage src1 weights once into VTCM in {d_inner_per_thread, d_conv} layout.
-    transpose_src1(src1_data, src1_stride_inner, ir0, d_inner_per_thread, d_conv, src1_T);
+    // Stage src1 weights once into VTCM in {d_inner_stride, d_conv} layout.
+    transpose_src1(src1_data, src1_stride_inner, ir0, d_inner_per_thread, d_inner_stride, d_conv, src1_T);

    const uint32_t C_TILE = VLEN_FP32;

@@ -314,7 +316,7 @@ static void ssm_conv_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void
                    HVX_Vector acc = hvx_vec_splat_f32(0.0f);
                    for (uint32_t j = 0; j < d_conv; ++j) {
                        HVX_Vector x = *(const HVX_Vector *) (src0_T + (t + j) * d_inner_tile + cb);
-                        HVX_Vector w = *(const HVX_Vector *) (src1_T + j * d_inner_per_thread + tile_off + cb);
+                        HVX_Vector w = *(const HVX_Vector *) (src1_T + j * d_inner_stride + tile_off + cb);
                        acc          = Q6_Vqf32_vadd_Vqf32Vqf32(acc, Q6_Vqf32_vmpy_VsfVsf(x, w));
                    }
                    HVX_Vector res = Q6_Vsf_equals_Vqf32(acc);
@@ -362,8 +364,7 @@ int op_ssm_conv_f32(struct htp_ops_context * octx) {
            use_hvx = 1;
        }

-        scctx.nrows_per_thread  = (d_inner + n_threads - 1) / n_threads;
-        scctx.nrows_per_thread += (scctx.nrows_per_thread & 1);
+        scctx.nrows_per_thread = hex_round_up((d_inner + n_threads - 1) / n_threads, VLEN_FP32);

        const uint32_t d_inner_per_thread = scctx.nrows_per_thread;
        const uint32_t ncs                = src0->ne[0];
@@ -3788,7 +3788,7 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_global_context & ctx) {
    ctx->memset_pipeline = ggml_webgpu_create_pipeline(ctx->device, wgsl_memset, "memset", constants);
 }

-static void create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
+static void ggml_backend_webgpu_request_adapter(wgpu::Instance & instance, wgpu::Adapter & adapter) {
    wgpu::RequestAdapterOptions options = {};

 #ifndef __EMSCRIPTEN__
@@ -3800,17 +3800,20 @@ static void create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
    options.nextInChain                   = &adapterTogglesDesc;
 #endif

-    ctx->webgpu_global_ctx->instance.WaitAny(
-        ctx->webgpu_global_ctx->instance.RequestAdapter(
-            &options, wgpu::CallbackMode::AllowSpontaneous,
-            [&ctx](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message) {
-                if (status != wgpu::RequestAdapterStatus::Success) {
-                    GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
-                    return;
-                }
-                ctx->webgpu_global_ctx->adapter = std::move(adapter);
-            }),
-        UINT64_MAX);
+    instance.WaitAny(instance.RequestAdapter(
+                         &options, wgpu::CallbackMode::AllowSpontaneous,
+                         [&adapter](wgpu::RequestAdapterStatus status, wgpu::Adapter _adapter, const char * message) {
+                             if (status != wgpu::RequestAdapterStatus::Success) {
+                                 GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
+                                 return;
+                             }
+                             adapter = std::move(_adapter);
+                         }),
+                     UINT64_MAX);
+}
+
+static void create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
+    ggml_backend_webgpu_request_adapter(ctx->webgpu_global_ctx->instance, ctx->webgpu_global_ctx->adapter);
    GGML_ASSERT(ctx->webgpu_global_ctx->adapter != nullptr);

    ctx->webgpu_global_ctx->adapter.GetLimits(&ctx->webgpu_global_ctx->capabilities.limits);
@@ -4543,20 +4546,7 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() {
    // Probe for adapter support
    wgpu::Adapter adapter;
    if (ctx->webgpu_global_ctx->instance != nullptr) {
-        wgpu::RequestAdapterOptions options = {};
-
-        // probe for adapter support
-        ctx->webgpu_global_ctx->instance.WaitAny(
-            ctx->webgpu_global_ctx->instance.RequestAdapter(
-                &options, wgpu::CallbackMode::AllowSpontaneous,
-                [&adapter](wgpu::RequestAdapterStatus status, wgpu::Adapter _adapter, const char * message) {
-                    if (status != wgpu::RequestAdapterStatus::Success) {
-                        GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
-                        return;
-                    }
-                    adapter = std::move(_adapter);
-                }),
-            UINT64_MAX);
+        ggml_backend_webgpu_request_adapter(ctx->webgpu_global_ctx->instance, adapter);
    }

    // WebGPU backend requires f16 support and, on native, implicit device synchronization.
@@ -600,18 +600,15 @@ FILE * ggml_fopen(const char * fname, const char * mode) {
    // convert fname (UTF-8)
    wchar_t * wfname = ggml_mbstowcs(fname);
    if (wfname) {
-        // convert mode (ANSI)
-        wchar_t * wmode = GGML_MALLOC((strlen(mode) + 1) * sizeof(wchar_t));
-        wchar_t * wmode_p = wmode;
-        do {
-            *wmode_p++ = (wchar_t)*mode;
-        } while (*mode++);
-
-        // open file
-        file = _wfopen(wfname, wmode);
+        // convert mode (UTF-8)
+        wchar_t * wmode = ggml_mbstowcs(mode);
+        if (wmode) {
+            // open file
+            file = _wfopen(wfname, wmode);
+            GGML_FREE(wmode);
+        }

        GGML_FREE(wfname);
-        GGML_FREE(wmode);
    }

    return file;
@@ -558,14 +558,15 @@ extern "C" {
    LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
    LLAMA_API enum llama_rope_type       llama_model_rope_type(const struct llama_model * model);

-    LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
-    LLAMA_API int32_t llama_model_n_embd     (const struct llama_model * model);
-    LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
-    LLAMA_API int32_t llama_model_n_embd_out (const struct llama_model * model);
-    LLAMA_API int32_t llama_model_n_layer    (const struct llama_model * model);
-    LLAMA_API int32_t llama_model_n_head     (const struct llama_model * model);
-    LLAMA_API int32_t llama_model_n_head_kv  (const struct llama_model * model);
-    LLAMA_API int32_t llama_model_n_swa      (const struct llama_model * model);
+    LLAMA_API int32_t llama_model_n_ctx_train  (const struct llama_model * model);
+    LLAMA_API int32_t llama_model_n_embd       (const struct llama_model * model);
+    LLAMA_API int32_t llama_model_n_embd_inp   (const struct llama_model * model);
+    LLAMA_API int32_t llama_model_n_embd_out   (const struct llama_model * model);
+    LLAMA_API int32_t llama_model_n_layer      (const struct llama_model * model);
+    LLAMA_API int32_t llama_model_n_layer_nextn(const struct llama_model * model);
+    LLAMA_API int32_t llama_model_n_head       (const struct llama_model * model);
+    LLAMA_API int32_t llama_model_n_head_kv    (const struct llama_model * model);
+    LLAMA_API int32_t llama_model_n_swa        (const struct llama_model * model);

    // Get the model's RoPE frequency scaling factor
    LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
@@ -1 +1 @@
-3af5f5760e19a96427f5f7a93b79cbdf3d4b265b
+707321c4cf6d21cb4bc831aa8b687dbf01a521ce
@@ -5,7 +5,7 @@ import os
 import sys
 import subprocess

-HTTPLIB_VERSION = "refs/tags/v0.47.0"
+HTTPLIB_VERSION = "refs/tags/v0.48.0"

 vendor = {
    "https://github.com/nlohmann/json/releases/latest/download/json.hpp":     "vendor/nlohmann/json.hpp",
@@ -20,6 +20,7 @@ set(LLAMA_UI_GZIP     "" CACHE STRING "Apply gzip compress to assets to save ban

 set(DIST_DIR     "${UI_BINARY_DIR}/dist")
 set(SRC_DIST_DIR "${UI_SOURCE_DIR}/dist")
+set(WORK_DIR     "${UI_BINARY_DIR}/ui-src")
 set(STAMP_FILE   "${UI_BINARY_DIR}/.ui-stamp")
 set(UI_CPP       "${UI_BINARY_DIR}/ui.cpp")
 set(UI_H         "${UI_BINARY_DIR}/ui.h")
@@ -64,6 +65,22 @@ function(npm_build_should_skip out_var)
    set(${out_var} TRUE PARENT_SCOPE)
 endfunction()

+function(stage_sources)
+    if(EXISTS "${WORK_DIR}")
+        file(GLOB staged RELATIVE "${WORK_DIR}" "${WORK_DIR}/*")
+        list(REMOVE_ITEM staged "node_modules")
+        foreach(entry ${staged})
+            file(REMOVE_RECURSE "${WORK_DIR}/${entry}")
+        endforeach()
+    endif()
+
+    file(COPY "${UI_SOURCE_DIR}/"
+        DESTINATION "${WORK_DIR}"
+        NO_SOURCE_PERMISSIONS
+        PATTERN "node_modules" EXCLUDE
+    )
+endfunction()
+
 function(npm_build out_var)
    set(${out_var} FALSE PARENT_SCOPE)

@@ -89,14 +106,16 @@ function(npm_build out_var)
        return()
    endif()

+    stage_sources()
+
    # npm writes node_modules/.package-lock.json on every successful install,
    # so a package-lock.json newer than this marker means node_modules is stale
-    set(NPM_MARKER "${UI_SOURCE_DIR}/node_modules/.package-lock.json")
+    set(NPM_MARKER "${WORK_DIR}/node_modules/.package-lock.json")
    set(need_install FALSE)
    if(NOT EXISTS "${NPM_MARKER}")
        set(need_install TRUE)
    else()
-        file(TIMESTAMP "${UI_SOURCE_DIR}/package-lock.json" lock_ts)
+        file(TIMESTAMP "${WORK_DIR}/package-lock.json" lock_ts)
        file(TIMESTAMP "${NPM_MARKER}" marker_ts)
        if(lock_ts STRGREATER marker_ts)
            set(need_install TRUE)
@@ -107,7 +126,7 @@ function(npm_build out_var)
        message(STATUS "UI: running npm install")
        execute_process(
            COMMAND ${NPM_EXECUTABLE} install
-            WORKING_DIRECTORY "${UI_SOURCE_DIR}"
+            WORKING_DIRECTORY "${WORK_DIR}"
            RESULT_VARIABLE rc
            ERROR_VARIABLE  err
        )
@@ -124,7 +143,7 @@ function(npm_build out_var)
    execute_process(
        COMMAND ${CMAKE_COMMAND} -E env "LLAMA_UI_OUT_DIR=${DIST_DIR}" "LLAMA_UI_VERSION=${HF_VERSION}" "LLAMA_BUILD_NUMBER=${LLAMA_BUILD_NUMBER}"
                ${NPM_EXECUTABLE} run build
-        WORKING_DIRECTORY "${UI_SOURCE_DIR}"
+        WORKING_DIRECTORY "${WORK_DIR}"
        RESULT_VARIABLE rc
        ERROR_VARIABLE  err
    )
@@ -1156,6 +1156,10 @@ void llama_context::set_embeddings_layer_inp(uint32_t lid, bool enable) {
    sched_need_reserve = true;
 }

+void llama_context::set_nextn_layer_offset(int32_t offset) {
+    cparams.nextn_layer_offset = offset;
+}
+
 void llama_context::set_causal_attn(bool value) {
    LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);

@@ -3699,6 +3703,10 @@ void llama_set_embeddings_layer_inp(llama_context * ctx, uint32_t lid, bool valu
    ctx->set_embeddings_layer_inp(lid, value);
 }

+void llama_set_nextn_layer_offset(llama_context * ctx, int32_t offset) {
+    ctx->set_nextn_layer_offset(offset);
+}
+
 llama_memory_t llama_get_memory(const struct llama_context * ctx) {
    if (!ctx) {
        return nullptr;
@@ -115,6 +115,7 @@ struct llama_context {
    void set_embeddings (bool value);
    void set_embeddings_nextn(bool value, bool masked);
    void set_embeddings_layer_inp(uint32_t lid, bool enable);
+    void set_nextn_layer_offset(int32_t offset);
    void set_causal_attn(bool value);
    void set_warmup(bool value);

@@ -18,6 +18,8 @@ struct llama_cparams {
    int32_t  n_threads;       // number of threads to use for generation
    int32_t  n_threads_batch; // number of threads to use for batch processing

+    int32_t  nextn_layer_offset = 0;
+
    float rope_freq_base;
    float rope_freq_scale;

@@ -95,6 +95,11 @@ LLAMA_API llama_memory_breakdown llama_get_memory_breakdown(const struct llama_c
 // If masked == false, output the embeddings for all tokens in the batch regardless of batch.logits
 LLAMA_API void llama_set_embeddings_nextn(struct llama_context * ctx, bool value, bool masked);

+// Select which appended NextN block the DECODER_MTP graph runs (offset past
+// the trunk: il = n_layer() + offset). Used by the speculative NextN driver to
+// chain multiple trained NextN heads. Default 0 (first head).
+LLAMA_API void llama_set_nextn_layer_offset(struct llama_context * ctx, int32_t offset);
+
 // mirrors:
 // LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
 LLAMA_API float * llama_get_embeddings_nextn(struct llama_context * ctx);
@@ -682,9 +682,16 @@ struct llm_graph_params {
            }
        }

+        // TODO: https://github.com/ggml-org/llama.cpp/pull/24340#discussion_r3448035248
+        if (cparams.nextn_layer_offset != other.cparams.nextn_layer_offset) {
+            return false;
+        }
+
        return
-            cparams.embeddings  == other.cparams.embeddings  &&
-            cparams.causal_attn == other.cparams.causal_attn &&
+            cparams.embeddings              == other.cparams.embeddings              &&
+            cparams.embeddings_nextn        == other.cparams.embeddings_nextn        &&
+            cparams.embeddings_nextn_masked == other.cparams.embeddings_nextn_masked &&
+            cparams.causal_attn             == other.cparams.causal_attn             &&
            arch  == other.arch  &&
            gtype == other.gtype &&
            cvec  == other.cvec  &&
@@ -2312,6 +2312,10 @@ int32_t llama_model_n_layer(const llama_model * model) {
    return model->hparams.n_layer();
 }

+int32_t llama_model_n_layer_nextn(const llama_model * model) {
+    return model->hparams.n_layer_nextn;
+}
+
 int32_t llama_model_n_head(const llama_model * model) {
    return model->hparams.n_head();
 }
@@ -932,8 +932,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::

    // copy the KV pairs from the input file
    gguf_set_kv     (ctx_out.get(), ml.metadata);
-    gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
-    gguf_set_val_u32(ctx_out.get(), "general.file_type", ftype); // TODO: use LLM_KV
+    gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION).c_str(), GGML_QNT_VERSION);
+    gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_GENERAL_FILE_TYPE).c_str(), ftype);

    // Remove split metadata
    gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
@@ -101,11 +101,11 @@ void llama_model_glm_dsa::load_arch_tensors(llama_model_loader &) {
        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);

        // DSA indexer
-        layer.indexer_k_norm   = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM,   "weight", i), {hparams.indexer_head_size}, flags);
-        layer.indexer_k_norm_b = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM,   "bias",   i), {hparams.indexer_head_size}, flags);
-        layer.indexer_proj     = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ,     "weight", i), {n_embd, hparams.indexer_n_head}, flags);
-        layer.indexer_attn_k   = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_K,   "weight", i), {n_embd, hparams.indexer_head_size}, flags);
-        layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags);
+        layer.indexer_k_norm   = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM,   "weight", i), {hparams.indexer_head_size}, flags | TENSOR_NOT_REQUIRED);
+        layer.indexer_k_norm_b = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM,   "bias",   i), {hparams.indexer_head_size}, flags | TENSOR_NOT_REQUIRED);
+        layer.indexer_proj     = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ,     "weight", i), {n_embd, hparams.indexer_n_head}, flags | TENSOR_NOT_REQUIRED);
+        layer.indexer_attn_k   = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_K,   "weight", i), {n_embd, hparams.indexer_head_size}, flags | TENSOR_NOT_REQUIRED);
+        layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags | TENSOR_NOT_REQUIRED);
        if (i < (int) hparams.n_layer_dense_lead) {
            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, flags);
            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, flags);
@@ -156,6 +156,8 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para

    // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass.
    for (int il = 0; il < n_layer; ++il) {
+        res->t_layer_inp[il] = inpL;
+
        ggml_tensor * inpSA = inpL;

        cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
@@ -179,6 +179,8 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p

    // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass.
    for (int il = 0; il < n_layer; ++il) {
+        res->t_layer_inp[il] = inpL;
+
        ggml_tensor * inpSA = inpL;

        cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
@@ -112,7 +112,7 @@ void llama_model_step35::load_arch_tensors(llama_model_loader & ml) {
        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, TENSOR_NOT_REQUIRED);
    };

-    auto load_block_mtp = [&](int i, bool is_first_mtp) {
+    auto load_block_mtp = [&](int i) {
        auto & layer = layers[i];

        const uint32_t n_head_l      = hparams.n_head(i);
@@ -121,15 +121,12 @@ void llama_model_step35::load_arch_tensors(llama_model_loader & ml) {

        // The MTP block is a full Step3p5 decoder layer (mtp_block) plus the
        // NextN-specific wiring (enorm/hnorm/eh_proj + optional shared head).
-        // `mtp_flags` becomes NOT_REQUIRED when the GGUF is trunk-only.
-        //
-        // Only the FIRST MTP block (i == n_main) is required for the
-        // single-block MTP runtime; trailing MTP blocks are always tolerated
-        // as missing so pruned GGUFs (block 0 only) load cleanly. Override
-        // mtp_flags to NOT_REQUIRED for those.
-        const int eff_mtp_flags = is_first_mtp ? mtp_flags : (mtp_flags | TENSOR_NOT_REQUIRED);
+        // Multi-block MTP: every declared MTP block is required (the draft chain
+        // runs all n_layer_nextn heads), so each block uses the captured
+        // `mtp_flags` directly — already NOT_REQUIRED for a trunk-only GGUF,
+        // which keeps that path correct.

-        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, eff_mtp_flags);
+        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, mtp_flags);
        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED);
        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED);

@@ -140,12 +137,12 @@ void llama_model_step35::load_arch_tensors(llama_model_loader & ml) {
            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | TENSOR_DUPLICATED);
        }

-        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head_l, n_embd_k_gqa, n_embd_v_gqa, eff_mtp_flags);
-        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v * n_head_l, n_embd}, eff_mtp_flags);
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head_l, n_embd_k_gqa, n_embd_v_gqa, mtp_flags);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v * n_head_l, n_embd}, mtp_flags);

        layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_head_l}, TENSOR_NOT_REQUIRED);

-        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, eff_mtp_flags);
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, mtp_flags);

        // dense MLP (leading dense blocks) — present if the MTP block isn't MoE
        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
@@ -165,9 +162,9 @@ void llama_model_step35::load_arch_tensors(llama_model_loader & ml) {
        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, TENSOR_NOT_REQUIRED);

        // NextN-specific tensors that define the MTP block.
-        layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ,          "weight", i), { 2 * n_embd, n_embd }, eff_mtp_flags);
-        layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM,            "weight", i), { n_embd },              eff_mtp_flags);
-        layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM,            "weight", i), { n_embd },              eff_mtp_flags);
+        layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ,          "weight", i), { 2 * n_embd, n_embd }, mtp_flags);
+        layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM,            "weight", i), { n_embd },              mtp_flags);
+        layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM,            "weight", i), { n_embd },              mtp_flags);
        layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS,     "weight", i), { n_embd, n_vocab },     TENSOR_NOT_REQUIRED);
        layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab },     TENSOR_NOT_REQUIRED);
        layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd },              TENSOR_NOT_REQUIRED);
@@ -176,13 +173,11 @@ void llama_model_step35::load_arch_tensors(llama_model_loader & ml) {
    for (int i = 0; i < n_layer; ++i) {
        load_block_trunk(i, trunk_flags);
    }
-    // Only the first MTP block (i == n_main) is required at runtime — the
-    // single-block-MTP graph in build_arch_graph always uses that one.
-    // Trailing MTP blocks are loaded if present (so an un-pruned GGUF with
-    // all MTP layers still works) but tolerated when absent via the pruning
-    // path. See scripts/prune_step35_extra_mtp.py for the pruner.
+    // All n_layer_nextn MTP blocks are required — the multi-block draft chain
+    // runs every head (head k at offset k). The GGUF declares the count via
+    // step35.nextn_predict_layers.
    for (int i = n_layer; i < n_layer_all; ++i) {
-        load_block_mtp(i, /*is_first_mtp=*/ i == n_layer);
+        load_block_mtp(i);
    }
 }

@@ -372,13 +367,14 @@ llama_model_step35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
    : llm_graph_context(params) {
    GGML_ASSERT(hparams.n_layer_nextn > 0 && "STEP35 MTP requires n_layer_nextn > 0");

-    // Single-block MTP only: always run the first trained MTP block (Qwen
-    // MTP / vLLM single-MTP-layer style). Multi-block round-robin proved to
-    // be a much deeper refactor than this PR justifies; the trailing MTP
-    // blocks are loaded with TENSOR_NOT_REQUIRED so pruned GGUFs (with just
-    // block 0) also work — see load_arch_tensors below and
-    // scripts/prune_step35_extra_mtp.py.
-    const int il = hparams.n_layer();
+    // Multi-block MTP: the DECODER_MTP graph runs the MTP head selected by
+    // cparams.nextn_layer_offset (0 = first trained head). The speculative driver
+    // bumps the offset per draft step to chain heads 45->46->47. offset 0 keeps
+    // single-block behavior identical to before.
+    const int il = hparams.n_layer() + cparams.nextn_layer_offset;
+    GGML_ASSERT(cparams.nextn_layer_offset >= 0 &&
+                cparams.nextn_layer_offset < (int) hparams.n_layer_nextn &&
+                "nextn_layer_offset out of range [0, n_layer_nextn)");
    const auto & layer = model.layers[il];

    GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj");
@@ -536,6 +532,9 @@ llama_model_step35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
    cur = ggml_add(ctx0, cur, ffn_inp);
    cb(cur, "mtp_post_ffn", il);

+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+    cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+
    // Pre-norm hidden state: used by the AR draft loop to seed the next MTP step.
    cb(cur, "h_nextn", -1);
    res->t_h_nextn = cur;
@@ -129,8 +129,86 @@ void test_gbnf_generation(testing &t) {
        });

        assert_gbnf_equal(t, R"""(
-            root ::= ([^<] | "<" [^/] | "</" [^t] | "</t" [^a] | "</ta" [^g] | "</tag" [^>])* ("<" | "</" | "</t" | "</ta" | "</tag")?
+            root ::= until-0
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            until-0 ::= | [<] until-0-01 | [^<] until-0
+            until-0-01 ::= | [<] until-0-01 | [/] until-0-02 | [^/<] until-0
+            until-0-02 ::= | [<] until-0-01 | [t] until-0-03 | [^<t] until-0
+            until-0-03 ::= | [<] until-0-01 | [a] until-0-04 | [^<a] until-0
+            until-0-04 ::= | [<] until-0-01 | [g] until-0-05 | [^<g] until-0
+            until-0-05 ::= | [<] until-0-01 | [^<>] until-0
+        )""", gbnf);
+    });
+
+    t.test("until grammar overlapping delimiter", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p)  {
+            return p.until("\n</parameter>\n");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= until-0
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            until-0 ::= | [\n] until-0-01 | [^\n] until-0
+            until-0-01 ::= | [\n] until-0-01 | [<] until-0-02 | [^\n<] until-0
+            until-0-02 ::= | [\n] until-0-01 | [/] until-0-03 | [^\n/] until-0
+            until-0-03 ::= | [\n] until-0-01 | [p] until-0-04 | [^\np] until-0
+            until-0-04 ::= | [\n] until-0-01 | [a] until-0-05 | [^\na] until-0
+            until-0-05 ::= | [\n] until-0-01 | [r] until-0-06 | [^\nr] until-0
+            until-0-06 ::= | [\n] until-0-01 | [a] until-0-07 | [^\na] until-0
+            until-0-07 ::= | [\n] until-0-01 | [m] until-0-08 | [^\nm] until-0
+            until-0-08 ::= | [\n] until-0-01 | [e] until-0-09 | [^\ne] until-0
+            until-0-09 ::= | [\n] until-0-01 | [t] until-0-10 | [^\nt] until-0
+            until-0-10 ::= | [\n] until-0-01 | [e] until-0-11 | [^\ne] until-0
+            until-0-11 ::= | [\n] until-0-01 | [r] until-0-12 | [^\nr] until-0
+            until-0-12 ::= | [\n] until-0-01 | [>] until-0-13 | [^\n>] until-0
+            until-0-13 ::= | [^\n] until-0
+        )""", gbnf);
+    });
+
+    // DeepSeek-V3.2 tag prefix. The DSML token (｜DSML｜) embeds U+FF5C,
+    // so the delimiter mixes ASCII and multi-byte codepoints.
+    t.test("until grammar unicode delimiter", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p)  {
+            return p.until("<｜DSML｜");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= until-0
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            until-0 ::= | [<] until-0-01 | [^<] until-0
+            until-0-01 ::= | [<] until-0-01 | [\uFF5C] until-0-02 | [^<\uFF5C] until-0
+            until-0-02 ::= | [<] until-0-01 | [D] until-0-03 | [^<D] until-0
+            until-0-03 ::= | [<] until-0-01 | [S] until-0-04 | [^<S] until-0
+            until-0-04 ::= | [<] until-0-01 | [M] until-0-05 | [^<M] until-0
+            until-0-05 ::= | [<] until-0-01 | [L] until-0-06 | [^<L] until-0
+            until-0-06 ::= | [<] until-0-01 | [^<\uFF5C] until-0
+        )""", gbnf);
+    });
+
+    t.test("until grammar multiple delimiters", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p)  {
+            return p.until_one_of({"ab", "cd", "ef"});
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= until-0
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            until-0 ::= | [a] until-0-01 | [c] until-0-03 | [e] until-0-05 | [^ace] until-0
+            until-0-01 ::= | [a] until-0-01 | [c] until-0-03 | [e] until-0-05 | [^abce] until-0
+            until-0-03 ::= | [a] until-0-01 | [c] until-0-03 | [e] until-0-05 | [^acde] until-0
+            until-0-05 ::= | [a] until-0-01 | [c] until-0-03 | [e] until-0-05 | [^acef] until-0
        )""", gbnf);
    });

@@ -10,7 +10,7 @@
 #undef NDEBUG
 #include <cassert>

-int main(void) {
+static void test(void) {
    common_params params;

    printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
@@ -210,3 +210,13 @@ int main(void) {

    printf("test-arg-parser: all tests OK\n\n");
 }
+
+int main(void) {
+    try {
+        test();
+    } catch (std::exception & e) {
+        fprintf(stderr, "test-arg-parser: exception: %s\n", e.what());
+        return 1;
+    }
+    return 0;
+}
@@ -5022,14 +5022,14 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
        tst.test("Hello, world!\nWhat's up?").tools({ special_function_tool }).expect(message_assist).expect_reconstruction().run();

        tst.test(
-             "```json\n\"42\" \n```")
+             "```json\n\"42\"\n```")
            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
            .json_schema(const_schema)
            .expect_content(R"("42")")
            .run();

        tst.test(
-             "\"42\" \n")
+             "\"42\"\n")
            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
            .json_schema(const_schema)
            .expect_content(R"("42")")
@@ -995,6 +995,32 @@ static void test_macros(testing & t) {
        json::object(),
        "Hello, John Smith,Hi, Jane Doe"
    );
+
+    test_template(t, "macro with caller",
+        "\
+{%- macro nest_dict(o, i, ff='') %}\n\
+  {{- caller(ff) }}\n\
+  {%- for k, v in o|items %}\n\
+    {{- i + k + ': ' }}\n\
+    {%- if v is mapping %}\n\
+      {{- '{' }}\n\
+      {% call(f) nest_dict(v, i + '    ') %}\n\
+        {{- 'fail' if ff is undefined }}\n\
+      {%- endcall %}\n\
+      {{- i + '}' }}\n\
+    {% else %}\n\
+      {{- v|string }}\n\
+    {% endif %}\n\
+  {%- endfor %}\n\
+{%- endmacro %}\n\
+{%- call(f) nest_dict({'root1': 1, 'root2': {'nest1': 1, 'nest2': {'nest3': 2}}}, '    ', 'Dict') %}\n\
+  {{- 'fail' if ff is defined }}\n\
+  {{- f + ' {' }}\n\
+{% endcall %}\n\
+{{- '}' }}",
+        json::object(),
+        "Dict {\n    root1: 1\n    root2: {\n        nest1: 1\n        nest2: {\n            nest3: 2\n        }\n    }\n}"
+    );
 }

 static void test_namespace(testing & t) {
@@ -92,7 +92,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "minimum": 0
        })""",
        R"""(
-            root ::= ([0] | [1-9] [0-9]{0,15}) space
+            root ::= ([0] | [1-9] [0-9]{0,15})
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -105,7 +105,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "minimum": 1
        })""",
        R"""(
-            root ::= ([1-9] [0-9]{0,15}) space
+            root ::= ([1-9] [0-9]{0,15})
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -118,7 +118,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "minimum": 3
        })""",
        R"""(
-            root ::= ([1-2] [0-9]{1,15} | [3-9] [0-9]{0,15}) space
+            root ::= ([1-2] [0-9]{1,15} | [3-9] [0-9]{0,15})
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -131,7 +131,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "minimum": 9
        })""",
        R"""(
-            root ::= ([1-8] [0-9]{1,15} | [9] [0-9]{0,15}) space
+            root ::= ([1-8] [0-9]{1,15} | [9] [0-9]{0,15})
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -144,7 +144,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "minimum": 10
        })""",
        R"""(
-            root ::= ([1] ([0-9]{1,15}) | [2-9] [0-9]{1,15}) space
+            root ::= ([1] ([0-9]{1,15}) | [2-9] [0-9]{1,15})
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -157,7 +157,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "minimum": 25
        })""",
        R"""(
-            root ::= ([1] [0-9]{2,15} | [2] ([0-4] [0-9]{1,14} | [5-9] [0-9]{0,14}) | [3-9] [0-9]{1,15}) space
+            root ::= ([1] [0-9]{2,15} | [2] ([0-4] [0-9]{1,14} | [5-9] [0-9]{0,14}) | [3-9] [0-9]{1,15})
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -170,7 +170,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "maximum": 30
        })""",
        R"""(
-            root ::= ("-" [1-9] [0-9]{0,15} | [0-9] | ([1-2] [0-9] | [3] "0")) space
+            root ::= ("-" [1-9] [0-9]{0,15} | [0-9] | ([1-2] [0-9] | [3] "0"))
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -183,7 +183,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "minimum": -5
        })""",
        R"""(
-            root ::= ("-" ([0-5]) | [0] | [1-9] [0-9]{0,15}) space
+            root ::= ("-" ([0-5]) | [0] | [1-9] [0-9]{0,15})
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -196,7 +196,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "minimum": -123
        })""",
        R"""(
-            root ::= ("-" ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-1] [0-9] | [2] [0-3])) | [0] | [1-9] [0-9]{0,15}) space
+            root ::= ("-" ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-1] [0-9] | [2] [0-3])) | [0] | [1-9] [0-9]{0,15})
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -209,7 +209,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "maximum": -5
        })""",
        R"""(
-            root ::= ("-" ([0-4] [0-9]{1,15} | [5-9] [0-9]{0,15})) space
+            root ::= ("-" ([0-4] [0-9]{1,15} | [5-9] [0-9]{0,15}))
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -222,7 +222,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "maximum": 1
        })""",
        R"""(
-            root ::= ("-" [1-9] [0-9]{0,15} | [0-1]) space
+            root ::= ("-" [1-9] [0-9]{0,15} | [0-1])
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -235,7 +235,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "maximum": 100
        })""",
        R"""(
-            root ::= ("-" [1-9] [0-9]{0,15} | [0-9] | ([1-8] [0-9] | [9] [0-9]) | "100") space
+            root ::= ("-" [1-9] [0-9]{0,15} | [0-9] | ([1-8] [0-9] | [9] [0-9]) | "100")
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -249,7 +249,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "maximum": 23
        })""",
        R"""(
-            root ::= ([0-9] | ([1] [0-9] | [2] [0-3])) space
+            root ::= ([0-9] | ([1] [0-9] | [2] [0-3]))
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -263,7 +263,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "maximum": 300
        })""",
        R"""(
-            root ::= (([1] ([5-9]) | [2-9] [0-9]) | ([1-2] [0-9]{2} | [3] "00")) space
+            root ::= (([1] ([5-9]) | [2-9] [0-9]) | ([1-2] [0-9]{2} | [3] "00"))
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -277,7 +277,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "maximum": 30
        })""",
        R"""(
-            root ::= ([5-9] | ([1-2] [0-9] | [3] "0")) space
+            root ::= ([5-9] | ([1-2] [0-9] | [3] "0"))
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -291,7 +291,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "maximum": 42
        })""",
        R"""(
-            root ::= ("-" ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-1] [0-9] | [2] [0-3])) | [0-9] | ([1-3] [0-9] | [4] [0-2])) space
+            root ::= ("-" ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-1] [0-9] | [2] [0-3])) | [0-9] | ([1-3] [0-9] | [4] [0-2]))
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -305,7 +305,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "maximum": 10
        })""",
        R"""(
-            root ::= ("-" ([0-9] | "10") | [0-9] | "10") space
+            root ::= ("-" ([0-9] | "10") | [0-9] | "10")
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -333,17 +333,17 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        "empty schema (object)",
        "{}",
        R"""(
-            array ::= "[" space ( value ("," space value)* )? "]" space
-            boolean ::= ("true" | "false") space
+            array ::= "[" space ( value ("," space value)* )? space "]"
+            boolean ::= ("true" | "false")
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
            decimal-part ::= [0-9]{1,16}
            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            null ::= "null" space
-            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
+            null ::= "null"
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
+            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? space "}"
            root ::= object
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
-            string ::= "\"" char* "\"" space
+            string ::= "\"" char* "\""
            value ::= object | array | string | number | boolean | null
        )"""
    });
@@ -361,17 +361,17 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        })""",
        R"""(
            date ::= [0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( "0" [1-9] | [1-2] [0-9] | "3" [0-1] )
-            date-string ::= "\"" date "\"" space
+            date-string ::= "\"" date "\""
            date-time ::= date "T" time
-            date-time-string ::= "\"" date-time "\"" space
-            root ::= "[" space tuple-0 "," space uuid "," space tuple-2 "," space tuple-3 "]" space
+            date-time-string ::= "\"" date-time "\""
+            root ::= "[" space tuple-0 "," space uuid "," space tuple-2 "," space tuple-3 space "]"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
            time ::= ([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )
-            time-string ::= "\"" time "\"" space
+            time-string ::= "\"" time "\""
            tuple-0 ::= date-string
            tuple-2 ::= time-string
            tuple-3 ::= date-time-string
-            uuid ::= "\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space
+            uuid ::= "\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\""
        )"""
    });

@@ -383,7 +383,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        })""",
        R"""(
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            root ::= "\"" char* "\"" space
+            root ::= "\"" char* "\""
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -397,7 +397,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        })""",
        R"""(
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            root ::= "\"" char+ "\"" space
+            root ::= "\"" char+ "\""
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -411,7 +411,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        })""",
        R"""(
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            root ::= "\"" char{3,} "\"" space
+            root ::= "\"" char{3,} "\""
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -425,7 +425,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        })""",
        R"""(
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            root ::= "\"" char{0,3} "\"" space
+            root ::= "\"" char{0,3} "\""
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -440,7 +440,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        })""",
        R"""(
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            root ::= "\"" char{1,4} "\"" space
+            root ::= "\"" char{1,4} "\""
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -452,7 +452,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "type": "boolean"
        })""",
        R"""(
-            root ::= ("true" | "false") space
+            root ::= ("true" | "false")
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -465,7 +465,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        })""",
        R"""(
            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            root ::= ("-"? integral-part) space
+            root ::= ("-"? integral-part)
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -477,7 +477,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "const": "foo"
        })""",
        R"""(
-            root ::= "\"foo\"" space
+            root ::= "\"foo\""
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -489,7 +489,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "const": 123
        })""",
        R"""(
-            root ::= "123" space
+            root ::= "123"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -501,7 +501,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "enum": ["red", "amber", "green", null, 42, ["foo"]]
        })""",
        R"""(
-            root ::= ("\"red\"" | "\"amber\"" | "\"green\"" | "null" | "42" | "[\"foo\"]") space
+            root ::= ("\"red\"" | "\"amber\"" | "\"green\"" | "null" | "42" | "[\"foo\"]")
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -515,9 +515,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        })""",
        R"""(
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            root ::= "[" space (string ("," space string)*)? "]" space
+            root ::= "[" space (string ("," space string)*)? space "]"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
-            string ::= "\"" char* "\"" space
+            string ::= "\"" char* "\""
        )"""
    });

@@ -529,12 +529,12 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "prefixItems": { "type": "string" }
        })""",
        R"""(
-            alternative-0 ::= "[" space (string ("," space string)*)? "]" space
+            alternative-0 ::= "[" space (string ("," space string)*)? space "]"
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            null ::= "null" space
+            null ::= "null"
            root ::= alternative-0 | null
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
-            string ::= "\"" char* "\"" space
+            string ::= "\"" char* "\""
        )"""
    });

@@ -546,9 +546,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        })""",
        R"""(
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            root ::= "[" space string "]" space
+            root ::= "[" space string space "]"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
-            string ::= "\"" char* "\"" space
+            string ::= "\"" char* "\""
        )"""
    });

@@ -562,10 +562,10 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
            decimal-part ::= [0-9]{1,16}
            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            root ::= "[" space string "," space number "]" space
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
+            root ::= "[" space string "," space number space "]"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
-            string ::= "\"" char* "\"" space
+            string ::= "\"" char* "\""
        )"""
    });

@@ -577,18 +577,18 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "items": {}
        })""",
        R"""(
-            array ::= "[" space ( value ("," space value)* )? "]" space
-            boolean ::= ("true" | "false") space
+            array ::= "[" space ( value ("," space value)* )? space "]"
+            boolean ::= ("true" | "false")
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
            decimal-part ::= [0-9]{1,16}
            integral-part ::= [0] | [1-9] [0-9]{0,15}
            item ::= object
-            null ::= "null" space
-            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
-            root ::= "[" space (item ("," space item)*)? "]" space
+            null ::= "null"
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
+            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? space "}"
+            root ::= "[" space (item ("," space item)*)? space "]"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
-            string ::= "\"" char* "\"" space
+            string ::= "\"" char* "\""
            value ::= object | array | string | number | boolean | null
        )"""
    });
@@ -602,18 +602,18 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "prefixItems": { "type": "string" }
        })""",
        R"""(
-            array ::= "[" space ( value ("," space value)* )? "]" space
-            boolean ::= ("true" | "false") space
+            array ::= "[" space ( value ("," space value)* )? space "]"
+            boolean ::= ("true" | "false")
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
            decimal-part ::= [0-9]{1,16}
            integral-part ::= [0] | [1-9] [0-9]{0,15}
            item ::= object
-            null ::= "null" space
-            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
-            root ::= "[" space (item ("," space item)*)? "]" space
+            null ::= "null"
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
+            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? space "}"
+            root ::= "[" space (item ("," space item)*)? space "]"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
-            string ::= "\"" char* "\"" space
+            string ::= "\"" char* "\""
            value ::= object | array | string | number | boolean | null
        )"""
    });
@@ -627,7 +627,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        R"""(
            decimal-part ::= [0-9]{1,16}
            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            root ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+            root ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -642,8 +642,8 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "minItems": 2
        })""",
        R"""(
-            boolean ::= ("true" | "false") space
-            root ::= "[" space boolean ("," space boolean)+ "]" space
+            boolean ::= ("true" | "false")
+            root ::= "[" space boolean ("," space boolean)+ space "]"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -658,8 +658,8 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "maxItems": 0
        })""",
        R"""(
-            boolean ::= ("true" | "false") space
-            root ::= "[" space  "]" space
+            boolean ::= ("true" | "false")
+            root ::= "[" space  space "]"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -674,8 +674,8 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "maxItems": 1
        })""",
        R"""(
-            boolean ::= ("true" | "false") space
-            root ::= "[" space boolean? "]" space
+            boolean ::= ("true" | "false")
+            root ::= "[" space boolean? space "]"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -690,8 +690,8 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "maxItems": 2
        })""",
        R"""(
-            boolean ::= ("true" | "false") space
-            root ::= "[" space (boolean ("," space boolean)?)? "]" space
+            boolean ::= ("true" | "false")
+            root ::= "[" space (boolean ("," space boolean)?)? space "]"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -708,11 +708,11 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        })""",
        R"""(
            decimal-part ::= [0-9]{1,16}
-            integer ::= ("-"? integral-part) space
+            integer ::= ("-"? integral-part)
            integral-part ::= [0] | [1-9] [0-9]{0,15}
            item ::= number | integer
-            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            root ::= "[" space item ("," space item){2,4} "]" space
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
+            root ::= "[" space item ("," space item){2,4} space "]"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -730,8 +730,8 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "maxItems": 5
        })""",
        R"""(
-            item ::= ("-" ([0-9] | "1" [0-2]) | [0-9] | ([1-8] [0-9] | [9] [0-9]) | ([1] [0-9]{2} | [2] "0" [0-7])) space
-            root ::= "[" space item ("," space item){2,4} "]" space
+            item ::= ("-" ([0-9] | "1" [0-2]) | [0-9] | ([1-8] [0-9] | [9] [0-9]) | ([1] [0-9]{2} | [2] "0" [0-7]))
+            root ::= "[" space item ("," space item){2,4} space "]"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -749,8 +749,8 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "maxItems": 5
        })""",
        R"""(
-            item ::= (([1] ([2-9]) | [2-9] [0-9]) | ([1] [0-9]{2} | [2] "0" [0-7])) space
-            root ::= "[" space item ("," space item){2,4} "]" space
+            item ::= (([1] ([2-9]) | [2-9] [0-9]) | ([1] [0-9]{2} | [2] "0" [0-7]))
+            root ::= "[" space item ("," space item){2,4} space "]"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -763,7 +763,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "pattern": "^abc?d*efg+(hij)?kl$"
        })""",
        R"""(
-            root ::= "\"" ("ab" "c"? "d"* "ef" "g"+ ("hij")? "kl") "\"" space
+            root ::= "\"" ("ab" "c"? "d"* "ef" "g"+ ("hij")? "kl") "\""
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -776,7 +776,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "pattern": "^\\[\\]\\{\\}\\(\\)\\|\\+\\*\\?$"
        })""",
        R"""(
-            root ::= "\"" ("[]{}()|+*?") "\"" space
+            root ::= "\"" ("[]{}()|+*?") "\""
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -789,7 +789,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "pattern": "^\"$"
        })""",
        R"""(
-            root ::= "\"" ("\"") "\"" space
+            root ::= "\"" ("\"") "\""
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -802,7 +802,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "pattern": "^A|B|C|D$"
        })""",
        R"""(
-            root ::= "\"" ("A" | "B" | "C" | "D") "\"" space
+            root ::= "\"" ("A" | "B" | "C" | "D") "\""
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -816,7 +816,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        })""",
        R"""(
            dot ::= [^\x0A\x0D]
-            root ::= "\"" (("(" root-1{1,3} ")")? root-1{3,3} "-" root-1{4,4} " " "a"{3,5} "nd" dot dot dot) "\"" space
+            root ::= "\"" (("(" root-1{1,3} ")")? root-1{3,3} "-" root-1{4,4} " " "a"{3,5} "nd" dot dot dot) "\""
            root-1 ::= [0-9]
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
@@ -845,9 +845,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            b-kv ::= "\"b\"" space ":" space string
            c-kv ::= "\"c\"" space ":" space string
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            root ::= "{" space b-kv "," space c-kv "," space a-kv "}" space
+            root ::= "{" space b-kv "," space c-kv "," space a-kv space "}"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
-            string ::= "\"" char* "\"" space
+            string ::= "\"" char* "\""
        )"""
    });

@@ -865,9 +865,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        R"""(
            a-kv ::= "\"a\"" space ":" space string
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            root ::= "{" space  (a-kv )? "}" space
+            root ::= "{" space  (a-kv )? space "}"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
-            string ::= "\"" char* "\"" space
+            string ::= "\"" char* "\""
        )"""
    });

@@ -889,9 +889,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            b-rest ::= ( "," space c-kv )?
            c-kv ::= "\"c\"" space ":" space string
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            root ::= "{" space  (a-kv a-rest | b-kv b-rest | c-kv )? "}" space
+            root ::= "{" space  (a-kv a-rest | b-kv b-rest | c-kv )? space "}"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
-            string ::= "\"" char* "\"" space
+            string ::= "\"" char* "\""
        )"""
    });

@@ -915,9 +915,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
            d-kv ::= "\"d\"" space ":" space string
            d-rest ::= ( "," space c-kv )?
-            root ::= "{" space b-kv "," space a-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
+            root ::= "{" space b-kv "," space a-kv ( "," space ( d-kv d-rest | c-kv ) )? space "}"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
-            string ::= "\"" char* "\"" space
+            string ::= "\"" char* "\""
        )"""
    });

@@ -930,14 +930,14 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        })""",
        R"""(
            additional-kv ::= string ":" space additional-value
-            additional-value ::= "[" space (number ("," space number)*)? "]" space
+            additional-value ::= "[" space (number ("," space number)*)? space "]"
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
            decimal-part ::= [0-9]{1,16}
            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            root ::= "{" space  (additional-kv ( "," space additional-kv )* )? "}" space
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
+            root ::= "{" space  (additional-kv ( "," space additional-kv )* )? space "}"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
-            string ::= "\"" char* "\"" space
+            string ::= "\"" char* "\""
        )"""
    });

@@ -949,17 +949,17 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "additionalProperties": true
        })""",
        R"""(
-            array ::= "[" space ( value ("," space value)* )? "]" space
-            boolean ::= ("true" | "false") space
+            array ::= "[" space ( value ("," space value)* )? space "]"
+            boolean ::= ("true" | "false")
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
            decimal-part ::= [0-9]{1,16}
            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            null ::= "null" space
-            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
+            null ::= "null"
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
+            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? space "}"
            root ::= object
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
-            string ::= "\"" char* "\"" space
+            string ::= "\"" char* "\""
            value ::= object | array | string | number | boolean | null
        )"""
    });
@@ -971,17 +971,17 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "type": "object"
        })""",
        R"""(
-            array ::= "[" space ( value ("," space value)* )? "]" space
-            boolean ::= ("true" | "false") space
+            array ::= "[" space ( value ("," space value)* )? space "]"
+            boolean ::= ("true" | "false")
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
            decimal-part ::= [0-9]{1,16}
            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            null ::= "null" space
-            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
+            null ::= "null"
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
+            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? space "}"
            root ::= object
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
-            string ::= "\"" char* "\"" space
+            string ::= "\"" char* "\""
            value ::= object | array | string | number | boolean | null
        )"""
    });
@@ -994,7 +994,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "additionalProperties": false
        })""",
        R"""(
-            root ::= "{" space  "}" space
+            root ::= "{" space  space "}"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -1012,15 +1012,15 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        })""",
        R"""(
            a-kv ::= "\"a\"" space ":" space number
-            additional-k ::= ["] ( [a] char+ | [^"a] char* )? ["] space
+            additional-k ::= ["] ( [a] char+ | [^"a] char* )? ["]
            additional-kv ::= additional-k ":" space string
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
            decimal-part ::= [0-9]{1,16}
            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            root ::= "{" space a-kv ( "," space ( additional-kv ( "," space additional-kv )* ) )? "}" space
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
+            root ::= "{" space a-kv ( "," space ( additional-kv ( "," space additional-kv )* ) )? space "}"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
-            string ::= "\"" char* "\"" space
+            string ::= "\"" char* "\""
        )"""
    });

@@ -1037,13 +1037,13 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        R"""(
            a-kv ::= "\"a\"" space ":" space number
            a-rest ::= ( "," space additional-kv )*
-            additional-k ::= ["] ( [a] char+ | [^"a] char* )? ["] space
+            additional-k ::= ["] ( [a] char+ | [^"a] char* )? ["]
            additional-kv ::= additional-k ":" space number
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
            decimal-part ::= [0-9]{1,16}
            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            root ::= "{" space  (a-kv a-rest | additional-kv ( "," space additional-kv )* )? "}" space
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
+            root ::= "{" space  (a-kv a-rest | additional-kv ( "," space additional-kv )* )? space "}"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -1061,7 +1061,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "additionalProperties": {"type": "number"}
        })""",
        R"""(
-            additional-k ::= ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["] space
+            additional-k ::= ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["]
            additional-kv ::= additional-k ":" space number
            also-kv ::= "\"also\"" space ":" space number
            also-rest ::= ( "," space additional-kv )*
@@ -1069,8 +1069,8 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
            decimal-part ::= [0-9]{1,16}
            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            root ::= "{" space and-kv ( "," space ( also-kv also-rest | additional-kv ( "," space additional-kv )* ) )? "}" space
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
+            root ::= "{" space and-kv ( "," space ( also-kv also-rest | additional-kv ( "," space additional-kv )* ) )? space "}"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -1090,13 +1090,13 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            -rest ::= ( "," space a-kv )? a-rest
            a-kv ::= "\"a\"" space ":" space integer
            a-rest ::= ( "," space additional-kv )*
-            additional-k ::= ["] ( [a] char+ | [^"a] char* ) ["] space
+            additional-k ::= ["] ( [a] char+ | [^"a] char* ) ["]
            additional-kv ::= additional-k ":" space integer
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            integer ::= ("-"? integral-part) space
+            integer ::= ("-"? integral-part)
            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            root ::= ("-"? integral-part) space
-            root0 ::= "{" space  (-kv -rest | a-kv a-rest | additional-kv ( "," space additional-kv )* )? "}" space
+            root ::= ("-"? integral-part)
+            root0 ::= "{" space  (-kv -rest | a-kv a-rest | additional-kv ( "," space additional-kv )* )? space "}"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -1116,12 +1116,12 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            a-rest ::= ( "," space aa-kv )? aa-rest
            aa-kv ::= "\"aa\"" space ":" space integer
            aa-rest ::= ( "," space additional-kv )*
-            additional-k ::= ["] ( [a] ([a] char+ | [^"a] char*) | [^"a] char* )? ["] space
+            additional-k ::= ["] ( [a] ([a] char+ | [^"a] char*) | [^"a] char* )? ["]
            additional-kv ::= additional-k ":" space integer
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            integer ::= ("-"? integral-part) space
+            integer ::= ("-"? integral-part)
            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            root ::= "{" space  (a-kv a-rest | aa-kv aa-rest | additional-kv ( "," space additional-kv )* )? "}" space
+            root ::= "{" space  (a-kv a-rest | aa-kv aa-rest | additional-kv ( "," space additional-kv )* )? space "}"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -1141,12 +1141,12 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            ab-rest ::= ( "," space ac-kv )? ac-rest
            ac-kv ::= "\"ac\"" space ":" space integer
            ac-rest ::= ( "," space additional-kv )*
-            additional-k ::= ["] ( [a] ([b] char+ | [c] char+ | [^"bc] char*) | [^"a] char* )? ["] space
+            additional-k ::= ["] ( [a] ([b] char+ | [c] char+ | [^"bc] char*) | [^"a] char* )? ["]
            additional-kv ::= additional-k ":" space integer
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            integer ::= ("-"? integral-part) space
+            integer ::= ("-"? integral-part)
            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            root ::= "{" space  (ab-kv ab-rest | ac-kv ac-rest | additional-kv ( "," space additional-kv )* )? "}" space
+            root ::= "{" space  (ab-kv ab-rest | ac-kv ac-rest | additional-kv ( "," space additional-kv )* )? space "}"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -1173,11 +1173,11 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        })""",
        R"""(
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            ref-definitions-foo ::= "{" space ref-definitions-foo-a-kv "}" space
+            ref-definitions-foo ::= "{" space ref-definitions-foo-a-kv space "}"
            ref-definitions-foo-a-kv ::= "\"a\"" space ":" space string
            root ::= ref-definitions-foo
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
-            string ::= "\"" char* "\"" space
+            string ::= "\"" char* "\""
        )"""
    });

@@ -1204,10 +1204,10 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            alternative-1 ::= ref-definitions-bar
            decimal-part ::= [0-9]{1,16}
            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            ref-definitions-bar ::= "{" space  (ref-definitions-bar-b-kv )? "}" space
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
+            ref-definitions-bar ::= "{" space  (ref-definitions-bar-b-kv )? space "}"
            ref-definitions-bar-b-kv ::= "\"b\"" space ":" space number
-            ref-definitions-foo ::= "{" space  (ref-definitions-foo-a-kv )? "}" space
+            ref-definitions-foo ::= "{" space  (ref-definitions-foo-a-kv )? space "}"
            ref-definitions-foo-a-kv ::= "\"a\"" space ":" space number
            root ::= alternative-0 | alternative-1
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
@@ -1241,14 +1241,14 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            b ::= b-0 | boolean
            b-0 ::= string
            b-kv ::= "\"b\"" space ":" space b
-            boolean ::= ("true" | "false") space
+            boolean ::= ("true" | "false")
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
            decimal-part ::= [0-9]{1,16}
            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            root ::= "{" space  (a-kv a-rest | b-kv )? "}" space
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
+            root ::= "{" space  (a-kv a-rest | b-kv )? space "}"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
-            string ::= "\"" char* "\"" space
+            string ::= "\"" char* "\""
        )"""
    });

@@ -1290,8 +1290,8 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            d-rest ::= ( "," space c-kv )?
            decimal-part ::= [0-9]{1,16}
            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            root ::= "{" space a-kv "," space b-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
+            root ::= "{" space a-kv "," space b-kv ( "," space ( d-kv d-rest | c-kv ) )? space "}"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -1311,7 +1311,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            }
        })""",
        R"""(
-            root ::= ("\"a\"" | "\"b\"") space
+            root ::= ("\"a\"" | "\"b\"")
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -1336,7 +1336,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            }
        })""",
        R"""(
-            root ::= ("\"b\"" | "\"c\"") space
+            root ::= ("\"b\"" | "\"c\"")
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -1378,13 +1378,13 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        R"""(
            decimal-part ::= [0-9]{1,16}
            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            number- ::= "{" space number-number-kv "}" space
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
+            number- ::= "{" space number-number-kv space "}"
            number-kv ::= "\"number\"" space ":" space number-
-            number-number ::= "{" space number-number-root-kv "}" space
+            number-number ::= "{" space number-number-root-kv space "}"
            number-number-kv ::= "\"number\"" space ":" space number-number
            number-number-root-kv ::= "\"root\"" space ":" space number
-            root ::= "{" space number-kv "}" space
+            root ::= "{" space number-kv space "}"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -1394,17 +1394,17 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        "description only (no type) treated as unconstrained",
        R"""({"description": "The 0-based index of the last line to be retrieved (inclusive). If None, read until the end of the file."})""",
        R"""(
-            array ::= "[" space ( value ("," space value)* )? "]" space
-            boolean ::= ("true" | "false") space
+            array ::= "[" space ( value ("," space value)* )? space "]"
+            boolean ::= ("true" | "false")
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
            decimal-part ::= [0-9]{1,16}
            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            null ::= "null" space
-            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
+            null ::= "null"
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
+            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? space "}"
            root ::= value
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
-            string ::= "\"" char* "\"" space
+            string ::= "\"" char* "\""
            value ::= object | array | string | number | boolean | null
        )"""
    });
@@ -1428,9 +1428,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "type": "object"
        })""",
        R"""(
-            code ::= "\" \\r \\n \\\" \\\\ \"" space
+            code ::= "\" \\r \\n \\\" \\\\ \""
            code-kv ::= "\"code\"" space ":" space code
-            root ::= "{" space code-kv "}" space
+            root ::= "{" space code-kv space "}"
            space ::= | " " | "\n"{1,2} [ \t]{0,20}
        )"""
    });
@@ -1547,7 +1547,7 @@ int main() {
                "pattern": "^(?:foo|bar)baz$"
            })""",
            R"""(
-                root ::= "\"" (("foo" | "bar") "baz") "\"" space
+                root ::= "\"" (("foo" | "bar") "baz") "\""
                space ::= | " " | "\n"{1,2} [ \t]{0,20}
            )""",
        });
@@ -1560,7 +1560,7 @@ int main() {
                "pattern": "^(?:(?:ab)+c)?d$"
            })""",
            R"""(
-                root ::= "\"" ((("ab")+ "c")? "d") "\"" space
+                root ::= "\"" ((("ab")+ "c")? "d") "\""
                space ::= | " " | "\n"{1,2} [ \t]{0,20}
            )""",
        });
@@ -161,7 +161,7 @@
 | `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
 | `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_AUTO) |
 | `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_OFFLOAD) |
-| `--image, --audio FILE` | path to an image or audio file. use with multimodal models, use comma-separated values for multiple files |
+| `--image, --audio, --video FILE` | path to an image, audio, or video file. use with multimodal models, use comma-separated values for multiple files |
 | `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
 | `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
 | `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_KWARGS) |
@@ -174,6 +174,7 @@
 | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, granite-4.1, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
 | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)<br/>(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
 | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
+| `--log-prompts-dir PATH` | Log prompts to directory (only used for debugging, default: disabled) |
 | `--spec-draft-hf, -hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_SPEC_DRAFT_HF_REPO) |
 | `--spec-draft-threads, -td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
 | `--spec-draft-threads-batch, -tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) |
@@ -202,7 +202,7 @@ struct cli_context {

    // TODO: support remote files in the future (http, https, etc)
    std::string load_input_file(const std::string & fname, bool is_media) {
-        std::ifstream file(fname, std::ios::binary);
+        std::ifstream file = fs_open_ifstream(fname, std::ios::binary);
        if (!file) {
            return "";
        }
@@ -13,6 +13,14 @@
 #include <sstream>
 #include <vector>
 #include <memory>
+#include <fstream>
+
+#ifdef _WIN32
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#endif

 // Internal header for clip.cpp

@@ -661,6 +669,22 @@ struct clip_image_f32_batch {
 // common utils
 //

+#ifdef _WIN32
+static std::ifstream open_ifstream_binary(const std::string & fname) {
+    int wlen = MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, NULL, 0);
+    if (!wlen) {
+        throw std::runtime_error("failed to convert filename to UTF-16: " + fname);
+    }
+    std::vector<wchar_t> wfname(wlen);
+    (void)MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, wfname.data(), wlen);
+    return std::ifstream(wfname.data(), std::ios::binary);
+}
+#else
+static std::ifstream open_ifstream_binary(const std::string & fname) {
+    return std::ifstream(fname, std::ios::binary);
+}
+#endif
+
 static std::string string_format(const char * fmt, ...) {
    va_list ap;
    va_list ap2;
@@ -534,7 +534,7 @@ ggml_tensor * clip_graph::build_vit(
 ggml_tensor * clip_graph::build_inp() {
    ggml_tensor * inp_raw = build_inp_raw();
    ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-    inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd);
+    inp = ggml_reshape_3d(ctx0, inp, n_patches, n_embd, n_batch);
    inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
    if (model.patch_bias) {
        inp = ggml_add(ctx0, inp, model.patch_bias);
@@ -1045,8 +1045,17 @@ struct clip_model_loader {
    bool has_vision = false;
    bool has_audio  = false;

+    mtmd_progress_callback progress_callback = nullptr;
+    void * progress_callback_user_data = nullptr;
+
    // TODO @ngxson : we should not pass clip_ctx here, it should be clip_model
-    clip_model_loader(const char * fname, bool skip_tensors = false) : fname(fname) {
+    clip_model_loader(const char * fname,
+            bool skip_tensors = false,
+            mtmd_progress_callback progress_cb = nullptr,
+            void * progress_user_data = nullptr)
+        : fname(fname),
+          progress_callback(progress_cb),
+          progress_callback_user_data(progress_user_data) {
        struct ggml_context * meta = nullptr;

        struct gguf_init_params params = {
@@ -1675,6 +1684,9 @@ struct clip_model_loader {
                    // note: some models having hparams.image_size == 0, which means the image size is dynamic
                    throw std::runtime_error(string_format("%s: image_size (%d) cannot be negative\n", __func__, hparams.image_size));
                }
+                if (hparams.image_size > 65536) {
+                    throw std::runtime_error(string_format("%s: image_size (%d) is too large (max 65536)\n", __func__, hparams.image_size));
+                }
                if (hparams.patch_size <= 0) {
                    throw std::runtime_error(string_format("%s: patch_size (%d) must be greater than 0\n", __func__, hparams.patch_size));
                }
@@ -1723,6 +1735,19 @@ struct clip_model_loader {
                LOG_INF("%s: audio_n_fft:        %d\n", __func__, hparams.audio_n_fft);
                LOG_INF("%s: audio_window_len:   %d\n", __func__, hparams.audio_window_len);
                LOG_INF("%s: audio_hop_len:      %d\n", __func__, hparams.audio_hop_len);
+
+                // GEMMA4UA is encoder-free: it uses n_mel_bins as a raw-waveform frame size (640) and has no FFT/filterbank, so the mel-range and FFT
+                // checks below do not apply to it.
+                const bool fft_based = model.proj_type != PROJECTOR_TYPE_GEMMA4UA;
+
+                // Validate audio hparams loaded from GGUF metadata
+                if (hparams.n_mel_bins <= 0 || (fft_based && hparams.n_mel_bins > 256)) {
+                    throw std::runtime_error(string_format("%s: n_mel_bins (%d) must be in range [1, 256]\n", __func__, hparams.n_mel_bins));
+                }
+                if (fft_based && (hparams.audio_sample_rate <= 0 || hparams.audio_n_fft <= 0 || hparams.audio_hop_len <= 0 || hparams.audio_window_len <= 0)) {
+                    throw std::runtime_error(string_format("%s: audio hparams invalid: sample_rate=%d n_fft=%d window_len=%d hop_len=%d\n",
+                        __func__, hparams.audio_sample_rate, hparams.audio_n_fft, hparams.audio_window_len, hparams.audio_hop_len));
+                }
            }
            LOG_INF("\n");
            LOG_INF("%s: model size:         %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
@@ -1736,7 +1761,7 @@ struct clip_model_loader {
        std::map<std::string, size_t> tensor_offset;
        std::vector<ggml_tensor *> tensors_to_load;

-        auto fin = std::ifstream(fname, std::ios::binary);
+        auto fin = open_ifstream_binary(fname);
        if (!fin) {
            throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
        }
@@ -2771,37 +2796,60 @@ struct clip_model_loader {
        }

        // load data
-        if (!ctx_clip.no_alloc) {
+        {
            std::vector<uint8_t> read_buf;

+            // start loading event
+            if (progress_callback){
+                progress_callback(0.0, progress_callback_user_data);
+            }
+
+            // compute total tensor data size for progress reporting
+            size_t total_data_size = 0;
+            for (auto & t : tensors_to_load) {
+                total_data_size += ggml_nbytes(t);
+            }
+
            // alloc memory and offload data
            ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend);
            ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft));
            ggml_backend_buffer_set_usage(ctx_clip.buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
-            for (auto & t : tensors_to_load) {
-                ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name);
-                GGML_ASSERT(cur && "tensor not found in ctx_data");
-                auto it_off = tensor_offset.find(t->name);
-                GGML_ASSERT(it_off != tensor_offset.end() && "no offset for tensor");
-                const size_t offset = it_off->second;
-                fin.seekg(offset, std::ios::beg);
-                if (!fin) {
-                    throw std::runtime_error(string_format("%s: failed to seek for tensor %s\n", __func__, t->name));
-                }
-                size_t num_bytes = ggml_nbytes(cur);
-                if (ggml_backend_buft_is_host(buft)) {
-                    // for the CPU and Metal backend, we can read directly into the tensor
-                    fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
-                } else {
-                    // read into a temporary buffer first, then copy to device memory
-                    read_buf.resize(num_bytes);
-                    fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
-                    ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
+            // read the weight from file
+            if (!ctx_clip.no_alloc) {
+                size_t data_loaded = 0;
+                for (auto & t : tensors_to_load) {
+                    ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name);
+                    GGML_ASSERT(cur && "tensor not found in ctx_data");
+                    auto it_off = tensor_offset.find(t->name);
+                    GGML_ASSERT(it_off != tensor_offset.end() && "no offset for tensor");
+                    const size_t offset = it_off->second;
+                    fin.seekg(offset, std::ios::beg);
+                    if (!fin) {
+                        throw std::runtime_error(string_format("%s: failed to seek for tensor %s\n", __func__, t->name));
+                    }
+                    size_t num_bytes = ggml_nbytes(cur);
+                    if (ggml_backend_buft_is_host(buft)) {
+                        // for the CPU and Metal backend, we can read directly into the tensor
+                        fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
+                    } else {
+                        // read into a temporary buffer first, then copy to device memory
+                        read_buf.resize(num_bytes);
+                        fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
+                        ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
+                    }
+                    data_loaded += num_bytes;
+                    if (progress_callback && total_data_size > 0) {
+                        const float progress = (float)data_loaded / (float)total_data_size;
+                        if (!progress_callback(progress, progress_callback_user_data)) {
+                            throw std::runtime_error(string_format("%s: model loading cancelled by progress_callback\n", __func__));
+                        }
+                    }
                }
+                LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str());
+            } else {
+                LOG_DBG("%s: no_alloc is set, skipping tensor data loading (%zu tensors)\n", __func__, tensors_to_load.size());
            }
            fin.close();
-
-            LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str());
        }

    }
@@ -2831,6 +2879,12 @@ struct clip_model_loader {
            img.set_size({sz, sz}, false, false);
            LOG_INF("%s: warmup with image size = %d x %d\n", __func__, sz, sz);
        } else {
+            // GEMMA4UA uses n_mel_bins as a raw-waveform frame size (640), not a mel-bin count,
+            // so the [1, 256] bound only applies to FFT-based models.
+            const bool fft_based = ctx_clip.model.proj_type != PROJECTOR_TYPE_GEMMA4UA;
+            if (hparams.n_mel_bins <= 0 || (fft_based && hparams.n_mel_bins > 256)) {
+                throw std::runtime_error(string_format("%s: invalid n_mel_bins (%d), must be in [1, 256]\n", __func__, hparams.n_mel_bins));
+            }
            img.set_size({hparams.warmup_audio_size, hparams.n_mel_bins}, false, false);
            LOG_INF("%s: warmup with audio size = %d\n", __func__, hparams.warmup_audio_size);
        }
@@ -2994,7 +3048,13 @@ struct clip_model_loader {
            }
            return;
        }
-        output = gguf_get_val_u32(ctx_gguf.get(), i);
+        const uint32_t val = gguf_get_val_u32(ctx_gguf.get(), i);
+        // sanity check
+        if (val > (uint32_t) INT32_MAX) {
+            throw std::runtime_error(string_format("%s: value %u for key '%s' exceeds INT32_MAX\n",
+                __func__, val, key.c_str()));
+        }
+        output = (int) val;
    }

    void get_f32(const std::string & key, float & output, bool required = true) const {
@@ -3077,7 +3137,10 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
    clip_ctx * ctx_audio = nullptr;

    try {
-        clip_model_loader loader(fname);
+        clip_model_loader loader(fname,
+            /* skip_tensors */ false,
+            ctx_params.progress_callback,
+            ctx_params.progress_callback_user_data);
        bool skip_audio = false;

        if (loader.has_vision) {
@@ -24,6 +24,9 @@ struct clip_image_size {
        return !(*this == other);
    }
    int area() const {
+        // avoid overflow when computing area
+        GGML_ASSERT(width  >= 0 && width  <= 46000);
+        GGML_ASSERT(height >= 0 && height <= 46000);
        return width * height;
    }
 };
@@ -51,6 +54,8 @@ struct clip_context_params {
    ggml_backend_sched_eval_callback cb_eval;
    void * cb_eval_user_data;
    bool no_alloc;
+    mtmd_progress_callback progress_callback;
+    void * progress_callback_user_data;
 };

 struct clip_init_result {
@@ -8,7 +8,9 @@ ggml_cgraph * clip_graph_internvl::build() {
    ggml_tensor * inp = build_inp();

    // add CLS token
-    inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
+    ggml_tensor * cls_repeated = ggml_repeat_4d(ctx0, model.class_embedding,
+            model.class_embedding->ne[0], 1, n_batch, 1);
+    inp = ggml_concat(ctx0, inp, cls_repeated, 1);

    // The larger models use a different ViT, which uses RMS norm instead of layer norm
    // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
@@ -24,14 +26,15 @@ ggml_cgraph * clip_graph_internvl::build() {
                            nullptr);

    // remove CLS token
-    cur = ggml_view_2d(ctx0, cur,
-        n_embd, n_patches,
-        ggml_row_size(cur->type, n_embd), 0);
+    cur = ggml_view_3d(ctx0, cur,
+        n_embd, n_patches, n_batch,
+        cur->nb[1], cur->nb[2], 0);
+    cur = ggml_cont(ctx0, cur);

    // pixel shuffle
    {
        const int scale_factor = model.hparams.n_merge;
-        const int bsz    = 1; // batch size, always 1 for now since we don't support batching
+        const int bsz    = n_batch;
        const int height = n_patches_y;
        const int width  = n_patches_x;
        GGML_ASSERT(scale_factor > 0);
@@ -44,9 +47,10 @@ ggml_cgraph * clip_graph_internvl::build() {
            bsz);
        cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
        // flatten to 2D
-        cur = ggml_cont_2d(ctx0, cur,
+        cur = ggml_cont_3d(ctx0, cur,
            n_embd * scale_factor * scale_factor,
-            cur->ne[1] * cur->ne[2]);
+            cur->ne[1] * cur->ne[2],
+            cur->ne[3]);
    }

    // projector (always using GELU activation)
@@ -80,6 +80,7 @@ struct clip_graph_minicpmv4_6 : clip_graph {
 struct clip_graph_internvl : clip_graph {
    clip_graph_internvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
+    bool support_batch() const override { return true; }
 };

 struct clip_graph_nemotron_v2_vl : clip_graph {
@@ -32,8 +32,8 @@ void mtmd_audio_cache::fill_hann_window(uint32_t length, bool periodic) {
    }
 }

-void mtmd_audio_cache::fill_mel_filterbank_matrix(int   n_mel,
-                                                  int   n_fft,
+void mtmd_audio_cache::fill_mel_filterbank_matrix(int64_t n_mel,
+                                                  int64_t n_fft,
                                                  int   sample_rate,
                                                  float fmin,
                                                  float fmax,
@@ -86,11 +86,16 @@ void mtmd_audio_cache::fill_mel_filterbank_matrix(int   n_mel,
        hz_pts[i] = mel_to_hz(mel_pts[i]);
    }

-    const int n_fft_bins = n_fft / 2 + 1;
+    const int64_t n_fft_bins = n_fft / 2 + 1;
+
+    // Validate allocation size
+    if ((size_t)n_mel * (size_t)n_fft_bins > SIZE_MAX) {
+        GGML_ASSERT(false && "mel filterbank allocation too large");
+    }

    // filterbank
-    std::vector<float> out(n_mel * n_fft_bins, 0);
-    for (int m = 0; m < n_mel; ++m) {
+    std::vector<float> out((size_t)n_mel * (size_t)n_fft_bins, 0);
+    for (int64_t m = 0; m < n_mel; ++m) {
        const double f_left   = hz_pts[m];
        const double f_center = hz_pts[m + 1];
        const double f_right  = hz_pts[m + 2];
@@ -266,8 +271,8 @@ static void ifft(const mtmd_audio_cache & cache, float * in, int N, float * out)
 }

 struct filter_params {
-    int32_t n_mel;
-    int32_t n_fft_bins;
+    int64_t n_mel;
+    int64_t n_fft_bins;
    int32_t hann_window_size;
    int32_t hop_length;
    int32_t sample_rate;
@@ -293,8 +298,8 @@ static void log_mel_spectrogram_worker_thread(int                        ith,
    std::vector<float> fft_in(frame_size * 2, 0.0);
    std::vector<float> fft_out(frame_size * 2 * 2 * 2);

-    int n_fft_bins = params.n_fft_bins;
-    int i = ith;
+    int64_t n_fft_bins = params.n_fft_bins;
+    int64_t i = ith;

    const auto & filters = cache.filters;

@@ -302,17 +307,18 @@ static void log_mel_spectrogram_worker_thread(int                        ith,
    GGML_ASSERT(n_fft_bins == 1 + (frame_size / 2));
    GGML_ASSERT(cache.sin_vals.size() == cache.cos_vals.size());
    // calculate FFT only when fft_in are not all zero
-    for (; i < std::min(n_samples / frame_step + 1, out.n_len); i += n_threads) {
-        const int offset = i * frame_step;
+    for (; i < std::min((int64_t)(n_samples / frame_step + 1), out.n_len); i += n_threads) {
+        const int64_t offset = i * frame_step;

        // apply Hann window (~10% faster)
-        for (int j = 0; j < std::min(frame_size, n_samples - offset); j++) {
+        const int valid_len = std::min(frame_size, std::max(0, n_samples - (int)offset));
+        for (int j = 0; j < valid_len; j++) {
            fft_in[j] = hann[j] * samples[offset + j];
        }

        // fill the rest with zeros
-        if (n_samples - offset < frame_size) {
-            std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0);
+        if (valid_len < frame_size) {
+            std::fill(fft_in.begin() + valid_len, fft_in.end(), 0.0);
        }

        // FFT
@@ -325,7 +331,7 @@ static void log_mel_spectrogram_worker_thread(int                        ith,
        }

        // mel spectrogram
-        for (int j = 0; j < out.n_mel; j++) {
+        for (int64_t j = 0; j < out.n_mel; j++) {
            double sum = 0.0;
            // unroll loop (suggested by GH user @lunixbochs)
            int k = 0;
@@ -339,21 +345,21 @@ static void log_mel_spectrogram_worker_thread(int                        ith,
            }
            // handle n_fft remainder
            for (; k < n_fft_bins; k++) {
-                sum += fft_out[k] * filters.data[j * n_fft_bins + k];
+                sum += fft_out[k] * filters.data[(size_t)j * n_fft_bins + k];
            }
            sum = std::max(sum, (double)params.mel_floor);
            sum = params.use_natural_log
                ? log(sum)
                : log10(sum);
-            out.data[j * out.n_len + i] = sum;
+            out.data[(size_t)j * out.n_len + i] = sum;
        }
    }

    // Otherwise fft_out are all zero
    double sum = params.use_natural_log ? log(1e-10) : log10(1e-10);
    for (; i < out.n_len; i += n_threads) {
-        for (int j = 0; j < out.n_mel; j++) {
-            out.data[j * out.n_len + i] = sum;
+        for (int64_t j = 0; j < out.n_mel; j++) {
+            out.data[(size_t)j * out.n_len + i] = sum;
        }
    }
 }
@@ -437,16 +443,21 @@ static bool log_mel_spectrogram(
    GGML_ASSERT(params.hop_length > 0);
    out.n_mel = params.n_mel;
    out.n_len = (n_samples - frame_size) / frame_step + 1;
-    // TODO: handle these checks better
-    if (out.n_mel > 0 && (unsigned long)out.n_len > SIZE_MAX / out.n_mel) {
-        LOG_ERR("%s: size overflow\n", __func__);
+    // Validate dimensions before allocation to prevent integer overflow
+    if (out.n_mel <= 0 || out.n_len <= 0) {
+        LOG_ERR("%s: invalid mel dimensions n_mel=%lld n_len=%lld\n", __func__, (long long)out.n_mel, (long long)out.n_len);
+        return false;
+    }
+    const size_t total_size = (size_t)out.n_mel * (size_t)out.n_len;
+    if (total_size > SIZE_MAX / sizeof(float)) {
+        LOG_ERR("%s: size overflow: n_mel=%lld n_len=%lld\n", __func__, (long long)out.n_mel, (long long)out.n_len);
        return false;
    }
    if (n_samples < frame_size) {
        LOG_ERR("%s: not enough samples after padding\n", __func__);
        return false;
    }
-    out.data.resize(out.n_mel * out.n_len);
+    out.data.resize(total_size);

    {
        std::vector<std::thread> workers(n_threads - 1);
@@ -464,38 +475,39 @@ static bool log_mel_spectrogram(
        }
    }

-    const int effective_n_len = n_samples_in / frame_step;
+    const int64_t effective_n_len = n_samples_in / frame_step;
    if (params.norm_per_feature) {
        GGML_ASSERT(effective_n_len > 1);
-        for (int i = 0; i < out.n_mel; i++) {
+        for (int64_t i = 0; i < out.n_mel; i++) {
            double mean = 0;
-            for (int j = 0; j < effective_n_len; ++j) {
-                mean += out.data[i * out.n_len + j];
+            for (int64_t j = 0; j < effective_n_len; ++j) {
+                mean += out.data[(size_t)i * out.n_len + j];
            }
            mean /= effective_n_len;

            double var = 0.0;
-            for (int j = 0; j < effective_n_len; ++j) {
-                const double value = out.data[i * out.n_len + j] - mean;
+            for (int64_t j = 0; j < effective_n_len; ++j) {
+                const double value = out.data[(size_t)i * out.n_len + j] - mean;
                var += value * value;
            }
            var /= effective_n_len - 1;  // unbiased
            const double mstd = std::sqrt(var + 1e-5);

-            for (int j = 0; j < effective_n_len; ++j) {
-                auto &value = out.data[i * out.n_len + j];
+            for (int64_t j = 0; j < effective_n_len; ++j) {
+                auto &value = out.data[(size_t)i * out.n_len + j];
                value        = (value - mean) / mstd;
            }

            // pad the rest with zeros
-            for (int j = effective_n_len; j < out.n_len; ++j) {
-                out.data[i * out.n_len + j] = 0.0;
+            for (int64_t j = effective_n_len; j < out.n_len; ++j) {
+                out.data[(size_t)i * out.n_len + j] = 0.0;
            }
        }
    } else if (!params.no_padding) {
        // Whisper-style clamping and normalization (NOT used by Gemma4)
        double mmax = -1e20;
-        for (int i = 0; i < out.n_mel*out.n_len; i++) {
+        const size_t mel_size = (size_t)out.n_mel * (size_t)out.n_len;
+        for (size_t i = 0; i < mel_size; i++) {
            if (out.data[i] > mmax) {
                mmax = out.data[i];
            }
@@ -503,7 +515,7 @@ static bool log_mel_spectrogram(

        mmax -= 8.0;

-        for (int i = 0; i < out.n_mel*out.n_len; i++) {
+        for (size_t i = 0; i < mel_size; i++) {
            if (out.data[i] < mmax) {
                out.data[i] = mmax;
            }
@@ -582,13 +594,13 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 s
    // because the cgraph in clip.cpp only accepts 3000 frames each, we need to split the mel
    // we always expect the mel to have 3000 silent frames at the end
    if (DEBUG) {
-        printf("output: n_mel = %d, n_len = %d\n", out_full.n_mel, out_full.n_len);
+        printf("output: n_mel = %d, n_len = %d\n", (int) out_full.n_mel, (int) out_full.n_len);
    }
    const size_t frames_per_chunk = 3000;
    GGML_ASSERT((size_t) out_full.n_len > frames_per_chunk);
    for (size_t off = 0; off < (size_t) out_full.n_len; off += frames_per_chunk) {
-        int n_len = std::min(frames_per_chunk, (size_t) out_full.n_len - off);
-        if ((size_t) n_len < frames_per_chunk) {
+        int64_t n_len = std::min((int64_t)frames_per_chunk, out_full.n_len - (int64_t)off);
+        if (n_len < (int64_t)frames_per_chunk) {
            break;  // last incomplete chunk will always be a padded chunk, safe to ignore
        }

@@ -596,10 +608,10 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 s
        out_chunk.n_len     = n_len;
        out_chunk.n_mel     = out_full.n_mel;
        out_chunk.n_len_org = out_full.n_mel;  // unused
-        out_chunk.data.reserve(out_chunk.n_mel * out_chunk.n_len);
+        out_chunk.data.reserve((size_t)out_chunk.n_mel * (size_t)out_chunk.n_len);

-        for (int i = 0; i < out_full.n_mel; i++) {
-            auto src = out_full.data.begin() + i * out_full.n_len + off;
+        for (int64_t i = 0; i < out_full.n_mel; i++) {
+            auto src = out_full.data.begin() + (size_t)i * out_full.n_len + off;
            out_chunk.data.insert(out_chunk.data.end(), src, src + frames_per_chunk);
        }

@@ -681,8 +693,8 @@ bool mtmd_audio_preprocessor_qwen3a::preprocess(const float *                 sa

    // The effective frame count: center-padded STFT gives ~n_samples/hop_length frames.
    // We take min(mel_full.n_len, n_samples/hop + 1) to avoid including excess frames.
-    const int n_eff = std::min(mel_full.n_len,
-                               (int)(n_samples / hparams.audio_hop_len) + 1);
+    const int64_t n_eff = std::min(mel_full.n_len,
+                               (int64_t)(n_samples / hparams.audio_hop_len) + 1);

    // Split into inference windows matching n_window_infer=800 from model config.
    // Each window is padded to the next multiple of chunk_size for the cgraph.
@@ -690,18 +702,18 @@ bool mtmd_audio_preprocessor_qwen3a::preprocess(const float *                 sa
    const int chunk_size  = 100; // conv sub-chunk size (n_window * 2, n_window=50)
    const int window_size = 800; // mel frames per forward pass (n_window_infer=800)

-    for (int off = 0; off < n_eff; off += window_size) {
-        const int win_eff    = std::min(window_size, n_eff - off);
-        const int n_chunks   = (win_eff + chunk_size - 1) / chunk_size;
-        const int n_padded   = n_chunks * chunk_size;
+    for (int64_t off = 0; off < n_eff; off += window_size) {
+        const int64_t win_eff  = std::min((int64_t)window_size, n_eff - off);
+        const int64_t n_chunks  = (win_eff + chunk_size - 1) / chunk_size;
+        const int64_t n_padded  = n_chunks * chunk_size;

        mtmd_audio_mel out;
        out.n_mel     = mel_full.n_mel;
        out.n_len     = n_padded;
        out.n_len_org = win_eff;
-        out.data.assign(out.n_mel * out.n_len, 0.0f);
-        for (int m = 0; m < out.n_mel; m++) {
-            const int copy_len = std::min(win_eff, mel_full.n_len - off);
+        out.data.assign((size_t)out.n_mel * (size_t)out.n_len, 0.0f);
+        for (int64_t m = 0; m < out.n_mel; m++) {
+            const int64_t copy_len = std::min((int64_t)win_eff, mel_full.n_len - off);
            if (copy_len > 0) {
                std::copy(mel_full.data.begin() + (size_t)m * mel_full.n_len + off,
                          mel_full.data.begin() + (size_t)m * mel_full.n_len + off + copy_len,
@@ -823,37 +835,38 @@ bool mtmd_audio_preprocessor_granite_speech::preprocess(const float *
    }

    double mmax = -1e20;
-    for (int i = 0; i < mel.n_mel * mel.n_len; i++) {
+    const size_t mel_size = (size_t)mel.n_mel * (size_t)mel.n_len;
+    for (size_t i = 0; i < mel_size; i++) {
        if (mel.data[i] > mmax) {
            mmax = mel.data[i];
        }
    }
    mmax -= 8.0;

-    for (int i = 0; i < mel.n_mel * mel.n_len; i++) {
+    for (size_t i = 0; i < mel_size; i++) {
        if (mel.data[i] < mmax) {
            mel.data[i] = mmax;
        }
        mel.data[i] = (mel.data[i] + 4.0) / 4.0;
    }

-    int n_frames = mel.n_len;
+    int64_t n_frames = mel.n_len;
    if (n_frames % 2 == 1) {
        n_frames--;
    }
-    const int n_mel     = mel.n_mel;
-    const int n_stacked = n_frames / 2;
+    const int64_t n_mel     = mel.n_mel;
+    const int64_t n_stacked = n_frames / 2;

    mtmd_audio_mel stacked;
    stacked.n_mel     = 2 * n_mel;
    stacked.n_len     = n_stacked;
-    stacked.n_len_org = (int)n_samples;
-    stacked.data.resize(2 * n_mel * n_stacked);
+    stacked.n_len_org = (int64_t)n_samples;
+    stacked.data.resize((size_t)2 * (size_t)n_mel * (size_t)n_stacked);

-    for (int t = 0; t < n_stacked; t++) {
-        for (int m = 0; m < n_mel; m++) {
-            stacked.data[m * n_stacked + t] = mel.data[m * mel.n_len + 2 * t];
-            stacked.data[(m + n_mel) * n_stacked + t] = mel.data[m * mel.n_len + 2 * t + 1];
+    for (int64_t t = 0; t < n_stacked; t++) {
+        for (int64_t m = 0; m < n_mel; m++) {
+            stacked.data[(size_t)m * n_stacked + t] = mel.data[(size_t)m * mel.n_len + 2 * t];
+            stacked.data[(size_t)(m + n_mel) * n_stacked + t] = mel.data[(size_t)m * mel.n_len + 2 * t + 1];
        }
    }

@@ -921,8 +934,8 @@ bool mtmd_audio_preprocessor_gemma4a::preprocess(const float *                 s
        const int hop = hparams.audio_hop_len;
        const int n_with_left = (int)chunk_len + pad_left;
        // PyTorch: unfold(size=frame_length+1, step=hop) on semicausal-padded waveform
-        const int pt_frames = (n_with_left - (hparams.audio_window_len + 1)) / hop + 1;
-        const int n_padded_needed = (pt_frames - 1) * hop + fft_size;
+        const int64_t pt_frames = (n_with_left - (hparams.audio_window_len + 1)) / hop + 1;
+        const int64_t n_padded_needed = (pt_frames - 1) * hop + fft_size;
        const int total_pad = std::max((int)(n_padded_needed - (int)chunk_len), pad_left);
        std::vector<float> padded_samples(total_pad + chunk_len, 0.0f);
        std::copy(chunk_ptr, chunk_ptr + chunk_len, padded_samples.data() + pad_left);
@@ -10,16 +10,16 @@
 #define MTMD_INTERNAL_HEADER

 struct mtmd_audio_mel {
-    int n_len;
-    int n_len_org;
-    int n_mel;
+    int64_t n_len;
+    int64_t n_len_org;
+    int64_t n_mel;

    std::vector<float> data;
 };

 struct mtmd_audio_mel_filters {
-    int32_t n_mel;
-    int32_t n_fft;
+    int64_t n_mel;
+    int64_t n_fft;

    std::vector<float> data;
 };
@@ -39,8 +39,8 @@ struct mtmd_audio_cache {

    // Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
    // n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
-    void fill_mel_filterbank_matrix(int   n_mel,
-                                    int   n_fft,
+    void fill_mel_filterbank_matrix(int64_t n_mel,
+                                    int64_t n_fft,
                                    int   sample_rate,               // e.g. 16000
                                    float fmin             = 0.0f,   // e.g. 0.0
                                    float fmax             = -1.0f,  // e.g. sr/2; pass -1 for auto
@@ -32,9 +32,9 @@ static volatile bool g_is_generating = false;
 static volatile bool g_is_interrupted = false;

 /**
- * Please note that this is NOT a production-ready stuff.
+ * Please note that this is NOT a production-ready binary.
 * It is a playground for trying multimodal support in llama.cpp.
- * For contributors: please keep this code simple and easy to understand.
+ * For contributors: please keep this code simple and easy to understand. Do not add unnecessary complexity. The goal is to have a simple CLI for testing multimodal support.
 */

 static void show_additional_info(int /*argc*/, char ** argv) {
@@ -65,6 +65,14 @@ static void sigint_handler(int signo) {
 }
 #endif

+// this is only used by tests.sh to capture the response ; it's not meant to be used in production
+static void inject_test_response_marker() {
+    const char * env = std::getenv("MTMD_TEST_RESPONSE_MARKER");
+    if (env) {
+        LOG("%s\n", env);
+    }
+}
+
 struct mtmd_cli_context {
    mtmd::context_ptr ctx_vision;
    common_init_result_ptr llama_init;
@@ -79,6 +87,8 @@ struct mtmd_cli_context {
    mtmd::bitmaps bitmaps;
    std::vector<mtmd_helper::video_ptr> videos;

+    mtmd::batch_ptr mbatch;
+
    // chat template
    common_chat_templates_ptr tmpls;
    std::vector<common_chat_msg> chat_history;
@@ -233,6 +243,8 @@ static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg &
 }

 static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
+    inject_test_response_marker();
+
    bool add_bos = ctx.chat_history.empty();
    auto formatted_chat = chat_add_and_format(ctx, msg);
    LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str());
@@ -259,20 +271,95 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
    ctx.bitmaps.entries.clear();
    ctx.videos.clear();

-    llama_pos new_n_past;
-    if (mtmd_helper_eval_chunks(ctx.ctx_vision.get(),
-                ctx.lctx, // lctx
-                chunks.ptr.get(), // chunks
-                ctx.n_past, // n_past
-                0, // seq_id
-                ctx.n_batch, // n_batch
-                true, // logits_last
-                &new_n_past)) {
-        LOG_ERR("Unable to eval prompt\n");
-        return 1;
-    }
+    // batch encode all media chunks, then decode each
+    size_t n_chunks = mtmd_input_chunks_size(chunks.ptr.get());
+    for (size_t i = 0; i < n_chunks; i++) {
+        auto chunk = mtmd_input_chunks_get(chunks.ptr.get(), i);
+        auto chunk_type = mtmd_input_chunk_get_type(chunk);

-    ctx.n_past = new_n_past;
+        if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+            // decode text chunk
+            llama_pos new_n_past = ctx.n_past;
+            res = mtmd_helper_eval_chunk_single(ctx.ctx_vision.get(),
+                        ctx.lctx,
+                        chunk,
+                        ctx.n_past,
+                        0, // seq_id
+                        ctx.n_batch,
+                        i == n_chunks - 1, // logits_last
+                        &new_n_past);
+            if (res != 0) {
+                LOG_ERR("Unable to eval text chunk %zu\n", i);
+                return 1;
+            }
+            ctx.n_past = new_n_past;
+        } else {
+            // media chunk: try to get embd from existing batch, or create a new batch
+            float * embd = nullptr;
+            if (ctx.mbatch) {
+                embd = mtmd_batch_get_output_embd(ctx.mbatch.get(), chunk);
+
+                if (embd) {
+                    LOG_DBG("found embd for media chunk %zu in existing batch\n", i);
+                } else {
+                    LOG_DBG("media chunk %zu not found in existing batch, creating new batch\n", i);
+                }
+            }
+
+            if (!embd) {
+                // create and encode a new batch with as many media chunks as possible
+                ctx.mbatch.reset(mtmd_batch_init(ctx.ctx_vision.get()));
+                res = mtmd_batch_add_chunk(ctx.mbatch.get(), chunk);
+                GGML_ASSERT(res == 0); // first chunk must always succeed
+
+                int n_added = 1;
+                // add as many subsequent media chunks as possible
+                for (size_t j = i + 1; j < n_chunks; j++) {
+                    auto next_chunk = mtmd_input_chunks_get(chunks.ptr.get(), j);
+                    auto next_type = mtmd_input_chunk_get_type(next_chunk);
+                    if (next_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+                        break; // text chunk splits the batch
+                    }
+                    res = mtmd_batch_add_chunk(ctx.mbatch.get(), next_chunk);
+                    if (res != 0) {
+                        break; // batch full or incompatible
+                    }
+                    n_added++;
+                }
+
+                int64_t time_start = ggml_time_ms();
+                LOG_INF("encoding mtmd batch, n_chunks = %d (done = %zu, total = %zu)\n", n_added, i, n_chunks);
+                res = mtmd_batch_encode(ctx.mbatch.get());
+                if (res != 0) {
+                    LOG_ERR("Failed to encode mtmd batch, res = %d\n", res);
+                    return 1;
+                }
+                LOG_INF("mtmd batch encoding done in %d ms\n", (int)(ggml_time_ms() - time_start));
+
+                embd = mtmd_batch_get_output_embd(ctx.mbatch.get(), chunk);
+            }
+
+            GGML_ASSERT(embd != nullptr);
+
+            llama_pos new_n_past = ctx.n_past;
+            res = mtmd_helper_decode_image_chunk(ctx.ctx_vision.get(),
+                        ctx.lctx,
+                        chunk,
+                        embd,
+                        ctx.n_past,
+                        0, // seq_id
+                        ctx.n_batch,
+                        &new_n_past,
+                        nullptr, // callback
+                        nullptr  // user_data
+                    );
+            if (res != 0) {
+                LOG_ERR("Unable to decode media chunk %zu\n", i);
+                return 1;
+            }
+            ctx.n_past = new_n_past;
+        }
+    }

    LOG("\n");

@@ -309,6 +396,9 @@ int main(int argc, char ** argv) {

    int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;

+    console::init(params.simple_io, params.use_color);
+    atexit([]() { console::cleanup(); });
+
    // Ctrl+C handling
    {
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
@@ -582,13 +582,29 @@ mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx,
 }

 mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder) {
-    std::vector<unsigned char> buf;
+#ifdef _WIN32
+    int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
+    if (!wlen) {
+        LOG_ERR("Unable to convert filename to UTF-16: %s\n", fname);
+        return {nullptr, nullptr};
+    }
+    std::vector<wchar_t> wfname(wlen);
+    wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wfname.data(), wlen);
+    if (!wlen) {
+        LOG_ERR("Unable to convert filename to UTF-16: %s\n", fname);
+        return {nullptr, nullptr};
+    }
+    FILE * f = _wfopen(wfname.data(), L"rb");
+#else
    FILE * f = fopen(fname, "rb");
+#endif
    if (!f) {
        LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
        return {nullptr, nullptr};
    }

+    std::vector<unsigned char> buf;
+
    fseek(f, 0, SEEK_END);
    long file_size = ftell(f);
    fseek(f, 0, SEEK_SET);
@@ -251,6 +251,8 @@ mtmd_context_params mtmd_context_params_default() {
        /* cb_eval           */ nullptr,
        /* cb_eval_user_data */ nullptr,
        /* batch_max_tokens  */ 1024,
+        /* progress_callback */ nullptr,
+        /* progress_callback_user_data */ nullptr,
    };
    return params;
 }
@@ -345,6 +347,8 @@ struct mtmd_context {
            /* cb_eval           */ ctx_params.cb_eval,
            /* cb_eval_user_data */ ctx_params.cb_eval_user_data,
            /* no_alloc          */ no_alloc,
+            /* progress_callback */ ctx_params.progress_callback,
+            /* progress_callback_user_data */ ctx_params.progress_callback_user_data,
        };

        auto res = clip_init(mmproj_fname, ctx_clip_params);
@@ -1295,9 +1299,12 @@ struct mtmd_tokenizer {
            for (auto & mel_spec : mel_spec_chunks) {
                const bool is_placeholder = mel_spec.data.empty();

+                // Validate dimensions fit in clip_image_size (int)
+                GGML_ASSERT(mel_spec.n_len <= INT32_MAX && mel_spec.n_len >= 0);
+                GGML_ASSERT(mel_spec.n_mel <= INT32_MAX && mel_spec.n_mel >= 0);
                clip_image_f32 mel_f32;
                mel_f32.set_size(
-                    {mel_spec.n_len, mel_spec.n_mel},
+                    {(int)mel_spec.n_len, (int)mel_spec.n_mel},
                    is_placeholder, /* is_audio */ true);
                mel_f32.cpy_buf(mel_spec.data);

@@ -2130,9 +2137,12 @@ std::map<ggml_backend_dev_t, size_t> mtmd_get_memory_usage(const char * mmproj_f
    mtmd::context_ptr ctx;
    auto saved_log_callback = g_logger_state.log_callback;
    auto saved_log_user_data = g_logger_state.log_callback_user_data;
+
+    ctx_params.progress_callback = nullptr;
+
    try {
        mtmd_log_set(stub_log_callback, nullptr); // suppress logging
-        ctx.reset(new mtmd_context(mmproj_fname, nullptr, ctx_params));
+        ctx.reset(new mtmd_context(mmproj_fname, nullptr, ctx_params, true));
        mtmd_log_set(saved_log_callback, saved_log_user_data); // restore log callback
        std::map<ggml_backend_dev_t, size_t> total_mem;
        auto merge = [&](const struct clip_ctx * c) {
@@ -83,6 +83,8 @@ typedef struct mtmd_input_chunks mtmd_input_chunks;
 typedef struct mtmd_input_text   mtmd_input_text;
 typedef struct mtmd_batch        mtmd_batch;

+typedef bool (*mtmd_progress_callback)(float progress, void * user_data);
+
 struct mtmd_context_params {
    bool use_gpu;
    bool print_timings;
@@ -104,6 +106,12 @@ struct mtmd_context_params {
    int32_t batch_max_tokens; // maximum number of output tokens in a batch
                              // (note: this is not a hard-limit, the first image will always be added even if it exceeds this limit)
                              // (default: 1024)
+
+    // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
+    // If the provided progress_callback returns true, model loading continues.
+    // If it returns false, model loading is immediately aborted.
+    mtmd_progress_callback progress_callback;
+    void * progress_callback_user_data;
 };

 MTMD_API const char * mtmd_default_marker(void);
@@ -13,6 +13,8 @@ mkdir -p $SCRIPT_DIR/output
 PROJ_ROOT="$SCRIPT_DIR/../.."
 cd $PROJ_ROOT

+export MTMD_TEST_RESPONSE_MARKER="<MTMD_TEST_RESPONSE_MARKER>"
+
 # Check if the first argument is "big", then run test with big models
 # This is useful if we're running the script on a larger machine, so we can test the big models
 RUN_BIG_TESTS=false
@@ -28,6 +30,15 @@ if [ "${1:-}" = "huge" ]; then
    echo "Include BIG and HUGE models..."
 fi

+USE_VIDEO=false
+if [ "${1:-}" = "video" ]; then
+    USE_VIDEO=true
+    echo "Using video as input..."
+    # behavior of USE_VIDEO:
+    # do NOT check if the output contains "new york", only verify if the exit code is 0
+    # when printing the result, print the OK/FAIL line then print the generated text
+fi
+
 # Check if the second argument is "flash", then enable flash attention
 # This is useful to test if flash attention off works correctly
 FLASH_ATTN="on"
@@ -50,13 +61,20 @@ add_test_vision() {
    if [ $# -gt 0 ]; then
        extra_args=$(printf " %q" "$@")
    fi
+    if [ "$USE_VIDEO" = true ]; then
+        arr_file+=("test-3.mp4")
+    else
+        arr_file+=("test-1.jpeg")
+    fi
    arr_prefix+=("[vision]")
    arr_hf+=("$hf")
    arr_extra_args+=("$extra_args")
-    arr_file+=("test-1.jpeg")
 }

 add_test_audio() {
+    if [ "$USE_VIDEO" = true ]; then
+        return 0
+    fi
    local hf=$1
    shift
    local extra_args=""
@@ -166,19 +184,35 @@ for i in "${!arr_hf[@]}"; do
        cmd+=" -p \"what is the publisher name of the newspaper?\""
    fi

-    output=$(eval "$cmd" 2>&1 | tee /dev/tty)
+    exit_code=0
+    output=$(eval "$cmd" 2>&1 | tee /dev/tty) || exit_code=$?

    echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log

-    # either contains "new york" or both "men" and "walk"
-    if echo "$output" | grep -iq "new york" \
-            || (echo "$output" | grep -iq "men" && echo "$output" | grep -iq "walk")
-    then
-        result="$prefix \033[32mOK\033[0m:   $hf"
+    if [ "$USE_VIDEO" = true ]; then
+        # for video, only check exit code; do not grep for "new york"
+        if [ $exit_code -eq 0 ]; then
+            result="$prefix \033[32mOK\033[0m:   $hf"
+        else
+            result="$prefix \033[31mFAIL\033[0m: $hf"
+        fi
+        # append generated text (after the response marker)
+        generated_text=$(echo "$output" | sed "1,/${MTMD_TEST_RESPONSE_MARKER}/d" | tail -10)
+        if [ -n "$generated_text" ]; then
+            result+="\n$generated_text"
+        fi
+        echo -e "$result"
    else
-        result="$prefix \033[31mFAIL\033[0m: $hf"
+        # either contains "new york" or both "men" and "walk"
+        if echo "$output" | grep -iq "new york" \
+                || (echo "$output" | grep -iq "men" && echo "$output" | grep -iq "walk")
+        then
+            result="$prefix \033[32mOK\033[0m:   $hf"
+        else
+            result="$prefix \033[31mFAIL\033[0m: $hf"
+        fi
+        echo -e "$result"
    fi
-    echo -e "$result"
    arr_res+=("$result")

    echo ""
@@ -180,6 +180,17 @@ That requires `JSON.stringify` when formatted to message content:
 }
 ```

+### Router mode: how child <--> router communicates
+
+Upon spawning a new child process using `subprocess`, both child and router listen to the stdout/stderr (combined)
+
+For the direction from child to router:
+- Generic messages are logs, it will be forwarded to router's stdout
+- Special state update messages are prefixed by `cmd_child_to_router:state:`, followed by a JSON. See `server_models::handle_child_state` for more
+
+For the direction from router to child:
+- When server sends `cmd_router_to_child:exit`, the child should exit gracefully --> if after `DEFAULT_STOP_TIMEOUT` and the child is still running, force-kill it
+
 ### Model management API (router mode)

 Model management API was added via PR [#23976](https://github.com/ggml-org/llama.cpp/pull/23976)
@@ -175,13 +175,12 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `-np, --parallel N` | number of server slots (default: -1, -1 = auto)<br/>(env: LLAMA_ARG_N_PARALLEL) |
 | `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
 | `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
-| `-tk, --talker-model FILE` | path to the qwen3-omni talker gguf, enables the /v1/audio/speech endpoint<br/>(env: LLAMA_ARG_TALKER_MODEL) |
-| `-c2w, --code2wav-model FILE` | path to the qwen3-omni code2wav gguf, the talker code detokenizer<br/>(env: LLAMA_ARG_CODE2WAV_MODEL) |
 | `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
 | `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_AUTO) |
 | `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_OFFLOAD) |
 | `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
 | `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
+| `--mtmd-batch-max-tokens N` | maximum number of image tokens per batch when encoding images (default: 1024)<br/>(env: LLAMA_ARG_MTMD_BATCH_MAX_TOKENS) |
 | `-a, --alias STRING` | set model name aliases, comma-separated (to be used by API)<br/>(env: LLAMA_ARG_ALIAS) |
 | `--tags STRING` | set model tags, comma-separated (informational, not used for routing)<br/>(env: LLAMA_ARG_TAGS) |
 | `--embd-normalize N` | normalisation for embeddings (default: 2) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) |
@@ -190,23 +189,21 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `--reuse-port` | allow multiple sockets to bind to the same port (default: disabled)<br/>(env: LLAMA_ARG_REUSE_PORT) |
 | `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
 | `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )<br/>(env: LLAMA_ARG_API_PREFIX) |
-| `--webui-config JSON` | [DEPRECATED: use --ui-config] JSON that provides default WebUI settings (overrides WebUI defaults)<br/>(env: LLAMA_ARG_WEBUI_CONFIG) |
-| `--ui-config JSON` | JSON that provides default UI settings (overrides UI defaults)<br/>(env: LLAMA_ARG_UI_CONFIG) |
-| `--webui-config-file PATH` | [DEPRECATED: use --ui-config-file] JSON file that provides default WebUI settings (overrides WebUI defaults)<br/>(env: LLAMA_ARG_WEBUI_CONFIG_FILE) |
-| `--ui-config-file PATH` | JSON file that provides default UI settings (overrides UI defaults)<br/>(env: LLAMA_ARG_UI_CONFIG_FILE) |
-| `--webui-mcp-proxy, --no-webui-mcp-proxy` | [DEPRECATED: use --ui-mcp-proxy/--no-ui-mcp-proxy] experimental: whether to enable MCP CORS proxy<br/>(env: LLAMA_ARG_WEBUI_MCP_PROXY) |
-| `--ui-mcp-proxy, --no-ui-mcp-proxy` | experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)<br/>(env: LLAMA_ARG_UI_MCP_PROXY) |
+| `--ui-config, --webui-config JSON` | JSON that provides default UI settings (overrides UI defaults)<br/>(env: LLAMA_ARG_UI_CONFIG) |
+| `--ui-config-file, --webui-config-file PATH` | JSON file that provides default UI settings (overrides UI defaults)<br/>(env: LLAMA_ARG_UI_CONFIG_FILE) |
+| `--ui-mcp-proxy, --webui-mcp-proxy, --no-ui-mcp-proxy, --no-webui-mcp-proxy` | experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)<br/>(env: LLAMA_ARG_UI_MCP_PROXY) |
 | `--tools TOOL1,TOOL2,...` | experimental: whether to enable built-in tools for AI agents - do not enable in untrusted environments (default: no tools)<br/>specify "all" to enable all tools<br/>available tools: read_file, file_glob_search, grep_search, exec_shell_command, write_file, edit_file, apply_diff, get_datetime<br/>(env: LLAMA_ARG_TOOLS) |
-| `--webui, --no-webui` | [DEPRECATED: use --ui/--no-ui] whether to enable the Web UI<br/>(env: LLAMA_ARG_WEBUI) |
-| `--ui, --no-ui` | whether to enable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_UI) |
+| `-ag, --agent, -no-ag, --no-agent` | whether to enable CORS proxy and all built-in tools - do not enable in untrusted environments (default: disabled)<br/>(env: LLAMA_ARG_AGENT) |
+| `--ui, --webui, --no-ui, --no-webui` | whether to enable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_UI) |
 | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
 | `--rerank, --reranking` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
 | `--api-key KEY` | API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)<br/>(env: LLAMA_API_KEY) |
-| `--api-key-file FNAME` | path to file containing API keys (default: none)<br/>(env: LLAMA_ARG_API_KEY_FILE) |
+| `--api-key-file FNAME` | path to file containing API keys, one per line; lines starting with a hash are treated as comments (default: none)<br/>(env: LLAMA_ARG_API_KEY_FILE) |
 | `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) |
 | `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
 | `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_KWARGS) |
 | `-to, --timeout N` | server read/write timeout in seconds (default: 3600)<br/>(env: LLAMA_ARG_TIMEOUT) |
+| `--sse-ping-interval N` | server SSE ping interval in seconds (-1 = disabled, default: 30)<br/>(env: LLAMA_ARG_SSE_PING_INTERVAL) |
 | `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
 | `--cache-prompt, --no-cache-prompt` | whether to enable prompt caching (default: enabled)<br/>(env: LLAMA_ARG_CACHE_PROMPT) |
 | `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting, requires prompt caching to be enabled (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
@@ -231,6 +228,7 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) |
 | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
 | `--sleep-idle-seconds SECONDS` | number of seconds of idleness after which the server will sleep (default: -1; -1 = disabled) |
+| `--log-prompts-dir PATH` | Log prompts to directory (only used for debugging, default: disabled) |
 | `--spec-draft-hf, -hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_SPEC_DRAFT_HF_REPO) |
 | `--spec-draft-threads, -td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
 | `--spec-draft-threads-batch, -tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) |
@@ -1861,9 +1859,37 @@ Example events:

 {
  "model": "...",
-  "event": "download_finished",
+  "event": "model_status",
  "data": {
-    "status": "loading"
+    "status": "loading",
+    "progress": {
+      "stages": ["text_model", "spec_model", "mmproj_model"],
+      "current": "text_model",
+      "value": 0.5
+    }
+  }
+}
+// note for "loading" status:
+// - subsequent events will follow the same order of "stages" list
+// - mmap is may report incorrect progress on some platforms; if you need exact progress, use --no-mmap
+
+{
+  "model": "...",
+  "event": "model_status",
+  "data": {
+    "status": "loaded",
+    "info": {
+      // note: only include info on first load
+      // waking up from sleep doesn't have this
+    }
+  }
+}
+
+{
+  "model": "...",
+  "event": "model_status",
+  "data": {
+    "status": "sleeping"
  }
 }

@@ -12,6 +12,7 @@
 #include <random>
 #include <sstream>
 #include <fstream>
+#include <limits>

 json format_error_response(const std::string & message, const enum error_type type) {
    std::string type_str;
@@ -1238,7 +1239,7 @@ json format_response_rerank(
 // other utils
 //

-std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx) {
+std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx, size_t n_top) {
    std::vector<llama_token_data> cur;

    const auto * logits = llama_get_logits_ith(ctx, idx);
@@ -1257,21 +1258,34 @@ std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int i
        }
    }

-    // sort tokens by logits
-    std::sort(cur.begin(), cur.end(), [](const llama_token_data & a, const llama_token_data & b) {
-        return a.logit > b.logit;
-    });
+    // sort tokens by logits (partial: only the leading `n_top` need ordering)
+    if (n_top > cur.size()) {
+        n_top = cur.size();
+    }
+    if (n_top > 0) {
+        std::partial_sort(cur.begin(), cur.begin() + n_top, cur.end(),
+            [](const llama_token_data & a, const llama_token_data & b) {
+                return a.logit > b.logit;
+            });
+    }

    // apply softmax
-    float max_l = cur[0].logit;
+    float max_l = -std::numeric_limits<float>::infinity();
+    if (n_top > 0) {
+        max_l = cur[0].logit; // partial_sort guarantees the absolute maximum is at index 0
+    } else {
+        for (const auto & t : cur) {
+            max_l = std::max(max_l, t.logit);
+        }
+    }
    float cum_sum = 0.0f;
-    for (size_t i = 0; i < cur.size(); ++i) {
-        float p = expf(cur[i].logit - max_l);
-        cur[i].p = p;
+    for (auto & t : cur) {
+        float p = expf(t.logit - max_l);
+        t.p = p;
        cum_sum += p;
    }
-    for (size_t i = 0; i < cur.size(); ++i) {
-        cur[i].p /= cum_sum;
+    for (auto & t : cur) {
+        t.p /= cum_sum;
    }

    return cur;
@@ -326,7 +326,7 @@ json format_response_rerank(
 // other utils
 //

-std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx);
+std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx, size_t n_top);

 std::string safe_json_to_str(const json & data);

@@ -22,8 +22,7 @@ struct server_context_meta {
    bool has_inp_image;
    bool has_inp_audio;
    bool has_inp_video;
-    json json_ui_settings;            // Primary: new name
-    json json_webui_settings;            // Deprecated: use json_ui_settings instead (kept for backward compat)
+    json json_ui_settings;
    int slot_n_ctx;
    enum llama_pooling_type pooling_type;

@@ -53,6 +52,31 @@ struct server_context_meta {
    uint64_t model_size;
 };

+enum server_state {
+    // SERVER_STATE_DOWNLOADING,
+    SERVER_STATE_LOADING,
+    SERVER_STATE_READY,
+    SERVER_STATE_SLEEPING,
+};
+
+static std::string server_state_to_str(server_state state) {
+    switch (state) {
+        case SERVER_STATE_LOADING:     return "loading";
+        case SERVER_STATE_READY:       return "ready";
+        case SERVER_STATE_SLEEPING:    return "sleeping";
+        default: GGML_ASSERT(false && "invalid server_state");
+    }
+}
+
+static server_state server_state_from_str(const std::string & str) {
+    if (str == "loading")     return SERVER_STATE_LOADING;
+    if (str == "ready")       return SERVER_STATE_READY;
+    if (str == "sleeping")    return SERVER_STATE_SLEEPING;
+    GGML_ASSERT(false && "invalid server_state string");
+}
+
+using server_state_callback_t = std::function<void(server_state, json /* payload */)>;
+
 struct server_context {
    std::unique_ptr<server_context_impl> impl;

@@ -80,9 +104,8 @@ struct server_context {
    // not thread-safe, should only be used from the main thread
    server_context_meta get_meta() const;

-    // register a callback to be called when sleeping state changes
-    // must be set before load_model() is called
-    void on_sleeping_changed(std::function<void(bool)> callback);
+    // note: must be set before load_model() is called
+    void set_state_callback(server_state_callback_t callback);
 };


@@ -7,9 +7,18 @@
 #include <unordered_set>
 #include <list>
 #include <map>
+#include <algorithm>
+#include <cctype>

 #include "server-http.h"

+static std::string proxy_header_to_lower(std::string header) {
+    std::transform(header.begin(), header.end(), header.begin(), [](unsigned char c) {
+        return std::tolower(c);
+    });
+    return header;
+}
+
 static server_http_res_ptr proxy_request(const server_http_req & req, std::string method) {
    std::string target_url = req.get_param("url");
    common_http_url parsed_url = common_http_parse_url(target_url);
@@ -33,11 +42,18 @@ static server_http_res_ptr proxy_request(const server_http_req & req, std::strin
    SRV_INF("proxying %s request to %s://%s:%i%s\n", method.c_str(), parsed_url.scheme.c_str(), parsed_url.host.c_str(), parsed_url.port, parsed_url.path.c_str());

    std::map<std::string, std::string> headers;
+    const std::string proxy_header_prefix = "x-llama-server-proxy-header-";
    for (auto [key, value] : req.headers) {
-        auto new_key = key;
-        if (string_starts_with(new_key, "x-proxy-header-")) {
-            string_replace_all(new_key, "x-proxy-header-", "");
+        const std::string lowered_key = proxy_header_to_lower(key);
+        if (!string_starts_with(lowered_key, proxy_header_prefix)) {
+            continue;
        }
+
+        auto new_key = key.substr(proxy_header_prefix.size());
+        if (new_key.empty()) {
+            continue;
+        }
+
        headers[new_key] = value;
    }

@@ -492,6 +492,8 @@ using server_http_req_ptr = std::unique_ptr<server_http_req>;
 static void process_handler_response(server_http_req_ptr && request, server_http_res_ptr & response, httplib::Response & res) {
    if (response->is_stream()) {
        res.status = response->status;
+        // Tell Nginx to not buffer any streamed response
+        response->headers["X-Accel-Buffering"] = "no";
        set_headers(res, response->headers);
        const std::string content_type = response->content_type;
        // convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it
@@ -1,5 +1,6 @@
 #include "server-common.h"
 #include "server-models.h"
+#include "server-context.h"

 #include "build-info.h"
 #include "preset.h"
@@ -44,9 +45,7 @@ extern char **environ;
 #define DEFAULT_STOP_TIMEOUT 10 // seconds

 #define CMD_ROUTER_TO_CHILD_EXIT  "cmd_router_to_child:exit"
-#define CMD_CHILD_TO_ROUTER_READY "cmd_child_to_router:ready" // also sent when waking up from sleep
-#define CMD_CHILD_TO_ROUTER_SLEEP "cmd_child_to_router:sleep"
-#define CMD_CHILD_TO_ROUTER_INFO  "cmd_child_to_router:info:" // followed by json string
+#define CMD_CHILD_TO_ROUTER_STATE "cmd_child_to_router:state:" // followed by json string

 // address for child process, this is needed because router may run on 0.0.0.0
 // ref: https://github.com/ggml-org/llama.cpp/issues/17862
@@ -443,6 +442,7 @@ void server_models::load_models() {
                /* last_used     */ 0,
                /* args          */ std::vector<std::string>(),
                /* loaded_info   */ {},
+                /* progress      */ {},
                /* exit_code     */ 0,
                /* stop_timeout  */ DEFAULT_STOP_TIMEOUT,
                /* multimodal    */ mtmd_caps{false, false},
@@ -609,6 +609,7 @@ void server_models::load_models() {
                    /* last_used     */ 0,
                    /* args          */ std::vector<std::string>(),
                    /* loaded_info   */ {},
+                    /* progress      */ {},
                    /* exit_code     */ 0,
                    /* stop_timeout  */ DEFAULT_STOP_TIMEOUT,
                    /* multimodal    */ mtmd_caps{false, false},
@@ -904,12 +905,8 @@ void server_models::load(const std::string & name) {
                while (fgets(buffer, vec_buf.size(), stdout_file) != nullptr) {
                    LOG("[%5d] %s", port, buffer);
                    std::string str(buffer);
-                    if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) {
-                        this->update_status(name, SERVER_MODEL_STATUS_LOADED, 0);
-                    } else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_INFO)) {
-                        this->update_loaded_info(name, str);
-                    } else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_SLEEP)) {
-                        this->update_status(name, SERVER_MODEL_STATUS_SLEEPING, 0);
+                    if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_STATE)) {
+                        this->handle_child_state(name, str);
                    }
                }
            } else {
@@ -976,7 +973,10 @@ void server_models::load(const std::string & name) {
        subprocess_destroy(&child_proc->get());

        // update status and exit code
-        this->update_status(name, SERVER_MODEL_STATUS_UNLOADED, exit_code);
+        this->update_status(name, {
+            SERVER_MODEL_STATUS_UNLOADED,
+            exit_code
+        });
        SRV_INF("instance name=%s exited with status %d\n", name.c_str(), exit_code);
    });

@@ -1016,7 +1016,8 @@ struct server_models_download_res : public common_download_callback {
            common_download_model(model, opts);
            is_ok = true;
        } catch (const std::exception & e) {
-            SRV_ERR("download failed for model name=%s: %s\n", model.name.c_str(), e.what());
+            auto model_name = model.get_name();
+            SRV_ERR("download failed for model name=%s: %s\n", model_name.c_str(), e.what());
            is_ok = false;
        }
        return is_ok;
@@ -1036,7 +1037,7 @@ struct server_models_download_res : public common_download_callback {
 };

 void server_models::download(common_params_model && model, common_download_opts && opts) {
-    std::string name = model.name;
+    std::string name = model.get_name();
    GGML_ASSERT(name == model.hf_repo);

    std::unique_lock<std::mutex> lk(mutex);
@@ -1064,9 +1065,10 @@ void server_models::download(common_params_model && model, common_download_opts
    inst.th = std::thread([this, dl = std::move(dl)]() {
        dl->opts.callback = dl.get();
        bool ok = dl->run();
+        auto model_name = dl->model.get_name();
        SRV_INF("download finished for model name=%s with status=%s\n",
-                    dl->model.name.c_str(), ok ? "success" : "failure");
-        update_download_progress(dl->model.name, {}, true, ok);
+                    model_name.c_str(), ok ? "success" : "failure");
+        update_download_progress(model_name, {}, true, ok);
        // need_reload is set inside update_download_progress under the mutex;
        // the next load_models() call will clean up this instance
    });
@@ -1130,21 +1132,33 @@ void server_models::unload_all() {
    }
 }

-void server_models::update_status(const std::string & name, server_model_status status, int exit_code) {
+void server_models::update_status(const std::string & name, const update_status_args & args) {
    std::unique_lock<std::mutex> lk(mutex);
    auto it = mapping.find(name);
    if (it != mapping.end()) {
        auto & meta = it->second.meta;
-        meta.status    = status;
-        meta.exit_code = exit_code;
+        meta.status      = args.status;
+        meta.exit_code   = args.exit_code;
+        if (!args.loaded_info.is_null()) {
+            meta.loaded_info = args.loaded_info;
+        }
+        if (!args.progress.is_null()) {
+            meta.progress = args.progress;
+        }
    }
    // broadcast status change to SSE
    {
        json data = {
-            {"status", server_model_status_to_string(status)},
+            {"status", server_model_status_to_string(args.status)},
        };
-        if (status == SERVER_MODEL_STATUS_UNLOADED) {
-            data["exit_code"] = exit_code;
+        if (args.status == SERVER_MODEL_STATUS_UNLOADED) {
+            data["exit_code"] = args.exit_code;
+        }
+        if (!args.loaded_info.is_null()) {
+            data["info"] = args.loaded_info;
+        }
+        if (!args.progress.is_null()) {
+            data["progress"] = args.progress;
        }
        // note: notify_sse doesn't acquire the lock, so no deadlock here
        notify_sse("status_change", name, data);
@@ -1152,29 +1166,6 @@ void server_models::update_status(const std::string & name, server_model_status
    cv.notify_all();
 }

-void server_models::update_loaded_info(const std::string & name, std::string & raw_info) {
-    if (!string_starts_with(raw_info, CMD_CHILD_TO_ROUTER_INFO)) {
-        SRV_WRN("invalid loaded info format from child for model name=%s: %s\n", name.c_str(), raw_info.c_str());
-        return;
-    }
-
-    json info;
-    try {
-        info = json::parse(raw_info.substr(strlen(CMD_CHILD_TO_ROUTER_INFO)));
-    } catch (const std::exception & e) {
-        SRV_WRN("failed to parse loaded info from child for model name=%s: %s\n", name.c_str(), e.what());
-        return;
-    }
-
-    std::unique_lock<std::mutex> lk(mutex);
-    auto it = mapping.find(name);
-    if (it != mapping.end()) {
-        auto & meta = it->second.meta;
-        meta.loaded_info = info;
-    }
-    cv.notify_all();
-}
-
 void server_models::update_download_progress(const std::string & name, const common_download_progress & progress, bool done, bool ok) {
    json curr;
    {
@@ -1323,21 +1314,59 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co
    return proxy;
 }

-bool server_models::is_child_server() {
+void server_models::handle_child_state(const std::string & name, const std::string & raw_input) {
+    server_state state;
+    json payload;
+
+    try {
+        json data = json::parse(raw_input.substr(strlen(CMD_CHILD_TO_ROUTER_STATE)));
+        state = server_state_from_str(json_value(data, "state", std::string()));
+        payload = json_value(data, "payload", json{});
+    } catch (const std::exception & e) {
+        SRV_ERR("failed to parse child state update for name=%s: %s\n", name.c_str(), e.what());
+        return;
+    }
+
+    switch (state) {
+        case SERVER_STATE_LOADING:
+            {
+                update_status(name, {
+                    SERVER_MODEL_STATUS_LOADING,
+                    0,
+                    nullptr, // no loaded_info yet
+                    payload,
+                });
+            } break;
+        case SERVER_STATE_READY:
+            {
+                update_status(name, {
+                    SERVER_MODEL_STATUS_LOADED,
+                    0,
+                    // note: payload can be empty if this is a wakeup from sleep
+                    payload.size() > 0 ? payload : nullptr,
+                    {}, // reset progress info
+                });
+            } break;
+        case SERVER_STATE_SLEEPING:
+            {
+                update_status(name, { SERVER_MODEL_STATUS_SLEEPING });
+            } break;
+        default:
+            // should never happen, but just in case
+            GGML_ASSERT(false && "unexpected state from child server");
+    }
+}
+
+//
+// server_child
+//
+
+bool server_child::is_child() {
    const char * router_port = std::getenv("LLAMA_SERVER_ROUTER_PORT");
    return router_port != nullptr;
 }

-std::thread server_models::setup_child_server(const std::function<void(int)> & shutdown_handler, const json & model_info) {
-    // send a notification to the router server that a model instance is ready
-    common_log_pause(common_log_main());
-    fflush(stdout);
-    fprintf(stdout, "%s\n", CMD_CHILD_TO_ROUTER_READY);
-    fflush(stdout);
-    fprintf(stdout, "%s%s\n", CMD_CHILD_TO_ROUTER_INFO, safe_json_to_str(model_info).c_str());
-    fflush(stdout);
-    common_log_resume(common_log_main());
-
+std::thread server_child::setup(const std::function<void(int)> & shutdown_handler) {
    // setup thread for monitoring stdin
    return std::thread([shutdown_handler]() {
        // wait for EOF on stdin
@@ -1363,10 +1392,15 @@ std::thread server_models::setup_child_server(const std::function<void(int)> & s
    });
 }

-void server_models::notify_router_sleeping_state(bool is_sleeping) {
+void server_child::notify_to_router(const std::string & state, const json & payload) {
+    json data = {
+        {"state", state},
+        {"payload", payload},
+    };
+    std::lock_guard<std::mutex> lk(mtx_stdout);
    common_log_pause(common_log_main());
    fflush(stdout);
-    fprintf(stdout, "%s\n", is_sleeping ? CMD_CHILD_TO_ROUTER_SLEEP : CMD_CHILD_TO_ROUTER_READY);
+    fprintf(stdout, "%s%s\n", CMD_CHILD_TO_ROUTER_STATE, safe_json_to_str(data).c_str());
    fflush(stdout);
    common_log_resume(common_log_main());
 }
@@ -1462,9 +1496,9 @@ void server_models_routes::init_routes() {
            auto res = std::make_unique<server_http_res>();
            res_ok(res, {
                // TODO: add support for this on web UI
-                {"role",          "router"},
-                {"max_instances", params.models_max},
-                {"models_autoload", params.models_autoload},
+                {"role",                 "router"},
+                {"max_instances",        params.models_max},
+                {"models_autoload",      params.models_autoload},
                // this is a dummy response to make sure the UI doesn't break
                {"model_alias", "llama-server"},
                {"model_path",  "none"},
@@ -1473,11 +1507,9 @@ void server_models_routes::init_routes() {
                    {"n_ctx",  0},
                }},
                // New key
-                {"ui_settings",     ui_settings},
-                // Deprecated: use ui_settings instead (kept for backward compat)
-                {"webui_settings",  webui_settings},
-                {"build_info",     std::string(llama_build_info())},
-                {"cors_proxy_enabled", params.ui_mcp_proxy || params.webui_mcp_proxy},
+                {"ui_settings",          ui_settings},
+                {"build_info",           std::string(llama_build_info())},
+                {"cors_proxy_enabled",   params.ui_mcp_proxy},
            });
            return res;
        }
@@ -1646,7 +1678,6 @@ void server_models_routes::init_routes() {
        common_params_model model;
        common_download_opts opts;

-        model.name           = name;
        model.hf_repo        = name;
        opts.bearer_token    = params.hf_token;
        opts.download_mmproj = true;
@@ -72,6 +72,7 @@ struct server_model_meta {
    int64_t last_used = 0; // for LRU unloading
    std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
    json loaded_info; // info to be reflected via /v1/models endpoint ; if in DOWNLOADING state, it should contain download progress info
+    json progress; // reflect load or download progress info, if any
    int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
    int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
    mtmd_caps multimodal; // multimodal capabilities
@@ -170,9 +171,15 @@ public:
    // to stop the download, call unload()
    void download(common_params_model && model, common_download_opts && opts);

+    struct update_status_args {
+        server_model_status status;
+        int exit_code = 0; // only valid if status == UNLOADED
+        json loaded_info = nullptr;
+        json progress = nullptr;
+    };
    // update the status of a model instance (thread-safe)
-    void update_status(const std::string & name, server_model_status status, int exit_code);
-    void update_loaded_info(const std::string & name, std::string & raw_info);
+    // also send SSE notification to /models/sse endpoint
+    void update_status(const std::string & name, const update_status_args & args);
    void update_download_progress(const std::string & name, const common_download_progress & progress, bool done, bool ok = true);

    // remove a cache model from disk and update the list (thread-safe)
@@ -193,34 +200,44 @@ public:
    // proxy an HTTP request to the model instance
    server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used);

+    // handle message sent from server_child::notify_to_router()
+    // raw input must starts with CMD_CHILD_TO_ROUTER_STATE, followed by a JSON string
+    // this function is not thread-safe, must be called from instance's monitoring thread
+    // payload per state:
+    //     state = loading     -> payload = {} (TODO: add progress info)
+    //     state = ready       -> payload = model_info (json), or {} if wakeup from sleeping
+    //     state = sleeping    -> payload = {}
+    void handle_child_state(const std::string & name, const std::string & raw_input);
+};
+
+struct server_child {
+    // serializes the notify_to_router writes
+    std::mutex mtx_stdout;
+
    // return true if the current process is a child server instance
-    static bool is_child_server();
+    bool is_child();

-    // notify the router server that a model instance is ready
+    // register the shutdown_handler to be called by the router
    // return the monitoring thread (to be joined by the caller)
-    static std::thread setup_child_server(const std::function<void(int)> & shutdown_handler, const json & model_info);
+    std::thread setup(const std::function<void(int)> & shutdown_handler);

-    // notify the router server that the sleeping state has changed
-    static void notify_router_sleeping_state(bool sleeping);
+    // notify router server for status changes (e.g. loading, downloading, sleeping, etc.)
+    // message will be handled by server_models::handle_child_state() on the router side
+    void notify_to_router(const std::string & state_name, const json & payload);
 };

 struct server_models_routes {
    common_params params;
    json ui_settings = json::object();     // Primary: new name
-    json webui_settings = json::object();  // Deprecated: use ui_settings (kept for compat)
    std::atomic<bool> stopping = false;    // for graceful disconnecting SSE clients during shutdown
    server_models models;
    server_models_routes(const common_params & params, int argc, char ** argv)
            : params(params), models(params, argc, argv) {
-        // Support both new ui_config_json and deprecated webui_config_json
-        const std::string & cfg = !this->params.ui_config_json.empty()
-            ? this->params.ui_config_json
-            : this->params.webui_config_json;
+        const std::string & cfg = this->params.ui_config_json;
        if (!cfg.empty()) {
            try {
                json json_settings = json::parse(cfg);
                ui_settings = json_settings;
-                webui_settings = json_settings;  // Deprecated: keep in sync
            } catch (const std::exception & e) {
                LOG_ERR("%s: failed to parse UI config: %s\n", __func__, e.what());
                throw;
@@ -14,6 +14,9 @@ std::vector<std::unique_ptr<field>> make_llama_cmpl_schema(const common_params &
        fields.emplace_back(f);
    };

+    add((new field_bool("verbose", params.verbose))
+        ->set_desc("Include __verbose field in the response with additional debug information"));
+
    add((new field_bool("timings_per_token", params.timings_per_token))
        ->set_desc("Include prompt processing and text generation speed information in each response"));

@@ -11,6 +11,7 @@
 #include <cstring>
 #include <climits>
 #include <algorithm>
+#include <unordered_set>

 namespace fs = std::filesystem;

@@ -90,8 +90,10 @@ int llama_server(int argc, char ** argv) {
    llama_numa_init(params.numa);

    // router server never loads a model and must not touch the GPU
+    const bool is_router_server = params.model.path.empty()
+                               && params.model.hf_repo.empty();
+
    // skip device enumeration so the CUDA primary context stays uncreated
-    const bool is_router_server = params.model.path.empty();
    common_params_print_info(params, !is_router_server);

    if (!is_router_server) {
@@ -113,8 +115,9 @@ int llama_server(int argc, char ** argv) {
    }

    // for consistency between server router mode and single-model mode, we set the same model name as alias
-    if (params.model_alias.empty() && !params.model.name.empty()) {
-        params.model_alias.insert(params.model.name);
+    auto model_name = params.model.get_name();
+    if (params.model_alias.empty() && !model_name.empty()) {
+        params.model_alias.insert(model_name);
    }

    // struct that contains llama context and inference
@@ -227,8 +230,7 @@ int llama_server(int argc, char ** argv) {
    ctx_http.register_gcp_compat();

    // CORS proxy (EXPERIMENTAL, only used by the Web UI for MCP)
-    // Supports both new ui_mcp_proxy and deprecated webui_mcp_proxy fields
-    if (params.ui_mcp_proxy || params.webui_mcp_proxy) {
+    if (params.ui_mcp_proxy) {
        SRV_WRN("%s", "-----------------\n");
        SRV_WRN("%s", "CORS proxy is enabled, do not expose server to untrusted environments\n");
        SRV_WRN("%s", "This feature is EXPERIMENTAL and may be removed or changed in future versions\n");
@@ -256,6 +258,7 @@ int llama_server(int argc, char ** argv) {
    // Start the server
    //

+    server_child child; // only used in non-router mode
    std::function<void()> clean_up;

    if (is_router_server) {
@@ -301,15 +304,16 @@ int llama_server(int argc, char ** argv) {
            return 1;
        }

-        // load the model
-        SRV_INF("%s", "loading model\n");
-
-        if (server_models::is_child_server()) {
-            ctx_server.on_sleeping_changed([&](bool sleeping) {
-                server_models::notify_router_sleeping_state(sleeping);
+        // setup communication child --> router if necessary
+        if (child.is_child()) {
+            ctx_server.set_state_callback([&](server_state state, json payload) {
+                child.notify_to_router(server_state_to_str(state), payload);
            });
        }

+        // load the model
+        SRV_INF("%s", "loading model\n");
+
        if (!ctx_server.load_model(params)) {
            clean_up();
            if (ctx_http.thread.joinable()) {
@@ -366,9 +370,9 @@ int llama_server(int argc, char ** argv) {

        // optionally, notify router server that this instance is ready
        std::thread monitor_thread;
-        if (server_models::is_child_server()) {
-            json model_info = routes.get_model_info();
-            monitor_thread = server_models::setup_child_server(shutdown_handler, model_info);
+        if (child.is_child()) {
+            monitor_thread = child.setup(shutdown_handler);
+            child.notify_to_router(server_state_to_str(SERVER_STATE_READY), routes.get_model_info());
        }

        // this call blocks the main thread until queue_tasks.terminate() is called
@@ -79,9 +79,9 @@ def test_load_split_model():
    assert match_regex("(little|girl)+", res.body["content"])


-def test_no_webui():
+def test_no_ui():
    global server
-    # default: webui enabled
+    # default: UI enabled
    server.start()
    url = f"http://{server.server_host}:{server.server_port}"
    res = requests.get(url)
@@ -89,8 +89,8 @@ def test_no_webui():
    assert "<!doctype html>" in res.text
    server.stop()

-    # with --no-webui
-    server.no_webui = True
+    # with --no-ui, the UI should be disabled
+    server.no_ui = True
    server.start()
    res = requests.get(url)
    assert res.status_code == 404
@@ -603,3 +603,23 @@ def test_chat_completions_token_count():
        })
        assert res.status_code == 200
        assert res.body["input_tokens"] > 5
+
+
+def test_verbose_debug():
+    global server
+    server.start()
+    for verbose in [True, False]:
+        res = server.make_request("POST", "/chat/completions", data={
+            "max_tokens": 2,
+            "messages": [
+                {"role": "system", "content": "Book"},
+                {"role": "user", "content": "What is the best book"},
+            ],
+            "verbose": verbose,
+        })
+        assert res.status_code == 200
+        if verbose:
+            assert "__verbose" in res.body
+            assert "Book" in res.body["__verbose"]["prompt"]
+        else:
+            assert "__verbose" not in res.body
@@ -12,7 +12,7 @@ def create_server():

 def test_mcp_no_proxy():
    global server
-    server.webui_mcp_proxy = False
+    server.ui_mcp_proxy = False
    server.start()

    res = server.make_request("GET", "/cors-proxy")
@@ -21,7 +21,7 @@ def test_mcp_no_proxy():

 def test_mcp_proxy():
    global server
-    server.webui_mcp_proxy = True
+    server.ui_mcp_proxy = True
    server.start()

    url = f"http://{server.server_host}:{server.server_port}/cors-proxy?url=http://example.com"
@@ -32,7 +32,7 @@ def test_mcp_proxy():

 def test_mcp_proxy_custom_port():
    global server
-    server.webui_mcp_proxy = True
+    server.ui_mcp_proxy = True
    server.start()

    # try getting the server's models API via the proxy
@@ -1,6 +1,8 @@
 import pytest
 from openai import OpenAI
 from utils import *
+import threading
+from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer

 server = ServerPreset.tinyllama2()

@@ -105,6 +107,49 @@ def test_cors_options(origin: str, cors_header: str, cors_header_value: str):
    assert res.headers[cors_header] == cors_header_value


+def test_cors_proxy_only_forwards_explicit_proxy_headers():
+    class CaptureHeadersHandler(BaseHTTPRequestHandler):
+        def do_GET(self):
+            self.server.captured_headers = dict(self.headers)
+            self.send_response(200)
+            self.end_headers()
+            self.wfile.write(b"ok")
+
+        def log_message(self, format, *args):
+            pass
+
+    target = ThreadingHTTPServer(("127.0.0.1", 0), CaptureHeadersHandler)
+    target.captured_headers = {}
+    target_thread = threading.Thread(target=target.serve_forever, daemon=True)
+    target_thread.start()
+
+    try:
+        server = ServerPreset.tinyllama2()
+        server.api_key = TEST_API_KEY
+        server.ui_mcp_proxy = True
+        server.start()
+
+        res = server.make_request("GET", f"/cors-proxy?url=http://127.0.0.1:{target.server_port}/capture", headers={
+            "Authorization": f"Bearer {TEST_API_KEY}",
+            "Proxy-Authorization": "Basic secret",
+            "X-Api-Key": TEST_API_KEY,
+            "Cookie": "session=secret",
+            "x-llama-server-proxy-header-accept": "application/json",
+            "x-llama-server-proxy-header-authorization": "Bearer explicit",
+        })
+
+        assert res.status_code == 200
+        captured = {key.lower(): value for key, value in target.captured_headers.items()}
+        assert captured["accept"] == "application/json"
+        assert captured["authorization"] == "Bearer explicit"
+        assert "proxy-authorization" not in captured
+        assert "x-api-key" not in captured
+        assert "cookie" not in captured
+    finally:
+        target.shutdown()
+        target.server_close()
+
+
@pytest.mark.parametrize(
    "media_path, image_url, success",
    [
@@ -94,7 +94,7 @@ class ServerProcess:
    enable_ctx_shift: int | None = False
    spec_draft_n_min: int | None = None
    spec_draft_n_max: int | None = None
-    no_webui: bool | None = None
+    no_ui: bool | None = None
    jinja: bool | None = None
    reasoning_format: Literal['deepseek', 'none', 'nothink'] | None = None
    reasoning: Literal['on', 'off', 'auto'] | None = None
@@ -107,7 +107,7 @@ class ServerProcess:
    cache_ram: int | None = None
    no_cache_idle_slots: bool = False
    log_path: str | None = None
-    webui_mcp_proxy: bool = False
+    ui_mcp_proxy: bool = False
    backend_sampling: bool = False
    gcp_compat: bool = False

@@ -225,8 +225,8 @@ class ServerProcess:
            server_args.extend(["--spec-draft-n-max", self.spec_draft_n_max])
        if self.spec_draft_n_min:
            server_args.extend(["--spec-draft-n-min", self.spec_draft_n_min])
-        if self.no_webui:
-            server_args.append("--no-webui")
+        if self.no_ui:
+            server_args.append("--no-ui")
        if self.no_models_autoload:
            server_args.append("--no-models-autoload")
        if self.jinja:
@@ -251,8 +251,8 @@ class ServerProcess:
            server_args.extend(["--cache-ram", self.cache_ram])
        if self.no_cache_idle_slots:
            server_args.append("--no-cache-idle-slots")
-        if self.webui_mcp_proxy:
-            server_args.append("--webui-mcp-proxy")
+        if self.ui_mcp_proxy:
+            server_args.append("--ui-mcp-proxy")
        if self.backend_sampling:
            server_args.append("--backend_sampling")
        if self.gcp_compat:
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Xuan-Son Nguyen	7c082bc417	server: fix report progress for loading spec models, add "stages" list (#24870 ) * server: fix report progress for loading spec models, add "stages" list * improve * nits * nits 2	2026-06-21 17:36:52 +02:00
Xuan-Son Nguyen	bddfd2b113	server: refactor batch construction (#24843 ) * server: refactor batch construction * wip * wip 2 * wip 3 * wip 4 * add abort_all_slots * handle batch full more carefully * fix assert * rm debug log * small nits * (debug) add timings * debug: force llama_synchronize for accurate timings * address comments * disable DEBUG_TIMINGS	2026-06-21 14:16:11 +02:00
Xuan-Son Nguyen	0d135df48c	mtmd: fix mtmd_get_memory_usage (#24867 )	2026-06-21 14:12:15 +02:00
Sigbjørn Skjæret	bf533823cd	jinja : implement call statement (#24847 ) * implement call statement * undo unintended change * de-lambda * simplify * move caller context inside function handler	2026-06-21 14:04:52 +02:00
Xuan-Son Nguyen	2f89acc2bc	mtmd: add load progress callback (#24865 )	2026-06-21 13:40:52 +02:00
Xuan-Son Nguyen	bfa3219177	server: add "verbose" field to schema (#24864 )	2026-06-21 13:03:14 +02:00
Xuan-Son Nguyen	d6d899580d	server: real-time model load progress tracking via /models/sse (#24828 ) * server: real-time model load progress tracking via /models/sse * update docs * add mutex for notify_to_router * correct docs	2026-06-21 11:58:14 +02:00
Georgi Gerganov	8a118ee86c	minor : clean-up whitespaces (#24862 ) [no ci]	2026-06-21 11:37:12 +03:00
YiChen Lv	d789527482	spec : Support Step3.5/3.7 flash mtp3 (#24340 ) * add mtp_layer_offset + include nextn flags in graph reuse * add llama_set_mtp_layer_offset + llama_model_n_nextn_layer API * offset head select + require all MTP blocks * speculative multi-head process() * speculative multi-head draft() * gather outputs via inp_out_ids * cleanup * fix core * minor cleanup * merged draft_multi_head into draft() * mtp rename nextn * Apply suggestions from code review Co-authored-by: Aman Gupta <amangupta052@gmail.com> * clean-up comments * fix for multi seq * apply suggestions && chain-heads comment * add a reference for chain_heads discussion --------- Co-authored-by: Aman Gupta <amangupta052@gmail.com>	2026-06-21 11:33:18 +03:00
Aldehir Rojas	063d9c156e	common/peg : refactor until gbnf grammar generation (#24839 ) * common/peg : refactor until gbnf grammar into an ac automaton * cont : add a test with multiple strings * cont : pad state with 0s so rules line up * cont : clean up comments * cont : use set everywhere * cont : inline state num string padding * cont : add a ref to PR * cont : fix regression in server-tools.cpp	2026-06-20 21:15:06 -05:00
Aldehir Rojas	c57607016a	common/json-schema-to-grammar : align spacing rules with parsers (#24835 )	2026-06-20 17:43:04 -05:00
Guanhuai Zhang	4a80943174	fix(hexagon): use padded stride for ssm-conv weights (#24470 )	2026-06-20 14:58:49 -07:00
Adrien Gallouët	84de01a1f1	llama : use LLM_KV for quantization_version & file_type (#24802 ) Signed-off-by: Adrien Gallouët <angt@huggingface.co>	2026-06-20 20:07:01 +02:00
Xuan-Son Nguyen	75f460ac28	arg: try fixing test-args-parser randomly fails (#24826 ) * arg: try fixing test-args-parser randomly fails * return ref * try triggering the workflow * exception wrapper * wip * test * test 2 * arg: guard win32 utf8 argv override make_utf8_argv rebuilds argv from GetCommandLineW to fix utf8 handling of non ascii arguments on windows. the override runs unconditionally inside common_params_parse, so it also clobbers a programmatic argv passed by a caller. test-arg-parser builds a synthetic argv but then sees the real process command line instead, the model argument is never parsed, and the assert that expects success aborts via fastfail (0xC0000409). this shows up as a random failure in the openvino windows workflow. only override argv when its length matches the caller argc, so the utf8 repair still applies to real binaries while a programmatic argv stays intact. --------- Co-authored-by: Pascal <admin@serveurperso.com>	2026-06-20 19:45:27 +02:00
Muhammad Salem	8452824611	release: add missing link for win opencl adreno arm64 (#24809 )	2026-06-20 23:08:59 +08:00
Matti4	e27f308597	server: avoid forwarding auth headers in CORS proxy (#24373 ) * server: avoid forwarding auth headers in CORS proxy * format * fix test * fix e2e test --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co>	2026-06-20 15:34:47 +02:00
Aldehir Rojas	67e9fd3b74	docker : prebuild web UI for s390x build [no release] (#24829 )	2026-06-20 05:54:42 -05:00
davidrhodus	796f41bedc	model : glm-dsa load DSA indexer tensors as optional (#24770 ) GLM-5.2 ships the DSA "lightning indexer" on only a subset of layers (the "full" layers; others omit it), but the GLM_DSA loader created the five indexer tensors on every layer as required, so loading any GLM-5.2 GGUF failed with e.g. `missing tensor 'blk.3.indexer.k_norm.weight'`. GLM_DSA's graph is llama_model_deepseek2::graph (plain MLA) and does not use the indexer tensors (indexer runtime not yet implemented), so they are loaded-but-unused. Marking them TENSOR_NOT_REQUIRED lets layers without an indexer load as nullptr and the model runs as full MLA attention. DeepSeek-V3.2 (uniform indexer on all layers) is unaffected.	2026-06-20 13:48:24 +03:00
Adrien Gallouët	37a77fb057	ggml : optimize AMX (#24806 ) Flatten the partition over n_batch * M so every thread participates in the quantization \| CPU \| Model \| Test \| t/s OLD \| t/s NEW \| Speedup \| \|:--------------------------------\|:------------------------------\|:-------\|----------:\|----------:\|----------:\| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B IQ4_NL - 4.5 bpw \| pp512 \| 730.71 \| 779.86 \| 1.07 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B IQ4_NL - 4.5 bpw \| tg128 \| 87.88 \| 86.79 \| 0.99 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B IQ4_XS - 4.25 bpw \| pp512 \| 725.09 \| 1023.31 \| 1.41 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B IQ4_XS - 4.25 bpw \| tg128 \| 83.64 \| 83.62 \| 1.00 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B Q4_0 \| pp512 \| 820.51 \| 924.05 \| 1.13 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B Q4_0 \| tg128 \| 90.59 \| 92.46 \| 1.02 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B Q4_1 \| pp512 \| 776.88 \| 872.79 \| 1.12 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B Q4_1 \| tg128 \| 89.39 \| 90.94 \| 1.02 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B Q4_K_M \| pp512 \| 719.28 \| 1009.27 \| 1.40 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B Q4_K_M \| tg128 \| 80.62 \| 80.86 \| 1.00 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B Q4_K_S \| pp512 \| 732.29 \| 1077.29 \| 1.47 \| \| Intel(R) Xeon(R) Platinum 8488C \| qwen35 0.8B Q4_K_S \| tg128 \| 86.42 \| 83.53 \| 0.97 \| Signed-off-by: Adrien Gallouët <angt@huggingface.co>	2026-06-20 13:43:06 +03:00
Sigbjørn Skjæret	f4043fec01	convert : more consistent handling of rope_parameters (#24833 )	2026-06-20 13:42:36 +03:00
Masashi Yoshimura	f449e05537	ggml-webgpu: add adapter toggles for F16 on Vulkan + NVIDIA	2026-06-20 08:12:32 +09:00
Xuan-Son Nguyen	2b686a9120	server: refactor child --> router communication (#24821 ) * server: refactor child --> router communication * fix wakeup case * add docs * improve update_status() * nits	2026-06-20 01:02:26 +02:00
Adrien Gallouët	4b48a53b6c	server : optimize get_token_probabilities (#24796 ) Use std::partial_sort to order only the requested top-n tokens instead of the full vocabulary logprobs sort: vocab=128000 n_top=0 iters=100 full sort: 8555.6 us/op partial sort: 704.3 us/op Signed-off-by: Adrien Gallouët <angt@huggingface.co>	2026-06-19 23:26:54 +02:00
Xuan-Son Nguyen	e475fa2b5f	mtmd, arg: fix utf8 handling on windows (#24779 ) * mtmd, arg: fix utf8 handling on windows * also fix ggml_fopen * fix build fail * also fix CLI	2026-06-19 22:28:38 +02:00
Xuan-Son Nguyen	175147e8f6	server: remove all internal mentions about "webui" (#24817 )	2026-06-19 22:12:46 +02:00
Mikolaj Kucharski	fabde3bf51	arg: Add comment line support to --api-key-file (#23168 )	2026-06-19 17:33:54 +02:00
Alessandro de Oliveira Faria (A.K.A.CABELO)	0d2d9ccbf6	vendor : update cpp-httplib to 0.48.0 (#24787 )	2026-06-19 22:16:35 +08:00
Xuan-Son Nguyen	8c2d6f6475	server: add --agent arg, remove redundant webui naming compat (#24801 ) * server: add --agent arg, remove redundant webui naming compat * corrent env * fix the test * llama-gen-docs * nits: wordings	2026-06-19 16:06:13 +02:00
Aldehir Rojas	38724ab593	docker : build the UI (#24794 ) * docker : build the UI * cont : use existing APP_VERSION	2026-06-19 15:32:31 +02:00
Xuan-Son Nguyen	e2e7a9b2d0	mtmd: several bug fixes (#24784 ) * mtmd: several bug fixes * fix build * fix gemma4ua * add sanity check in get_u32() * fix build (2) * area() avoid overflow	2026-06-19 12:18:36 +02:00
Ruixiang Wang	b14e3fb90c	spec: support eagle3 for qwen3.5 & 3.6 (#24593 ) * spec: support qwen3.5 & 3.6 eagle3 draft * eagle3: Add deferred boundary checkpoints restore support for hybrid models * apply suggestions Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * spec: adapt to API change * spec: fix naming * cont : add TODO --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2026-06-19 13:08:50 +03:00
Xuan-Son Nguyen	159d093a43	server: fix non-bound n_discard value (ctx shifting) (#24786 ) * server: fix non-bound n_discard value * Update tools/server/server-context.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2026-06-19 10:53:44 +02:00
Georgi Gerganov	5fd2dc2c41	sync : ggml	2026-06-19 10:19:14 +03:00
Georgi Gerganov	1868af13ac	ggml : bump version to 0.15.2 (ggml/1548)	2026-06-19 10:19:14 +03:00
Georgi Gerganov	5bd21b8555	pi : remove docs from system prompt (#24791 )	2026-06-19 09:34:00 +03:00
Georgi Gerganov	80452d65b9	server : consolidate slot selection into get_available_slot (#24755 ) Absorb get_slot_by_id logic into get_available_slot so slot selection is handled by a single function call. When a specific slot id is requested, the LCP similarity check still runs to enable proper prompt cache updates. Assisted-by: pi:llama.cpp/Qwen3.6-27B	2026-06-19 09:22:34 +03:00
shalinib-ibm	8141e730f1	ggml-cpu: support K tails in power10 Q8/Q4 MMA matmul (#24753 ) * ggml-cpu: support K tails in Power10 MMA Q8/Q4 matmul This patch removes the requirement that K be divisible by kc in the tinyBlas_Q0_PPC tiled matmul path. Process the final K panel using its actual depth and pass the reduced panel size through packing and kernel execution. This allows more workloads to use the MMA kernel and reduces fallback to mnpack. * Apply suggestion from @taronaeo Co-authored-by: Aaron Teo <taronaeo@gmail.com> --------- Co-authored-by: Aaron Teo <taronaeo@gmail.com>	2026-06-19 08:55:38 +03:00
Xuan-Son Nguyen	db52540f73	mtmd: add batching support for internvl (#24775 )	2026-06-19 01:16:16 +02:00
Pascal	3a3edc9ac6	Ggml/cuda col2im 1d (#24417 ) * cuda: add GGML_OP_COL2IM_1D, follow-up to the CPU op * cuda: col2im_1d use fast_div_modulo for the index decomposition * cuda: col2im_1d tighten supports_op, type match and contiguous dst	2026-06-18 22:23:01 +02:00
Reguna	40f3aafc45	server: add "X-Accel-Buffering": "no" header to streaming endpoints (#24774 ) * server: add "X-Accel-Buffering": "no" header to streaming endpoints This header tells Nginx (as a reverse proxy) to NOT buffer responses. (only affects streaming endpoints) Without it, Nginx will break streaming with certain applications (notably the Pi coding harness).	2026-06-18 22:01:24 +02:00
Xuan-Son Nguyen	a6b3260a42	mtmd: add batching for mtmd-cli, add video tests (#24778 )	2026-06-18 21:55:04 +02:00
o7si	32eddaf2ea	cmake : fix ui build with read-only source (#24752 )	2026-06-18 18:59:18 +02:00