improve test

tests: add backend copy test
2026-06-30 17:47:40 +02:00 · 2026-06-17 17:42:56 +02:00 · 2026-06-17 16:04:35 +02:00
432 changed files with 18105 additions and 27879 deletions
@@ -13,20 +13,6 @@ ARG APP_REVISION=N/A
 # BUILD STAGE
 # Compile all binary files and libraries
 # ==============================================================================
-ARG NODE_VERSION=24
-
-FROM docker.io/node:$NODE_VERSION AS web
-
-ARG APP_VERSION
-
-WORKDIR /app/tools/ui
-
-COPY tools/ui/package.json tools/ui/package-lock.json ./
-RUN npm ci
-
-COPY tools/ui/ ./
-RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
-
 FROM ${CANN_BASE_IMAGE} AS build

 # -- Install build dependencies --
@@ -40,8 +26,6 @@ WORKDIR /app
 # -- Copy project files --
 COPY . .

-COPY --from=web /app/tools/ui/dist tools/ui/dist
-
 # -- Set CANN environment variables (required for compilation) --
 # Using ENV instead of `source` allows environment variables to persist across the entire image layer
 ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
@@ -145,7 +129,7 @@ ENTRYPOINT ["/app/tools.sh"]
 # ==============================================================================
 FROM base AS light

-COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

 ENTRYPOINT [ "/app/llama-cli" ]

@@ -156,7 +140,7 @@ FROM base AS server

 ENV LLAMA_ARG_HOST=0.0.0.0

-COPY --from=build /app/full/llama /app/full/llama-server /app
+COPY --from=build /app/full/llama-server /app

 HEALTHCHECK --interval=5m CMD [ "curl", "-f", "http://localhost:8080/health" ]

@@ -3,20 +3,6 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-ARG NODE_VERSION=24
-
-FROM docker.io/node:$NODE_VERSION AS web
-
-ARG APP_VERSION
-
-WORKDIR /app/tools/ui
-
-COPY tools/ui/package.json tools/ui/package-lock.json ./
-RUN npm ci
-
-COPY tools/ui/ ./
-RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
-
 FROM docker.io/ubuntu:$UBUNTU_VERSION AS build

 ARG TARGETARCH
@@ -30,8 +16,6 @@ WORKDIR /app

 COPY . .

-COPY --from=web /app/tools/ui/dist tools/ui/dist
-
 RUN if [ "$TARGETARCH" = "amd64" ] || [ "$TARGETARCH" = "arm64" ]; then \
        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
    else \
@@ -104,7 +88,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

 WORKDIR /app

@@ -115,7 +99,7 @@ FROM base AS server

 ENV LLAMA_ARG_HOST=0.0.0.0

-COPY --from=build /app/full/llama /app/full/llama-server /app
+COPY --from=build /app/full/llama-server /app

 WORKDIR /app

@@ -11,20 +11,6 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-ARG NODE_VERSION=24
-
-FROM docker.io/node:$NODE_VERSION AS web
-
-ARG APP_VERSION
-
-WORKDIR /app/tools/ui
-
-COPY tools/ui/package.json tools/ui/package-lock.json ./
-RUN npm ci
-
-COPY tools/ui/ ./
-RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
-
 FROM ${BASE_CUDA_DEV_CONTAINER} AS build

 ARG GCC_VERSION
@@ -40,8 +26,6 @@ WORKDIR /app

 COPY . .

-COPY --from=web /app/tools/ui/dist tools/ui/dist
-
 RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
    export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
    fi && \
@@ -113,7 +97,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

 WORKDIR /app

@@ -124,7 +108,7 @@ FROM base AS server

 ENV LLAMA_ARG_HOST=0.0.0.0

-COPY --from=build /app/full/llama /app/full/llama-server /app
+COPY --from=build /app/full/llama-server /app

 WORKDIR /app

@@ -5,20 +5,6 @@ ARG APP_REVISION=N/A

 ## Build Image

-ARG NODE_VERSION=24
-
-FROM docker.io/node:$NODE_VERSION AS web
-
-ARG APP_VERSION
-
-WORKDIR /app/tools/ui
-
-COPY tools/ui/package.json tools/ui/package-lock.json ./
-RUN npm ci
-
-COPY tools/ui/ ./
-RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
-
 FROM docker.io/intel/deep-learning-essentials:$ONEAPI_VERSION AS build

 ARG GGML_SYCL_F16=ON
@@ -36,8 +22,6 @@ WORKDIR /app

 COPY . .

-COPY --from=web /app/tools/ui/dist tools/ui/dist
-
 RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
        echo "GGML_SYCL_F16 is set" \
        && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON" \
@@ -141,7 +125,7 @@ ENTRYPOINT ["/app/tools.sh"]
 FROM base AS light

 COPY --from=build /app/lib/ /app
-COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

 WORKDIR /app

@@ -153,7 +137,7 @@ FROM base AS server
 ENV LLAMA_ARG_HOST=0.0.0.0

 COPY --from=build /app/lib/ /app
-COPY --from=build /app/full/llama /app/full/llama-server /app
+COPY --from=build /app/full/llama-server /app

 WORKDIR /app

@@ -10,20 +10,6 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-ARG NODE_VERSION=24
-
-FROM docker.io/node:$NODE_VERSION AS web
-
-ARG APP_VERSION
-
-WORKDIR /app/tools/ui
-
-COPY tools/ui/package.json tools/ui/package-lock.json ./
-RUN npm ci
-
-COPY tools/ui/ ./
-RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
-
 FROM ${BASE_MUSA_DEV_CONTAINER} AS build

 # MUSA architecture to build for (defaults to all supported archs)
@@ -43,8 +29,6 @@ WORKDIR /app

 COPY . .

-COPY --from=web /app/tools/ui/dist tools/ui/dist
-
 RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
    fi && \
@@ -115,7 +99,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

 WORKDIR /app

@@ -126,7 +110,7 @@ FROM base AS server

 ENV LLAMA_ARG_HOST=0.0.0.0

-COPY --from=build /app/full/llama /app/full/llama-server /app
+COPY --from=build /app/full/llama-server /app

 WORKDIR /app

@@ -1,12 +1,12 @@
-ARG OPENVINO_VERSION_MAJOR=2026.2.1
-ARG OPENVINO_VERSION_FULL=2026.2.1.21919.ede283a88e3
+ARG OPENVINO_VERSION_MAJOR=2026.2
+ARG OPENVINO_VERSION_FULL=2026.2.0.21903.52ddc073857
 ARG UBUNTU_VERSION=24.04

 # Intel GPU driver versions. https://github.com/intel/compute-runtime/releases
-ARG IGC_VERSION=v2.36.3
-ARG IGC_VERSION_FULL=2_2.36.3+21719
-ARG COMPUTE_RUNTIME_VERSION=26.22.38646.4
-ARG COMPUTE_RUNTIME_VERSION_FULL=26.22.38646.4-0
+ARG IGC_VERSION=v2.34.4
+ARG IGC_VERSION_FULL=2_2.34.4+21428
+ARG COMPUTE_RUNTIME_VERSION=26.18.38308.1
+ARG COMPUTE_RUNTIME_VERSION_FULL=26.18.38308.1-0
 ARG IGDGMM_VERSION=22.10.0

 # Intel NPU driver versions. https://github.com/intel/linux-npu-driver/releases
@@ -22,20 +22,6 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-ARG NODE_VERSION=24
-
-FROM docker.io/node:$NODE_VERSION AS web
-
-ARG APP_VERSION
-
-WORKDIR /app/tools/ui
-
-COPY tools/ui/package.json tools/ui/package-lock.json ./
-RUN npm ci
-
-COPY tools/ui/ ./
-RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
-
 ## Build Image
 FROM docker.io/ubuntu:${UBUNTU_VERSION} AS build

@@ -83,8 +69,6 @@ WORKDIR /app

 COPY . .

-COPY --from=web /app/tools/ui/dist tools/ui/dist
-
 # Build Stage
 RUN bash -c "source ${OpenVINO_DIR}/setupvars.sh && \
    cmake -B build/ReleaseOV -G Ninja \
@@ -214,7 +198,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app/
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app/

 WORKDIR /app

@@ -225,7 +209,7 @@ FROM base AS server

 ENV LLAMA_ARG_HOST=0.0.0.0

-COPY --from=build /app/full/llama /app/full/llama-server /app/
+COPY --from=build /app/full/llama-server /app/

 WORKDIR /app

@@ -11,20 +11,6 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-ARG NODE_VERSION=24
-
-FROM docker.io/node:$NODE_VERSION AS web
-
-ARG APP_VERSION
-
-WORKDIR /app/tools/ui
-
-COPY tools/ui/package.json tools/ui/package-lock.json ./
-RUN npm ci
-
-COPY tools/ui/ ./
-RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
-
 ### Build image
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build

@@ -52,8 +38,6 @@ WORKDIR /app

 COPY . .

-COPY --from=web /app/tools/ui/dist tools/ui/dist
-
 RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
    cmake -S . -B build \
        -DGGML_HIP=ON \
@@ -127,7 +111,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

 WORKDIR /app

@@ -138,7 +122,7 @@ FROM base AS server

 ENV LLAMA_ARG_HOST=0.0.0.0

-COPY --from=build /app/full/llama /app/full/llama-server /app
+COPY --from=build /app/full/llama-server /app

 WORKDIR /app

@@ -124,7 +124,7 @@ WORKDIR /llama.cpp/bin

 # Copy llama.cpp binaries and libraries
 COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
-COPY --from=collector /llama.cpp/bin/llama /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin
+COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin

 ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]

@@ -138,7 +138,7 @@ WORKDIR /llama.cpp/bin

 # Copy llama.cpp binaries and libraries
 COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
-COPY --from=collector /llama.cpp/bin/llama /llama.cpp/bin/llama-server /llama.cpp/bin
+COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin

 EXPOSE 8080

@@ -3,20 +3,6 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-ARG NODE_VERSION=24
-
-FROM docker.io/node:$NODE_VERSION AS web
-
-ARG APP_VERSION
-
-WORKDIR /app/tools/ui
-
-COPY tools/ui/package.json tools/ui/package-lock.json ./
-RUN npm ci
-
-COPY tools/ui/ ./
-RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
-
 FROM docker.io/ubuntu:$UBUNTU_VERSION AS build

 # Install build tools
@@ -31,8 +17,6 @@ WORKDIR /app

 COPY . .

-COPY --from=web /app/tools/ui/dist tools/ui/dist
-
 RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
    cmake --build build --config Release -j$(nproc)

@@ -107,7 +91,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

 WORKDIR /app

@@ -118,7 +102,7 @@ FROM base AS server

 ENV LLAMA_ARG_HOST=0.0.0.0

-COPY --from=build /app/full/llama /app/full/llama-server /app
+COPY --from=build /app/full/llama-server /app

 WORKDIR /app

@@ -3,20 +3,6 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-ARG NODE_VERSION=24
-
-FROM docker.io/node:$NODE_VERSION AS web
-
-ARG APP_VERSION
-
-WORKDIR /app/tools/ui
-
-COPY tools/ui/package.json tools/ui/package-lock.json ./
-RUN npm ci
-
-COPY tools/ui/ ./
-RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
-
 FROM docker.io/ubuntu:$UBUNTU_VERSION AS build

 RUN apt-get update && \
@@ -28,8 +14,6 @@ WORKDIR /app

 COPY . .

-COPY --from=web /app/tools/ui/dist tools/ui/dist
-
 RUN cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_ZENDNN=ON && \
    cmake --build build -j $(nproc)

@@ -97,7 +81,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

 WORKDIR /app

@@ -108,7 +92,7 @@ FROM base AS server

 ENV LLAMA_ARG_HOST=0.0.0.0

-COPY --from=build /app/full/llama /app/full/llama-server /app
+COPY --from=build /app/full/llama-server /app

 WORKDIR /app

@@ -10,8 +10,6 @@

 build*/

-tools/ui/node_modules/
-
 models/*

 /llama-cli
@@ -35,20 +35,8 @@ AMD ZenDNN:
 documentation:
    - changed-files:
        - any-glob-to-any-file:
-            - "**/*.md"
            - docs/**
            - media/**
-examples:
-    - all:
-        - changed-files:
-            - any-glob-to-any-file:
-                - app/**
-                - examples/**
-                - tools/**
-            - all-globs-to-all-files:
-                - '!tools/server/**'
-                - '!tools/mtmd/**'
-                - '!tools/ui/**'
 testing:
    - changed-files:
        - any-glob-to-any-file:
@@ -59,12 +47,28 @@ build:
            - cmake/**
            - CMakeLists.txt
            - CMakePresets.json
+examples:
+    - changed-files:
+        - any-glob-to-any-file:
+            - examples/**
+            - tools/**
 devops:
    - changed-files:
        - any-glob-to-any-file:
            - .devops/**
            - .github/**
            - ci/**
+python:
+    - changed-files:
+        - any-glob-to-any-file:
+            - "**/*.py"
+            - requirements/**
+            - gguf-py/**
+            - .flake8
+script:
+    - changed-files:
+        - any-glob-to-any-file:
+            - scripts/**
 android:
    - changed-files:
        - any-glob-to-any-file:
@@ -77,20 +81,9 @@ server:
    - changed-files:
        - any-glob-to-any-file:
            - tools/server/**
-mtmd:
-    - changed-files:
-        - any-glob-to-any-file:
-            - tools/mtmd/**
-conversion:
-    - changed-files:
-        - any-glob-to-any-file:
-            - conversion/**
-            - convert_*.py
-            - gguf-py/**
-vendor:
-    - changed-files:
-        - any-glob-to-any-file:
-            - vendor/**
+
+
+
 ggml:
    - changed-files:
        - any-glob-to-any-file:
@@ -68,8 +68,8 @@ jobs:

    env:
      # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.2.1"
-      OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
+      OPENVINO_VERSION_MAJOR: "2026.2"
+      OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"

    steps:
      - name: Clone
@@ -96,8 +96,8 @@ jobs:

    env:
      # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.2.1"
-      OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
+      OPENVINO_VERSION_MAJOR: "2026.2"
+      OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"

    steps:
      - name: Clone
@@ -39,8 +39,8 @@ jobs:

    env:
      # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.2.1"
-      OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
+      OPENVINO_VERSION_MAJOR: "2026.2"
+      OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"

    steps:
      - name: Clone
@@ -96,8 +96,8 @@ jobs:

    env:
      # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.2.1"
-      OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
+      OPENVINO_VERSION_MAJOR: "2026.2"
+      OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"

    steps:
      - name: Clone
@@ -266,8 +266,8 @@ jobs:

    env:
      # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.2.1"
-      OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
+      OPENVINO_VERSION_MAJOR: "2026.2"
+      OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"

    steps:
      - name: Clone
@@ -58,13 +58,6 @@ jobs:
          git tag ${{ steps.srctag.outputs.name }} || exit 0
          git push origin ${{ steps.srctag.outputs.name }} || exit 0

-  build_ui:
-    name: Build UI
-    needs: create_tag
-    uses: ./.github/workflows/ui-build.yml
-    with:
-      hf_ui_version: ${{ needs.create_tag.outputs.source_tag }}
-
  prepare_matrices:
    name: Prepare Docker matrices
    runs-on: ubuntu-24.04
@@ -86,7 +79,7 @@ jobs:
          [
            { "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
            { "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
-            { "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x", "prebuilt_ui": true },
+            { "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" },
            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
            { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.3.0", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
@@ -142,7 +135,7 @@ jobs:

  push_to_registry:
    name: Push Docker image to Docker Registry
-    needs: [prepare_matrices, create_tag, build_ui]
+    needs: [prepare_matrices, create_tag]

    runs-on: ${{ matrix.config.runs_on }}
    strategy:
@@ -157,13 +150,6 @@ jobs:
          fetch-depth: 0
          ref: ${{ needs.create_tag.outputs.source_tag }}

-      - name: Download prebuilt UI
-        if: ${{ matrix.config.prebuilt_ui == true }}
-        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
-        with:
-          name: ui-build
-          path: tools/ui/dist
-
      - name: Set up QEMU
        if: ${{ contains(matrix.config.platforms, 'linux/amd64') }}
        uses: docker/setup-qemu-action@ce360397dd3f832beb865e1373c09c0e9f86d70a # v4
@@ -46,13 +46,11 @@ jobs:

    steps:
      - id: check
-        env:
-          COMMIT_MESSAGE: ${{ github.event.head_commit.message }}
        run: |
          if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
            echo "should_release=true" >> $GITHUB_OUTPUT
          elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/master" ]]; then
-            if echo "$COMMIT_MESSAGE" | grep -q '\[no release\]'; then
+            if echo "${{ github.event.head_commit.message }}" | grep -q '\[no release\]'; then
              echo "should_release=false" >> $GITHUB_OUTPUT
            else
              echo "should_release=true" >> $GITHUB_OUTPUT
@@ -446,8 +444,8 @@ jobs:

    env:
      # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.2.1"
-      OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
+      OPENVINO_VERSION_MAJOR: "2026.2"
+      OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"

    steps:
      - name: Set OpenVINO version output
@@ -506,11 +504,8 @@ jobs:
          cmake -B build/ReleaseOV -G Ninja \
            -DCMAKE_BUILD_TYPE=Release \
            -DGGML_OPENVINO=ON \
-            -DCMAKE_INSTALL_RPATH='$ORIGIN' \
-            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build/ReleaseOV --config Release --parallel
+            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }}
+          cmake --build build/ReleaseOV --config Release -j $(nproc)

      - name: ccache-clear
        uses: ./.github/actions/ccache-clear
@@ -524,26 +519,8 @@ jobs:
      - name: Pack artifacts
        id: pack_artifacts
        run: |
-          dest=./build/ReleaseOV/bin
-          OPENVINO_ROOT=./openvino_toolkit
-          ov_lib="$OPENVINO_ROOT/runtime/lib/intel64"
-
-          # Bundle OpenVINO runtime libs + TBB. Binaries built with RPATH=$ORIGIN
-          # load these siblings without setupvars.sh / LD_LIBRARY_PATH.
-          cp -P "$ov_lib"/libopenvino.so* \
-                "$ov_lib"/libopenvino_c.so* \
-                "$ov_lib"/libopenvino_*_plugin.so \
-                "$ov_lib"/libopenvino_intel_npu_compiler*.so \
-                "$OPENVINO_ROOT"/runtime/3rdparty/tbb/lib/*.so* \
-                "$dest"
-          cp -P /usr/lib/x86_64-linux-gnu/libOpenCL.so.1* "$dest" 2>/dev/null || true
-          cp "$ov_lib"/cache.json "$dest" 2>/dev/null || true
-
-          # OpenVINO licensing
-          cp -r "$OPENVINO_ROOT"/docs/licensing "$dest"/openvino-licensing
-
-          cp LICENSE "$dest"
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C "$dest" .
+          cp LICENSE ./build/ReleaseOV/bin/
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/ReleaseOV/bin .

      - name: Upload artifacts
        uses: actions/upload-artifact@v6
@@ -552,9 +529,6 @@ jobs:
          name: llama-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz

  windows-openvino:
-    needs: [check-release]
-    if: ${{ needs.check-release.outputs.should_release == 'true' }}
-
    runs-on: windows-2022

    outputs:
@@ -562,13 +536,12 @@ jobs:

    env:
      # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.2.1"
-      OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
+      OPENVINO_VERSION_MAJOR: "2026.2"
+      OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"

    steps:
      - name: Set OpenVINO version output
        id: openvino_version
-        shell: bash
        run: echo "value=${{ env.OPENVINO_VERSION_MAJOR }}" >> $GITHUB_OUTPUT

      - name: Clone
@@ -631,9 +604,7 @@ jobs:
            -A x64 ^
            -DCMAKE_BUILD_TYPE=Release ^
            -DGGML_OPENVINO=ON ^
-            -DLLAMA_BUILD_BORINGSSL=ON ^
-            -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake ^
-            ${{ env.CMAKE_ARGS }}
+            -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake

          cmake --build build\ReleaseOV --config Release -- /m

@@ -650,29 +621,8 @@ jobs:
        id: pack_artifacts
        shell: powershell
        run: |
-          # Locate the extracted OpenVINO toolkit root (same pattern as the Build step).
-          $OPENVINO_ROOT = (Get-ChildItem -Directory openvino_toolkit | Select-Object -First 1).FullName
-          if (-not $OPENVINO_ROOT) {
-            Write-Error "OpenVINO toolkit folder not found under .\openvino_toolkit"
-            exit 1
-          }
-
-          $dest = ".\build\ReleaseOV\bin\Release"
-
-          $ovBin = Join-Path $OPENVINO_ROOT 'runtime\bin\intel64\Release'
-          Copy-Item -Path (Join-Path $ovBin '*.dll')       -Destination $dest -Force
-          Copy-Item -Path (Join-Path $ovBin 'cache.json')  -Destination $dest -Force
-
-          $tbbBin = Join-Path $OPENVINO_ROOT 'runtime\3rdparty\tbb\bin'
-          Copy-Item -Path (Join-Path $tbbBin 'tbb*.dll') -Destination $dest -Force
-
-          # OpenVINO licensing
-          $licensingDest = Join-Path $dest 'openvino-licensing'
-          New-Item -ItemType Directory -Force -Path $licensingDest | Out-Null
-          Copy-Item -Path (Join-Path $OPENVINO_ROOT 'docs\licensing\*') -Destination $licensingDest -Recurse -Force
-
-          Copy-Item LICENSE $dest
-          7z a -snl llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip $dest\*
+          Copy-Item LICENSE .\build\ReleaseOV\bin\
+          7z a -snl llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip .\build\ReleaseOV\bin\*

      - name: Upload artifacts
        uses: actions/upload-artifact@v6
@@ -1674,7 +1624,6 @@ jobs:
            **Windows:**
            - [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
            - [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip)
-            - [Windows arm64 (OpenCL Adreno)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-opencl-adreno-arm64.zip)
            - [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) - [CUDA 12.4 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-12.4-x64.zip)
            - [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.3-x64.zip) - [CUDA 13.3 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.3-x64.zip)
            - [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
@@ -25,3 +25,13 @@ Commits:
 - Do not explicitly set the git author in commits - rely on the default git config
 - Always use `--no-gpg-sign` when committing
 - Never `git push` without explicit confirmation from the user
+
+Resources (read on demand):
+- [CONTRIBUTING.md](CONTRIBUTING.md)
+- [Build documentation](docs/build.md)
+- [Server usage documentation](tools/server/README.md)
+- [Server development documentation](tools/server/README-dev.md)
+- [PEG parser](docs/development/parsing.md)
+- [Auto parser](docs/autoparser.md)
+- [Jinja engine](common/jinja/README.md)
+- [PR template](.github/pull_request_template.md)
@@ -222,16 +222,6 @@ if (LLAMA_BUILD_APP)
    add_subdirectory(app)
 endif()

-# Standalone libmtmd build without pulling in the rest of the tools/ tree.
-# Useful when packaging just the mtmd library for language bindings (e.g. an
-# Apple XCFramework, or a WASM build). When the full tools build is enabled,
-# mtmd is already built by the tools/ subdirectory above; this hook only fires
-# when LLAMA_BUILD_TOOLS is OFF to avoid double-adding the target.
-option(LLAMA_BUILD_MTMD "llama: build tools/mtmd library standalone" OFF)
-if (LLAMA_BUILD_MTMD AND NOT (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS))
-    add_subdirectory(tools/mtmd)
-endif()
-
 #
 # install
 #
@@ -10,7 +10,7 @@
 # ggml-org/ggml-rpc         : rgerganov
 # ggml-org/ggml-sycl        : arthw
 # ggml-org/ggml-vulkan      : 0cc4m, jeffbolznv
-# ggml-org/ggml-webgpu      : reeselevine, yomaytk
+# ggml-org/ggml-webgpu      : reeselevine
 # ggml-org/ggml-zdnn        : taronaeo
 # ggml-org/llama-common     : ggerganov, aldehir, angt, danbev, ngxson, pwilkin
 # ggml-org/llama-mtmd       : ngxson
@@ -142,9 +142,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
 - [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
 - [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
- [x] [Liquid LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2)
- [x] [Liquid LFM2.5 models](https://huggingface.co/collections/LiquidAI/lfm25)
- [x] [Liquid Nanos](https://huggingface.co/collections/LiquidAI/liquid-nanos)
+- [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38)
 - [x] [Hunyuan models](https://huggingface.co/collections/tencent/hunyuan-dense-model-6890632cda26b19119c9c5e7)
 - [x] [BailingMoeV2 (Ring/Ling 2.0) models](https://huggingface.co/collections/inclusionAI/ling-v2-68bf1dd2fc34c306c1fa6f86)
 - [x] [Mellum models](https://huggingface.co/JetBrains/models?search=mellum)
@@ -80,7 +80,7 @@ To protect sensitive data from potential leaks or unauthorized access, it is cru
 ### Untrusted environments or networks

 If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
-* Do not use the RPC backend, [ggml-rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061).
+* Do not use the RPC backend, [rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061).
 * Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value.
 * Encrypt your data if sending it over the network.

@@ -1,6 +1,6 @@
 set(TARGET llama-app)

-add_executable(${TARGET} llama.cpp download.cpp)
+add_executable(${TARGET} llama.cpp)
 set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama)

 target_link_libraries(${TARGET} PRIVATE
@@ -1,71 +0,0 @@
-#include "arg.h"
-#include "common.h"
-#include "download.h"
-#include "log.h"
-
-#include <cstdio>
-#include <filesystem>
-
-static void print_usage(int /*argc*/, char ** argv) {
-    printf(
-        "\nexamples:\n"
-        "  %s -hf ggml-org/gemma-3-4b-it-qat-GGUF\n"
-        "  %s -hf ggml-org/gemma-3-4b-it-qat-GGUF:Q4_K_M\n"
-        "  %s -hf ggml-org/models -hff model.gguf\n"
-        "  %s -mu https://example.com/model.gguf -m model.gguf\n"
-        "\n",
-        argv[0], argv[0], argv[0], argv[0]
-    );
-}
-
-int llama_download(int argc, char ** argv);
-
-int llama_download(int argc, char ** argv) {
-    common_init();
-
-    common_params params;
-    params.verbosity = LOG_LEVEL_ERROR;
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DOWNLOAD, print_usage)) {
-        return 1;
-    }
-
-    const bool has_source = !params.model.hf_repo.empty() || !params.model.url.empty() ||
-                            !params.model.path.empty()    || !params.model.docker_repo.empty();
-    if (!has_source) {
-        fprintf(stderr, "error: no model source specified (use --hf-repo, --model-url, --model or --docker-repo)\n");
-        return 1;
-    }
-
-    try {
-        common_models_handler handler = common_models_handler_init(params, LLAMA_EXAMPLE_DOWNLOAD);
-        common_models_handler_apply(handler, params);
-    } catch (const std::exception & e) {
-        fprintf(stderr, "error: %s\n", e.what());
-        return 1;
-    }
-
-    if (!params.models_preset.empty()) {
-        // -hf pointed at a preset repo: print the preset path and stop
-        printf("%s\n", params.models_preset.c_str());
-        return 0;
-    }
-    if (params.model.path.empty()) {
-        fprintf(stderr, "error: model download failed\n");
-        return 1;
-    }
-    if (!std::filesystem::exists(params.model.path)) {
-        fprintf(stderr, "error: model file does not exist: %s\n", params.model.path.c_str());
-        return 1;
-    }
-
-    printf("%s\n", params.model.path.c_str());
-    if (!params.mmproj.path.empty()) {
-        printf("%s\n", params.mmproj.path.c_str());
-    }
-    if (!params.speculative.draft.mparams.path.empty()) {
-        printf("%s\n", params.speculative.draft.mparams.path.c_str());
-    }
-
-    return 0;
-}
@@ -19,23 +19,17 @@ int llama_batched_bench(int argc, char ** argv);
 int llama_fit_params(int argc, char ** argv);
 int llama_quantize(int argc, char ** argv);
 int llama_perplexity(int argc, char ** argv);
-int llama_download(int argc, char ** argv);

-// Self-update is only supported for binaries built with llama-install.sh
+// hands the update over to the install script, which downloads and swaps the binary
 static int llama_update(int argc, char ** argv) {
    (void) argc;
    (void) argv;

-#ifdef LLAMA_INSTALL_BUILD
 #if defined(_WIN32)
    return system("powershell -NoProfile -ExecutionPolicy Bypass -Command \"irm https://llama.app/install.ps1 | iex\"");
 #else
    return system("curl -fsSL https://llama.app/install.sh | sh");
 #endif
-#else
-    printf("Updates are available only when installed from https://llama.app\n");
-    return 1;
-#endif
 }

 static const char * progname;
@@ -50,33 +44,23 @@ struct command {
    std::vector<std::string> aliases;
    bool hidden;
    int (*func)(int, char **);
-    bool flags = false; // allow --name
 };

-#ifdef LLAMA_INSTALL_BUILD
-#define UPDATE_HIDDEN false
-#else
-#define UPDATE_HIDDEN true
-#endif
-
 static const command cmds[] = {
-    {"serve",         "HTTP API server",                                    {"server"},   false,         llama_server       },
-    {"cli",           "Command-line interactive interface",                 {"client"},   false,         llama_cli          },
-    {"update",        "Update llama to the latest release",                 {},           UPDATE_HIDDEN, llama_update       },
-    {"download",      "Download a model",                                   {"get"},      false,         llama_download     },
-    {"completion",    "Text completion",                                    {"complete"}, true,          llama_completion   },
-    {"bench",         "Benchmark prompt processing and text generation",    {},           true,          llama_bench        },
-    {"batched-bench", "Benchmark batched decoding performance",             {},           true,          llama_batched_bench},
-    {"fit-params",    "Compute parameters to fit a model in device memory", {},           true,          llama_fit_params   },
-    {"quantize",      "Quantize a model",                                   {},           true,          llama_quantize     },
-    {"perplexity",    "Compute model perplexity and KL divergence",         {},           true,          llama_perplexity   },
-    {"version",       "Show version",                                       {},           false,         version,           true },
-    {"licenses",      "Show third-party licenses",                          {"credits"},  false,         licenses,          true },
-    {"help",          "Show available commands",                            {},           false,         help,              true },
+    {"serve",         "HTTP API server",                                    {"server"},   false, llama_server       },
+    {"cli",           "Command-line interactive interface",                 {"client"},   false, llama_cli          },
+    {"update",        "Update llama to the latest release",                 {},           false, llama_update       },
+    {"completion",    "Text completion",                                    {"complete"}, true,  llama_completion   },
+    {"bench",         "Benchmark prompt processing and text generation",    {},           true,  llama_bench        },
+    {"batched-bench", "Benchmark batched decoding performance",             {},           true,  llama_batched_bench},
+    {"fit-params",    "Compute parameters to fit a model in device memory", {},           true,  llama_fit_params   },
+    {"quantize",      "Quantize a model",                                   {},           true,  llama_quantize     },
+    {"perplexity",    "Compute model perplexity and KL divergence",         {},           true,  llama_perplexity   },
+    {"version",       "Show version",                                       {},           false, version            },
+    {"licenses",      "Show third-party licenses",                          {"credits"},  false, licenses           },
+    {"help",          "Show available commands",                            {},           false, help               },
 };

-#undef UPDATE_HIDDEN
-
 static int version(int argc, char ** argv) {
    printf("%s\n", llama_build_info());
    return 0;
@@ -109,10 +93,7 @@ static int help(int argc, char ** argv) {
    return 0;
 }

-static bool matches(std::string arg, const command & cmd) {
-    if (cmd.flags && arg.size() > 2 && arg[0] == '-' && arg[1] == '-') {
-        arg.erase(0, 2);
-    }
+static bool matches(const std::string & arg, const command & cmd) {
    if (arg == cmd.name) {
        return true;
    }
@@ -13,7 +13,6 @@ LLAMA_BUILD_EXAMPLES=OFF
 LLAMA_BUILD_TOOLS=OFF
 LLAMA_BUILD_TESTS=OFF
 LLAMA_BUILD_SERVER=OFF
-LLAMA_BUILD_MTMD=ON
 GGML_METAL=ON
 GGML_METAL_EMBED_LIBRARY=ON
 GGML_BLAS_DEFAULT=ON
@@ -40,7 +39,6 @@ COMMON_CMAKE_ARGS=(
    -DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS}
    -DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
    -DLLAMA_BUILD_SERVER=${LLAMA_BUILD_SERVER}
-    -DLLAMA_BUILD_MTMD=${LLAMA_BUILD_MTMD}
    -DGGML_METAL_EMBED_LIBRARY=${GGML_METAL_EMBED_LIBRARY}
    -DGGML_BLAS_DEFAULT=${GGML_BLAS_DEFAULT}
    -DGGML_METAL=${GGML_METAL}
@@ -128,8 +126,6 @@ setup_framework_structure() {
    cp ggml/include/ggml-cpu.h     ${header_path}
    cp ggml/include/ggml-blas.h    ${header_path}
    cp ggml/include/gguf.h         ${header_path}
-    cp tools/mtmd/mtmd.h           ${header_path}
-    cp tools/mtmd/mtmd-helper.h    ${header_path}

    # Create module map (common for all platforms)
    cat > ${module_path}module.modulemap << EOF
@@ -251,7 +247,6 @@ combine_static_libraries() {
        "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-cpu.a"
        "${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a"
        "${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a"
-        "${base_dir}/${build_dir}/tools/mtmd/${release_dir}/libmtmd.a"
    )

    # Create temporary directory for processing
@@ -415,7 +410,6 @@ cmake -B build-ios-sim -G Xcode \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
    -DLLAMA_OPENSSL=OFF \
-    -DMTMD_VIDEO=OFF \
    -S .
 cmake --build build-ios-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet

@@ -430,7 +424,6 @@ cmake -B build-ios-device -G Xcode \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
    -DLLAMA_OPENSSL=OFF \
-    -DMTMD_VIDEO=OFF \
    -S .
 cmake --build build-ios-device --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet

@@ -457,7 +450,6 @@ cmake -B build-visionos -G Xcode \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
    -DLLAMA_OPENSSL=OFF \
    -DLLAMA_BUILD_SERVER=OFF \
-    -DMTMD_VIDEO=OFF \
    -S .
 cmake --build build-visionos --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet

@@ -473,7 +465,6 @@ cmake -B build-visionos-sim -G Xcode \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
    -DLLAMA_OPENSSL=OFF \
    -DLLAMA_BUILD_SERVER=OFF \
-    -DMTMD_VIDEO=OFF \
    -S .
 cmake --build build-visionos-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet

@@ -490,7 +481,6 @@ cmake -B build-tvos-sim -G Xcode \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
    -DLLAMA_OPENSSL=OFF \
-    -DMTMD_VIDEO=OFF \
    -S .
 cmake --build build-tvos-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet

@@ -506,7 +496,6 @@ cmake -B build-tvos-device -G Xcode \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
    -DLLAMA_OPENSSL=OFF \
-    -DMTMD_VIDEO=OFF \
    -S .
 cmake --build build-tvos-device --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet

@@ -80,6 +80,8 @@ add_library(${TARGET}
    http.h
    imatrix-loader.cpp
    imatrix-loader.h
+    json-partial.cpp
+    json-partial.h
    json-schema-to-grammar.cpp
    llguidance.cpp
    log.cpp
@@ -17,7 +17,6 @@
 #   define NOMINMAX
 #endif
 #include <windows.h>
-#include <shellapi.h>
 #endif

 #define JSON_ASSERT GGML_ASSERT
@@ -286,17 +285,108 @@ static std::string clean_file_name(const std::string & fname) {
    return clean_fname;
 }

+static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
+    GGML_ASSERT(!params.model.hf_repo.empty());
+
+    // the returned hf_repo is without tag
+    auto [hf_repo, hf_tag] = common_download_split_repo_tag(params.model.hf_repo);
+
+    // "latest" tag (default if not specified) is translated to "default" preset
+    if (hf_tag == "latest") {
+        hf_tag = "default";
+    }
+
+    std::string model_endpoint = common_get_model_endpoint();
+    auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
+
+    // prepare local path for caching
+    auto preset_fname = clean_file_name(hf_repo + "_preset.ini");
+    auto preset_path = fs_get_cache_file(preset_fname);
+    common_download_opts opts;
+    opts.bearer_token = params.hf_token;
+    opts.offline = params.offline;
+
+    LOG_TRC("%s: looking for remote preset at %s\n", __func__, preset_url.c_str());
+    const int status = common_download_file_single(preset_url, preset_path, opts);
+    const bool has_preset = status >= 200 && status < 400;
+
+    // remote preset is optional, so we don't error out if not found
+    if (has_preset) {
+        LOG_TRC("%s: applying remote preset from %s\n", __func__, preset_url.c_str());
+        common_preset_context ctx(ex, /* only_remote_allowed */ true);
+        common_preset global;
+        auto remote_presets = ctx.load_from_ini(preset_path, global);
+        remote_presets = ctx.cascade(global, remote_presets);
+        if (remote_presets.find(hf_tag) != remote_presets.end()) {
+            common_preset preset = remote_presets.at(hf_tag);
+            LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
+            preset.apply_to_params(params);
+        } else {
+            throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
+        }
+    } else {
+        LOG_TRC("%s: no remote preset found, skipping\n", __func__);
+    }
+
+    return has_preset;
+}
+
 struct handle_model_result {
    bool found_mmproj = false;
    common_params_model mmproj;

    bool found_mtp = false;
    common_params_model mtp;
-
-    bool found_preset = false;
-    std::string preset_path;
 };

+static handle_model_result common_params_handle_model(struct common_params_model & model,
+                                                      const common_download_opts & opts) {
+    handle_model_result result;
+
+    if (!model.docker_repo.empty()) {
+        model.path = common_docker_resolve_model(model.docker_repo);
+        model.name = model.docker_repo;
+    } else if (!model.hf_repo.empty()) {
+        // If -m was used with -hf, treat the model "path" as the hf_file to download
+        if (model.hf_file.empty() && !model.path.empty()) {
+            model.hf_file = model.path;
+            model.path = "";
+        }
+        common_download_opts hf_opts = opts;
+        auto download_result = common_download_model(model, hf_opts);
+
+        if (download_result.model_path.empty()) {
+            throw std::runtime_error("failed to download model from Hugging Face");
+        }
+
+        model.name = model.hf_repo;
+        model.path = download_result.model_path;
+
+        if (!download_result.mmproj_path.empty()) {
+            result.found_mmproj = true;
+            result.mmproj.path  = download_result.mmproj_path;
+        }
+
+        if (!download_result.mtp_path.empty()) {
+            result.found_mtp = true;
+            result.mtp.path  = download_result.mtp_path;
+        }
+    } else if (!model.url.empty()) {
+        if (model.path.empty()) {
+            auto f = string_split<std::string>(model.url, '#').front();
+            f = string_split<std::string>(f, '?').front();
+            model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
+        }
+
+        auto download_result = common_download_model(model, opts);
+        if (download_result.model_path.empty()) {
+            throw std::runtime_error("failed to download model from " + model.url);
+        }
+    }
+
+    return result;
+}
+
 const std::vector<ggml_type> kv_cache_types = {
    GGML_TYPE_F32,
    GGML_TYPE_F16,
@@ -340,242 +430,62 @@ static bool parse_bool_value(const std::string & value) {
    throw std::invalid_argument("the argument has been removed. " + msg);
 }

-//
-// common_models_handler
-//
-
-static std::string get_default_local_path(const std::string & url) {
-    auto f = string_split<std::string>(url, '#').front();
-    f = string_split<std::string>(f, '?').front();
-    return fs_get_cache_file(string_split<std::string>(f, '/').back());
-}
-
-common_models_handler common_models_handler_init(const common_params & params, llama_example curr_ex) {
-    common_download_hf_plan plan;
-    common_download_hf_plan plan_spec;
-    common_download_hf_plan plan_voc;
-    common_download_opts opts;
-
-    const bool spec_type_draft_mtp = std::find(params.speculative.types.begin(),
-                                        params.speculative.types.end(),
-                                        COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params.speculative.types.end();
-
-    // only download mmproj if the current example is using it
-    bool use_mmproj = false;
-    for (const auto & ex : mmproj_examples) {
-        if (curr_ex == ex) {
-            use_mmproj = true;
-            break;
-        }
-    }
-
-    opts.bearer_token    = params.hf_token;
-    opts.offline         = params.offline;
-    opts.download_mtp    = spec_type_draft_mtp;
-    opts.download_mmproj = use_mmproj && !params.no_mmproj
-                        && params.mmproj.path.empty() && params.mmproj.url.empty();
-
-    if (!params.model.hf_repo.empty()) {
-        plan = common_download_get_hf_plan(params.model, opts);
-    }
-
-    if (!params.speculative.draft.mparams.hf_repo.empty()) {
-        plan_spec = common_download_get_hf_plan(params.speculative.draft.mparams, opts);
-    }
-
-    if (!params.vocoder.model.hf_repo.empty()) {
-        plan_voc = common_download_get_hf_plan(params.vocoder.model, opts);
-    }
-
-    return common_models_handler{plan, plan_spec, plan_voc, opts};
-}
-
-bool common_models_handler_is_preset_repo(const common_models_handler & handler) {
-    return !handler.plan.preset.url.empty();
-}
-
-static std::vector<common_download_task> build_url_tasks(const common_params_model & model, common_download_opts opts) {
-    auto parts = common_download_get_all_parts(model.url);
-    std::vector<common_download_task> tasks;
-
-    // single-part: download straight to model.path if the user gave one (-m), else the cache default
-    if (parts.size() == 1) {
-        common_download_task task;
-        task.url        = parts[0];
-        task.local_path = model.path.empty() ? get_default_local_path(parts[0]) : model.path;
-        task.opts       = opts;
-        tasks.push_back(std::move(task));
-        return tasks;
-    }
-
-    // multi-part: place each part under the user's -m directory (if given), else the cache default
-    std::string base_dir;
-    if (!model.path.empty()) {
-        auto pos = model.path.rfind('/');
-        base_dir = pos == std::string::npos ? std::string(".") : model.path.substr(0, pos);
-    }
-
-    for (const auto & part : parts) {
-        common_download_task task;
-        task.url  = part;
-        task.opts = opts;
-
-        std::string local = get_default_local_path(part);
-        if (!base_dir.empty()) {
-            auto pos = local.rfind('/');
-            std::string name = pos == std::string::npos ? local : local.substr(pos + 1);
-            local = base_dir + "/" + name;
-        }
-        task.local_path = local;
-        tasks.push_back(std::move(task));
-    }
-    return tasks;
-}
-
-void common_models_handler_apply(common_models_handler & handler, common_params & params, common_download_callback * callback) {
-    std::vector<common_download_task> tasks;
-
-    auto & plan      = handler.plan;
-    auto & plan_spec = handler.plan_spec;
-    auto & plan_voc  = handler.plan_voc;
-
-    auto opts = handler.opts; // copy
-    opts.callback = callback;
-
-    // handle plain "url" if needed
-    auto handle_url = [&](common_params_model & model) {
-        if (!model.url.empty()) {
-            if (model.path.empty()) {
-                model.path = get_default_local_path(model.url);
-            }
-        }
-    };
-    handle_url(params.model);
-    handle_url(params.mmproj);
-    handle_url(params.vocoder.model);
-    handle_url(params.speculative.draft.mparams);
-
-    // optionally, if docker repo is set, resolve it
-    if (!params.model.docker_repo.empty()) {
-        params.model.url  = common_docker_resolve_model(params.model.docker_repo);
-        params.model.path = get_default_local_path(params.model.url);
-    }
-
-    // handle plain "url" tasks (non-hf)
-    if (!params.model.url.empty()) {
-        auto url_tasks = build_url_tasks(params.model, opts);
-        // the first part is what gets loaded, so point params.model.path at it
-        if (!url_tasks.empty()) {
-            std::string first_path = url_tasks.front().local_path;
-            url_tasks.front().on_done = [&]() { params.model.path = first_path; };
-        }
-        for (auto & task : url_tasks) {
-            tasks.push_back(std::move(task));
-        }
-    }
-    if (!params.mmproj.url.empty()) {
-        common_download_task task;
-        task.url        = params.mmproj.url;
-        task.local_path = params.mmproj.path;
-        task.opts       = opts;
-        tasks.push_back(task);
-    }
-    if (!params.vocoder.model.url.empty()) {
-        common_download_task task;
-        task.url        = params.vocoder.model.url;
-        task.local_path = params.vocoder.model.path;
-        task.opts       = opts;
-        tasks.push_back(task);
-    }
-    if (!params.speculative.draft.mparams.url.empty()) {
-        common_download_task task;
-        task.url        = params.speculative.draft.mparams.url;
-        task.local_path = params.speculative.draft.mparams.path;
-        task.opts       = opts;
-        tasks.push_back(task);
-    }
-
-    // handle hf_plan tasks
-    auto add_tasks = [&opts, &tasks](const hf_cache::hf_files & model_files, common_params_model & model) {
-        for (size_t i = 0; i < model_files.size(); ++i) {
-            auto & model_file = model_files[i];
-            bool is_first = (i == 0);
-            tasks.emplace_back(model_file, opts, [&, is_first]() {
-                if (is_first) {
-                    // only use first part as model path
-                    model.path = hf_cache::finalize_file(model_file);
-                } else {
-                    hf_cache::finalize_file(model_file);
-                }
-            });
-        }
-    };
-    if (!plan.model_files.empty()) {
-        add_tasks(plan.model_files, params.model);
-    }
-    if (!plan.mmproj.local_path.empty()) {
-        tasks.emplace_back(plan.mmproj, opts, [&]() {
-            params.mmproj.path = hf_cache::finalize_file(plan.mmproj);
-        });
-    }
-    if (!plan.mtp.local_path.empty()) {
-        tasks.emplace_back(plan.mtp, opts, [&]() {
-            // only fall back to the discovered MTP head when no draft was explicitly provided
-            if (params.speculative.draft.mparams.empty()) {
-                params.speculative.draft.mparams.path = hf_cache::finalize_file(plan.mtp);
-            } else {
-                hf_cache::finalize_file(plan.mtp);
-            }
-        });
-    }
-    if (!plan.preset.local_path.empty()) {
-        tasks.emplace_back(plan.preset, opts, [&]() {
-            // if HF repo is a preset repo, we simply run server in router mode with the preset.ini file
-            params.models_preset_hf = params.model.hf_repo; // only for showing a warning
-            params.models_preset    = hf_cache::finalize_file(plan.preset);
-            params.model = common_params_model{}; // make sure to clear model, so server starts in router mode
-        });
-    }
-
-    // handle plan_spec (e.g. --spec-draft-hf)
-    if (!plan_spec.model_files.empty()) {
-        add_tasks(plan_spec.model_files, params.speculative.draft.mparams);
-    }
-
-    // handle vocoder plan (e.g. --hf-repo-v)
-    if (!plan_voc.model_files.empty()) {
-        add_tasks(plan_voc.model_files, params.vocoder.model);
-    }
-
-    // run all tasks in parallel
-    if (!params.offline) {
-        // if duplicated files are found, only download once (but still call on_done for each task)
-        std::unordered_map<std::string, common_download_task *> unique_tasks;
-        for (auto & task : tasks) {
-            auto it = unique_tasks.find(task.local_path);
-            if (it == unique_tasks.end()) {
-                unique_tasks[task.local_path] = &task;
-            }
-        }
-        std::vector<common_download_task> unique_tasks_vec;
-        for (auto & pair : unique_tasks) {
-            unique_tasks_vec.push_back(*pair.second);
-        }
-        common_download_run_tasks(unique_tasks_vec);
-    }
-
-    // download successful, update params with the downloaded paths
-    for (const auto & task : tasks) {
-        if (task.on_done) {
-            task.on_done();
-        }
-    }
-}
-
 //
 // CLI argument parsing functions
 //

+bool common_params_handle_models(common_params & params, llama_example curr_ex) {
+    const bool spec_type_draft_mtp = std::find(params.speculative.types.begin(),
+                                         params.speculative.types.end(),
+                                         COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params.speculative.types.end();
+
+    common_download_opts opts;
+    opts.bearer_token    = params.hf_token;
+    opts.offline         = params.offline;
+    opts.skip_download   = params.skip_download;
+    opts.download_mtp    = spec_type_draft_mtp;
+    opts.download_mmproj = !params.no_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty();
+
+    // sub-models (draft, mmproj, vocoder) are explicitly specified by the user,
+    // so we should not auto-discover mtp/mmproj siblings for them
+    common_download_opts sub_opts = opts;
+    sub_opts.download_mtp    = false;
+    sub_opts.download_mmproj = false;
+
+    try {
+        auto res = common_params_handle_model(params.model, opts);
+        if (params.no_mmproj) {
+            params.mmproj = {};
+        } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
+            // optionally, handle mmproj model when -hf is specified
+            params.mmproj = res.mmproj;
+        }
+        // only download mmproj if the current example is using it
+        for (const auto & ex : mmproj_examples) {
+            if (curr_ex == ex) {
+                common_params_handle_model(params.mmproj, sub_opts);
+                break;
+            }
+        }
+
+        // when --spec-type mtp is set and no draft model was provided explicitly,
+        // fall back to the MTP head discovered alongside the -hf model
+        if (spec_type_draft_mtp && res.found_mtp &&
+            params.speculative.draft.mparams.path.empty() &&
+            params.speculative.draft.mparams.hf_repo.empty() &&
+            params.speculative.draft.mparams.url.empty()) {
+            params.speculative.draft.mparams.path = res.mtp.path;
+        }
+        common_params_handle_model(params.speculative.draft.mparams, sub_opts);
+        common_params_handle_model(params.vocoder.model,             sub_opts);
+        return true;
+    } catch (const common_skip_download_exception &) {
+        return false;
+    } catch (const std::exception &) {
+        throw;
+    }
+}
+
 static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
    common_params & params = ctx_arg.params;

@@ -691,6 +601,30 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
    // parse the first time to get -hf option (used for remote preset)
    parse_cli_args();

+    // export_graph_ops loads only metadata
+    const bool skip_model_download = ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;
+
+    // maybe handle remote preset
+    if (!params.model.hf_repo.empty() && !skip_model_download) {
+        std::string cli_hf_repo = params.model.hf_repo;
+        bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex);
+
+        // special case: if hf_repo explicitly set by preset, we need to preserve it (ignore CLI value)
+        // this is useful when we have one HF repo pointing to other HF repos (one model - multiple GGUFs)
+        std::string preset_hf_repo = params.model.hf_repo;
+        bool preset_has_hf_repo = preset_hf_repo != cli_hf_repo;
+
+        if (has_preset) {
+            // re-parse CLI args to override preset values
+            parse_cli_args();
+        }
+
+        // preserve hf_repo from preset if needed
+        if (preset_has_hf_repo) {
+            params.model.hf_repo = preset_hf_repo;
+        }
+    }
+
    postprocess_cpu_params(params.cpuparams,       nullptr);
    postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);

@@ -701,26 +635,15 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
    }

-    const bool skip_model_download =
-        // server will call common_params_handle_models() later, so we skip it here
-        ctx_arg.ex == LLAMA_EXAMPLE_SERVER ||
-        // download calls common_params_handle_models() itself and prints the paths
-        ctx_arg.ex == LLAMA_EXAMPLE_DOWNLOAD ||
-        // export_graph_ops loads only metadata
-        ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;
-
+    // handle model and download
    if (!skip_model_download) {
-        // handle model and download
-        common_models_handler handler = common_models_handler_init(params, ctx_arg.ex);
-        common_models_handler_apply(handler, params);
+        common_params_handle_models(params, ctx_arg.ex);
+    }

-        // model is required (except for server)
-        // TODO @ngxson : maybe show a list of available models in CLI in this case
-        if (params.model.path.empty()
-                && !params.usage
-                && !params.completion) {
-            throw std::invalid_argument("error: --model is required\n");
-        }
+    // model is required (except for server)
+    // TODO @ngxson : maybe show a list of available models in CLI in this case
+    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !skip_model_download && !params.usage && !params.completion) {
+        throw std::invalid_argument("error: --model is required\n");
    }

    if (params.escape) {
@@ -784,19 +707,15 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
            common_options.push_back(&opt);
        }
    }
-    bool first = true;
-    auto print_section = [&](const char * header, std::vector<common_arg *> & options) {
-        if (options.empty()) {
-            return;
-        }
-        printf("%s----- %s -----\n\n", first ? "" : "\n\n", header);
-        first = false;
-        print_options(options);
-    };
-    print_section("common params",           common_options);
-    print_section("sampling params",         sampling_options);
-    print_section("speculative params",      spec_options);
-    print_section("example-specific params", specific_options);
+    printf("----- common params -----\n\n");
+    print_options(common_options);
+    printf("\n\n----- sampling params -----\n\n");
+    print_options(sampling_options);
+    printf("\n\n----- speculative params -----\n\n");
+    print_options(spec_options);
+    // TODO: maybe convert enum llama_example to string
+    printf("\n\n----- example-specific params -----\n\n");
+    print_options(specific_options);
 }

 static void common_params_print_completion(common_params_context & ctx_arg) {
@@ -1018,44 +937,7 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
    return true;
 }

-#ifdef _WIN32
-struct utf8_argv {
-    std::vector<std::string> buf;
-    std::vector<char*> ptrs;
-};
-
-static utf8_argv make_utf8_argv() {
-    utf8_argv out;
-    int wargc = 0;
-    LPWSTR* wargv = CommandLineToArgvW(GetCommandLineW(), &wargc);
-    if (!wargv) return out;
-
-    out.buf.reserve(wargc);
-    for (int i = 0; i < wargc; ++i) {
-        int n = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, wargv[i], -1, nullptr, 0, nullptr, nullptr);
-        if (n <= 0) { out.buf.emplace_back(); continue; }
-        auto& s = out.buf.emplace_back();
-        s.resize(static_cast<size_t>(n - 1));
-        (void)WideCharToMultiByte(CP_UTF8, 0, wargv[i], -1, s.data(), n, nullptr, nullptr);
-    }
-    LocalFree(wargv);
-
-    out.ptrs.reserve(out.buf.size() + 1);
-    for (auto& s : out.buf) out.ptrs.push_back(s.data());
-    out.ptrs.push_back(nullptr);
-    return out;
-}
-#endif
-
 bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
-#ifdef _WIN32
-    auto utf8 = make_utf8_argv();
-    // repair argv only when it matches the process command line
-    if (static_cast<int>(utf8.buf.size()) == argc) {
-        argv = utf8.ptrs.data();
-    }
-#endif
-
    auto ctx_arg = common_params_parser_init(params, ex, print_usage);
    const common_params params_org = ctx_arg.params; // the example can modify the default params

@@ -1196,9 +1078,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
     */
    auto add_opt = [&](common_arg arg) {
-        // download only exposes the handful of args explicitly tagged for it
-        const bool inherit_common = ex != LLAMA_EXAMPLE_DOWNLOAD;
-        if ((arg.in_example(ex) || (inherit_common && arg.in_example(LLAMA_EXAMPLE_COMMON))) && !arg.is_exclude(ex)) {
+        if ((arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) {
            ctx_arg.options.push_back(std::move(arg));
        }
    };
@@ -1209,7 +1089,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.usage = true;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}));
+    ));
    add_opt(common_arg(
        {"--version"},
        "show version and build info",
@@ -2331,7 +2211,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, bool value) {
            params.no_mmproj = !value;
        }
-    ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_MMPROJ_AUTO"));
+    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO"));
    add_opt(common_arg(
        {"--mmproj-offload"},
        {"--no-mmproj-offload"},
@@ -2730,14 +2610,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.model.path = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_MODEL"));
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
    add_opt(common_arg(
        {"-mu", "--model-url"}, "MODEL_URL",
        "model download url (default: unused)",
        [](common_params & params, const std::string & value) {
            params.model.url = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_MODEL_URL"));
+    ).set_env("LLAMA_ARG_MODEL_URL"));
    add_opt(common_arg(
        { "-dr", "--docker-repo" }, "[<repo>/]<model>[:quant]",
        "Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.\n"
@@ -2746,7 +2626,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.model.docker_repo = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_DOCKER_REPO"));
+    ).set_env("LLAMA_ARG_DOCKER_REPO"));
    add_opt(common_arg(
        {"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
        "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
@@ -2756,14 +2636,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.model.hf_repo = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_HF_REPO"));
+    ).set_env("LLAMA_ARG_HF_REPO"));
    add_opt(common_arg(
        {"-hff", "--hf-file"}, "FILE",
        "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
        [](common_params & params, const std::string & value) {
            params.model.hf_file = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_HF_FILE"));
+    ).set_env("LLAMA_ARG_HF_FILE"));
    add_opt(common_arg(
        {"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
        "Hugging Face model repository for the vocoder model (default: unused)",
@@ -2784,14 +2664,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.hf_token = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("HF_TOKEN"));
-    add_opt(common_arg(
-        {"--mtp"},
-        "also download the multi-token prediction (MTP) head, if available (default: unused)",
-        [](common_params & params) {
-            params.speculative.types.push_back(COMMON_SPECULATIVE_TYPE_DRAFT_MTP);
-        }
-    ).set_examples({LLAMA_EXAMPLE_DOWNLOAD}));
+    ).set_env("HF_TOKEN"));
    add_opt(common_arg(
        {"--context-file"}, "FNAME",
        "file to load context from (use comma-separated values to specify multiple files)",
@@ -3001,26 +2874,62 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.api_prefix = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
+    // Deprecated: use --ui-config instead (kept for backward compat)
    add_opt(common_arg(
-        {"--ui-config", "--webui-config"}, "JSON",
+        {"--webui-config"}, "JSON",
+        "[DEPRECATED: use --ui-config] JSON that provides default WebUI settings (overrides WebUI defaults)",
+        [](common_params & params, const std::string & value) {
+            params.ui_config_json = value;
+            params.webui_config_json = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG"));
+
+    add_opt(common_arg(
+        {"--ui-config"}, "JSON",
        "JSON that provides default UI settings (overrides UI defaults)",
        [](common_params & params, const std::string & value) {
            params.ui_config_json = value;
+            params.webui_config_json = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_CONFIG"));
+
+    // Deprecated: use --ui-config-file instead (kept for backward compat)
    add_opt(common_arg(
-        {"--ui-config-file", "--webui-config-file"}, "PATH",
+        {"--webui-config-file"}, "PATH",
+        "[DEPRECATED: use --ui-config-file] JSON file that provides default WebUI settings (overrides WebUI defaults)",
+        [](common_params & params, const std::string & value) {
+            params.ui_config_json = read_file(value);
+            params.webui_config_json = params.ui_config_json;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
+
+    add_opt(common_arg(
+        {"--ui-config-file"}, "PATH",
        "JSON file that provides default UI settings (overrides UI defaults)",
        [](common_params & params, const std::string & value) {
            params.ui_config_json = read_file(value);
+            params.webui_config_json = params.ui_config_json;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_CONFIG_FILE"));
+
+    // Deprecated: use --ui-mcp-proxy instead (kept for backward compat)
    add_opt(common_arg(
-        {"--ui-mcp-proxy", "--webui-mcp-proxy"},
-        {"--no-ui-mcp-proxy", "--no-webui-mcp-proxy"},
+        {"--webui-mcp-proxy"},
+        {"--no-webui-mcp-proxy"},
+        "[DEPRECATED: use --ui-mcp-proxy/--no-ui-mcp-proxy] experimental: whether to enable MCP CORS proxy",
+        [](common_params & params, bool value) {
+            params.ui_mcp_proxy = value;
+            params.webui_mcp_proxy = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_MCP_PROXY"));
+
+    add_opt(common_arg(
+        {"--ui-mcp-proxy"},
+        {"--no-ui-mcp-proxy"},
        "experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)",
        [](common_params & params, bool value) {
            params.ui_mcp_proxy = value;
+            params.webui_mcp_proxy = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_MCP_PROXY"));
    add_opt(common_arg(
@@ -3032,26 +2941,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.server_tools = parse_csv_row(value);
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TOOLS"));
+    // Deprecated: use --ui/--no-ui instead (kept for backward compat)
    add_opt(common_arg(
-        {"-ag", "--agent"},
-        {"-no-ag", "--no-agent"},
-        "whether to enable CORS proxy and all built-in tools - do not enable in untrusted environments (default: disabled)",
+        {"--webui"},
+        {"--no-webui"},
+        "[DEPRECATED: use --ui/--no-ui] whether to enable the Web UI",
        [](common_params & params, bool value) {
-            if (value) {
-                params.server_tools = {"all"};
-                params.ui_mcp_proxy = true;
-            } else {
-                params.server_tools.clear();
-                params.ui_mcp_proxy = false;
-            }
+            params.ui = value;
+            params.webui = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_AGENT"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI"));
+
    add_opt(common_arg(
-        {"--ui", "--webui"},
-        {"--no-ui", "--no-webui"},
+        {"--ui"},
+        {"--no-ui"},
        string_format("whether to enable the Web UI (default: %s)", params.ui ? "enabled" : "disabled"),
        [](common_params & params, bool value) {
            params.ui = value;
+            params.webui = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI"));
    add_opt(common_arg(
@@ -3082,7 +2989,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
    add_opt(common_arg(
        {"--api-key-file"}, "FNAME",
-        "path to file containing API keys, one per line; lines starting with a hash are treated as comments (default: none)",
+        "path to file containing API keys (default: none)",
        [](common_params & params, const std::string & value) {
            std::ifstream key_file(value);
            if (!key_file) {
@@ -3090,7 +2997,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            }
            std::string key;
            while (std::getline(key_file, key)) {
-                if (!key.empty() && key[0] != '#') {
+                if (!key.empty()) {
                    params.api_keys.push_back(key);
                }
            }
@@ -3748,7 +3655,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        "draft model for speculative decoding (default: unused)",
        [](common_params & params, const std::string & value) {
            params.speculative.draft.mparams.path = value;
-            params.speculative.draft.mparams.hf_file = value; // will be used if --spec-draft-hf is set
        }
    ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_MODEL"));
    add_opt(common_arg(
@@ -1,14 +1,12 @@
 #pragma once

 #include "common.h"
-#include "download.h"

 #include <set>
 #include <map>
 #include <string>
 #include <vector>
 #include <cstring>
-#include <memory>

 // pseudo-env variable to identify preset-only arguments
 #define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
@@ -131,21 +129,11 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
 // see: https://github.com/ggml-org/llama.cpp/issues/18163
 void common_params_add_preset_options(std::vector<common_arg> & args);

-struct common_models_handler {
-    common_download_hf_plan plan;
-    common_download_hf_plan plan_spec;
-    common_download_hf_plan plan_voc;
-    common_download_opts opts;
-};
-
-// initialize downloading opts and hf_plan if needed, but does not download anything yet
-common_models_handler common_models_handler_init(const common_params & params, llama_example curr_ex);
-
-// check if the model is a preset repo (i.e. has a preset file)
-bool common_models_handler_is_preset_repo(const common_models_handler & handler);
-
-// download and update params with the downloaded model path
-void common_models_handler_apply(common_models_handler & handler, common_params & params, common_download_callback * callback = nullptr);
+// populate model paths (main model, mmproj, etc) from -hf if necessary
+// return true if the model is ready to use
+// throw an exception if there is an error that prevents the model from being used (e.g. network error, model not found, etc)
+// if params.skip_download is true, no downloads will be attempted. return false if the model is invalid or missing (e.g. ETag check failed)
+bool common_params_handle_models(common_params & params, llama_example curr_ex);

 // initialize argument parser context - used by test-arg-parser and preset
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
@@ -395,11 +395,10 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
                                           arguments.name_suffix) +
                           arguments.value_prefix +
                           (schema_info.resolves_to_string(param_schema) ?
-                                p.ac(p.tool_arg_string_value(until_suffix) +
-                                    p.tool_arg_close(p.literal(arguments.value_suffix)), arguments.value_suffix) :
-                                (p.tool_arg_json_value(p.schema(
-                                    p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) +
-                                    p.tool_arg_close(p.literal(arguments.value_suffix)))));
+                                p.tool_arg_string_value(until_suffix) :
+                                p.tool_arg_json_value(p.schema(
+                                    p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false))) +
+                           p.tool_arg_close(p.literal(arguments.value_suffix)));

            auto named_arg = p.rule("tool-" + name + "-arg-" + param_name, arg);
            if (is_required) {
@@ -90,93 +90,41 @@ std::string common_chat_msg::render_content(const std::string & delimiter) const
    return text;
 }

-common_chat_role common_chat_role_from_string(const std::string & role) {
-    if (role == "system")    { return COMMON_CHAT_ROLE_SYSTEM;    }
-    if (role == "assistant") { return COMMON_CHAT_ROLE_ASSISTANT; }
-    if (role == "user")      { return COMMON_CHAT_ROLE_USER;      }
-    if (role == "tool")      { return COMMON_CHAT_ROLE_TOOL;      }
-    return COMMON_CHAT_ROLE_UNKNOWN;
-}
-
-const char * common_chat_role_to_string(common_chat_role role) {
-    switch (role) {
-        case COMMON_CHAT_ROLE_SYSTEM:    return "system";
-        case COMMON_CHAT_ROLE_ASSISTANT: return "assistant";
-        case COMMON_CHAT_ROLE_USER:      return "user";
-        case COMMON_CHAT_ROLE_TOOL:      return "tool";
-        case COMMON_CHAT_ROLE_UNKNOWN:   return "";
-    }
-    return "";
-}
-
-json common_chat_msg_delimiters::to_json() const {
-    json result = json::array();
-    for (const auto & d : delimiters) {
-        result.push_back({
-            { "role",      common_chat_role_to_string(d.role) },
-            { "delimiter", d.delimiter                        },
-        });
-    }
-    return result;
-}
-
-common_chat_msg_delimiters common_chat_msg_delimiters_parse(const json & delimiters) {
-    common_chat_msg_delimiters result;
-
-    if (!delimiters.is_array()) {
-        return result;
+std::vector<common_chat_msg_span> common_chat_split_by_role(const std::string & prompt, const std::vector<common_chat_msg_delimiter> & delims) {
+    if (delims.empty() || prompt.empty()) {
+        return {};
    }

-    result.delimiters.reserve(delimiters.size());
-    for (const auto & d : delimiters) {
-        if (!d.is_object()) {
-            continue;
+    auto parser = build_peg_parser([&](common_peg_parser_builder & p) {
+        std::vector<std::string>       all_delims;
+        std::vector<common_peg_parser> tagged_messages;
+
+        all_delims.reserve(delims.size());
+        tagged_messages.reserve(delims.size());
+        for (const auto & d : delims) {
+            all_delims.push_back(d.delimiter);
        }
-        result.delimiters.push_back({
-            common_chat_role_from_string(d.value("role", std::string())),
-            d.value("delimiter", std::string()),
-        });
-    }

-    return result;
-}
-
-void common_chat_msg_delimiters::tokenize(const llama_vocab * vocab) {
-    for (auto & d : delimiters) {
-        d.tokens = common_tokenize(vocab, d.delimiter, false, true);
-    }
-}
-
-common_chat_msg_spans common_chat_msg_delimiters::split(const llama_tokens & tokens, const std::map<size_t, size_t> & skips) const {
-    std::vector<std::pair<common_chat_role, size_t>> matches;
-
-    auto skip = skips.begin();
-    for (size_t i = 0; i < tokens.size();) {
-        if (skip != skips.end() && i == skip->first) {
-            i += skip->second;
-            ++skip;
-            continue;
+        auto any_delim = p.until_one_of(all_delims);
+        for (const auto & d : delims) {
+            tagged_messages.push_back(p.tag(d.role, p.literal(d.delimiter) + any_delim));
        }
-        for (const auto & d : delimiters) {
-            if (i + d.tokens.size() > tokens.size()) {
-                continue;
-            }
-            if (std::equal(d.tokens.begin(), d.tokens.end(), tokens.begin() + i)) {
-                matches.emplace_back(d.role, i);
-                break;
-            }
+
+        return any_delim + p.zero_or_more(p.choice(tagged_messages)) + p.end();
+    });
+
+    common_peg_parse_context ctx(prompt);
+    const auto result = parser.parse(ctx);
+    if (!result.success()) {
+        return {};
+    }
+
+    std::vector<common_chat_msg_span> spans;
+    ctx.ast.visit(result, [&](const common_peg_ast_node & node) {
+        if (!node.tag.empty()) {
+            spans.push_back({ node.tag, node.start, node.end - node.start });
        }
-        i++;
-    }
-
-    matches.emplace_back(COMMON_CHAT_ROLE_UNKNOWN, tokens.size());
-
-    common_chat_msg_spans spans;
-    for (size_t i = 0; i + 1 < matches.size(); i++) {
-        const auto & curr = matches[i];
-        const auto & next = matches[i + 1];
-        spans.add(curr.first, curr.second, next.second - curr.second);
-    }
+    });

    return spans;
 }
@@ -1133,13 +1081,13 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp

    data.prompt            = prompt;
    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs, /* messages_override= */ adjusted_messages);
-    data.message_delimiters = {
-        { COMMON_CHAT_ROLE_ASSISTANT, "<|start|>assistant" },
-        { COMMON_CHAT_ROLE_USER,      "<|start|>user"      },
-        { COMMON_CHAT_ROLE_SYSTEM,    "<|start|>developer" },
-        { COMMON_CHAT_ROLE_SYSTEM,    "<|start|>system"    },
-        { COMMON_CHAT_ROLE_TOOL,      "<|start|>functions" },
-    };
+    data.message_spans = common_chat_split_by_role(prompt, {
+        { "assistant", "<|start|>assistant" },
+        { "user",      "<|start|>user"      },
+        { "system",    "<|start|>developer" },
+        { "system",    "<|start|>system"    },
+        { "tool",      "<|start|>functions" },
+    });

    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.supports_thinking = true;
@@ -1280,10 +1228,10 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ
        data.prompt += data.generation_prompt;
    }

-    data.message_delimiters = {
-        { COMMON_CHAT_ROLE_USER,      "<|turn>user"  },
-        { COMMON_CHAT_ROLE_ASSISTANT, "<|turn>model" },
-    };
+    data.message_spans = common_chat_split_by_role(data.prompt, {
+        { "user",      "<|turn>user\n"  },
+        { "assistant", "<|turn>model\n" },
+    });

    data.format            = COMMON_CHAT_FORMAT_PEG_GEMMA4;
    data.supports_thinking  = true;
@@ -2082,15 +2030,15 @@ static common_chat_params common_chat_params_init_cohere2moe(const common_chat_t
        RESULT_START, RESULT_END,
    };

-    // Declare per-role message delimiters. Tool results are rendered with the
+    // Split the rendered prompt into per-role message spans. Tool results are rendered with the
    // system token followed by <|START_TOOL_RESULT|>, so the "tool" delimiter must be listed before
    // the plain "system" one (it is a strict superset, and the role split tries delimiters in order).
-    data.message_delimiters = {
-        { COMMON_CHAT_ROLE_ASSISTANT, GEN_PREFIX },
-        { COMMON_CHAT_ROLE_USER,      TURN_START + USER },
-        { COMMON_CHAT_ROLE_TOOL,      TURN_START + SYSTEM + RESULT_START },
-        { COMMON_CHAT_ROLE_SYSTEM,    TURN_START + SYSTEM },
-    };
+    data.message_spans = common_chat_split_by_role(data.prompt, {
+        { "assistant", GEN_PREFIX },
+        { "user",      TURN_START + USER },
+        { "tool",      TURN_START + SYSTEM + RESULT_START },
+        { "system",    TURN_START + SYSTEM },
+    });

    auto has_tools         = inputs.tools.is_array() && !inputs.tools.empty();
    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
@@ -2578,15 +2526,17 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
        autoparser.analyze_template(tmpl);
        auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser);

-        common_chat_msg_delimiters delimiters;
+        std::vector<common_chat_msg_delimiter> delimiters;
        if (!autoparser.assistant_start.empty()) {
-            delimiters.add(COMMON_CHAT_ROLE_ASSISTANT, autoparser.assistant_start);
+            delimiters.push_back({ "assistant", autoparser.assistant_start });
        }
        if (!autoparser.user_start.empty()) {
-            delimiters.add(COMMON_CHAT_ROLE_USER, autoparser.user_start);
+            delimiters.push_back({ "user", autoparser.user_start });
        }

-        auto_params.message_delimiters = std::move(delimiters);
+        if (!delimiters.empty()) {
+            auto_params.message_spans = common_chat_split_by_role(auto_params.prompt, delimiters);
+        }

        auto_params.supports_thinking = autoparser.reasoning.mode != autoparser::reasoning_mode::NONE;
        if (auto_params.supports_thinking) {
@@ -2758,9 +2708,5 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena &          src_pars
 std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_templates * chat_templates) {
    GGML_ASSERT(chat_templates != nullptr);
    GGML_ASSERT(chat_templates->template_default != nullptr);
-    if (chat_templates->template_tool_use != nullptr) {
-        // take the more expressive template when available
-        return chat_templates->template_tool_use->caps.to_map();
-    }
    return chat_templates->template_default->caps.to_map();
 }
@@ -143,75 +143,15 @@ struct common_chat_msg_diff {
    }
 };

-enum common_chat_role {
-    COMMON_CHAT_ROLE_UNKNOWN,
-    COMMON_CHAT_ROLE_SYSTEM,
-    COMMON_CHAT_ROLE_ASSISTANT,
-    COMMON_CHAT_ROLE_USER,
-    COMMON_CHAT_ROLE_TOOL
-};
-
-common_chat_role common_chat_role_from_string(const std::string & role);
-const char *     common_chat_role_to_string(common_chat_role role);
-
 struct common_chat_msg_span {
-    common_chat_role role = COMMON_CHAT_ROLE_UNKNOWN;
+    std::string role;
    std::size_t pos = 0;
    std::size_t len = 0;
-
-    bool valid() const {
-        return role != COMMON_CHAT_ROLE_UNKNOWN;
-    }
-};
-
-struct common_chat_msg_spans {
-    std::vector<common_chat_msg_span> spans;
-
-    void add(common_chat_role role, size_t pos, size_t len) {
-        spans.push_back({ role, pos, len });
-    }
-
-    bool is_user_start(int32_t pos) const {
-        for (auto it = spans.begin(); it != spans.end(); ++it) {
-            if (it->role == COMMON_CHAT_ROLE_USER && pos == (int32_t) it->pos) {
-                return true;
-            }
-        }
-        return false;
-    }
-
-    int32_t last_user_message_pos() const {
-        for (auto it = spans.rbegin(); it != spans.rend(); ++it) {
-            if (it->role == COMMON_CHAT_ROLE_USER) {
-                return (int32_t) it->pos;
-            }
-        }
-        return -1;
-    }
 };

 struct common_chat_msg_delimiter {
-    common_chat_role role = COMMON_CHAT_ROLE_UNKNOWN;
-    std::string      delimiter;
-    llama_tokens     tokens = {};
-};
-
-struct common_chat_msg_delimiters {
-    std::vector<common_chat_msg_delimiter> delimiters;
-
-    common_chat_msg_delimiters() = default;
-    common_chat_msg_delimiters(std::initializer_list<common_chat_msg_delimiter> delims) : delimiters(delims) {}
-
-    void add(common_chat_role role, const std::string & delimiter) {
-        delimiters.push_back({ role, delimiter });
-    }
-
-    void tokenize(const llama_vocab * vocab);
-
-    // split tokens into message spans. skips maps a start index to a length of a region to jump over without matching
-    common_chat_msg_spans split(const llama_tokens & tokens, const std::map<size_t, size_t> & skips = {}) const;
-
-    nlohmann::ordered_json to_json() const;
+    std::string role;
+    std::string delimiter;
 };

 struct common_chat_tool {
@@ -279,7 +219,7 @@ struct common_chat_params {
    std::vector<std::string>            preserved_tokens;
    std::vector<std::string>            additional_stops;
    std::string                         parser;
-    common_chat_msg_delimiters          message_delimiters;
+    std::vector<common_chat_msg_span>   message_spans;
 };

 // per-message parsing syntax
@@ -385,4 +325,5 @@ struct common_chat_prompt_preset {

 common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates);

-common_chat_msg_delimiters common_chat_msg_delimiters_parse(const nlohmann::ordered_json & delimiters);
+std::vector<common_chat_msg_span> common_chat_split_by_role(const std::string & prompt, const std::vector<common_chat_msg_delimiter> & delims);
+
@@ -1074,18 +1074,6 @@ std::vector<common_file_info> fs_list(const std::string & path, bool include_dir
    return files;
 }

-std::ifstream fs_open_ifstream(const std::string & fname, std::ios_base::openmode mode) {
-#ifdef _WIN32
-    int wlen = MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, NULL, 0);
-    if (!wlen) { return std::ifstream(); }
-    std::vector<wchar_t> wfname(wlen);
-    (void)MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, wfname.data(), wlen);
-    return std::ifstream(wfname.data(), mode);
-#else
-    return std::ifstream(fname, mode);
-#endif
-}
-
 //
 // TTY utils
 //
@@ -2046,7 +2034,7 @@ bool common_prompt_batch_decode(
 }

 size_t common_prompt_checkpoint::size() const {
-    return data_tgt.size() + data_dft.size() + data_spec.size();
+    return data_tgt.size() + data_dft.size();
 }

 bool common_prompt_checkpoint::empty() const {
@@ -2061,7 +2049,6 @@ void common_prompt_checkpoint::clear() {

    data_tgt.clear();
    data_dft.clear();
-    data_spec.clear();
 }

 void common_prompt_checkpoint::update_pos(
@@ -2151,5 +2138,4 @@ void common_prompt_checkpoint::clear_tgt() {

 void common_prompt_checkpoint::clear_dft() {
    data_dft.clear();
-    data_spec.clear();
 }
@@ -96,7 +96,6 @@ enum llama_example {
    LLAMA_EXAMPLE_FIT_PARAMS,
    LLAMA_EXAMPLE_RESULTS,
    LLAMA_EXAMPLE_EXPORT_GRAPH_OPS,
-    LLAMA_EXAMPLE_DOWNLOAD,

    LLAMA_EXAMPLE_COUNT,
 };
@@ -291,25 +290,12 @@ struct common_params_sampling {
 };

 struct common_params_model {
-    std::string path        = ""; // model local path
-    std::string url         = ""; // model url to download
-    std::string hf_repo     = ""; // HF repo
-    std::string hf_file     = ""; // HF file
-    std::string docker_repo = ""; // Docker repo
-
-    std::string get_name() const {
-        if (!hf_repo.empty()) {
-            return hf_repo;
-        }
-        if (!docker_repo.empty()) {
-            return docker_repo;
-        }
-        return path;
-    }
-
-    bool empty() const {
-        return get_name().empty();
-    }
+    std::string path        = ""; // model local path                                       // NOLINT
+    std::string url         = ""; // model url to download                                  // NOLINT
+    std::string hf_repo     = ""; // HF repo                                                // NOLINT
+    std::string hf_file     = ""; // HF file                                                // NOLINT
+    std::string docker_repo = ""; // Docker repo                                            // NOLINT
+    std::string name        = ""; // in format <user>/<model>[:<tag>] (tag is optional)     // NOLINT
 };

 // draft-model-based speculative decoding parameters
@@ -372,12 +358,12 @@ struct common_params_speculative {
    common_params_speculative_ngram_cache ngram_cache;

    bool has_dft() const {
-        return !draft.mparams.empty();
+        return !draft.mparams.path.empty() || !draft.mparams.hf_repo.empty();
    }

    uint32_t need_n_rs_seq() const {
        bool needs_rs_seq = std::any_of(types.begin(), types.end(), [&](auto t) {
-            return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP || t == COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3;
+            return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP;
        });

        return needs_rs_seq ? draft.n_max : 0u;
@@ -524,6 +510,7 @@ struct common_params {
    int32_t control_vector_layer_start = -1; // layer range for control vector
    int32_t control_vector_layer_end   = -1; // layer range for control vector
    bool    offline                    = false;
+    bool    skip_download              = false; // skip model file downloading

    int32_t ppl_stride      = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
    int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
@@ -613,7 +600,7 @@ struct common_params {
    bool    cache_prompt        = true;  // whether to enable prompt caching
    bool    cache_idle_slots    = true;  // save and clear idle slots upon starting a new task
    int32_t n_ctx_checkpoints   = 32;    // max number of context checkpoints per slot
-    int32_t checkpoint_min_step = 8192;  // minimum spacing between context checkpoints
+    int32_t checkpoint_min_step = 256;   // minimum spacing between context checkpoints
    int32_t cache_ram_mib       = 8192;  // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.

    std::string hostname      = "127.0.0.1";
@@ -637,6 +624,12 @@ struct common_params {

    // UI configs
    bool ui = true;
+
+    // Deprecated: use ui, ui_mcp_proxy, ui_config_json instead
+    bool webui = ui;
+    bool webui_mcp_proxy = false;
+    std::string webui_config_json;
+
    bool ui_mcp_proxy = false;
    std::string ui_config_json;

@@ -649,11 +642,10 @@ struct common_params {
    std::vector<std::string> server_tools;

    // router server configs
-    std::string models_dir    = "";     // directory containing models for the router server
-    std::string models_preset = "";     // directory containing model presets for the router server
-    int models_max = 4;                 // maximum number of models to load simultaneously
-    bool models_autoload = true;        // automatically load models when requested via the router server
-    std::string models_preset_hf = "";  // show a warning about remote presets on router loaded (if not empty)
+    std::string models_dir    = ""; // directory containing models for the router server
+    std::string models_preset = ""; // directory containing model presets for the router server
+    int models_max = 4;             // maximum number of models to load simultaneously
+    bool models_autoload = true;    // automatically load models when requested via the router server

    bool log_json = false;

@@ -855,9 +847,6 @@ struct common_file_info {
 };
 std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);

-// fs open, also handle UTF8 on Windows
-std::ifstream fs_open_ifstream(const std::string & fname, std::ios_base::openmode mode);
-
 //
 // TTY utils
 //
@@ -1075,10 +1064,6 @@ struct common_prompt_checkpoint {
    std::vector<uint8_t> data_tgt;
    std::vector<uint8_t> data_dft;

-    // (optional) speculative-decoding implementation state stashed with the checkpoint
-    // (e.g. eagle3's deferred-boundary g_embd row)
-    std::vector<uint8_t> data_spec;
-
    size_t size() const;

    bool empty() const;
@@ -292,6 +292,10 @@ static int common_download_file_single_online(const std::string & url,

    const bool file_exists = std::filesystem::exists(path);

+    if (!file_exists && opts.skip_download) {
+        return -2; // file is missing and download is disabled
+    }
+
    if (file_exists && skip_etag) {
        LOG_DBG("%s: using cached file: %s\n", __func__, path.c_str());
        return 304; // 304 Not Modified - fake cached response
@@ -358,6 +362,9 @@ static int common_download_file_single_online(const std::string & url,
            return 304; // 304 Not Modified - fake cached response
        }
        // pass this point, the file exists but is different from the server version, so we need to redownload it
+        if (opts.skip_download) {
+            return -2; // special code to indicate that the download was skipped due to etag mismatch
+        }
        if (remove(path.c_str()) != 0) {
            LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
            return -1;
@@ -684,8 +691,18 @@ static void list_available_gguf_files(const hf_cache::hf_files & files) {
    }
 }

-common_download_hf_plan common_download_get_hf_plan(const common_params_model & model, const common_download_opts & opts) {
-    common_download_hf_plan plan;
+struct hf_plan {
+    hf_cache::hf_file primary;
+    hf_cache::hf_files model_files;
+    hf_cache::hf_file mmproj;
+    hf_cache::hf_file mtp;
+};
+
+static hf_plan get_hf_plan(const common_params_model  & model,
+                           const common_download_opts & opts,
+                           bool download_mmproj,
+                           bool download_mtp) {
+    hf_plan plan;
    hf_cache::hf_files all;

    auto [repo, tag] = common_download_split_repo_tag(model.hf_repo);
@@ -700,14 +717,6 @@ common_download_hf_plan common_download_get_hf_plan(const common_params_model &
        return plan;
    }

-    // if preset.ini exists in the repo root, download only that file
-    for (const auto & f : all) {
-        if (f.path == "preset.ini") {
-            plan.preset = f;
-            return plan;
-        }
-    }
-
    hf_cache::hf_file primary;

    if (!model.hf_file.empty()) {
@@ -734,49 +743,115 @@ common_download_hf_plan common_download_get_hf_plan(const common_params_model &
    plan.primary = primary;
    plan.model_files = get_split_files(all, primary);

-    if (opts.download_mmproj) {
+    if (download_mmproj) {
        plan.mmproj = find_best_mmproj(all, primary.path);
    }
-    if (opts.download_mtp) {
+
+    if (download_mtp) {
        plan.mtp = find_best_mtp(all, primary.path);
    }

    return plan;
 }

-void common_download_run_tasks(const std::vector<common_download_task> & tasks) {
+struct download_task {
+    std::string url;
+    std::string path;
+};
+
+static std::vector<download_task> get_url_tasks(const common_params_model & model) {
+    auto split = get_gguf_split_info(model.url);
+
+    if (split.count <= 1) {
+        return {{model.url, model.path}};
+    }
+
+    auto filename = split.prefix;
+    if (auto pos = split.prefix.rfind('/'); pos != std::string::npos) {
+        filename = split.prefix.substr(pos + 1);
+    }
+
+    auto parent_path = std::filesystem::path(model.path).parent_path();
+    auto prefix_path = (parent_path / filename).string();
+
+    std::vector<download_task> tasks;
+    for (int i = 1; i <= split.count; i++) {
+        auto suffix = string_format("-%05d-of-%05d.gguf", i, split.count);
+        tasks.push_back({split.prefix + suffix, prefix_path + suffix});
+    }
+    return tasks;
+}
+
+common_download_model_result common_download_model(const common_params_model  & model,
+                                                   const common_download_opts & opts) {
+    common_download_model_result result;
+    std::vector<download_task> tasks;
+    hf_plan hf;
+
+    bool download_mmproj = opts.download_mmproj;
+    bool download_mtp = opts.download_mtp;
+    bool is_hf = !model.hf_repo.empty();
+
+    if (is_hf) {
+        hf = get_hf_plan(model, opts, download_mmproj, download_mtp);
+        for (const auto & f : hf.model_files) {
+            tasks.push_back({f.url, f.local_path});
+        }
+        if (!hf.mmproj.path.empty()) {
+            tasks.push_back({hf.mmproj.url, hf.mmproj.local_path});
+        }
+        if (!hf.mtp.path.empty()) {
+            tasks.push_back({hf.mtp.url, hf.mtp.local_path});
+        }
+    } else if (!model.url.empty()) {
+        tasks = get_url_tasks(model);
+    } else {
+        result.model_path = model.path;
+        return result;
+    }
+
+    if (tasks.empty()) {
+        return result;
+    }
+
    std::vector<std::future<int>> futures;
    for (const auto & task : tasks) {
        futures.push_back(std::async(std::launch::async,
-            [&task]() {
-                return common_download_file_single(task.url, task.local_path, task.opts, task.is_hf);
+            [&task, &opts, is_hf]() {
+                return common_download_file_single(task.url, task.path, opts, is_hf);
            }
        ));
    }

-    for (size_t i = 0; i < futures.size(); ++i) {
-        std::string url = tasks[i].url;
-        int status = futures[i].get();
+    for (auto & f : futures) {
+        int status = f.get();
+        if (status == -2 && opts.skip_download) {
+            throw common_skip_download_exception();
+        }
        bool is_ok = is_http_status_ok(status);
        if (!is_ok) {
-            throw std::runtime_error(string_format("Download '%s' failed with status code: %d", url.c_str(), status));
+            return {};
        }
    }
-}

-std::vector<std::string> common_download_get_all_parts(const std::string & url) {
-    auto split = get_gguf_split_info(url);
+    if (is_hf) {
+        for (const auto & f : hf.model_files) {
+            hf_cache::finalize_file(f);
+        }
+        result.model_path = hf.primary.final_path;

-    if (split.count <= 1) {
-        return {url};
+        if (!hf.mmproj.path.empty()) {
+            result.mmproj_path = hf_cache::finalize_file(hf.mmproj);
+        }
+
+        if (!hf.mtp.path.empty()) {
+            result.mtp_path = hf_cache::finalize_file(hf.mtp);
+        }
+    } else {
+        result.model_path = model.path;
    }

-    std::vector<std::string> parts;
-    for (int i = 1; i <= split.count; i++) {
-        auto suffix = string_format("-%05d-of-%05d.gguf", i, split.count);
-        parts.push_back(split.prefix + suffix);
-    }
-    return parts;
+    return result;
 }

 //
@@ -922,87 +997,3 @@ std::vector<common_cached_model_info> common_list_cached_models() {

    return result;
 }
-
-bool common_download_remove(const std::string & hf_repo_with_tag) {
-    namespace fs = std::filesystem;
-
-    auto [repo_id, tag] = common_download_split_repo_tag(hf_repo_with_tag);
-
-    if (tag.empty()) {
-        return hf_cache::remove_cached_repo(repo_id);
-    }
-
-    std::string tag_upper = tag;
-    for (char & c : tag_upper) {
-        c = (char) std::toupper((unsigned char) c);
-    }
-
-    auto files = hf_cache::get_cached_files(repo_id);
-    if (files.empty()) {
-        return false;
-    }
-
-    // collect snapshot entries whose tag matches
-    std::vector<fs::path> to_remove;
-    for (const auto & f : files) {
-        auto split = get_gguf_split_info(f.path);
-        if (split.tag == tag_upper) {
-            to_remove.emplace_back(f.local_path);
-        }
-    }
-
-    if (to_remove.empty()) {
-        return false;
-    }
-
-    // resolve blob paths from symlinks before deleting snapshot entries
-    std::vector<fs::path> blobs_to_check;
-    for (const auto & p : to_remove) {
-        std::error_code ec;
-        if (fs::is_symlink(p, ec)) {
-            auto target = fs::read_symlink(p, ec);
-            if (!ec) {
-                blobs_to_check.push_back((p.parent_path() / target).lexically_normal());
-            }
-        }
-    }
-
-    // remove snapshot entries
-    for (const auto & p : to_remove) {
-        std::error_code ec;
-        fs::remove(p, ec);
-        if (ec) {
-            LOG_WRN("%s: failed to remove %s: %s\n", __func__, p.string().c_str(), ec.message().c_str());
-        }
-    }
-
-    if (blobs_to_check.empty()) {
-        return true;
-    }
-
-    // collect blobs still referenced by remaining snapshot entries
-    std::unordered_set<std::string> still_referenced;
-    for (const auto & f : hf_cache::get_cached_files(repo_id)) {
-        fs::path p(f.local_path);
-        std::error_code ec;
-        if (fs::is_symlink(p, ec)) {
-            auto target = fs::read_symlink(p, ec);
-            if (!ec) {
-                still_referenced.insert((p.parent_path() / target).lexically_normal().string());
-            }
-        }
-    }
-
-    // remove orphaned blobs
-    for (const auto & blob : blobs_to_check) {
-        if (still_referenced.find(blob.string()) == still_referenced.end()) {
-            std::error_code ec;
-            fs::remove(blob, ec);
-            if (ec) {
-                LOG_WRN("%s: failed to remove blob %s: %s\n", __func__, blob.string().c_str(), ec.message().c_str());
-            }
-        }
-    }
-
-    return true;
-}
@@ -1,10 +1,7 @@
 #pragma once

-#include "hf-cache.h"
-
 #include <string>
 #include <vector>
-#include <functional>

 struct common_params_model;

@@ -50,40 +47,65 @@ struct common_cached_model_info {
    }
 };

-// Options for common_download_file_single
+// Options for common_download_model and common_download_file_single
 struct common_download_opts {
    std::string bearer_token;
    common_header_list headers;
    bool offline = false;
+    bool skip_download = false; // if true, only validation is performed, common_skip_download_exception may be thrown if the file is missing or invalid
    bool download_mmproj = false;
    bool download_mtp = false;
    common_download_callback * callback = nullptr;
 };

-struct common_download_task {
-    common_download_opts opts;
-    std::string url;
-    std::string local_path;
-    std::function<void()> on_done;
-    bool is_hf = false;
-
-    common_download_task() = default;
-    common_download_task(hf_cache::hf_file f,
-            const common_download_opts & opts,
-            std::function<void()> on_done = nullptr)
-        : opts(opts), url(f.url), local_path(f.local_path), on_done(on_done), is_hf(true) {}
+// Result of common_download_model
+struct common_download_model_result {
+    std::string model_path;
+    std::string mmproj_path;
+    std::string mtp_path;
 };

-void common_download_run_tasks(const std::vector<common_download_task> & tasks);
+// throw if the file is missing or invalid (e.g. ETag check failed)
+struct common_skip_download_exception : public std::runtime_error {
+    common_skip_download_exception() : std::runtime_error("skip download") {}
+};

-// if url is a multi-part GGUF file, returns all parts, otherwise returns the single file
-std::vector<std::string> common_download_get_all_parts(const std::string & url);
+// Download model from HuggingFace repo or URL
+//
+// input (via model struct):
+// - model.hf_repo: HF repo with optional tag, see common_download_split_repo_tag
+// - model.hf_file: specific file in the repo (requires hf_repo)
+// - model.url: simple download (used if hf_repo is empty)
+// - model.path: local file path
+//
+// tag matching (for HF repos without model.hf_file):
+// - if tag is specified, searches for GGUF matching that quantization
+// - if no tag, searches for Q4_K_M, then Q4_0, then first available GGUF
+//
+// split GGUF: multi-part files like "model-00001-of-00003.gguf" are automatically
+// detected and all parts are downloaded
+//
+// caching:
+// - HF repos: uses HuggingFace cache
+// - URLs: uses ETag-based caching
+//
+// when opts.offline=true, no network requests are made
+// when download_mmproj=true, searches for mmproj in same directory as model or any parent directory
+// then with the closest quantization bits
+// when download_mtp=true, applies the same sibling search for an MTP-head GGUF
+//
+// returns result with model_path, mmproj_path and mtp_path (empty when not found / on failure)
+common_download_model_result common_download_model(
+    const common_params_model & model,
+    const common_download_opts & opts = {}
+);

 // returns list of cached models
 std::vector<common_cached_model_info> common_list_cached_models();

 // download single file from url to local path
 // returns status code or -1 on error
+// returns -2 if the download was skipped due to ETag mismatch (file outdated, skip_download=true)
 // skip_etag: if true, don't read/write .etag files (for HF cache where filename is the hash)
 int common_download_file_single(const std::string & url,
                                const std::string & path,
@@ -93,19 +115,3 @@ int common_download_file_single(const std::string & url,
 // resolve and download model from Docker registry
 // return local path to downloaded model file
 std::string common_docker_resolve_model(const std::string & docker);
-
-// Remove a cached model from disk
-// input format: "user/model" or "user/model:tag"
-// - if tag is omitted, removes the entire repo cache directory
-// - if tag is present, removes only files matching that tag (and orphaned blobs)
-// returns true if anything was removed
-bool common_download_remove(const std::string & hf_repo_with_tag);
-
-struct common_download_hf_plan {
-    hf_cache::hf_file primary;
-    hf_cache::hf_files model_files;
-    hf_cache::hf_file mmproj;
-    hf_cache::hf_file mtp;
-    hf_cache::hf_file preset; // if set, only this file is downloaded
-};
-common_download_hf_plan common_download_get_hf_plan(const common_params_model & model, const common_download_opts & opts);
@@ -495,19 +495,4 @@ std::string finalize_file(const hf_file & file) {
    return file.final_path;
 }

-bool remove_cached_repo(const std::string & repo_id) {
-    if (!is_valid_repo_id(repo_id)) {
-        LOG_WRN("%s: invalid repository: %s\n", __func__, repo_id.c_str());
-        return false;
-    }
-    fs::path repo_path = get_repo_path(repo_id);
-    std::error_code ec;
-    auto removed = fs::remove_all(repo_path, ec);
-    if (ec) {
-        LOG_ERR("%s: failed to remove repo cache %s: %s\n", __func__, repo_path.string().c_str(), ec.message().c_str());
-        return false;
-    }
-    return removed > 0;
-}
-
 } // namespace hf_cache
@@ -29,7 +29,4 @@ hf_files get_cached_files(const std::string & repo_id = {});
 // Create snapshot path (link or move/copy) and return it
 std::string finalize_file(const hf_file & file);

-// Remove the entire cached directory for a repo, returns true if removed
-bool remove_cached_repo(const std::string & repo_id);
-
 } // namespace hf_cache
@@ -686,62 +686,59 @@ value set_statement::execute_impl(context & ctx) {
    return mk_val<value_undefined>();
 }

-static inline void bind_parameters(const std::string & name, const statements & this_args, const func_args & args, context & ctx) {
-    const size_t expected_count = this_args.size();
-    const size_t input_count = args.count();
-
-    JJ_DEBUG("Invoking '%s' with %zu input arguments (expected %zu)", name.c_str(), input_count, expected_count);
-    for (size_t i = 0; i < expected_count; ++i) {
-        if (i < input_count) {
-            if (is_stmt<identifier>(this_args[i])) {
-                // normal parameter
-                std::string param_name = cast_stmt<identifier>(this_args[i])->val;
-                value param_value = args.get_kwarg_or_pos(param_name, i);
-                JJ_DEBUG("  Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
-                ctx.set_val(param_name, param_value);
-            } else if (is_stmt<keyword_argument_expression>(this_args[i])) {
-                // default argument used as normal parameter
-                auto kwarg = cast_stmt<keyword_argument_expression>(this_args[i]);
-                if (!is_stmt<identifier>(kwarg->key)) {
-                    throw std::runtime_error("Keyword argument key must be an identifier in '" + name + "'");
-                }
-                std::string param_name = cast_stmt<identifier>(kwarg->key)->val;
-                value param_value = args.get_kwarg_or_pos(param_name, i);
-                JJ_DEBUG("  Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
-                ctx.set_val(param_name, param_value);
-            } else {
-                throw std::runtime_error("Invalid parameter type in '" + name + "'");
-            }
-        } else {
-            auto & default_arg = this_args[i];
-            if (is_stmt<keyword_argument_expression>(default_arg)) {
-                auto kwarg = cast_stmt<keyword_argument_expression>(default_arg);
-                if (!is_stmt<identifier>(kwarg->key)) {
-                    throw std::runtime_error("Keyword argument key must be an identifier in '" + name + "'");
-                }
-                std::string param_name = cast_stmt<identifier>(kwarg->key)->val;
-                JJ_DEBUG("  Binding parameter '%s' to default argument of type %s", param_name.c_str(), kwarg->val->type().c_str());
-                ctx.set_val(param_name, kwarg->val->execute(args.ctx));
-            } else {
-                throw std::runtime_error("Not enough arguments provided to '" + name + "'");
-            }
-            //std::string param_name = cast_stmt<identifier>(default_args[i])->val;
-            //JJ_DEBUG("  Binding parameter '%s' to default", param_name.c_str());
-            //ctx.var[param_name] = default_args[i]->execute(ctx);
-        }
-    }
-}
-
 value macro_statement::execute_impl(context & ctx) {
    if (!is_stmt<identifier>(this->name)) {
        throw std::runtime_error("Macro name must be an identifier");
    }
    std::string name = cast_stmt<identifier>(this->name)->val;

-    const func_handler func = [this, name](const func_args & args) -> value {
-        context macro_ctx(args.ctx); // new scope for macro execution
+    const func_handler func = [this, name, &ctx](const func_args & args) -> value {
+        size_t expected_count = this->args.size();
+        size_t input_count = args.count();

-        bind_parameters(name, this->args, args, macro_ctx);
+        JJ_DEBUG("Invoking macro '%s' with %zu input arguments (expected %zu)", name.c_str(), input_count, expected_count);
+        context macro_ctx(ctx); // new scope for macro execution
+
+        // bind parameters
+        for (size_t i = 0; i < expected_count; ++i) {
+            if (i < input_count) {
+                if (is_stmt<identifier>(this->args[i])) {
+                    // normal parameter
+                    std::string param_name = cast_stmt<identifier>(this->args[i])->val;
+                    value param_value = args.get_kwarg_or_pos(param_name, i);
+                    JJ_DEBUG("  Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
+                    macro_ctx.set_val(param_name, param_value);
+                } else if (is_stmt<keyword_argument_expression>(this->args[i])) {
+                    // default argument used as normal parameter
+                    auto kwarg = cast_stmt<keyword_argument_expression>(this->args[i]);
+                    if (!is_stmt<identifier>(kwarg->key)) {
+                        throw std::runtime_error("Keyword argument key must be an identifier in macro '" + name + "'");
+                    }
+                    std::string param_name = cast_stmt<identifier>(kwarg->key)->val;
+                    value param_value = args.get_kwarg_or_pos(param_name, i);
+                    JJ_DEBUG("  Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
+                    macro_ctx.set_val(param_name, param_value);
+                } else {
+                    throw std::runtime_error("Invalid parameter type in macro '" + name + "'");
+                }
+            } else {
+                auto & default_arg = this->args[i];
+                if (is_stmt<keyword_argument_expression>(default_arg)) {
+                    auto kwarg = cast_stmt<keyword_argument_expression>(default_arg);
+                    if (!is_stmt<identifier>(kwarg->key)) {
+                        throw std::runtime_error("Keyword argument key must be an identifier in macro '" + name + "'");
+                    }
+                    std::string param_name = cast_stmt<identifier>(kwarg->key)->val;
+                    JJ_DEBUG("  Binding parameter '%s' to default argument of type %s", param_name.c_str(), kwarg->val->type().c_str());
+                    macro_ctx.set_val(param_name, kwarg->val->execute(ctx));
+                } else {
+                    throw std::runtime_error("Not enough arguments provided to macro '" + name + "'");
+                }
+                //std::string param_name = cast_stmt<identifier>(default_args[i])->val;
+                //JJ_DEBUG("  Binding parameter '%s' to default", param_name.c_str());
+                //macro_ctx.var[param_name] = default_args[i]->execute(ctx);
+            }
+        }

        // execute macro body
        JJ_DEBUG("Executing macro '%s' body with %zu statements", name.c_str(), this->body.size());
@@ -755,46 +752,6 @@ value macro_statement::execute_impl(context & ctx) {
    return mk_val<value_undefined>();
 }

-value call_statement::execute_impl(context & ctx) {
-    auto call_expr = cast_stmt<call_expression>(this->call);
-    if (!call_expr) {
-        throw std::runtime_error("Call statement requires a valid call expression");
-    }
-
-    value callee_val = call_expr->callee->execute(ctx);
-    if (!is_val<value_func>(callee_val)) {
-        throw std::runtime_error("Callee is not a function: got " + callee_val->type());
-    }
-    auto * callee_func = cast_val<value_func>(callee_val);
-
-    context caller_ctx(ctx); // new scope for caller execution
-
-    const func_handler func = [this, caller_ctx = std::move(caller_ctx)](const func_args & args) -> value {
-        context block_ctx(caller_ctx); // new scope for block execution
-
-        bind_parameters("caller", this->caller_args, args, block_ctx);
-
-        JJ_DEBUG("Executing call body with %zu statements", this->body.size());
-        auto res = exec_statements(this->body, block_ctx);
-        JJ_DEBUG("Call body execution complete, result: %s", res->val_str.str().c_str());
-        return res;
-    };
-
-    context call_ctx(ctx);
-    call_ctx.set_val("caller", mk_val<value_func>("caller", func));
-
-    func_args args(call_ctx);
-
-    for (const auto & arg_expr : call_expr->args) {
-        auto arg_val = arg_expr->execute(ctx);
-        JJ_DEBUG("  Argument type: %s", arg_val->type().c_str());
-        args.push_back(arg_val);
-    }
-
-    JJ_DEBUG("Calling macro '%s' with %zu arguments", callee_func->name.c_str(), args.count());
-    return callee_func->invoke(args);
-}
-
 value member_expression::execute_impl(context & ctx) {
    value object = this->object->execute(ctx);

@@ -552,7 +552,6 @@ struct call_statement : public statement {
        for (const auto & arg : this->caller_args) chk_type<expression>(arg);
    }
    std::string type() const override { return "CallStatement"; }
-    value execute_impl(context & ctx) override;
 };

 struct ternary_expression : public expression {
@@ -0,0 +1,324 @@
+#include "json-partial.h"
+
+#include "log.h"
+
+#include <nlohmann/json.hpp>
+
+#include <string>
+#include <regex>
+
+using json = nlohmann::ordered_json;
+
+enum common_json_stack_element_type {
+    COMMON_JSON_STACK_ELEMENT_OBJECT,
+    COMMON_JSON_STACK_ELEMENT_KEY,
+    COMMON_JSON_STACK_ELEMENT_ARRAY,
+};
+
+struct common_json_stack_element {
+    common_json_stack_element_type type;
+    std::string key;
+};
+
+bool common_json_parse(
+    const std::string & input,
+    const std::string & healing_marker,
+    common_json & out)
+{
+    std::string::const_iterator it = input.begin();
+    const auto end = input.end();
+    return common_json_parse(it, end, healing_marker, out);
+}
+
+bool common_json_parse(
+    std::string::const_iterator & it,
+    const std::string::const_iterator & end,
+    const std::string & healing_marker,
+    common_json & out)
+{
+    // // https://json.nlohmann.me/features/parsing/sax_interface/
+    struct json_error_locator : public nlohmann::json_sax<json> {
+        std::size_t position;
+        bool found_error;
+        std::string last_token;
+        std::string exception_message;
+        std::vector<common_json_stack_element> stack;
+
+        json_error_locator() : position(0), found_error(false) {}
+
+        bool parse_error(std::size_t position, const std::string & last_token, const json::exception & ex) override { // NOLINT
+            this->position = position - 1;
+            this->found_error = true;
+            this->last_token = last_token;
+            this->exception_message = ex.what();
+            return false;
+        }
+        void close_value() {
+            if (!stack.empty() && (stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY)) {
+                stack.pop_back();
+            }
+        }
+        bool null() override { // NOLINT
+            close_value();
+            return true;
+        }
+        bool boolean(bool) override { // NOLINT
+            close_value();
+            return true;
+        }
+        bool number_integer(number_integer_t) override { // NOLINT
+            close_value();
+            return true;
+        }
+        bool number_unsigned(number_unsigned_t) override { // NOLINT
+            close_value();
+            return true;
+        }
+        bool number_float(number_float_t, const string_t &) override { // NOLINT
+            close_value();
+            return true;
+        }
+        bool string(string_t &) override { // NOLINT
+            close_value();
+            return true;
+        }
+        bool binary(binary_t &) override { // NOLINT
+            close_value();
+            return true;
+        }
+        bool start_object(std::size_t) override { // NOLINT
+            stack.push_back({COMMON_JSON_STACK_ELEMENT_OBJECT, ""});
+            return true;
+        }
+        bool end_object() override {
+            GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT);
+            stack.pop_back();
+            close_value();
+            return true;
+        }
+        bool key(string_t & key) override { // NOLINT
+            stack.push_back({COMMON_JSON_STACK_ELEMENT_KEY, key});
+            return true;
+        }
+        bool start_array(std::size_t) override { // NOLINT
+            stack.push_back({COMMON_JSON_STACK_ELEMENT_ARRAY, ""});
+            return true;
+        }
+        bool end_array() override {
+            GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY);
+            stack.pop_back();
+            close_value();
+            return true;
+        }
+    };
+    json_error_locator err_loc;
+    auto start = it;
+    json::sax_parse(it, end, &err_loc);
+
+    if (err_loc.found_error) {
+        it = start;
+        auto temptative_end = it + err_loc.position;
+        // LOG_DBG("Error at position %zu (is_end = %s): %s\n", err_loc.position, temptative_end == end ? "true" : "false", err_loc.exception_message.c_str());
+
+        auto input = std::string(it, temptative_end);
+        try {
+            out.json = json::parse(input);
+            // out.json = json::parse(it, temptative_end);
+            it = temptative_end;
+            return true;
+        } catch (const std::exception & ex) {
+            // No, needs healing.
+            LOG_DBG("Failed to parse up to error: %s: <<<%s>>>\n", ex.what(), std::string(it, temptative_end).c_str());
+        }
+        auto can_parse = [](const std::string & str) {
+            try {
+                auto _ = json::parse(str); // NOLINT
+                return true;
+            } catch (const std::exception &) {
+                return false;
+            }
+        };
+        if (!healing_marker.empty() && !err_loc.stack.empty()) {
+            std::string str(it, temptative_end);
+            auto last_non_sp_pos = str.find_last_not_of(" \n\r\t");
+            if (last_non_sp_pos == std::string::npos) {
+                throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
+            }
+            auto last_non_sp_char = str[last_non_sp_pos];
+            // Used to detect stops on a number, which may not be complete.
+            auto was_maybe_number = [&]() {
+                if (!str.empty() && std::isspace(str.back())) {
+                    return false;
+                }
+                return std::isdigit(last_non_sp_char) ||
+                    last_non_sp_char == '.' ||
+                    last_non_sp_char == 'e' ||
+                    last_non_sp_char == 'E' ||
+                    last_non_sp_char == '-';
+            };
+
+            std::string closing;
+            for (size_t i = err_loc.stack.size(); i > 0; i--) {
+                auto & el = err_loc.stack[i - 1];
+                if (el.type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
+                    closing += "}";
+                } else if (el.type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
+                    closing += "]";
+                } else if (el.type != COMMON_JSON_STACK_ELEMENT_KEY) {
+                    throw std::runtime_error("Unexpected stack element type");
+                }
+            }
+
+            // Matches a potentially partial unicode escape sequence, e.g. \u, \uX, \uXX, \uXXX, \uXXXX
+            static const std::regex partial_unicode_regex(R"(\\u(?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F])?)?)?)?$)");
+
+            auto is_high_surrogate = [&](const std::string & s) {
+                // Check if a partial of a high surrogate (U+D800-U+DBFF)
+                return s.length() >= 4 &&
+                    s[0] == '\\' && s[1] == 'u' &&
+                    std::tolower(s[2]) == 'd' &&
+                    (s[3] == '8' || s[3] == '9' || std::tolower(s[3]) == 'a' || std::tolower(s[3]) == 'b');
+            };
+
+            // Initialize the unicode marker to a low surrogate to handle the edge case
+            // where a high surrogate (U+D800-U+DBFF) is immediately followed by a
+            // backslash (\)
+            std::string unicode_marker_padding = "udc00";
+            std::smatch last_unicode_seq;
+
+            if (std::regex_search(str, last_unicode_seq, partial_unicode_regex)) {
+                std::smatch second_last_seq;
+                std::string prelude = str.substr(0, last_unicode_seq.position());
+
+                // Pad the escape sequence with 0s until it forms a complete sequence of 6 characters
+                unicode_marker_padding = std::string(6 - last_unicode_seq.length(), '0');
+
+                if (is_high_surrogate(last_unicode_seq.str())) {
+                    // If the sequence is a partial match for a high surrogate, add a low surrogate (U+DC00-U+UDFF)
+                    unicode_marker_padding += "\\udc00";
+                } else if (std::regex_search(prelude, second_last_seq, partial_unicode_regex)) {
+                    if (is_high_surrogate(second_last_seq.str())) {
+                        // If this follows a high surrogate, pad it to be a low surrogate
+                        if (last_unicode_seq.length() == 2) {
+                            unicode_marker_padding = "dc00";
+                        } else if (last_unicode_seq.length() == 3) {
+                            unicode_marker_padding = "c00";
+                        } else {
+                            // The original unicode_marker_padding is already padded with 0s
+                        }
+                    }
+                }
+            }
+
+            const auto & magic_seed = out.healing_marker.marker = healing_marker;//"$llama.cpp.json$";
+
+            if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY) {
+                // We're inside an object value
+                if (last_non_sp_char == ':' && can_parse(str + "1" + closing)) {
+                    // Was about to create an object value
+                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
+                } else if (can_parse(str + ": 1" + closing)) {
+                    str += (out.healing_marker.json_dump_marker = ":\"" + magic_seed) + "\"" + closing;
+                } else if (last_non_sp_char == '{' && can_parse(str + closing)) {
+                    // Was about to create an object
+                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
+                } else if (can_parse(str + "\"" + closing)) {
+                    // Was inside an object value string
+                    str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
+                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
+                    // Was inside an object value string after an escape
+                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
+                } else if (can_parse(str + unicode_marker_padding + "\"" + closing)) {
+                    // Was inside an object value string after a partial unicode escape
+                    str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing;
+                } else {
+                    // find last :
+                    auto last_pos = str.find_last_of(':');
+                    if (last_pos == std::string::npos) {
+                        throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
+                    }
+                    // Cutting back to opening : for object value
+                    str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
+                }
+            } else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
+                if ((last_non_sp_char == ',' || last_non_sp_char == '[') && can_parse(str + "1" + closing)) {
+                    // Was about to create an array value
+                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
+                } else if (can_parse(str + "\"" + closing)) {
+                    // Was inside an array value string
+                    str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
+                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
+                    // Was inside an array value string after an escape
+                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
+                } else if (can_parse(str + unicode_marker_padding + "\"" + closing)) {
+                    // Was inside an array value string after a partial unicode escape
+                    str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing;
+                } else if (!was_maybe_number() && can_parse(str + ", 1" + closing)) {
+                    // Had just finished a value
+                    str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\"" + closing;
+                } else {
+                    auto last_pos = str.find_last_of("[,");
+                    if (last_pos == std::string::npos) {
+                        throw std::runtime_error("Cannot heal a truncated JSON array stopped in an unknown location");
+                    }
+                    // Cutting back to last [ or , for array value
+                    str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
+                }
+            } else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
+                if ((last_non_sp_char == '{' && can_parse(str + closing)) ||
+                        (last_non_sp_char == ',' && can_parse(str + "\"\": 1" + closing))) {
+                    // Was about to create an object key+value
+                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
+                } else if (!was_maybe_number() && can_parse(str + ",\"\": 1" + closing)) {
+                    // Was about to create an object key+value
+                    str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\": 1" + closing;
+                } else if (can_parse(str + "\": 1" + closing)) {
+                    // Was inside an object key string
+                    str += (out.healing_marker.json_dump_marker = magic_seed) + "\": 1" + closing;
+                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\": 1" + closing)) {
+                    // Was inside an object key string after an escape
+                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\": 1" + closing;
+                } else if (can_parse(str + unicode_marker_padding + "\": 1" + closing)) {
+                    // Was inside an object key string after a partial unicode escape
+                    str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\": 1" + closing;
+                } else {
+                    auto last_pos = str.find_last_of(':');
+                    if (last_pos == std::string::npos) {
+                        throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
+                    }
+                    // fprintf(stderr, "Cutting back to last : for object key+value\n");
+                    str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
+                }
+            } else {
+                throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
+            }
+            // fprintf(stderr, "HEALED:\nSTRING <<<\n%s\n>>>\n\nmagic_cut: <<<\n%s\n>>>\n\n", str.c_str(), out.healing_marker.json_dump_marker.c_str());
+            out.json = json::parse(str);
+            it = temptative_end;
+            return true;
+        }
+        // handle unclosed top-level primitive
+        if (err_loc.position != 0 && !healing_marker.empty() && err_loc.stack.empty()) {
+            std::string str(it, temptative_end);
+            const auto & magic_seed = out.healing_marker.marker = healing_marker;
+            if (can_parse(str + "\"")) {
+                // Was inside an string
+                str += (out.healing_marker.json_dump_marker = magic_seed) + "\"";
+            } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"")) {
+                // Was inside an string after an escape
+                str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"";
+            } else {
+                // TODO: handle more unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...)
+                // fprintf(stderr, "Closing: TODO\n");
+                return false;
+            }
+            out.json = json::parse(str);
+            it = temptative_end;
+            return true;
+        }
+        return false;
+    }
+    out.json = json::parse(it, end);
+    it = end;
+    return true;
+}
@@ -0,0 +1,39 @@
+#pragma once
+
+// TODO: use json_fwd.hpp when possible
+#include <nlohmann/json.hpp>
+
+// Healing marker (empty if the JSON was fully parsed / wasn't healed).
+struct common_healing_marker {
+    // Raw marker.
+    std::string marker;
+
+    // Cutting the `common_json.json.dump()` string at the (only) occurrence of this marker should yield the original partial JSON string (modulo spaces / if it had the same dump format).
+    std::string json_dump_marker;
+};
+
+// Represents a parsed JSON object, with its optional healing marker (a JSON dump fragment that can be used to find the position of healing in the JSON dump string)
+struct common_json {
+    nlohmann::ordered_json json;
+
+    common_healing_marker healing_marker;
+};
+
+// Parse the JSON string, healing (closing) any partial JSON if `healing_marker` is not empty.
+//
+// Healing completes partial JSON strings by adding a (possibly modified) healing marker, then whatever is needed to close the JSON.
+// This allows to parse the resulting healed JSON string, yet be able to cut it again if needed at the healing marker.
+// (this is used when parsing JSON outputs from the models, then crafting partial JSONs for the partial tool calls in OAI format).
+//
+// For instance, parsing `{` with a healing marker `foo` will produce a healed JSON `{"foo":1}`, w/ json_dump_marker = `"foo"` (which can be used to break the JSON again).
+bool common_json_parse(
+    const std::string & input,
+    const std::string & healing_marker,
+    common_json & out);
+
+// Parse the JSON string (see overload above), but advancing an iterator to the end of the input when the (potentially partial) parsing succeeds.
+bool common_json_parse(
+    std::string::const_iterator & it,
+    const std::string::const_iterator & end,
+    const std::string & healing_marker,
+    common_json & out);
@@ -233,27 +233,27 @@ struct BuiltinRule {
 };

 static std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
-    {"boolean", {"(\"true\" | \"false\")", {}}},
+    {"boolean", {"(\"true\" | \"false\") space", {}}},
    {"decimal-part", {"[0-9]{1,16}", {}}},
    {"integral-part", {"[0] | [1-9] [0-9]{0,15}", {}}},
-    {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)?", {"integral-part", "decimal-part"}}},
-    {"integer", {"(\"-\"? integral-part)", {"integral-part"}}},
+    {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
+    {"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
    {"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
-    {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? space \"}\"", {"string", "value"}}},
-    {"array", {"\"[\" space ( value (\",\" space value)* )? space \"]\"", {"value"}}},
-    {"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\"", {}}},
+    {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
+    {"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
+    {"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
    {"char",   {"[^\"\\\\\\x7F\\x00-\\x1F] | [\\\\] ([\"\\\\bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
-    {"string", {"\"\\\"\" char* \"\\\"\"", {"char"}}},
-    {"null", {"\"null\"", {}}},
+    {"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
+    {"null", {"\"null\" space", {}}},
 };

 static std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
    {"date", {"[0-9]{4} \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
    {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9]{3} )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
    {"date-time", {"date \"T\" time", {"date", "time"}}},
-    {"date-string", {"\"\\\"\" date \"\\\"\"", {"date"}}},
-    {"time-string", {"\"\\\"\" time \"\\\"\"", {"time"}}},
-    {"date-time-string", {"\"\\\"\" date-time \"\\\"\"", {"date-time"}}}
+    {"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
+    {"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
+    {"date-time-string", {"\"\\\"\" date-time \"\\\"\" space", {"date-time"}}}
 };

 static bool is_reserved_name(const std::string & name) {
@@ -551,16 +551,16 @@ private:
            }
            return join_seq();
        };
-        return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\"");
+        return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space");
    }

    /*
        Returns a rule that matches a JSON string that is none of the provided strings

        not_strings({"a"})
-            -> ["] ( [a] char+ | [^"a] char* )? ["]
+            -> ["] ( [a] char+ | [^"a] char* )? ["] space
        not_strings({"and", "also"})
-            -> ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["]
+            -> ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["] space
    */
    std::string _not_strings(const std::vector<std::string> & strings) {

@@ -619,7 +619,7 @@ private:
        if (!trie.is_end_of_string) {
            out << "?";
        }
-        out << " [\"]";
+        out << " [\"] space";
        return out.str();
    }

@@ -725,7 +725,7 @@ private:
            rule += " )?";
        }

-        rule += " space \"}\"";
+        rule += " \"}\" space";

        return rule;
    }
@@ -858,14 +858,14 @@ public:
            return _add_rule(rule_name, _generate_union_rule(name, schema_types));
        }
        if (schema.contains("const")) {
-            return _add_rule(rule_name, _generate_constant_rule(schema["const"]));
+            return _add_rule(rule_name, _generate_constant_rule(schema["const"]) + " space");
        }
        if (schema.contains("enum")) {
            std::vector<std::string> enum_values;
            for (const auto & v : schema["enum"]) {
                enum_values.push_back(_generate_constant_rule(v));
            }
-            return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ")");
+            return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ") space");
        }
        if ((schema_type.is_null() || schema_type == "object")
                && (schema.contains("properties") ||
@@ -933,7 +933,7 @@ public:
                    }
                }
                if (!enum_intersection.empty()) {
-                    return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ")");
+                    return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ") space");
                }
            }
            return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
@@ -948,7 +948,7 @@ public:
                    }
                    rule += visit(items[i], name + (name.empty() ? "" : "-") + "tuple-" + std::to_string(i));
                }
-                rule += " space \"]\"";
+                rule += " \"]\" space";
                return _add_rule(rule_name, rule);
            }
            std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item");
@@ -956,7 +956,7 @@ public:
            json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json();
            int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : std::numeric_limits<int>::max();

-            return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " space \"]\"");
+            return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space");
        }
        if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) {
            return _visit_pattern(schema["pattern"], rule_name);
@@ -972,7 +972,7 @@ public:
            std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
            int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
            int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
-            return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\"");
+            return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
        }
        if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
            int64_t min_value = std::numeric_limits<int64_t>::min();
@@ -990,7 +990,7 @@ public:
            std::stringstream out;
            out << "(";
            build_min_max_int(min_value, max_value, out);
-            out << ")";
+            out << ") space";
            return _add_rule(rule_name, out.str());
        }
        if (schema.empty() || schema_type == "object") {
@@ -6,14 +6,13 @@
 #include "unicode.h"

 #include <algorithm>
-#include <deque>
 #include <initializer_list>
 #include <map>
 #include <memory>
 #include <nlohmann/json.hpp>
 #include <regex>
-#include <set>
 #include <stdexcept>
+#include <unordered_set>

 // Trick to catch missing branches
 template <typename T>
@@ -89,7 +88,40 @@ struct trie {
        return match_result{match_result::NO_MATCH};
    }

+    struct prefix_and_next {
+        std::vector<uint32_t> prefix;
+        std::vector<uint32_t> next_chars;
+    };
+
+    std::vector<prefix_and_next> collect_prefix_and_next() {
+        std::vector<uint32_t>        prefix;
+        std::vector<prefix_and_next> result;
+        collect_prefix_and_next(0, prefix, result);
+        return result;
+    }
+
  private:
+    void collect_prefix_and_next(size_t index, std::vector<uint32_t> & prefix, std::vector<prefix_and_next> & out) {
+        if (!nodes[index].is_word) {
+            if (!nodes[index].children.empty()) {
+                std::vector<uint32_t> chars;
+                chars.reserve(nodes[index].children.size());
+                for (const auto & p : nodes[index].children) {
+                    chars.push_back(p.first);
+                }
+                out.emplace_back(prefix_and_next{prefix, chars});
+            }
+        }
+
+        for (const auto & p : nodes[index].children) {
+            uint32_t ch = p.first;
+            auto child = p.second;
+            prefix.push_back(ch);
+            collect_prefix_and_next(child, prefix, out);
+            prefix.pop_back();
+        }
+    }
+
    size_t create_node() {
        size_t index = nodes.size();
        nodes.emplace_back();
@@ -121,65 +153,6 @@ struct trie {
    }
 };

-// Aho-Corasick automaton
-struct aho_corasick {
-    trie                t;
-    std::vector<size_t> fail;      // failure links
-    std::vector<size_t> order;     // states in BFS order
-    std::vector<bool>   terminal;  // match states (directly or via a suffix link)
-    std::set<uint32_t>  alphabet;  // every character with a transition
-
-    aho_corasick(const std::vector<std::string> & strings) : t(strings) {
-        const auto & nodes = t.nodes;
-        const size_t n = nodes.size();
-
-        fail.assign(n, 0);
-        order.reserve(n);
-
-        std::deque<size_t> queue{ 0 };
-        while (!queue.empty()) {
-            size_t u = queue.front();
-            queue.pop_front();
-            order.push_back(u);
-            for (const auto & [ch, v] : nodes[u].children) {
-                if (u != 0) {
-                    size_t f = fail[u];
-                    while (f && nodes[f].children.find(ch) == nodes[f].children.end()) {
-                        f = fail[f];
-                    }
-                    auto it = nodes[f].children.find(ch);
-                    fail[v] = (it != nodes[f].children.end() && it->second != v) ? it->second : 0;
-                }
-                queue.push_back(v);
-            }
-        }
-
-        terminal.assign(n, false);
-        for (size_t u : order) {
-            terminal[u] = nodes[u].is_word || (u != 0 && terminal[fail[u]]);
-        }
-
-        for (const auto & node : nodes) {
-            for (const auto & [ch, v] : node.children) {
-                alphabet.insert(ch);
-            }
-        }
-    }
-
-    size_t num_states()          const { return t.nodes.size(); }
-    bool   is_terminal(size_t s) const { return terminal[s]; }
-
-    // follow failure links until a transition on `ch` exists.
-    size_t next(size_t state, uint32_t ch) const {
-        const auto & nodes = t.nodes;
-        while (state && nodes[state].children.find(ch) == nodes[state].children.end()) {
-            state = fail[state];
-        }
-        auto it = nodes[state].children.find(ch);
-        return it != nodes[state].children.end() ? it->second : 0;
-    }
-};
-
 static std::pair<uint32_t, size_t> parse_hex_escape(const std::string & str, size_t pos, int hex_count) {
    if (pos + hex_count > str.length()) {
        return {0, 0};
@@ -921,10 +894,6 @@ struct parser_executor {
    common_peg_parse_result operator()(const common_peg_gbnf_parser & p) {
        return arena.parse(p.child, ctx, start_pos);
    }
-
-    common_peg_parse_result operator()(const common_peg_ac_parser & p) {
-        return arena.parse(p.child, ctx, start_pos);
-    }
 };

 common_peg_parse_result common_peg_arena::parse(common_peg_parse_context & ctx, size_t start) const {
@@ -993,8 +962,7 @@ void common_peg_arena::resolve_refs() {
                                 std::is_same_v<T, common_peg_not_parser> ||
                                 std::is_same_v<T, common_peg_tag_parser> ||
                                 std::is_same_v<T, common_peg_atomic_parser> ||
-                                 std::is_same_v<T, common_peg_gbnf_parser> ||
-                                 std::is_same_v<T, common_peg_ac_parser>) {
+                                 std::is_same_v<T, common_peg_gbnf_parser>) {
                p.child = resolve_ref(p.child);
            } else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
                p.child = resolve_ref(p.child);
@@ -1024,12 +992,12 @@ void common_peg_arena::resolve_refs() {
 }

 std::string common_peg_arena::dump(common_peg_parser_id id) const {
-    std::set<common_peg_parser_id> visited;
+    std::unordered_set<common_peg_parser_id> visited;
    return dump_impl(id, visited);
 }

 std::string common_peg_arena::dump_impl(common_peg_parser_id                       id,
-                                        std::set<common_peg_parser_id> & visited) const {
+                                        std::unordered_set<common_peg_parser_id> & visited) const {
    // Check for cycles
    if (visited.count(id)) {
        return "[cycle]";
@@ -1075,8 +1043,6 @@ std::string common_peg_arena::dump_impl(common_peg_parser_id
            return "Atomic(" + dump_impl(p.child, visited) + ")";
        } else if constexpr (std::is_same_v<T, common_peg_gbnf_parser>) {
            return "Gbnf(" + p.grammar + ", " + dump_impl(p.child, visited) + ")";
-        } else if constexpr (std::is_same_v<T, common_peg_ac_parser>) {
-            return "Ac(" + string_join(p.delimiters, " | ") + ", " + dump_impl(p.child, visited) + ")";
        } else if constexpr (std::is_same_v<T, common_peg_any_parser>) {
            return "Any";
        } else if constexpr (std::is_same_v<T, common_peg_space_parser>) {
@@ -1376,7 +1342,7 @@ common_peg_parser common_peg_parser_builder::json_object() {
 common_peg_parser common_peg_parser_builder::json_array() {
    return rule("json-array", [this]() {
        auto ws = space();
-        auto elements = sequence({json(), zero_or_more(sequence({ws, literal(","), ws, json()}))});
+        auto elements = sequence({json(), zero_or_more(sequence({literal(","), ws, json()}))});
        return sequence({
            literal("["),
            ws,
@@ -1486,13 +1452,6 @@ common_peg_parser common_peg_parser_builder::json_member(const std::string & key
    });
 }

-common_peg_parser common_peg_parser_builder::ac(const common_peg_parser & p, const std::vector<std::string> & delimiters) {
-    if (delimiters.empty()) {
-        throw std::runtime_error("ac parser requires at least one delimiter");
-    }
-    return add(common_peg_ac_parser{p, delimiters});
-}
-
 static std::string gbnf_escape_char_class(uint32_t c) {
    if (c == '-' || c == ']' || c == '[' || c == '\\') {
        return "\\" + std::string(1, (char) c);
@@ -1543,118 +1502,61 @@ static std::string gbnf_escape_char_class(uint32_t c) {
    return std::string(buf);
 }

-static std::string gbnf_char_class(const std::vector<uint32_t> & chars, bool negate) {
-    std::string s = negate ? "[^" : "[";
-    for (uint32_t ch : chars) {
-        s += gbnf_escape_char_class(ch);
-    }
-    return s + "]";
-}
+static std::string gbnf_excluding_pattern(const std::vector<std::string> & strings) {
+    trie matcher(strings);
+    auto pieces = matcher.collect_prefix_and_next();

-static std::string gbnf_ac_grammar(
-    const common_grammar_builder &   builder,
-    const std::string &              prefix,
-    const std::vector<std::string> & strings,
-    const std::function<std::string(const std::vector<uint32_t> &,
-                                    const std::map<size_t, std::vector<uint32_t>> &,
-                                    const std::vector<uint32_t> &,
-                                    const std::function<std::string(size_t)> &)> & build_rule) {
-    aho_corasick ac(strings);
-
-    auto state_name = [&](size_t s) -> std::string {
-        if (s == 0) {
-            return prefix;
-        }
-        std::string num = std::to_string(s);
-        num = num.size() == 1 ? ("0" + num) : num;
-        return prefix + "-" + num;
-    };
-
-    for (size_t q = 0; q < ac.num_states(); q++) {
-        if (ac.is_terminal(q)) {
-            continue; // match states
+    std::string pattern;
+    std::string trailing;  // optional proper-prefix of a delimiter, allowed only at the very end
+    for (size_t i = 0; i < pieces.size(); ++i) {
+        if (i > 0) {
+            pattern += " | ";
        }

-        std::map<size_t, std::vector<uint32_t>> buckets;
-        std::vector<uint32_t> completing;  // chars that complete a delimiter
-        std::vector<uint32_t> specific;    // chars with an explicit transition
-        for (uint32_t c : ac.alphabet) {
-            size_t d = ac.next(q, c);
-            if (ac.is_terminal(d)) {
-                completing.push_back(c);
-                specific.push_back(c);
-            } else if (d != 0) {
-                buckets[d].push_back(c); // specific non-root destination
-                specific.push_back(c);
-            }
+        const auto & pre = pieces[i].prefix;
+        const auto & chars = pieces[i].next_chars;
+
+        std::string cls;
+        cls.reserve(chars.size());
+        for (uint32_t ch : chars) {
+            cls += gbnf_escape_char_class(ch);
        }

-        builder.add_rule(state_name(q), build_rule(completing, buckets, specific, state_name));
+        if (!pre.empty()) {
+            std::string pre_literal = gbnf_format_literal(common_unicode_cpts_to_utf8(pre));
+            pattern += pre_literal + " [^" + cls + "]";
+            // Each interior alternative consumes a delimiter-prefix plus a disambiguating
+            // char, so the repetition alone cannot match a value that *ends* on a proper
+            // prefix of a delimiter (e.g. a trailing "\n" when the delimiter is
+            // "\n</parameter>\n"). The runtime until() (greedy first-match) accepts such
+            // values, so without this the grammar would reject input the parser accepts.
+            // Allow the value to terminate on any proper prefix as an optional tail.
+            // This makes the grammar a slight superset of the runtime language (a value
+            // may end on the longest prefix, which greedy first-match would not itself
+            // produce); harmless for constrained generation, which only needs to admit
+            // every runtime-valid string.
+            if (!trailing.empty()) {
+                trailing += " | ";
+            }
+            trailing += pre_literal;
+        } else {
+            pattern += "[^" + cls + "]";
+        }
    }

-    // An empty delimiter makes the start state terminal. Emit an entry rule
-    // that matches the empty string so the returned reference stays valid.
-    if (ac.is_terminal(0)) {
-        builder.add_rule(prefix, "|");
+    std::string result = "(" + pattern + ")*";
+    if (!trailing.empty()) {
+        result += " (" + trailing + ")?";
    }
-
-    return state_name(0);
+    return result;
 }

-// GBNF grammar matching strings that contain no string in `strings` as a
-// substring. Emits the complement of an Aho-Corasick automaton DFA and returns
-// the start state rule name.
-//
-// ref: https://github.com/ggml-org/llama.cpp/pull/24839
-static std::string gbnf_excluding_grammar(const common_grammar_builder & builder,
-                                          const std::string &            prefix,
-                                          const std::vector<std::string> & strings) {
-    return gbnf_ac_grammar(builder, prefix, strings,
-        [](const std::vector<uint32_t> & /*completing*/,
-           const std::map<size_t, std::vector<uint32_t>> & buckets,
-           const std::vector<uint32_t> & specific,
-           const std::function<std::string(size_t)> & state_name) {
-            // every state is accepting and completing chars get no
-            // alternative, so a forbidden string can never be matched
-            std::string rhs = "|";
-            for (const auto & [d, chars] : buckets) {
-                rhs += " " + gbnf_char_class(chars, false) + " " + state_name(d) + " |";
-            }
-            rhs += " " + gbnf_char_class(specific, true) + " " + state_name(0);
-            return rhs;
-        });
-}
-
-// GBNF grammar matching everything up to and including the first occurrence of
-// any string in `strings`. Emits the Aho-Corasick automaton DFA and returns
-// the start state rule name.
-static std::string gbnf_including_grammar(const common_grammar_builder & builder,
-                                          const std::string &            prefix,
-                                          const std::vector<std::string> & strings) {
-    return gbnf_ac_grammar(builder, prefix, strings,
-        [](const std::vector<uint32_t> & completing,
-           const std::map<size_t, std::vector<uint32_t>> & buckets,
-           const std::vector<uint32_t> & specific,
-           const std::function<std::string(size_t)> & state_name) {
-            std::vector<std::string> alts;
-            if (!completing.empty()) {
-                alts.push_back(gbnf_char_class(completing, false)); // terminate on match
-            }
-            for (const auto & [d, chars] : buckets) {
-                alts.push_back(gbnf_char_class(chars, false) + " " + state_name(d));
-            }
-            // every other character keeps scanning from the start state
-            alts.push_back(gbnf_char_class(specific, true) + " " + state_name(0));
-            return string_join(alts, " | ");
-        });
-}
-
-static std::set<std::string> collect_reachable_rules(
+static std::unordered_set<std::string> collect_reachable_rules(
    const common_peg_arena & arena,
    const common_peg_parser_id & rule
 ) {
-    std::set<std::string> reachable;
-    std::set<std::string> visited;
+    std::unordered_set<std::string> reachable;
+    std::unordered_set<std::string> visited;

    std::function<void(common_peg_parser_id)> visit = [&](common_peg_parser_id id) {
        const auto & parser = arena.get(id);
@@ -1686,7 +1588,6 @@ static std::set<std::string> collect_reachable_rules(
                                 std::is_same_v<T, common_peg_tag_parser> ||
                                 std::is_same_v<T, common_peg_atomic_parser> ||
                                 std::is_same_v<T, common_peg_gbnf_parser> ||
-                                 std::is_same_v<T, common_peg_ac_parser> ||
                                 std::is_same_v<T, common_peg_schema_parser>) {
                visit(p.child);
            } else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
@@ -1864,7 +1765,7 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
                if (p.delimiters.empty()) {
                    return ".*";
                }
-                return gbnf_excluding_grammar(builder, "until-" + std::to_string(id), p.delimiters);
+                return gbnf_excluding_pattern(p.delimiters);
            } else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
                if (schema_delegates(p)) {
                    return to_gbnf(p.child);
@@ -1881,8 +1782,6 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
                return to_gbnf(p.child);
            } else if constexpr (std::is_same_v<T, common_peg_gbnf_parser>) {
                return p.grammar;
-            } else if constexpr (std::is_same_v<T, common_peg_ac_parser>) {
-                return gbnf_including_grammar(builder, "ac-" + std::to_string(id), p.delimiters);
            } else {
                static_assert(is_always_false_v<T>);
            }
@@ -1890,7 +1789,7 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
    };

    // Collect reachable rules
-    std::set<std::string> reachable_rules;
+    std::unordered_set<std::string> reachable_rules;

    if (lazy) {
        // Collect rules reachable from trigger rules
@@ -2019,8 +1918,6 @@ static nlohmann::json serialize_parser_variant(const common_peg_parser_variant &
            };
        } else if constexpr (std::is_same_v<T, common_peg_gbnf_parser>) {
            return json{{"type", "gbnf"}, {"child", p.child}, {"grammar", p.grammar}};
-        } else if constexpr (std::is_same_v<T, common_peg_ac_parser>) {
-            return json{{"type", "ac"}, {"child", p.child}, {"delimiters", p.delimiters}};
        }
    }, variant);
 }
@@ -2193,16 +2090,6 @@ static common_peg_parser_variant deserialize_parser_variant(const nlohmann::json
        };
    }

-    if (type == "ac") {
-        if (!j.contains("child") || !j.contains("delimiters") || !j["delimiters"].is_array() || j["delimiters"].empty()) {
-            throw std::runtime_error("ac parser requires 'child' and a non-empty 'delimiters' array");
-        }
-        return common_peg_ac_parser{
-            j["child"].get<common_peg_parser_id>(),
-            j["delimiters"].get<std::vector<std::string>>(),
-        };
-    }
-
    throw std::runtime_error("Unknown parser type: " + type);
 }

@@ -3,8 +3,8 @@
 #include <nlohmann/json_fwd.hpp>

 #include <memory>
-#include <set>
 #include <unordered_map>
+#include <unordered_set>
 #include <string>
 #include <string_view>
 #include <functional>
@@ -275,11 +275,6 @@ struct common_peg_gbnf_parser {
    std::string grammar;
 };

-struct common_peg_ac_parser {
-    common_peg_parser_id child;
-    std::vector<std::string> delimiters;
-};
-
 // Variant holding all parser types
 using common_peg_parser_variant = std::variant<
    common_peg_epsilon_parser,
@@ -301,8 +296,7 @@ using common_peg_parser_variant = std::variant<
    common_peg_ref_parser,
    common_peg_atomic_parser,
    common_peg_tag_parser,
-    common_peg_gbnf_parser,
-    common_peg_ac_parser
+    common_peg_gbnf_parser
 >;

 class common_peg_arena {
@@ -341,7 +335,7 @@ class common_peg_arena {
    friend class common_peg_parser_builder;

  private:
-    std::string dump_impl(common_peg_parser_id id, std::set<common_peg_parser_id> & visited) const;
+    std::string dump_impl(common_peg_parser_id id, std::unordered_set<common_peg_parser_id> & visited) const;

    common_peg_parser_id add_parser(common_peg_parser_variant parser);
    void add_rule(const std::string & name, common_peg_parser_id id);
@@ -520,13 +514,6 @@ class common_peg_parser_builder {
    // the child's grammar. Parsing delegates entirely to the child.
    common_peg_parser gbnf(const common_peg_parser & p, const std::string & grammar) { return add(common_peg_gbnf_parser{p, grammar}); }

-    // Wraps a child parser but emits a GBNF grammar built from the Aho-Corasick
-    // automaton of `delimiters`, matching everything up to and including the
-    // first delimiter. Parsing delegates entirely to the child, which is
-    // responsible for consuming the delimiter (e.g. until(D) + literal(D)).
-    common_peg_parser ac(const common_peg_parser & p, const std::vector<std::string> & delimiters);
-    common_peg_parser ac(const common_peg_parser & p, const std::string & delimiter) { return ac(p, std::vector<std::string>{delimiter}); }
-
    void set_root(const common_peg_parser & p);

    common_peg_arena build();
@@ -16,6 +16,48 @@ static std::string rm_leading_dashes(const std::string & str) {
    return str.substr(pos);
 }

+// only allow a subset of args for remote presets for security reasons
+// do not add more args unless absolutely necessary
+// args that output to files are strictly prohibited
+static std::set<std::string> get_remote_preset_whitelist(const std::map<std::string, common_arg> & key_to_opt) {
+    static const std::set<std::string> allowed_options = {
+        "model-url",
+        "hf-repo",
+        "hf-repo-draft",
+        "hf-repo-v", // vocoder
+        "hf-file-v", // vocoder
+        "mmproj-url",
+        "pooling",
+        "jinja",
+        "batch-size",
+        "ubatch-size",
+        "cache-reuse",
+        "chat-template-kwargs",
+        "mmap",
+        // note: sampling params are automatically allowed by default
+        // negated args will be added automatically if the positive arg is specified above
+    };
+
+    std::set<std::string> allowed_keys;
+
+    for (const auto & it : key_to_opt) {
+        const std::string & key = it.first;
+        const common_arg & opt = it.second;
+        if (allowed_options.find(key) != allowed_options.end() || opt.is_sampling) {
+            allowed_keys.insert(key);
+            // also add variant keys (args without leading dashes and env vars)
+            for (const auto & arg : opt.get_args()) {
+                allowed_keys.insert(rm_leading_dashes(arg));
+            }
+            for (const auto & env : opt.get_env()) {
+                allowed_keys.insert(env);
+            }
+        }
+    }
+
+    return allowed_keys;
+}
+
 std::vector<std::string> common_preset::to_args(const std::string & bin_path) const {
    std::vector<std::string> args;

@@ -258,10 +300,16 @@ static std::string parse_bool_arg(const common_arg & arg, const std::string & ke
    return value;
 }

-common_preset_context::common_preset_context(llama_example ex)
+common_preset_context::common_preset_context(llama_example ex, bool only_remote_allowed)
        : ctx_params(common_params_parser_init(default_params, ex)) {
    common_params_add_preset_options(ctx_params.options);
    key_to_opt = get_map_key_opt(ctx_params);
+
+    // setup allowed keys if only_remote_allowed is true
+    if (only_remote_allowed) {
+        filter_allowed_keys = true;
+        allowed_keys = get_remote_preset_whitelist(key_to_opt);
+    }
 }

 common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {
@@ -60,7 +60,7 @@ struct common_preset_context {
    std::set<std::string> allowed_keys;

    // if only_remote_allowed is true, only accept whitelisted keys
-    common_preset_context(llama_example ex);
+    common_preset_context(llama_example ex, bool only_remote_allowed = false);

    // load presets from INI file
    common_presets load_from_ini(const std::string & path, common_preset & global) const;
@@ -259,9 +259,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
             }
        }
    }
-    if (!grmr && !grammar_str.empty()) {
-        throw std::runtime_error("failed to parse grammar");
-    }

    // Compute prefill tokens from the generation prompt
    std::vector<llama_token> prefill_tokens;
@@ -161,10 +161,6 @@ struct common_speculative_impl {

    virtual void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) = 0;

-    // (optional) serialize/restore per-seq internal state (e.g. eagle3's deferred boundary).
-    virtual bool get_state(llama_seq_id /*seq_id*/, std::vector<uint8_t> & /*data*/) const { return false; }
-    virtual void set_state(llama_seq_id /*seq_id*/, const std::vector<uint8_t> & /*data*/) {}
-
    // true if this implementation requires the target context to extract post-norm embeddings
    virtual bool need_embd() const = 0;

@@ -845,49 +841,6 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
                    (size_t) n_embd_dec * sizeof(float));
    }

-    // we only need to stash the deferred boundary's g_embd row for recurrent/hybrid targets:
-    // their single-position checkpoints drop it on restore
-    bool need_boundary_stash() const {
-        const llama_model * model_tgt = llama_get_model(params.ctx_tgt);
-        return llama_model_is_recurrent(model_tgt) || llama_model_is_hybrid(model_tgt);
-    }
-
-    bool get_state(llama_seq_id seq_id, std::vector<uint8_t> & data) const override {
-        if (!need_boundary_stash()) {
-            return false;
-        }
-        if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq || pending_pos_last[seq_id] < 0) {
-            return false;
-        }
-
-        const llama_pos          pos = pending_pos_last[seq_id];
-        const std::vector<float> & g = pending_g_last[seq_id];
-
-        data.resize(sizeof(llama_pos) + g.size() * sizeof(float));
-        std::memcpy(data.data(),                     &pos,     sizeof(llama_pos));
-        std::memcpy(data.data() + sizeof(llama_pos), g.data(), g.size() * sizeof(float));
-        return true;
-    }
-
-    void set_state(llama_seq_id seq_id, const std::vector<uint8_t> & data) override {
-        if (!need_boundary_stash()) {
-            return;
-        }
-        if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) {
-            return;
-        }
-        if (data.size() != sizeof(llama_pos) + (size_t) n_embd_dec * sizeof(float)) {
-            return;
-        }
-
-        llama_pos pos = -1;
-        std::memcpy(&pos, data.data(), sizeof(llama_pos));
-
-        pending_pos_last[seq_id] = pos;
-        pending_g_last[seq_id].resize(n_embd_dec);
-        std::memcpy(pending_g_last[seq_id].data(), data.data() + sizeof(llama_pos), (size_t) n_embd_dec * sizeof(float));
-    }
-
    bool need_embd() const override {
        return false;
    }
@@ -905,13 +858,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {

    int32_t n_embd = 0;

-    // One MTP draft driver, three modes (set once in the ctor):
-    //   is_mem_shared (gemma4): shares the target KV, runs all heads in one graph.
-    //   chain_heads (step35): n_mtp_layers trained heads, one per draft step.
-    //   neither (qwen35 / qwen35moe): a single trained MTP head.
-    int32_t n_mtp_layers  = 1;
-    bool    is_mem_shared = false;   // gemma4
-    bool    chain_heads   = false;   // derived in the ctor: n_mtp_layers > 1 && !is_mem_shared
+    bool is_mem_shared = false;

    // Per-sequence cross-batch carryover: pair (h_p, x_{p+1}) at MTP pos p+1.
    // The last h-row of one process() call needs the first token of the NEXT
@@ -926,8 +873,10 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
    std::vector<std::vector<float>> verify_h;
    std::vector<int32_t> verify_h_rows;

-    std::vector<int>                i_last;
-    std::vector<std::vector<float>> chain_h;
+    // Per-seq draft length from the last draft() call, used in accept() to
+    // roll back ctx_dft's recurrent state past the AR draft's redundant
+    // pre-advancement before process() mirrored the verify batch.
+    std::vector<uint16_t> last_n_drafted;

    common_speculative_impl_draft_mtp(const common_params_speculative & params, uint32_t n_seq)
        : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_MTP, n_seq)
@@ -940,7 +889,6 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
        n_embd = llama_model_n_embd_out(llama_get_model(ctx_dft));
        GGML_ASSERT(n_embd == llama_model_n_embd(llama_get_model(ctx_tgt)) &&
                "MTP input row width must match the target h_nextn width");
-        n_mtp_layers = std::max(1, (int) llama_model_n_layer_nextn(llama_get_model(ctx_dft)));

        LOG_INF("%s: adding speculative implementation 'draft-mtp'\n", __func__);
        LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d, backend_sampling=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd, (int) this->params.backend_sampling);
@@ -987,25 +935,16 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
        llama_set_embeddings_nextn(ctx_dft, true, /*masked*/ true);

        is_mem_shared = llama_get_ctx_other(ctx_dft) == ctx_tgt;
-        chain_heads   = n_mtp_layers > 1 && !is_mem_shared;
-
-        if (chain_heads) {
-            this->params.n_max = std::min(this->params.n_max, n_mtp_layers);
-
-            chain_h.assign(n_seq, {});
-            for (auto & c : chain_h) {
-                c.reserve((size_t) (this->params.n_max + 1) * n_embd);
-            }
-        }

        pending_h.assign(n_seq, std::vector<float>(n_embd, 0.0f));

-        i_last.assign(n_seq, -1);
        i_batch_beg.assign(n_seq, -1);
        i_batch_end.assign(n_seq, -1);

        verify_h.assign(n_seq, {});
        verify_h_rows.assign(n_seq, 0);
+
+        last_n_drafted.assign(n_seq, 0);
    }

    ~common_speculative_impl_draft_mtp() override {
@@ -1111,34 +1050,9 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
                set_h(i_batch_beg[seq_id], pending_h[seq_id].data());
            }

-            auto * mem_dft = llama_get_memory(ctx_dft);
-
-            bool ok = true;
-            for (int head = 0; head < n_mtp_layers; ++head) {
-                if (chain_heads) {
-                    // ref: https://github.com/ggml-org/llama.cpp/pull/24340/changes#r3413498544
-                    for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
-                        if (i_batch_beg[seq_id] < 0) {
-                            continue;
-                        }
-                        llama_memory_seq_rm(mem_dft, seq_id, batch_in.pos[i_batch_beg[seq_id]], -1);
-                    }
-                    llama_set_nextn_layer_offset(ctx_dft, head);
-                }
-
-                const int32_t rc = llama_decode(ctx_dft, batch);
-                if (rc != 0) {
-                    LOG_ERR("%s: llama_decode(ctx_dft) head=%d failed rc=%d (pos=%d)\n",
-                            __func__, head, (int) rc, (int) batch_in.pos[0]);
-                    ok = false;
-                    break;
-                }
-            }
-
-            if (chain_heads) {
-                llama_set_nextn_layer_offset(ctx_dft, 0); // restore default for non-draft decodes
-            }
-            if (!ok) {
+            const int32_t rc = llama_decode(ctx_dft, batch);
+            if (rc != 0) {
+                LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (pos=%d)\n", __func__, (int) rc, (int) batch_in.pos[0]);
                return false;
            }
        }
@@ -1173,6 +1087,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
        int n_drafting = 0;
        std::vector<bool> drafting(n_seq);

+        const float * h_row = nullptr;
        const size_t row_bytes = (size_t) n_embd * sizeof(float);

        for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
@@ -1187,43 +1102,22 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
            common_sampler_reset(smpls[seq_id].get());

            common_batch_add(batch, dp.id_last, dp.n_past, { seq_id }, true);
-            std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd, pending_h[seq_id].data(), row_bytes);

-            i_last[seq_id] = batch.n_tokens - 1;
+            h_row = pending_h[seq_id].data();
+            std::memcpy(batch.embd + n_embd*(batch.n_tokens - 1), h_row, row_bytes);
+        }

-            if (chain_heads) {
-                chain_h[seq_id].assign(pending_h[seq_id].begin(), pending_h[seq_id].end());
-            }
+        int ret = llama_decode(ctx_dft, batch);
+        if (ret != 0) {
+            LOG_WRN("%s: llama_decode returned %d\n", __func__, ret);
+            return;
        }

        int i = 0;

        while (n_drafting > 0) {
-            // each step decodes under a different head, i.e. a different decoder layer, and
-            // KV is per layer. process() filled this layer's KV only for positions < n_past
-            // (prompt + accepted prefix) — nothing in the draft region yet. so reset the
-            // draft region (the seq_rm lower bound is n_past, leaving the prompt KV intact)
-            // and select head i so it rebuilds its own layer's KV there; decoding just the
-            // latest token would leave its attention reading cells only another head wrote.
-            if (chain_heads) {
-                auto * mem_dft = llama_get_memory(ctx_dft);
-                for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
-                    if (drafting[seq_id]) {
-                        llama_memory_seq_rm(mem_dft, seq_id, dparams[seq_id].n_past, -1);
-                    }
-                }
-                llama_set_nextn_layer_offset(ctx_dft, i);
-            }
+            int i_batch = 0;

-            int ret = llama_decode(ctx_dft, batch);
-            if (ret != 0) {
-                LOG_WRN("%s: llama_decode[%d] returned %d\n", __func__, i, ret);
-                break;
-            }
-
-            // rebuild the batch for the next step: the growing-KV paths re-add only the
-            // new token (the KV already holds the prefix), while chained heads re-add the
-            // whole prefix at the next head. dropped sequences are simply not re-added.
            common_batch_clear(batch);

            for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
@@ -1233,8 +1127,9 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {

                auto * smpl = smpls[seq_id].get();

-                common_sampler_sample(smpl, ctx_dft, i_last[seq_id], true);
-                const float * h_row = llama_get_embeddings_nextn_ith(ctx_dft, i_last[seq_id]);
+                common_sampler_sample(smpl, ctx_dft, i_batch, true);
+                h_row = llama_get_embeddings_nextn_ith(ctx_dft, i_batch);
+                ++i_batch;

                const auto * cur_p = common_sampler_get_candidates(smpl, true);

@@ -1268,39 +1163,28 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
                    continue;
                }

-                if (chain_heads) {
-                    // ref: https://github.com/ggml-org/llama.cpp/pull/24340#discussion_r3448031546
-                    chain_h[seq_id].insert(chain_h[seq_id].end(), h_row, h_row + n_embd);
-
-                    const int n_rows = (int) result.size() + 1; // id_last + tokens drafted so far
-                    for (int t = 0; t < n_rows; ++t) {
-                        const llama_token tok = (t == 0) ? dp.id_last : result[t - 1];
-                        common_batch_add(batch, tok, dp.n_past + t, { seq_id }, t == n_rows - 1);
-                        std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd,
-                                    chain_h[seq_id].data() + (size_t) t * n_embd, row_bytes);
-                    }
-                } else if (is_mem_shared) {
+                if (is_mem_shared) {
                    // note: with shared memory (e.g. Gemma4 assistants) we use the same position for all draft tokens
                    // ref: https://github.com/huggingface/transformers/blob/effde20942e3f82a1b97449f60b3a48c5ff96145/docs/source/en/model_doc/gemma4_assistant.md?plain=1#L36-L37
                    common_batch_add(batch, id, dp.n_past, { seq_id }, true);
-                    std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd, h_row, row_bytes);
                } else {
                    common_batch_add(batch, id, dp.n_past + i + 1, { seq_id }, true);
-                    std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd, h_row, row_bytes);
                }
-
-                i_last[seq_id] = batch.n_tokens - 1;
+                std::memcpy(batch.embd + n_embd*(batch.n_tokens - 1), h_row, row_bytes);
            }

            if (batch.n_tokens == 0) {
                break;
            }

-            ++i;
-        }
+            // evaluate the drafted tokens on the draft model
+            ret = llama_decode(ctx_dft, batch);
+            if (ret != 0) {
+                LOG_WRN("%s: llama_decode[%d] returned %d\n", __func__, i, ret);
+                break;
+            }

-        if (chain_heads) {
-            llama_set_nextn_layer_offset(ctx_dft, 0); // restore default for non-draft decodes
+            ++i;
        }

        for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
@@ -1312,6 +1196,8 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
            if (dp.result->size() < (size_t) params.n_min) {
                dp.result->clear();
            }
+
+            last_n_drafted[seq_id] = (uint16_t) dp.result->size();
        }
    }

@@ -1924,7 +1810,7 @@ common_speculative * common_speculative_init(common_params_speculative & params,

        bool has_draft_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE));
        bool has_draft_eagle3 = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3)) && params.draft.ctx_dft != nullptr;
-        bool has_draft_mtp    = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_MTP))    && params.draft.ctx_dft != nullptr;
+        bool has_mtp = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_MTP)) && params.draft.ctx_dft != nullptr;



@@ -1962,7 +1848,7 @@ common_speculative * common_speculative_init(common_params_speculative & params,
        if (has_draft_eagle3) {
            configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, params));
        }
-        if (has_draft_mtp) {
+        if (has_mtp) {
            configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_MTP, params));
        }
    }
@@ -2232,31 +2118,6 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u
    }
 }

-// TODO: support the case of more than one speculative implementations having a state
-bool common_speculative_get_state(common_speculative * spec, llama_seq_id seq_id, std::vector<uint8_t> & data) {
-    if (spec == nullptr) {
-        return false;
-    }
-
-    for (auto & impl : spec->impls) {
-        if (impl->get_state(seq_id, data)) {
-            return true;
-        }
-    }
-
-    return false;
-}
-
-void common_speculative_set_state(common_speculative * spec, llama_seq_id seq_id, const std::vector<uint8_t> & data) {
-    if (spec == nullptr) {
-        return;
-    }
-
-    for (auto & impl : spec->impls) {
-        impl->set_state(seq_id, data);
-    }
-}
-
 void common_speculative_print_stats(const common_speculative * spec) {
    if (spec == nullptr) {
        return;
@@ -68,10 +68,6 @@ void common_speculative_draft(common_speculative * spec);
 // informs the speculative context that n_accepted tokens were accepted by the target model
 void common_speculative_accept(common_speculative * spec, llama_seq_id, uint16_t n_accepted);

-// (optional) get/set internal state
-bool common_speculative_get_state(common_speculative * spec, llama_seq_id seq_id, std::vector<uint8_t> & data);
-void common_speculative_set_state(common_speculative * spec, llama_seq_id seq_id, const std::vector<uint8_t> & data);
-
 // print statistics about the speculative decoding
 void common_speculative_print_stats(const common_speculative * spec);

@@ -46,7 +46,6 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "DbrxForCausalLM": "dbrx",
    "DeciLMForCausalLM": "deci",
    "DeepseekForCausalLM": "deepseek",
-    "DeepseekOCRForCausalLM": "deepseek",
    "DeepseekV2ForCausalLM": "deepseek",
    "DeepseekV3ForCausalLM": "deepseek",
    "DeepseekV32ForCausalLM": "deepseek",
@@ -97,7 +96,6 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "GraniteMoeHybridForCausalLM": "granite",
    "GraniteMoeSharedForCausalLM": "granite",
    "GraniteSpeechForConditionalGeneration": "granite",
-    "GraniteSpeechPlusForConditionalGeneration": "granite",
    "Grok1ForCausalLM": "grok",
    "GrokForCausalLM": "grok",
    "GroveMoeForCausalLM": "grovemoe",
@@ -125,7 +123,6 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "LLaDAModelLM": "llada",
    "LLaMAForCausalLM": "llama",
    "Lfm25AudioTokenizer": "lfm2",
-    "Lfm2BidirectionalModel": "lfm2",
    "Lfm2ForCausalLM": "lfm2",
    "Lfm2Model": "lfm2",
    "Lfm2MoeForCausalLM": "lfm2",
@@ -136,7 +133,6 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "LlamaModel": "llama",
    "Eagle3DraftModel": "llama",
    "Eagle3Speculator": "llama",
-    "Eagle3LlamaForCausalLM": "llama",
    "LlamaForCausalLMEagle3": "llama",
    "LlavaForConditionalGeneration": "llama",
    "LlavaStableLMEpochForCausalLM": "stablelm",
@@ -235,7 +231,6 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "UMT5ForConditionalGeneration": "t5",
    "UMT5Model": "t5",
    "UltravoxModel": "ultravox",
-    "UnlimitedOCRForCausalLM": "deepseek",
    "VLlama3ForCausalLM": "llama",
    "VoxtralForConditionalGeneration": "llama",
    "WavTokenizerDec": "wavtokenizer",
@@ -266,7 +261,6 @@ MMPROJ_MODEL_MAP: dict[str, str] = {
    "GlmasrModel": "ultravox",
    "Granite4VisionForConditionalGeneration": "granite",
    "GraniteSpeechForConditionalGeneration": "granite",
-    "GraniteSpeechPlusForConditionalGeneration": "granite",
    "HunYuanVLForConditionalGeneration": "hunyuan",
    "Idefics3ForConditionalGeneration": "smolvlm",
    "InternVisionModel": "internvl",
@@ -302,7 +296,6 @@ MMPROJ_MODEL_MAP: dict[str, str] = {
    "StepVLForConditionalGeneration": "step3",
    "Step3p7ForConditionalGeneration": "step3",
    "UltravoxModel": "ultravox",
-    "UnlimitedOCRForCausalLM": "deepseek",
    "VoxtralForConditionalGeneration": "ultravox",
    "YoutuVLForConditionalGeneration": "youtuvl",
 }
@@ -126,7 +126,7 @@ class BailingMoeV2Model(TextModel):
        if (rope_dim := hparams.get("head_dim")) is None:
            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]

-        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5)))
+        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
        self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
@@ -1119,10 +1119,8 @@ class TextModel(ModelBase):

        rope_theta = self.find_hparam(["global_rope_theta", "rope_global_theta", "rope_theta_global", "rope_theta", "rotary_emb_base"], optional=True)
        local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "rope_theta_local", "swa_rope_theta", "rope_local_base_freq"], optional=True)
-        partial_rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"], optional=True)
-        original_max_position_embeddings = self.find_hparam(["original_max_position_embeddings"], optional=True)

-        # Ensure global params are mirrored in rope_parameters
+        # Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
        if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
            if local_rope_theta is not None:
                self.rope_parameters["sliding_attention"] = {"rope_theta": local_rope_theta}
@@ -1130,10 +1128,6 @@ class TextModel(ModelBase):
                self.rope_parameters["rope_theta"] = rope_theta
            if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None:
                self.rope_parameters["rope_type"] = rope_type
-            if "partial_rotary_factor" not in self.rope_parameters and partial_rotary_factor is not None:
-                self.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
-            if "original_max_position_embeddings" not in self.rope_parameters and original_max_position_embeddings is not None:
-                self.rope_parameters["original_max_position_embeddings"] = original_max_position_embeddings

    @classmethod
    def __init_subclass__(cls):
@@ -148,7 +148,7 @@ class ChatGLMModel(TextModel):
            rope_dim = self.hparams["attention_dim"]
        else:
            rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
-        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5)))
+        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
        self.gguf_writer.add_add_bos_token(False)
        rope_freq = 10000
        if "rope_ratio" in self.hparams:
@@ -161,7 +161,7 @@ class DeciModel(TextModel):
                factor = rope_params.get("factor", 8.0)
                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
-                old_context_len = rope_params.get("original_max_position_embeddings", 8192)
+                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)

                low_freq_wavelen = old_context_len / low_freq_factor
                high_freq_wavelen = old_context_len / high_freq_factor
@@ -14,7 +14,7 @@ from .base import MmprojModel, ModelBase, TextModel, gguf, logger
 from .qwen import QwenModel


-@ModelBase.register("DeepseekOCRForCausalLM", "UnlimitedOCRForCausalLM")
+@ModelBase.register("DeepseekOCRForCausalLM")
 class DeepseekOCRVisionModel(MmprojModel):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
@@ -205,8 +205,6 @@ class DeepseekModel(TextModel):
@ModelBase.register(
    "DeepseekV2ForCausalLM",
    "DeepseekV3ForCausalLM",
-    "DeepseekOCRForCausalLM",
-    "UnlimitedOCRForCausalLM",
    "KimiVLForConditionalGeneration",
    "KimiK25ForConditionalGeneration",
    "YoutuForCausalLM",
@@ -226,7 +224,7 @@ class DeepseekV2Model(TextModel):
        self.origin_hf_arch = hparams.get('architectures', [None])[0]

        # special handling for Deepseek OCR
-        if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM", "UnlimitedOCRForCausalLM"):
+        if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM"):
            self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR
            self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
            self.gguf_writer.add_architecture()
@@ -352,12 +350,6 @@ class DeepseekV2Model(TextModel):

        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])

-        # Unlimited-OCR sliding window; written for metadata, the decoder ignores it (full MHA)
-        if is_ocr:
-            sliding_window = hparams.get("sliding_window_size") or hparams.get("sliding_window")
-            if sliding_window:
-                self.gguf_writer.add_sliding_window(sliding_window)
-
        if (rope_mscale_all := self.rope_parameters.get("mscale_all_dim")) is not None:
            # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
            # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
@@ -24,7 +24,7 @@ class ExaoneModel(TextModel):

        assert (hparams["activation_function"] == "silu")

-        rotary_factor = self.rope_parameters.get("partial_rotary_factor")
+        rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
        rotary_factor = rotary_factor if rotary_factor is not None else 1.0
        self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))

@@ -39,7 +39,7 @@ class ExaoneModel(TextModel):
                factor = rope_params.get("factor", 8.0)
                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
-                old_context_len = rope_params.get("original_max_position_embeddings", 8192)
+                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)

                low_freq_wavelen = old_context_len / low_freq_factor
                high_freq_wavelen = old_context_len / high_freq_factor
@@ -104,7 +104,7 @@ class Exaone4Model(TextModel):
                factor = rope_params.get("factor", 16.0)
                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
-                old_context_len = rope_params.get("original_max_position_embeddings", 8192)
+                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)

                low_freq_wavelen = old_context_len / low_freq_factor
                high_freq_wavelen = old_context_len / high_freq_factor
@@ -693,7 +693,7 @@ class Gemma4Model(Gemma3Model):
            self.gguf_writer.add_head_count_kv(value_arr)

        # handle n_rot differently for global vs swa layers
-        partial_rotary_factor_swa = self.rope_parameters.get("partial_rotary_factor", 1.0)
+        partial_rotary_factor_swa = self.hparams.get("partial_rotary_factor", 1.0)
        n_rot_full = int(head_dim_full) # "proportional" is used, see generate_extra_tensors
        n_rot_swa = int(head_dim_swa * partial_rotary_factor_swa)
        self.gguf_writer.add_rope_dimension_count(n_rot_full)
@@ -124,7 +124,7 @@ class Glm4MoeModel(TextModel):
                self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
            )
        self.gguf_writer.add_rope_dimension_count(
-            int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5))
+            int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))
        )

        # MoE parameters - Use only routed expert count (shared experts handled separately)
@@ -226,7 +226,7 @@ class GlmMoeDsaModel(DeepseekV2Model):
        super().set_gguf_parameters()

        rope_dim = self.hparams["qk_rope_head_dim"]
-        partial_rotary_factor = self.rope_parameters.get("partial_rotary_factor", 1.0)
+        partial_rotary_factor = self.hparams.get("partial_rotary_factor", 1.0)
        self.gguf_writer.add_rope_dimension_count(int(rope_dim * partial_rotary_factor))

        # NextN/MTP prediction layers
@@ -348,34 +348,6 @@ class GraniteSpeechMmprojModel(MmprojModel):
        yield from super().modify_tensors(data_torch, name, bid)


-@ModelBase.register("GraniteSpeechPlusForConditionalGeneration")
-class GraniteSpeechPlusMmprojModel(GraniteSpeechMmprojModel):
-    """Conversion for GraniteSpeechPlus - extends GraniteSpeech with feature layer concatenation"""
-    has_vision_encoder = False
-    has_audio_encoder = True
-
-    def set_gguf_parameters(self):
-        assert self.hparams_audio is not None
-        super().set_gguf_parameters()
-
-        # Add feature_layer if present in encoder config
-        if feature_layers := self.hparams_audio.get("cat_hidden_layers"):
-            self.gguf_writer.add_audio_feature_layers(feature_layers)
-            logger.info(f"gguf: audio feature_layers = {feature_layers}")
-
-            # Validate projector dimension matches concatenated encoder output
-            hidden_dim = self.hparams_audio["hidden_dim"]
-            expected_dim = hidden_dim * (len(feature_layers) + 1)
-            projector_dim = self.global_config["projector_config"]["encoder_hidden_size"]
-
-            if projector_dim != expected_dim:
-                raise ValueError(
-                    f"Projector encoder_hidden_size ({projector_dim}) does not match "
-                    f"expected concatenated dimension ({expected_dim}). "
-                    f"Expected: hidden_dim ({hidden_dim}) * (len(feature_layers) + 1) = {expected_dim}"
-                )
-
-
@ModelBase.register("Granite4VisionForConditionalGeneration")
 class Granite4VisionMmprojModel(MmprojModel):
    has_vision_encoder = True
@@ -64,17 +64,11 @@ class LFM2Model(TextModel):
        yield from super().modify_tensors(data_torch, name, bid)


-@ModelBase.register("Lfm2Model", "Lfm2BidirectionalModel")
+@ModelBase.register("Lfm2Model")
 class LFM2ColBertModel(LFM2Model):
    model_arch = gguf.MODEL_ARCH.LFM2
    dense_tensor_name = "dense_2"

-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        if self.hf_arch == "Lfm2BidirectionalModel":
-            self.gguf_writer.add_causal_attention(False)
-        self._try_set_pooling_type()
-
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        if not name.startswith(self.dense_tensor_name):
            name = "model." + name
@@ -82,11 +76,10 @@ class LFM2ColBertModel(LFM2Model):
        yield from super().modify_tensors(data_torch, name, bid)

    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
-        # optional dense tensor is stored in a separate safetensors file
+        # dense tensor is stored in a separate safetensors file
        from safetensors.torch import load_file
        tensors_file = self.dir_model / "1_Dense" / "model.safetensors"
-        if not tensors_file.is_file():
-            return
+        assert tensors_file.is_file()
        tensor = load_file(tensors_file)["linear.weight"]
        self.gguf_writer.add_embedding_length_out(tensor.shape[0])
        yield f"{self.dense_tensor_name}.weight", tensor.clone()
@@ -23,7 +23,6 @@ from .base import ModelBase, TextModel, gguf, logger
    "LlavaForConditionalGeneration",
    "VoxtralForConditionalGeneration",
    "LlamaForCausalLMEagle3",
-    "Eagle3LlamaForCausalLM",
    "Eagle3Speculator",
    "Eagle3DraftModel",
    "IQuestCoderForCausalLM",
@@ -290,7 +289,7 @@ class LlamaModel(TextModel):
                factor = rope_params.get("factor", 8.0)
                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
-                old_context_len = rope_params.get("original_max_position_embeddings", 8192)
+                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)

                low_freq_wavelen = old_context_len / low_freq_factor
                high_freq_wavelen = old_context_len / high_freq_factor
@@ -114,8 +114,7 @@ class Mamba2Model(TextModel):
            hparams["text_config"] = hparams["llm_config"]
        super().__init__(dir_model, *args, hparams=hparams, **kwargs)
        self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
-        self.expand = self.find_hparam(["mamba_expand", "expand"], optional=True) or 2
-        self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or self.expand * self.d_model
+        self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model
        self.n_group = self.find_hparam(["n_groups"], optional=True) or 1

    def set_vocab(self):
@@ -145,9 +144,11 @@ class Mamba2Model(TextModel):

        rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5

+        # Fail early for models which don't have a block expansion factor of 2
+        # TODO: does this really matter?
        # skip the assertion for FalconH1 Model
        if self.model_arch != gguf.MODEL_ARCH.FALCON_H1:
-            assert self.d_inner == self.expand * self.d_model
+            assert self.d_inner == 2 * self.d_model
            assert self.d_inner % head_dim == 0

        self.gguf_writer.add_context_length(2**20)  # arbitrary value; for those who use the default
@@ -154,7 +154,7 @@ class MimoV2Model(TextModel):
        self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
        self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])

-        rope_dim = int(self.hparams["head_dim"] * self.rope_parameters["partial_rotary_factor"])
+        rope_dim = int(self.hparams["head_dim"] * self.hparams["partial_rotary_factor"])
        self.gguf_writer.add_rope_dimension_count(rope_dim)

        self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon", 1e-5))
@@ -32,9 +32,11 @@ class MiniCPMModel(TextModel):
    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
        rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]

-        long_factors = self.rope_parameters.get('long_factor')
-        short_factors = self.rope_parameters.get('short_factor')
-        if long_factors or short_factors:
+        rope_scaling = self.find_hparam(['rope_scaling'], True)
+        if rope_scaling is not None:
+            long_factors = rope_scaling.get('long_factor', None)
+            short_factors = rope_scaling.get('short_factor', None)
+
            if long_factors is None or short_factors is None:
                raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')

@@ -83,11 +85,13 @@ class MiniCPM3Model(TextModel):
        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])

    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
-        long_factors = self.rope_parameters.get('long_factor')
-        short_factors = self.rope_parameters.get('short_factor')
-        if long_factors or short_factors:
+        rope_scaling = self.find_hparam(['rope_scaling'], True)
+        if rope_scaling is not None:
            rope_dims = self.hparams["qk_rope_head_dim"]

+            long_factors = rope_scaling.get('long_factor', None)
+            short_factors = rope_scaling.get('short_factor', None)
+
            if long_factors is None or short_factors is None:
                raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')

@@ -125,18 +125,17 @@ class NemotronModel(TextModel):
        self.gguf_writer.add_layer_norm_eps(f_norm_eps)

        # * Partial RoPE
-        rot_pct = self.rope_parameters["partial_rotary_factor"]
+        rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        n_head = self.find_hparam(["num_attention_heads", "n_head"])
        self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)

        # * RopeScaling for Nemotron
-        factor = self.hparams.get("factor") or self.rope_parameters.get("factor")
-        if factor is None:
+        if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
        else:
            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-            self.gguf_writer.add_rope_scaling_factor(factor)
+            self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        # * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side
@@ -18,7 +18,7 @@ class Phi2Model(TextModel):
    model_arch = gguf.MODEL_ARCH.PHI2

    def set_gguf_parameters(self):
-        rot_pct = self.rope_parameters["partial_rotary_factor"]
+        rot_pct = self.find_hparam(["partial_rotary_factor"])
        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        n_head = self.find_hparam(["num_attention_heads", "n_head"])

@@ -149,8 +149,8 @@ class Phi3MiniModel(TextModel):
        n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
        rms_eps = self.find_hparam(["rms_norm_eps"])
        max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
-        orig_max_pos_embds = self.rope_parameters["original_max_position_embeddings"]
-        rot_pct = self.rope_parameters.get("partial_rotary_factor", 1.0)
+        orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
+        rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
        rope_dims = int(rot_pct * n_embd) // n_head

        self.gguf_writer.add_context_length(max_pos_embds)
@@ -174,19 +174,18 @@ class Phi3MiniModel(TextModel):
        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        n_head = self.find_hparam(["num_attention_heads", "n_head"])
        max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
-        orig_max_pos_embds = self.rope_parameters["original_max_position_embeddings"]
-        rot_pct = self.rope_parameters.get("partial_rotary_factor", 1.0)
+        orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
+        rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
        rope_dims = int(rot_pct * n_embd) // n_head

        # write rope scaling for long context (128k) model
-        long_factors = self.rope_parameters.get('long_factor')
-        short_factors = self.rope_parameters.get('short_factor')
-        if not long_factors:
+        rope_scaling = self.find_hparam(['rope_scaling'], True)
+        if rope_scaling is None:
            return

        scale = max_pos_embds / orig_max_pos_embds

-        rope_scaling_type = self.rope_parameters.get('rope_type', '').lower()
+        rope_scaling_type = rope_scaling.get('rope_type', rope_scaling.get('type', '')).lower()
        if len(rope_scaling_type) == 0:
            raise KeyError('Missing the required key rope_scaling.type')

@@ -199,6 +198,9 @@ class Phi3MiniModel(TextModel):

        self.gguf_writer.add_rope_scaling_attn_factors(attn_factor)

+        long_factors = rope_scaling.get('long_factor', None)
+        short_factors = rope_scaling.get('short_factor', None)
+
        if long_factors is None or short_factors is None:
            raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')

@@ -280,7 +280,7 @@ class Qwen3NextModel(Qwen2MoeModel):
        self.gguf_writer.add_full_attention_interval(self.hparams.get("full_attention_interval", 4))
        if (rope_dim := self.hparams.get("head_dim")) is None:
            rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
-        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.25)))
+        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.25)))

    @classmethod
    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
@@ -28,7 +28,7 @@ class StableLMModel(TextModel):
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
        self.gguf_writer.add_block_count(self.block_count)
        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
-        rotary_factor = self.rope_parameters["partial_rotary_factor"]
+        rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
        self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
        self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
@@ -314,7 +314,7 @@ class Step35Model(TextModel):
        factor = float(rope_params.get("factor", 8.0))
        low_freq_factor = float(rope_params.get("low_freq_factor", 1.0))
        high_freq_factor = float(rope_params.get("high_freq_factor", 4.0))
-        old_context_len = int(rope_params.get("original_max_position_embeddings", 8192))
+        old_context_len = int(rope_params.get("original_max_position_embeddings", self.hparams.get("original_max_position_embeddings", 8192)))

        low_freq_wavelen = old_context_len / low_freq_factor
        high_freq_wavelen = old_context_len / high_freq_factor
@@ -29,7 +29,7 @@ With Termux, you can install and run `llama.cpp` as if the environment were Linu

 ```
 $ apt update && apt upgrade -y
-$ apt install git cmake libandroid-spawn
+$ apt install git cmake
 ```

 Then, follow the [build instructions](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md), specifically for CMake.
@@ -237,8 +237,8 @@ chmod +x ubuntu-llamacpp-ov-install.sh
 # ============================================
 set -euo pipefail

-OPENVINO_VERSION_MAJOR="2026.2.1"
-OPENVINO_VERSION_FULL="2026.2.1.21919.ede283a88e3"
+OPENVINO_VERSION_MAJOR="2026.2"
+OPENVINO_VERSION_FULL="2026.2.0.21903.52ddc073857"

 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 OPENVINO_INSTALL_DIR="/opt/intel/openvino_${OPENVINO_VERSION_MAJOR}"
@@ -334,7 +334,7 @@ echo "  ./build/ReleaseOV/bin/llama-cli -m model.gguf"
 ```

 > [!NOTE]
-> The script pins OpenVINO `2026.2.1` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release.
+> The script pins OpenVINO `2026.2` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release.

 </details>

@@ -364,8 +364,8 @@ REM ============================================
 REM llama.cpp OpenVINO Build Script (Ninja)
 REM ============================================

-set "OPENVINO_VERSION_MAJOR=2026.2.1"
-set "OPENVINO_VERSION_FULL=2026.2.1.21919.ede283a88e3"
+set "OPENVINO_VERSION_MAJOR=2026.2"
+set "OPENVINO_VERSION_FULL=2026.2.0.21903.52ddc073857"

 set "SCRIPT_DIR=%~dp0"
 set "VCPKG_DIR=C:\vcpkg"
@@ -547,7 +547,7 @@ endlocal
 ```

 > [!NOTE]
-> The script pins OpenVINO `2026.2.1` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release. From any new shell, source the matching `setupvars` script via the junction — `call "C:\Intel\openvino\setupvars.bat"` from `cmd`, or `& "C:\Intel\openvino\setupvars.ps1"` from PowerShell. If `winget` cannot register Visual Studio Build Tools on first run, install them once manually and re-run the script from an elevated **Developer Command Prompt for VS 2022**.
+> The script pins OpenVINO `2026.2` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release. From any new shell, source the matching `setupvars` script via the junction — `call "C:\Intel\openvino\setupvars.bat"` from `cmd`, or `& "C:\Intel\openvino\setupvars.ps1"` from PowerShell. If `winget` cannot register Visual Studio Build Tools on first run, install them once manually and re-run the script from an elevated **Developer Command Prompt for VS 2022**.

 </details>

@@ -161,64 +161,6 @@ You could update your test result in it directly.

 Please refer to [Docker with SYCL](../docker.md#docker-with-sycl) for details.

-## Quick Development WOW
-
-This chapter is for quick development & try with SYCL backend on Intel GPU.
-
-You need to install following sofeware before development:
-   - Intel GPU driver
-   - oneAPI package
-   - other development tools.
-
-Please refer to [Linux](#linux) or [Windows](#windows-1) for above installation and resolve the trouble in usage. There are the detailed guide.
-
- Linux
-
-```
-## build from source code
-./examples/sycl/build.sh
-
-## run CONV_2D_DW unit test cases
-./build/bin/test-backend-ops -b SYCL0 -o CONV_2D_DW
-
-## run all unit test cases
-./build/bin/test-backend-ops -b SYCL0
-
-## run with LLM on the first GPU
-./examples/sycl/test.sh -mg 0 -m xxxx.gguf
-
-## run service with LLM on the first GPU
-export ONEAPI_DEVICE_SELECTOR="level_zero:0"
-./examples/sycl/start-svr.sh -m xxxx.gguf
-
-## update the docs/ops.md for new/update OPs
-./examples/sycl/update-ops-doc.sh
-```
-
- Windows
-
-```
-## build from source code
-examples\sycl\win-build-sycl.bat
-
-## run CONV_2D_DW unit test cases
-build\bin\test-backend-ops.exe -b SYCL0 -o CONV_2D_DW
-
-## run all unit test cases
-build\bin\test-backend-ops.exe -b SYCL0
-
-## run LLM on the first GPU
-examples\sycl\win-test.bat -mg 0 -m xxxx.gguf
-
-## run service with LLM on the first GPU
-set ONEAPI_DEVICE_SELECTOR="level_zero:0"
-examples\sycl\win-start-svr.bat -m xxxx.gguf
-
-## update the docs/ops.md for new/update OPs
-examples\sycl\win-update-ops-doc.bat
-```
-
-
 ## Linux

 ### I. Setup Environment
@@ -413,15 +355,6 @@ In two device selection modes, the default SYCL backend is level_zero, you can c
 |------------------|----------------------------------------|
 | Single device    | --split-mode none --main-gpu DEVICE_ID |
 | Multiple devices | --split-mode layer (default)           |
-| Multiple devices | --split-mode tensor (tensor parallelism) |
-
-`--split-mode tensor` (tensor parallelism) shards each layer across the selected
-GPUs. It requires flash attention, which is auto-enabled when `--flash-attn` is
-left at its default `auto`, so `--split-mode tensor` works out of the box.
-Passing `--flash-attn off` together with `--split-mode tensor` is rejected at
-context creation. The default `f16` KV cache is recommended. Tensor parallelism
-is currently optimized for 2 GPUs; other device counts fall back to a generic
-all-reduce.

 Examples:

@@ -724,15 +657,6 @@ In two device selection modes, the default SYCL backend is level_zero, you can c
 |------------------|----------------------------------------|
 | Single device    | --split-mode none --main-gpu DEVICE_ID |
 | Multiple devices | --split-mode layer (default)           |
-| Multiple devices | --split-mode tensor (tensor parallelism) |
-
-`--split-mode tensor` (tensor parallelism) shards each layer across the selected
-GPUs. It requires flash attention, which is auto-enabled when `--flash-attn` is
-left at its default `auto`, so `--split-mode tensor` works out of the box.
-Passing `--flash-attn off` together with `--split-mode tensor` is rejected at
-context creation. The default `f16` KV cache is recommended. Tensor parallelism
-is currently optimized for 2 GPUs; other device counts fall back to a generic
-all-reduce.

 Examples:

@@ -777,7 +701,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | GGML_SYCL_GRAPH    | ON *(default)* \|OFF *(Optional)*     | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
 | GGML_SYCL_DNN      | ON *(default)* \|OFF *(Optional)*     | Enable build with oneDNN.                   |
 | GGML_SYCL_HOST_MEM_FALLBACK | ON *(default)* \|OFF *(Optional)* | Allow host memory fallback when device memory is full during quantized weight reorder. Enables inference to continue at reduced speed (reading over PCIe) instead of failing. Requires Linux kernel 6.8+. |
-| GGML_SYCL_SUPPORT_LEVEL_ZERO_API | ON *(default)* \|OFF *(Optional)* | Support to use Level Zero API for device memory allocation. Requires Level Zero headers/library at build time and Intel GPU driver (Level Zero runtime) at run time. Reduces system RAM usage during multi-GPU inference. SYCL backend always runs on Level Zero running time even if it's set as OFF (The SYCL api will be usage for memory allocation).|
+| GGML_SYCL_SUPPORT_LEVEL_ZERO | ON *(default)* \|OFF *(Optional)* | Enable Level Zero API for device memory allocation. Requires Level Zero headers/library at build time and Intel GPU driver (Level Zero runtime) at run time. Reduces system RAM usage during multi-GPU inference. |
 | CMAKE_C_COMPILER   | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path.      |
 | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)*   | Set `icpx/icx` compiler for SYCL code path. |

@@ -788,11 +712,10 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | Name              | Value            | Function                                                                                                                  |
 |-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
 | GGML_SYCL_DEBUG   | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG                                                                             |
-| GGML_SYCL_DEV2DEV_MEMCPY | 0 (default) or 1 | Choose the SYCL or L0 API in dev2dev memory copy.<br>Value: <br>*  0: SYCL API (default)<br>* 1: L0 API -- L0 API is found to lead to abnormal crash in some case. This debug flag is used to check the issue.|
 | GGML_SYCL_ENABLE_FLASH_ATTN | 1 (default) or 0| Enable Flash-Attention. It can reduce memory usage. The performance impact depends on the LLM.|
 | GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for Intel devices older than Gen 10) |
 | GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. |
-| GGML_SYCL_USE_LEVEL_ZERO_API | 1 (default) or 0 | Use Level Zero API for device memory allocation instead of SYCL. Reduces system RAM usage on Intel dGPUs by avoiding DMA-buf/TTM host memory staging. Requires GGML_SYCL_SUPPORT_LEVEL_ZERO_API=ON at build time. SYCL backend always runs on Level Zero running time even if it's set as OFF (The SYCL api will be usage for memory allocation).|
+| GGML_SYCL_ENABLE_LEVEL_ZERO | 1 (default) or 0 | Use Level Zero API for device memory allocation instead of SYCL. Reduces system RAM usage on Intel dGPUs by avoiding DMA-buf/TTM host memory staging. Requires GGML_SYCL_SUPPORT_LEVEL_ZERO=ON at build time. |
 | GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
 | GGML_SYCL_ENABLE_VMM | 0 or 1 (default) | Enable the virtual-memory device pool. |
 | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
@@ -808,7 +731,6 @@ Pass these via `CXXFLAGS` or add a one-off `#define` to enable a flag on the spo
 | DEBUG_SYCL_POOL | Enable device memory pool logging on teardown. Useful for profiling allocations. |
 | DEBUG_SYCL_MALLOC | Enable verbose per-call logging of device pool alloc/free operations. |

-
 ## Design Rule

 - Open to all contributors.
@@ -24,6 +24,7 @@
            "GGML_LLAMAFILE":   "OFF",
            "GGML_OPENCL":      "ON",
            "GGML_HEXAGON":     "ON",
+            "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
            "LLAMA_OPENSSL":    "OFF"
        }
    },
@@ -46,6 +47,7 @@
            "GGML_LLAMAFILE":   "OFF",
            "GGML_OPENCL":      "ON",
            "GGML_HEXAGON":     "ON",
+            "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
            "LLAMA_OPENSSL":    "OFF"
        }
    },
@@ -71,6 +73,7 @@
            "GGML_LLAMAFILE":   "OFF",
            "GGML_OPENCL":      "OFF",
            "GGML_HEXAGON":     "ON",
+            "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
            "LLAMA_OPENSSL":    "OFF"
        }
    },
@@ -1,11 +1,10 @@
 # Multimodal

 llama.cpp supports multimodal input via `libmtmd`. Currently, there are 2 tools support this feature:
- [llama-cli](../tools/cli/README.md)
+- [llama-mtmd-cli](../tools/mtmd/README.md)
 - [llama-server](../tools/server/README.md) via OpenAI-compatible `/chat/completions` API
- [llama-mtmd-cli](../tools/mtmd/README.md), for testing and development

-Currently, we support **image**, **audio** and **video** input.
+Currently, we support **image** and **audio** input. Audio is highly experimental and may have reduced quality.

 To enable it, you can use one of the 2 methods below:

@@ -27,11 +27,11 @@ Legend:
 |                        COL2IM_1D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
-|                          CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                          CONV_3D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
+|                          CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
+|                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                          CONV_3D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                              COS | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                      COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
@@ -8,53 +8,55 @@ The INI preset feature, introduced in [PR#17859](https://github.com/ggml-org/lla

 When running multiple models on the server (router mode), INI preset files can be used to configure model-specific parameters. Please refer to the [server documentation](../tools/server/README.md) for more details.

-### Using a Hugging Face Preset
+### Using a Remote Preset

-> [!IMPORTANT]
+> [!NOTE]
 >
-> Please only use presets that you can trust! Unknown presets may be unsafe
+> This feature is currently only supported via the `-hf` option.

-You can push your preset to Hugging Face Hub and share with other users by:
-1. Creating an empty model repository on Hugging Face
-2. Creating a `preset.ini` file in the root directory of the repository
+For GGUF models hosted on Hugging Face, you can include a `preset.ini` file in the root directory of the repository to define specific configurations for that model.

-Example of a `preset.ini`:
+Example:

 ```ini
-[*]
-ctx-size             = 0
-mmap                 = 1
-kv-unified           = 1
-parallel             = 4
-spec-default         = 1
-
-[Qwen3.5-4B]
-hf                   = unsloth/Qwen3.5-4B-GGUF:Q4_K_M
-ctx-size             = 262144
-batch-size           = 2048
-ubatch-size          = 2048
-top-p                = 1.0
-top-k                = 0
-min-p                = 0.01
-temp                 = 1.0
-
-[gpt-oss-120b-hf]
-hf                   = ggml-org/gpt-oss-120b-GGUF
-ctx-size             = 262144
-batch-size           = 2048
-ubatch-size          = 2048
-top-p                = 1.0
-top-k                = 0
-min-p                = 0.01
-temp                 = 1.0
-chat-template-kwargs = {"reasoning_effort": "high"}
+hf-repo-draft = username/my-draft-model-GGUF
+temp = 0.5
+top-k = 20
+top-p = 0.95
 ```

-The preset will be loaded similarly to the `--models-preset` option. Therefore, you can also override certain params via CLI arguments:
+For security reasons, only certain options are allowed. Please refer to [preset.cpp](../common/preset.cpp) for the complete list of permitted options.
+
+Example usage:
+
+Assuming your repository `username/my-model-with-preset` contains a `preset.ini` with the configuration above:
+
+```sh
+llama-cli -hf username/my-model-with-preset
+
+# This is equivalent to:
+llama-cli -hf username/my-model-with-preset \
+  --hf-repo-draft username/my-draft-model-GGUF \
+  --temp 0.5 \
+  --top-k 20 \
+  --top-p 0.95
+```
+
+You can also override preset arguments by specifying them on the command line:

 ```sh
 # Force temp = 0.1, overriding the preset value
-llama-cli -hf username/my-preset --temp 0.1
+llama-cli -hf username/my-model-with-preset --temp 0.1
+```
+
+If you want to define multiple preset configurations for one or more GGUF models, you can create a blank HF repo for each preset. Each HF repo should contain a `preset.ini` file that references the actual model(s):
+
+```ini
+hf-repo = user/my-model-main
+hf-repo-draft = user/my-model-draft
+temp = 0.8
+ctx-size = 1024
+; (and other configurations)
 ```

 ### Named presets
@@ -13,45 +13,6 @@ The `llama-server` application supports several implementations of speculative d
 A much smaller model (called the _draft model_) generates drafts.
 A draft model is the most used approach in speculative decoding.

-### EAGLE-3 (`draft-eagle3`)
-
-EAGLE-3 uses a small draft model that reads the target model's hidden states to predict the next tokens, so it
-reaches higher acceptance than a standalone draft model of the same size. The draft is a one-layer transformer
-trained for a specific target model; it shares the target model's tokenizer and, optionally, uses a reduced draft
-vocabulary with its own `lm_head`, which is mapped back using a `d2t` table.
-
-Convert the EAGLE-3 checkpoint with `--target-model-dir` so it inherits the target's tokenizer and the layer
-indices to read. Both the SpecForge `LlamaForCausalLMEagle3` and the vLLM/AngelSlim `Eagle3LlamaForCausalLM`
-checkpoint formats are supported (for example [`AngelSlim/Qwen3-4B_eagle3`](https://huggingface.co/AngelSlim/Qwen3-4B_eagle3)
-for `Qwen/Qwen3-4B`):
-
-```bash
-python convert_hf_to_gguf.py AngelSlim/Qwen3-4B_eagle3 \
-    --target-model-dir Qwen/Qwen3-4B --outtype bf16 --outfile Qwen3-4B-eagle3.gguf
-
-llama-server -m Qwen3-4B.gguf -md Qwen3-4B-eagle3.gguf --spec-type draft-eagle3
-```
-
-Supported EAGLE-3 draft models include:
-
- [yuhuili/EAGLE3-LLaMA3.1-Instruct-8B](https://huggingface.co/yuhuili/EAGLE3-LLaMA3.1-Instruct-8B)
- [yuhuili/EAGLE3-LLaMA3.3-Instruct-70B](https://huggingface.co/yuhuili/EAGLE3-LLaMA3.3-Instruct-70B)
- [RedHatAI/gemma-4-31B-it-speculator.eagle3](https://huggingface.co/RedHatAI/gemma-4-31B-it-speculator.eagle3)
- [RedHatAI/gemma-4-26B-A4B-it-speculator.eagle3](https://huggingface.co/RedHatAI/gemma-4-26B-A4B-it-speculator.eagle3)
- [Tengyunw/qwen3_8b_eagle3](https://huggingface.co/Tengyunw/qwen3_8b_eagle3)
- [Tengyunw/qwen3_30b_moe_eagle3](https://huggingface.co/Tengyunw/qwen3_30b_moe_eagle3)
- [AngelSlim/Qwen3-1.7B_eagle3](https://huggingface.co/AngelSlim/Qwen3-1.7B_eagle3)
- [AngelSlim/Qwen3-4B_eagle3](https://huggingface.co/AngelSlim/Qwen3-4B_eagle3)
- [AngelSlim/Qwen3-8B_eagle3](https://huggingface.co/AngelSlim/Qwen3-8B_eagle3)
- [AngelSlim/Qwen3-14B_eagle3](https://huggingface.co/AngelSlim/Qwen3-14B_eagle3)
- [AngelSlim/Qwen3-32B_eagle3](https://huggingface.co/AngelSlim/Qwen3-32B_eagle3)
- [AngelSlim/Qwen3-a3B_eagle3](https://huggingface.co/AngelSlim/Qwen3-a3B_eagle3)
- [RedHatAI/gpt-oss-20b-speculator.eagle3](https://huggingface.co/RedHatAI/gpt-oss-20b-speculator.eagle3)
- [lmsys/EAGLE3-gpt-oss-120b-bf16](https://huggingface.co/lmsys/EAGLE3-gpt-oss-120b-bf16)
- [nvidia/gpt-oss-120b-Eagle3-long-context](https://huggingface.co/nvidia/gpt-oss-120b-Eagle3-long-context)
-
-For the full and up-to-date list of supported models, see #18039.
-
 ### n-gram Cache (`ngram-cache`)

 An n-gram is a sequence of n tokens. The n-gram cache implementation maintains statistics about short n-gram sequences.
@@ -147,7 +108,7 @@ If a draft model is combined with a draftless decoding the draftless decoding ha
 ### General Speculative Parameters

 ```
--spec-type [none|draft-simple|draft-eagle3|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
+--spec-type [none|draft-simple|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
                                        comma-separated list of types of speculative decoding to use
                                        (default: none)
                                        (env: LLAMA_ARG_SPEC_TYPE)
@@ -286,7 +247,6 @@ Specifies a comma-separated list of speculative decoding types to use.
 |------|-------------|
 | `none` | No speculative decoding (default) |
 | `draft-simple` | Use a simple draft model for speculation |
-| `draft-eagle3` | Use an EAGLE-3 draft model that reads the target's hidden states |
 | `draft-mtp` | Use Multi Token Prediction (MTP) heads from the main model |
 | `ngram-cache` | Use n-gram cache lookup |
 | `ngram-simple` | Use simple n-gram pattern matching |
@@ -198,18 +198,18 @@ class BuiltinRule:
 SPACE_RULE = '| " " | "\\n"{1,2} [ \\t]{0,20}'

 PRIMITIVE_RULES = {
-    'boolean'      : BuiltinRule('("true" | "false")', []),
+    'boolean'      : BuiltinRule('("true" | "false") space', []),
    'decimal-part' : BuiltinRule('[0-9]{1,16}', []),
    'integral-part': BuiltinRule('[0] | [1-9] [0-9]{0,15}', []),
-    'number'       : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?', ['integral-part', 'decimal-part']),
-    'integer'      : BuiltinRule('("-"? integral-part)', ['integral-part']),
+    'number'       : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
+    'integer'      : BuiltinRule('("-"? integral-part) space', ['integral-part']),
    'value'        : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
-    'object'       : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? space "}"', ['string', 'value']),
-    'array'        : BuiltinRule('"[" space ( value ("," space value)* )? space "]"', ['value']),
-    'uuid'         : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\""', []),
+    'object'       : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
+    'array'        : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
+    'uuid'         : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []),
    'char'         : BuiltinRule(r'[^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})', []),
-    'string'       : BuiltinRule(r'"\"" char* "\""', ['char']),
-    'null'         : BuiltinRule('"null"', []),
+    'string'       : BuiltinRule(r'"\"" char* "\"" space', ['char']),
+    'null'         : BuiltinRule('"null" space', []),
 }

 # TODO: support "uri", "email" string formats
@@ -217,9 +217,9 @@ STRING_FORMAT_RULES = {
    'date'            : BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
    'time'            : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
    'date-time'       : BuiltinRule('date "T" time', ['date', 'time']),
-    'date-string'     : BuiltinRule('"\\"" date "\\""', ['date']),
-    'time-string'     : BuiltinRule('"\\"" time "\\""', ['time']),
-    'date-time-string': BuiltinRule('"\\"" date-time "\\""', ['date-time']),
+    'date-string'     : BuiltinRule('"\\"" date "\\"" space', ['date']),
+    'time-string'     : BuiltinRule('"\\"" time "\\"" space', ['time']),
+    'date-time-string': BuiltinRule('"\\"" date-time "\\"" space', ['date-time']),
 }

 DOTALL = '[\\U00000000-\\U0010FFFF]'
@@ -319,7 +319,7 @@ class SchemaConverter:
                out.append(f'[^"{"".join(rejects)}] {char_rule}*')
        visit(trie)

-        out.append(f' ){"" if trie.is_end_of_string else "?"} ["]')
+        out.append(f' ){"" if trie.is_end_of_string else "?"} ["] space')
        return ''.join(out)

    def _add_rule(self, name, rule):
@@ -549,7 +549,7 @@ class SchemaConverter:
        return self._add_rule(
            name,
            to_rule(transform()) if self._raw_pattern \
-                else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\"")
+                else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space")


    def _resolve_ref(self, ref):
@@ -580,10 +580,10 @@ class SchemaConverter:
            return self._add_rule(rule_name, self._generate_union_rule(name, [{**schema, 'type': t} for t in schema_type]))

        elif 'const' in schema:
-            return self._add_rule(rule_name, self._generate_constant_rule(schema['const']))
+            return self._add_rule(rule_name, self._generate_constant_rule(schema['const']) + ' space')

        elif 'enum' in schema:
-            rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ')'
+            rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ') space'
            return self._add_rule(rule_name, rule)

        elif schema_type in (None, 'object') and \
@@ -624,7 +624,7 @@ class SchemaConverter:
                    enum_intersection &= s

                if enum_intersection:
-                    rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in sorted(enum_intersection))) + ')'
+                    rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in sorted(enum_intersection))) + ') space'
                    return self._add_rule(rule_name, rule)

            return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None))
@@ -638,12 +638,12 @@ class SchemaConverter:
                    ' "," space '.join(
                        self.visit(item, f'{name}{"-" if name else ""}tuple-{i}')
                        for i, item in enumerate(items)) +
-                    ' space "]"')
+                    ' "]" space')
            else:
                item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item')
                min_items = schema.get("minItems", 0)
                max_items = schema.get("maxItems")
-                return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' space "]"')
+                return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' "]" space')

        elif schema_type in (None, 'string') and 'pattern' in schema:
            return self._visit_pattern(schema['pattern'], rule_name)
@@ -663,7 +663,7 @@ class SchemaConverter:
            min_len = schema.get('minLength', 0)
            max_len = schema.get('maxLength')

-            return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\""')
+            return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space')

        elif schema_type in (None, 'integer') and \
                ('minimum' in schema or 'exclusiveMinimum' in schema or 'maximum' in schema or 'exclusiveMaximum' in schema):
@@ -680,7 +680,7 @@ class SchemaConverter:

            out = ["("]
            _generate_min_max_int(min_value, max_value, out)
-            out.append(")")
+            out.append(") space")
            return self._add_rule(rule_name, ''.join(out))

        elif (schema_type == 'object') or (len(schema) == 0):
@@ -765,7 +765,7 @@ class SchemaConverter:
                rule += ' )'
            rule += ' )?'

-        rule += ' space "}"'
+        rule += ' "}" space'

        return rule

@@ -1,9 +0,0 @@
-#!/bin/bash
-
-#  MIT license
-#  Copyright (C) 2026 Intel Corporation
-#  SPDX-License-Identifier: MIT
-
-./build/bin/test-backend-ops support --output csv > docs/ops/SYCL.csv
-./scripts/create_ops_docs.py
-
@@ -1,8 +0,0 @@
-@echo off
-
-rem MIT license
-rem Copyright (C) 2026 Intel Corporation
-rem SPDX-License-Identifier: MIT
-
-build\bin\test-backend-ops support --output csv > docs\ops\SYCL.csv
-python scripts\create_ops_docs.py
@@ -5,7 +5,7 @@ project("ggml" C CXX ASM)
 ### GGML Version
 set(GGML_VERSION_MAJOR 0)
 set(GGML_VERSION_MINOR 15)
-set(GGML_VERSION_PATCH 3)
+set(GGML_VERSION_PATCH 1)
 set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")

 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
@@ -249,7 +249,7 @@ option(GGML_SYCL                            "ggml: use SYCL"
 option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
 option(GGML_SYCL_GRAPH                      "ggml: enable graphs in the SYCL backend"         ON)
 option(GGML_SYCL_HOST_MEM_FALLBACK          "ggml: allow host memory fallback in SYCL reorder (requires kernel 6.8+)" ON)
-option(GGML_SYCL_SUPPORT_LEVEL_ZERO_API     "ggml: use Level Zero API in SYCL backend"  ON)
+option(GGML_SYCL_SUPPORT_LEVEL_ZERO         "ggml: use Level Zero API in SYCL backend"  ON)
 option(GGML_SYCL_DNN                        "ggml: enable oneDNN in the SYCL backend"         ON)
 set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
                                            "ggml: sycl target device")
@@ -266,6 +266,7 @@ set   (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
                                            "ggml: OpenCL API version to target")

 option(GGML_HEXAGON                         "ggml: enable Hexagon backend"                    OFF)
+set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml: quantize group size (32, 64, or 128)")

 # toolchain for vulkan-shaders-gen
 set   (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
@@ -27,14 +27,6 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int de
 // split tensor buffer that splits matrices by rows across multiple devices
 GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);

-// Tensor parallelism (--split-mode tensor): comm_init/free/allreduce_tensor
-// trio queried by the meta-backend via ggml_backend_reg_get_proc_address.
-// See typedefs in ggml/include/ggml-backend.h. Mirrors the CUDA backend's
-// pattern (ggml_backend_cuda_comm_*).
-GGML_BACKEND_API void * ggml_backend_sycl_comm_init(ggml_backend_t * backends, size_t n_backends);
-GGML_BACKEND_API void   ggml_backend_sycl_comm_free(void * comm_ctx);
-GGML_BACKEND_API bool   ggml_backend_sycl_comm_allreduce_tensor(void * comm_ctx, struct ggml_tensor ** tensors);
-
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
 GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);

@@ -438,14 +438,7 @@ if (GGML_CPU_ALL_VARIANTS)
            ggml_add_cpu_backend_variant(power8_2       POWER8  VSX)
            ggml_add_cpu_backend_variant(power9         POWER9  VSX)
            ggml_add_cpu_backend_variant(power10        POWER10 VSX)
-            # POWER11 backend: only if compiler supports -mcpu=power11
-            check_cxx_compiler_flag("-mcpu=power11" GGML_CXX_SUPPORTS_POWER11)
-            if (GGML_CXX_SUPPORTS_POWER11)
-                message(STATUS "Compiler supports -mcpu=power11, enabling POWER11 backend")
-                ggml_add_cpu_backend_variant(power11 POWER11 VSX)
-            else()
-                message(STATUS "Skipping POWER11 backend: compiler does not support -mcpu=power11")
-            endif()
+            ggml_add_cpu_backend_variant(power11        POWER11 VSX)
        else()
            message(FATAL_ERROR "Unsupported PowerPC target OS: ${CMAKE_SYSTEM_NAME}")
        endif()
@@ -1551,8 +1551,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
        int split_backend_id = split->backend_id;
        ggml_backend_t split_backend = sched->backends[split_backend_id];

-        ggml_backend_synchronize(split_backend);
-
        // copy the input tensors to the split backend
        for (int input_id = 0; input_id < split->n_inputs; input_id++) {
            ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
@@ -1563,15 +1561,15 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
                if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
                    ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
-                } else if (!split_backend->iface.cpy_tensor_async) {
+                } else {
                    ggml_backend_synchronize(split_backend);
                }
-                ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
+                ggml_backend_tensor_copy(input, input_cpy);
            } else {
                // wait for the split backend to finish using the input before overwriting it
                if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
                    ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
-                } else if (!split_backend->iface.cpy_tensor_async) {
+                } else {
                    ggml_backend_synchronize(split_backend);
                }

@@ -1676,8 +1674,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
            }
        }

-        ggml_backend_synchronize(split_backend);
-
        if (!sched->callback_eval) {
            enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
            if (ec != GGML_STATUS_SUCCESS) {
@@ -389,7 +389,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M_UPPER}")
            string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")

-            if (EXTRACTED_NUMBER EQUAL 10 OR EXTRACTED_NUMBER EQUAL 11)
+            if (EXTRACTED_NUMBER GREATER_EQUAL 10)
                list(APPEND ARCH_FLAGS -mcpu=power10)
            elseif (EXTRACTED_NUMBER EQUAL 9)
                list(APPEND ARCH_FLAGS -mcpu=power9)
@@ -2417,14 +2417,15 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
            // Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size
            GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size);

-            parallel_for_ggml(params, n_batch * M, [&](int begin, int end) {
-                for (int idx = begin; idx < end; ++idx) {
-                    int batch_idx = idx / M;
-                    int m         = idx % M;
+            parallel_for_ggml(params, n_batch, [&](int begin, int end) {
+                for (int batch_idx = begin; batch_idx < end; ++batch_idx) {
                    int64_t src1_offset = ggml_batch_offset(src1, batch_idx, ne2);
                    const float * A_data = (const float *)((const char *)src1->data + src1_offset);
                    char * wdata_batch = (char *)wdata + batch_idx * M * row_size_A;
-                    from_float<vec_dot_type>(A_data + m * K, wdata_batch + m * row_size_A, K);
+
+                    for (int m = 0; m < M; ++m) {
+                        from_float<vec_dot_type>(A_data + m * K, wdata_batch + m * row_size_A, K);
+                    }
                }
            });
        });
@@ -2345,7 +2345,7 @@ class tinyBLAS_Q0_PPC {
            else if (n_aligned % 16 == 0) nc = 16;
            else                          nc = 8;
        }
-        bool can_use_tiled = n_aligned > 0 && (m % mc == 0);
+        bool can_use_tiled = n_aligned > 0 && (m % mc == 0) && (k % kc == 0);
        if (can_use_tiled) {
            matmul_tiled(m, n_aligned, mc, nc, kc);
            if (n > n_aligned) {
@@ -3063,14 +3063,13 @@ class tinyBLAS_Q0_PPC {
            int64_t ii = (job / xtiles) * mc;
            int64_t jj = (job % xtiles) * nc;
            for (int64_t kk = 0; kk < k; kk += kc) {
-                int64_t k_cur = MIN(kc, k - kk);
                if constexpr(is_Ablock_q4) {
-                    packNormal_q4_fp16(A + ii * lda + kk, lda, mc, k_cur, (uint8_t *)A_pack);
+                    packNormal_q4_fp16(A + ii * lda + kk, lda, mc, kc, (uint8_t *)A_pack);
                } else {
-                    packNormal_q8_fp16(A + ii * lda + kk, lda, mc, k_cur, (uint8_t *)A_pack);
+                    packNormal_q8_fp16(A + ii * lda + kk, lda, mc, kc, (uint8_t *)A_pack);
                }
-                packNormal_q8_fp16(B + jj * ldb + kk, ldb, nc, k_cur, (uint8_t *)B_pack);
-                KERNEL_Q0(ii, jj, mc, nc, k_cur, kk, A_pack, B_pack);
+                packNormal_q8_fp16(B + jj * ldb + kk, ldb, nc, kc, (uint8_t *)B_pack);
+                KERNEL_Q0(ii, jj, mc, nc, kc, kk, A_pack, B_pack);
            }
        }
    }
@@ -3688,6 +3688,8 @@ static void ggml_compute_forward_norm_f32(

    GGML_ASSERT(ggml_are_same_shape(src0, dst));

+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
    const int ith = params->ith;
    const int nth = params->nth;

@@ -3701,49 +3703,25 @@ static void ggml_compute_forward_norm_f32(
    for (int64_t i03 = 0; i03 < ne03; i03++) {
        for (int64_t i02 = 0; i02 < ne02; i02++) {
            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
-                const char * x = (const char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
-                char * y = (char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3;
+                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);

-                if (nb00 == sizeof(float) && nb0 == sizeof(float)) {
-                    const float * xf = (const float *) x;
+                float sum = 0.0;
+                ggml_vec_sum_f32(ne00, &sum, x);
+                float mean = sum/ne00;

-                    float sum = 0.0;
-                    ggml_vec_sum_f32(ne00, &sum, xf);
-                    float mean = sum/ne00;
-
-                    float * yf = (float *) y;
-                    float variance = 0;
+                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
+                float variance = 0;

 #ifdef GGML_USE_ACCELERATE
-                    mean = -mean;
-                    vDSP_vsadd(xf, 1, &mean, yf, 1, ne00);
-                    vDSP_measqv(yf, 1, &variance, ne00);
+                mean = -mean;
+                vDSP_vsadd(x, 1, &mean, y, 1, ne00);
+                vDSP_measqv(y, 1, &variance, ne00);
 #else
-                    variance = ggml_vec_cvar_f32(ne00, yf, xf, mean);
+                variance = ggml_vec_cvar_f32(ne00, y, x, mean);
 #endif //GGML_USE_ACCELERATE

-                    const float scale = 1.0f/sqrtf(variance + eps);
-                    ggml_vec_scale_f32(ne00, yf, scale);
-                } else {
-                    float sum = 0.0;
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        sum += *(const float *) (x + i00*nb00);
-                    }
-                    const float mean = sum/ne00;
-
-                    float variance = 0.0f;
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        const float v = *(const float *) (x + i00*nb00) - mean;
-                        *(float *) (y + i00*nb0) = v;
-                        variance += v * v;
-                    }
-                    variance /= ne00;
-
-                    const float scale = 1.0f/sqrtf(variance + eps);
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        *(float *) (y + i00*nb0) *= scale;
-                    }
-                }
+                const float scale = 1.0f/sqrtf(variance + eps);
+                ggml_vec_scale_f32(ne00, y, scale);
            }
        }
    }
@@ -4164,6 +4142,8 @@ static void ggml_compute_forward_l2_norm_f32(

    GGML_ASSERT(ggml_are_same_shape(src0, dst));

+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
    const int ith = params->ith;
    const int nth = params->nth;

@@ -4178,27 +4158,20 @@ static void ggml_compute_forward_l2_norm_f32(
    for (int64_t i03 = 0; i03 < ne03; i03++) {
        for (int64_t i02 = 0; i02 < ne02; i02++) {
            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
-                const char * x = (const char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
+                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);

                ggml_float sum = 0.0;
                for (int64_t i00 = 0; i00 < ne00; i00++) {
-                    const float xi = *(const float *) (x + i00*nb00);
-                    sum += (ggml_float)(xi * xi);
+                    sum += (ggml_float)(x[i00] * x[i00]);
                }

+                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
+
+                memcpy(y, x, ne00 * sizeof(float));
+
                const float scale = 1.0f/fmaxf(sqrtf(sum), eps);

-                char * y = (char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3;
-
-                if (nb00 == sizeof(float) && nb0 == sizeof(float)) {
-                    memcpy(y, x, ne00 * sizeof(float));
-                    ggml_vec_scale_f32(ne00, (float *) y, scale);
-                } else {
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        const float xi = *(const float *) (x + i00*nb00);
-                        *(float *) (y + i00*nb0) = xi * scale;
-                    }
-                }
+                ggml_vec_scale_f32(ne00, y, scale);
            }
        }
    }
@@ -75,12 +75,12 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
            ay1 = GGML_F32_VEC_LOAD(y + i);
            sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
        }
-        // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmla on available elements only
+        // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
        if (np2 < n) {
            svbool_t pg = svwhilelt_b32(np2, n);
            ax1 = svld1_f32(pg, x + np2);
            ay1 = svld1_f32(pg, y + np2);
-            sum1 = svmla_f32_m(pg, sum1, ax1, ay1);
+            sum1 = svmad_f32_m(pg, ax1, ay1, sum1);
        }
        // reduce sum1,sum2 to sum1
        GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
@@ -34,26 +34,26 @@ template <float (*bin_op)(const float, const float),
 static __global__ void k_bin_bcast(const src0_t *         src0,
                                   const src1_t *         src1,
                                   dst_t *                dst,
-                                   const uint32_t         ne0,
-                                   const uint32_t         ne1,
-                                   const uint32_t         ne2,
+                                   const int              ne0,
+                                   const int              ne1,
+                                   const int              ne2,
                                   const uint3            ne3,
                                   const uint3            ne10,
                                   const uint3            ne11,
                                   const uint3            ne12,
                                   const uint3            ne13,
-                                 /*const uint32_t         s0,*/
-                                   const uint32_t         s1,
-                                   const uint32_t         s2,
-                                   const uint32_t         s3,
-                                   const uint32_t         s00,
-                                   const uint32_t         s01,
-                                   const uint32_t         s02,
-                                   const uint32_t         s03,
-                                   const uint32_t         s10,
-                                   const uint32_t         s11,
-                                   const uint32_t         s12,
-                                   const uint32_t         s13,
+                                 /*const int              s0,*/
+                                   const int              s1,
+                                   const int              s2,
+                                   const int              s3,
+                                   const int              s00,
+                                   const int              s01,
+                                   const int              s02,
+                                   const int              s03,
+                                   const int              s10,
+                                   const int              s11,
+                                   const int              s12,
+                                   const int              s13,
                                   src1_ptrs... src1s) {
    ggml_cuda_pdl_lc();
    const uint32_t i0s = blockDim.x * blockIdx.x + threadIdx.x;
@@ -61,7 +61,7 @@ static __global__ void k_bin_bcast(const src0_t *         src0,
    const uint32_t i2  = fastdiv((blockDim.z * blockIdx.z + threadIdx.z), ne3);
    const uint32_t i3  = (blockDim.z * blockIdx.z + threadIdx.z) - (i2 * ne3.z);

-    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3.z) {
+    if (i0s >= (uint32_t)ne0 || i1 >= (uint32_t)ne1 || i2 >= (uint32_t)ne2 || i3 >= ne3.z) {
        return;
    }

@@ -69,32 +69,25 @@ static __global__ void k_bin_bcast(const src0_t *         src0,
    const uint32_t i12 = fastmodulo(i2, ne12);
    const uint32_t i13 = fastmodulo(i3, ne13);

-    const size_t i_src0 = size_t( i3)*s03 + size_t( i2)*s02 + size_t( i1)*s01;
-    const size_t i_src1 = size_t(i13)*s13 + size_t(i12)*s12 + size_t(i11)*s11;
-    const size_t i_dst  = size_t( i3)*s3  + size_t( i2)*s2  + size_t( i1)*s1;
+    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
+    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
+    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;

    const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr;
    dst_t * dst_row = dst + i_dst;

-    const uint32_t s0 = blockDim.x * gridDim.x;
-
    ggml_cuda_pdl_sync();
-    for (uint32_t i0 = i0s; i0 < ne0; i0 += s0) {
+    for (int i0 = i0s; i0 < ne0; i0 += blockDim.x * gridDim.x) {
        const uint32_t i10 = fastmodulo(i0, ne10);

-        float result = src0_row ? (float) src0_row[size_t(i0)*s00] : 0.0f;
+        float result = src0_row ? (float) src0_row[i0*s00] : 0.0f;
        if constexpr (sizeof...(src1_ptrs) > 0) {
-            result = (..., (result = bin_op(result, (float)src1s[i_src1 + size_t(i10)*s10])));
+            result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10*s10])));
        } else {
-            result = bin_op(result, (float)src1[i_src1 + size_t(i10)*s10]);
+            result = bin_op(result, (float)src1[i_src1 + i10*s10]);
        }

        dst_row[i0] = (dst_t) result;
-
-        // protect i0 from overflow
-        if (ne0 - i0 <= s0) {
-           break;
-        }
    }
 }

@@ -117,19 +110,19 @@ static __global__ void k_bin_bcast_unravel(const src0_t *         src0,
                                           const uint3            ne12,
                                           const uint3            ne13,
                                         /*const int              s0,*/
-                                           const uint32_t         s1,
-                                           const uint32_t         s2,
-                                           const uint32_t         s3,
-                                           const uint32_t         s00,
-                                           const uint32_t         s01,
-                                           const uint32_t         s02,
-                                           const uint32_t         s03,
-                                           const uint32_t         s10,
-                                           const uint32_t         s11,
-                                           const uint32_t         s12,
-                                           const uint32_t         s13,
+                                           const int              s1,
+                                           const int              s2,
+                                           const int              s3,
+                                           const int              s00,
+                                           const int              s01,
+                                           const int              s02,
+                                           const int              s03,
+                                           const int              s10,
+                                           const int              s11,
+                                           const int              s12,
+                                           const int              s13,
                                           src1_ptrs... src1s) {
-    const uint32_t i  = blockDim.x*blockIdx.x + threadIdx.x;
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;

    const uint32_t i3 = fastdiv(i, prod_012);
    const uint32_t i2 = fastdiv(i - i3 * prod_012.z, prod_01);
@@ -140,25 +133,25 @@ static __global__ void k_bin_bcast_unravel(const src0_t *         src0,
        return;
    }

-    const uint32_t i11 = fastmodulo(i1, ne11);
-    const uint32_t i12 = fastmodulo(i2, ne12);
-    const uint32_t i13 = fastmodulo(i3, ne13);
+    const int i11 = fastmodulo(i1, ne11);
+    const int i12 = fastmodulo(i2, ne12);
+    const int i13 = fastmodulo(i3, ne13);

-    const size_t i_src0 = size_t( i3)*s03 + size_t( i2)*s02 + size_t( i1)*s01;
-    const size_t i_src1 = size_t(i13)*s13 + size_t(i12)*s12 + size_t(i11)*s11;
-    const size_t i_dst  = size_t( i3)*s3  + size_t( i2)*s2  + size_t( i1)*s1;
+    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
+    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
+    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;

    const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr;
    dst_t * dst_row = dst + i_dst;

-    const uint32_t i10 = fastmodulo(i0, ne10);
+    const int i10 = fastmodulo(i0, ne10);

    ggml_cuda_pdl_sync();
-    float result = src0_row ? (float) src0_row[size_t(i0)*s00] : 0.0f;
+    float result = src0_row ? (float) src0_row[i0*s00] : 0.0f;
    if constexpr (sizeof...(src1_ptrs) > 0) {
-        result = (..., (result = bin_op(result, (float)src1s[i_src1 + size_t(i10)*s10])));
+        result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10*s10])));
    } else {
-        result = bin_op(result, (float)src1[i_src1 + size_t(i10)*s10]);
+        result = bin_op(result, (float)src1[i_src1 + i10*s10]);
    }

    dst_row[i0] = (dst_t) result;
@@ -255,31 +248,6 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
        size_t s02 = nb02 / sizeof(src0_t);
        size_t s03 = nb03 / sizeof(src0_t);

-        GGML_ASSERT(ne0 <= std::numeric_limits<uint32_t>::max());
-        GGML_ASSERT(ne1 <= std::numeric_limits<uint32_t>::max());
-        GGML_ASSERT(ne2 <= std::numeric_limits<uint32_t>::max());
-        GGML_ASSERT(ne3 <= std::numeric_limits<uint32_t>::max());
-
-      //GGML_ASSERT(s0  <= std::numeric_limits<uint32_t>::max());
-        GGML_ASSERT(s1  <= std::numeric_limits<uint32_t>::max());
-        GGML_ASSERT(s2  <= std::numeric_limits<uint32_t>::max());
-        GGML_ASSERT(s3  <= std::numeric_limits<uint32_t>::max());
-
-        GGML_ASSERT(s00 <= std::numeric_limits<uint32_t>::max());
-        GGML_ASSERT(s01 <= std::numeric_limits<uint32_t>::max());
-        GGML_ASSERT(s02 <= std::numeric_limits<uint32_t>::max());
-        GGML_ASSERT(s03 <= std::numeric_limits<uint32_t>::max());
-
-        GGML_ASSERT(s10 <= std::numeric_limits<uint32_t>::max());
-        GGML_ASSERT(s11 <= std::numeric_limits<uint32_t>::max());
-        GGML_ASSERT(s12 <= std::numeric_limits<uint32_t>::max());
-        GGML_ASSERT(s13 <= std::numeric_limits<uint32_t>::max());
-
-        GGML_ASSERT(cne1[0] <= std::numeric_limits<uint32_t>::max());
-        GGML_ASSERT(cne1[1] <= std::numeric_limits<uint32_t>::max());
-        GGML_ASSERT(cne1[2] <= std::numeric_limits<uint32_t>::max());
-        GGML_ASSERT(cne1[3] <= std::numeric_limits<uint32_t>::max());
-
        GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
        GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
        GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
@@ -295,8 +263,6 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
        GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
        GGML_ASSERT(nb13 % sizeof(src1_t) == 0);

-        GGML_ASSERT(ne2 * ne3 <= std::numeric_limits<unsigned int>::max());
-
        const int block_size = 128;

        int64_t hne0 = std::max(ne0 / 2LL, 1LL);
@@ -315,13 +281,7 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
        const uint3 ne13 = init_fastdiv_values((uint32_t) cne1[3]);

        if (block_nums.z > 65535 || block_nums.y > 65535) {
-            int64_t     block_num   = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size;
-
-            GGML_ASSERT(block_num              <= std::numeric_limits<uint32_t>::max());
-            GGML_ASSERT(block_num * block_size <= std::numeric_limits<uint32_t>::max());
-            GGML_ASSERT(ne0 * ne1              <= std::numeric_limits<uint32_t>::max());
-            GGML_ASSERT(ne0 * ne1 * ne2        <= std::numeric_limits<uint32_t>::max());
-
+            int         block_num  = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size;
            const uint3 prod_012    = init_fastdiv_values((uint32_t) (ne0 * ne1 * ne2));
            const uint3 prod_01     = init_fastdiv_values((uint32_t) (ne0 * ne1));
            const uint3 ne0_fastdiv = init_fastdiv_values((uint32_t) ne0);
@@ -338,10 +298,6 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
                    s10, s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...);
            }
        } else {
-            GGML_ASSERT(int64_t(block_nums.x) * block_dims.x <= std::numeric_limits<uint32_t>::max());
-            GGML_ASSERT(int64_t(block_nums.y) * block_dims.y <= std::numeric_limits<uint32_t>::max());
-            GGML_ASSERT(int64_t(block_nums.z) * block_dims.z <= std::numeric_limits<uint32_t>::max());
-
            const uint3 ne3_fastdiv = init_fastdiv_values((uint32_t) ne3);
            {
                const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream);
@@ -1,81 +0,0 @@
-#include "col2im-1d.cuh"
-#include "convert.cuh"
-
-// col2im_1d: scatter-add GEMM columns to 1D signal (gather approach)
-// columns: [K*OC, T_in]  ->  output: [T_out, OC]
-// Supports F32, F16, BF16 data with F32 accumulator.
-
-template <typename T>
-static __global__ void col2im_1d_kernel(
-        const T * __restrict__ col,
-        T       * __restrict__ dst,
-        const int T_in, const uint3 T_out_fd,
-        const int OC, const int K, const int K_OC,
-        const int s0, const int p0, const int total) {
-
-    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (idx >= total) return;
-
-    // dst layout: [T_out, OC], ne[0]=T_out fastest
-    const uint2 qr  = fast_div_modulo((uint32_t)idx, T_out_fd);  // qr.x = idx / T_out, qr.y = idx % T_out
-    const int oc    = (int)qr.x;
-    const int t_out = (int)qr.y;
-    const int t_abs = t_out + p0;  // absolute position in uncropped signal
-
-    // Gather: find all (t_in, k) where t_in*s + k == t_abs, 0 <= k < K
-    int t_in_min = (t_abs - K + s0) / s0;  // ceil((t_abs - K + 1) / s)
-    if (t_in_min < 0) t_in_min = 0;
-    int t_in_max = t_abs / s0;
-    if (t_in_max >= T_in) t_in_max = T_in - 1;
-
-    float sum = 0.0f;
-    for (int t_in = t_in_min; t_in <= t_in_max; t_in++) {
-        const int k = t_abs - t_in * s0;
-        // col layout: [K*OC, T_in], column index = oc * K + k
-        sum += ggml_cuda_cast<float>(col[(oc * K + k) + t_in * K_OC]);
-    }
-
-    dst[idx] = ggml_cuda_cast<T>(sum);
-}
-
-void ggml_cuda_op_col2im_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-    cudaStream_t stream = ctx.stream();
-
-    GGML_ASSERT(ggml_is_contiguous(src0));
-
-    const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
-    const int32_t OC = ((const int32_t *)(dst->op_params))[1];
-    const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
-
-    const int K_OC = (int) src0->ne[0];
-    const int T_in = (int) src0->ne[1];
-    const int K    = K_OC / OC;
-    const int T_out = (int) dst->ne[0];
-
-    const uint3 T_out_fd = init_fastdiv_values((uint32_t)T_out);
-
-    const int total = T_out * OC;
-    const int block_size = 256;
-    const int num_blocks = (total + block_size - 1) / block_size;
-
-    switch (src0->type) {
-        case GGML_TYPE_F32: {
-            col2im_1d_kernel<<<num_blocks, block_size, 0, stream>>>(
-                (const float *)src0->data, (float *)dst->data,
-                T_in, T_out_fd, OC, K, K_OC, s0, p0, total);
-        } break;
-        case GGML_TYPE_F16: {
-            col2im_1d_kernel<<<num_blocks, block_size, 0, stream>>>(
-                (const half *)src0->data, (half *)dst->data,
-                T_in, T_out_fd, OC, K, K_OC, s0, p0, total);
-        } break;
-        case GGML_TYPE_BF16: {
-            col2im_1d_kernel<<<num_blocks, block_size, 0, stream>>>(
-                (const nv_bfloat16 *)src0->data, (nv_bfloat16 *)dst->data,
-                T_in, T_out_fd, OC, K, K_OC, s0, p0, total);
-        } break;
-        default:
-            GGML_ABORT("col2im_1d: unsupported type");
-    }
-}
@@ -1,3 +0,0 @@
-#include "common.cuh"
-
-void ggml_cuda_op_col2im_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
@@ -53,10 +53,10 @@ static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const
    const int64_t nmat = ne / (ne00 * ne01);
    const int64_t n = ne00 * ne01;

-    const int64_t x  = (int64_t) blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.x;
-    const int64_t y  = (int64_t) blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.y;
-    const int64_t tx = (int64_t) blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.x;  // transpose block offset
-    const int64_t ty = (int64_t) blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.y;
+    const int x = blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.x;
+    const int y = blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.y;
+    const int tx = blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.x;  // transpose block offset
+    const int ty = blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.y;

    __shared__ float tile[2][CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D+1];
    int cur_tile_buf = 0;
@@ -197,7 +197,7 @@ static void ggml_cpy_scalar_contiguous_cuda(
 cudaStream_t stream) {

    const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
-    GGML_ASSERT(num_blocks <= INT_MAX);
+    GGML_ASSERT(num_blocks < UINT_MAX);
    const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream);
    ggml_cuda_kernel_launch(cpy_scalar_contiguous<src_t, dst_t>, launch_params, cx, cdst, ne);
 }
@@ -208,14 +208,6 @@ static void ggml_cpy_scalar_cuda(
    const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
    const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {

-    const auto launch_scalar_generic = [&]() {
-        const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
-        GGML_ASSERT(num_blocks <= INT_MAX);
-        const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream);
-        ggml_cuda_kernel_launch(cpy_scalar<cpy_1_scalar<src_t, dst_t>>, launch_params,
-            cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
-    };
-
    if (transposed) {
        GGML_ASSERT(ne == ne00*ne01*ne02);  // ne[3] is 1 assumed
        int64_t ne00n, ne01n, ne02n;
@@ -232,18 +224,20 @@ static void ggml_cpy_scalar_cuda(
        int64_t grid_x = (ne01n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D;
        int64_t grid_y = (ne00n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D;
        int64_t grid_z = (ne/(ne01n*ne00n) + CUDA_CPY_BLOCK_NM - 1) / CUDA_CPY_BLOCK_NM;
-        GGML_ASSERT(grid_x <= INT_MAX);
-        if (grid_y > USHRT_MAX || grid_z > USHRT_MAX) {
-            launch_scalar_generic();
-        } else {
-            dim3 dimGrid(grid_x, grid_y, grid_z);
-            dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1);
-            const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(dimGrid, dimBlock, 0, stream);
-            ggml_cuda_kernel_launch(cpy_scalar_transpose<dst_t>, launch_params,
-                cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
-        }
+        GGML_ASSERT(grid_x < UINT_MAX);
+        GGML_ASSERT(grid_y < USHRT_MAX);
+        GGML_ASSERT(grid_z < USHRT_MAX);
+        dim3 dimGrid(grid_x, grid_y, grid_z);
+        dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1);
+        const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(dimGrid, dimBlock, 0, stream);
+        ggml_cuda_kernel_launch(cpy_scalar_transpose<dst_t>, launch_params,
+            cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
    } else {
-        launch_scalar_generic();
+        const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+        GGML_ASSERT(num_blocks < UINT_MAX);
+        const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream);
+        ggml_cuda_kernel_launch(cpy_scalar<cpy_1_scalar<src_t, dst_t>>, launch_params,
+            cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
    }
 }

@@ -254,7 +248,7 @@ static void ggml_cpy_f32_q8_0_cuda(

    GGML_ASSERT(ne % QK8_0 == 0);
    const int64_t num_blocks = ne / QK8_0;
-    GGML_ASSERT(num_blocks <= INT_MAX);
+    GGML_ASSERT(num_blocks < UINT_MAX);
    cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
@@ -265,7 +259,7 @@ static void ggml_cpy_q8_0_f32_cuda(
    const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {

    const int64_t num_blocks = ne;
-    GGML_ASSERT(num_blocks <= INT_MAX);
+    GGML_ASSERT(num_blocks < UINT_MAX);
    cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
@@ -277,7 +271,7 @@ static void ggml_cpy_f32_q4_0_cuda(

    GGML_ASSERT(ne % QK4_0 == 0);
    const int64_t num_blocks = ne / QK4_0;
-    GGML_ASSERT(num_blocks <= INT_MAX);
+    GGML_ASSERT(num_blocks < UINT_MAX);
    cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
@@ -290,7 +284,7 @@ static void ggml_cpy_q4_0_f32_cuda(
    const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
    cudaStream_t stream) {
    const int64_t num_blocks = ne;
-    GGML_ASSERT(num_blocks <= INT_MAX);
+    GGML_ASSERT(num_blocks < UINT_MAX);
    cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0><<<num_blocks, 1, 0, stream>>>(
        cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
         ne10, ne11, ne12, nb10, nb11, nb12, nb13);
@@ -303,7 +297,7 @@ static void ggml_cpy_f32_q4_1_cuda(

    GGML_ASSERT(ne % QK4_1 == 0);
    const int64_t num_blocks = ne / QK4_1;
-    GGML_ASSERT(num_blocks <= INT_MAX);
+    GGML_ASSERT(num_blocks < UINT_MAX);
    cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
@@ -316,7 +310,7 @@ static void ggml_cpy_q4_1_f32_cuda(
    const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
    cudaStream_t stream) {
    const int64_t num_blocks = ne;
-    GGML_ASSERT(num_blocks <= INT_MAX);
+    GGML_ASSERT(num_blocks < UINT_MAX);
    cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1><<<num_blocks, 1, 0, stream>>>(
        cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
         ne10, ne11, ne12, nb10, nb11, nb12, nb13);
@@ -329,7 +323,7 @@ static void ggml_cpy_f32_q5_0_cuda(

    GGML_ASSERT(ne % QK5_0 == 0);
    const int64_t num_blocks = ne / QK5_0;
-    GGML_ASSERT(num_blocks <= INT_MAX);
+    GGML_ASSERT(num_blocks < UINT_MAX);
    cpy_f32_q<cpy_blck_f32_q5_0, QK5_0><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
@@ -342,7 +336,7 @@ static void ggml_cpy_q5_0_f32_cuda(
    const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
    cudaStream_t stream) {
    const int64_t num_blocks = ne;
-    GGML_ASSERT(num_blocks <= INT_MAX);
+    GGML_ASSERT(num_blocks < UINT_MAX);
    cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0><<<num_blocks, 1, 0, stream>>>(
        cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
        ne10, ne11, ne12, nb10, nb11, nb12, nb13);
@@ -355,7 +349,7 @@ static void ggml_cpy_f32_q5_1_cuda(

    GGML_ASSERT(ne % QK5_1 == 0);
    const int64_t num_blocks = ne / QK5_1;
-    GGML_ASSERT(num_blocks <= INT_MAX);
+    GGML_ASSERT(num_blocks < UINT_MAX);
    cpy_f32_q<cpy_blck_f32_q5_1, QK5_1><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
@@ -368,7 +362,7 @@ static void ggml_cpy_q5_1_f32_cuda(
    const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
    cudaStream_t stream) {
    const int64_t num_blocks = ne;
-    GGML_ASSERT(num_blocks <= INT_MAX);
+    GGML_ASSERT(num_blocks < UINT_MAX);
    cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1><<<num_blocks, 1, 0, stream>>>(
        cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
        ne10, ne11, ne12, nb10, nb11, nb12, nb13);
@@ -381,51 +375,11 @@ static void ggml_cpy_f32_iq4_nl_cuda(

    GGML_ASSERT(ne % QK4_NL == 0);
    const int64_t num_blocks = ne / QK4_NL;
-    GGML_ASSERT(num_blocks <= INT_MAX);
+    GGML_ASSERT(num_blocks < UINT_MAX);
    cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }

-// check if a same-type copy reduces to a 2D strided copy (height rows of width
-// contiguous bytes), so it can use cudaMemcpy2DAsync instead of the scalar kernel
-static bool ggml_cuda_cpy_as_memcpy_2d(const ggml_tensor * src0, const ggml_tensor * src1,
-        size_t & width, size_t & height, size_t & spitch, size_t & dpitch) {
-    // require matching shape: a reshaped copy maps elements by flat order, which the
-    // prefix walk below does not handle
-    if (src0->type != src1->type || !ggml_are_same_shape(src0, src1)) {
-        return false;
-    }
-
-    // grow the contiguous prefix block shared by both tensors
-    size_t block_nb = ggml_element_size(src0);
-    int d = 0;
-    for (; d < GGML_MAX_DIMS; ++d) {
-        if (src0->nb[d] != block_nb || src1->nb[d] != block_nb) {
-            break;
-        }
-        block_nb *= src0->ne[d];
-    }
-
-    // d == 0: nothing contiguous; d == GGML_MAX_DIMS: fully contiguous (handled by memcpy)
-    if (d == 0 || d == GGML_MAX_DIMS) {
-        return false;
-    }
-
-    // dim d carries the rows; everything above it must be a single element
-    for (int i = d + 1; i < GGML_MAX_DIMS; ++i) {
-        if (src0->ne[i] != 1) {
-            return false;
-        }
-    }
-
-    width  = block_nb;
-    height = src0->ne[d];
-    spitch = src0->nb[d];
-    dpitch = src1->nb[d];
-
-    return spitch >= width && dpitch >= width;
-}
-
 void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
    const int64_t ne = ggml_nelements(src0);
    GGML_ASSERT(ne == ggml_nelements(src1));
@@ -461,8 +415,6 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
    const bool can_be_transposed = nb01 == (int64_t)ggml_element_size(src0) &&
        src0->ne[3] == 1 && nb02 == ne00 * ne01 * (int64_t)ggml_element_size(src0);

-    size_t mc_width = 0, mc_height = 0, mc_spitch = 0, mc_dpitch = 0;
-
    if (src0->type == src1->type && contiguous_srcs) {
        GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
 #if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY)
@@ -473,9 +425,6 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
        {
            CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
        }
-    } else if (ggml_cuda_cpy_as_memcpy_2d(src0, src1, mc_width, mc_height, mc_spitch, mc_dpitch)) {
-        CUDA_CHECK(cudaMemcpy2DAsync(src1_ddc, mc_dpitch, src0_ddc, mc_spitch,
-                                     mc_width, mc_height, cudaMemcpyDeviceToDevice, main_stream));
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
        if (can_be_transposed) {
            ggml_cpy_scalar_cuda<float, float, true>
@@ -11,7 +11,6 @@
 #include "ggml-cuda/argsort.cuh"
 #include "ggml-cuda/binbcast.cuh"
 #include "ggml-cuda/clamp.cuh"
-#include "ggml-cuda/col2im-1d.cuh"
 #include "ggml-cuda/concat.cuh"
 #include "ggml-cuda/conv-transpose-1d.cuh"
 #include "ggml-cuda/conv2d.cuh"
@@ -3052,9 +3051,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_CONV_TRANSPOSE_1D:
            ggml_cuda_op_conv_transpose_1d(ctx,dst);
            break;
-        case GGML_OP_COL2IM_1D:
-            ggml_cuda_op_col2im_1d(ctx, dst);
-            break;
        case GGML_OP_POOL_2D:
            ggml_cuda_op_pool2d(ctx, dst);
            break;
@@ -3192,24 +3188,11 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
    ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
    ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;

-    // Enables async copies from CPU to CUDA, instead of only CUDA-to-CUDA
-    // Excluding this path for HIP and MUSA as a precaution.
-    // According to the summary in https://github.com/ggml-org/llama.cpp/pull/20793#issuecomment-4275794315, this change is not beneficial for hip anyways.
-    // Additionally, there is a lot of anectodal evidence that hip/musa stream behavior might not always 1:1 match CUDA behavior.
-    // e.g. https://github.com/ROCm/rocm-systems/issues/5109
-    // It thus makes sense to exclude this path for HIP and MUSA. This PR was not aimed these backends, the majority of testing happened on CUDA.
-    // This can be revisited in the future if enabling copy_from_host benefits hip/MUSA, and if the PR author can extensively test on these backends.
-#if defined(GGML_USE_HIP) || defined(GGML_USE_MUSA)
-    const bool copy_from_host = false;
-#else
-    const bool copy_from_host = ggml_backend_buffer_is_host(buf_src) && ggml_backend_dev_type(backend_src->device) == GGML_BACKEND_DEVICE_TYPE_CPU;
-#endif
-
-    if (!(copy_from_host || ggml_backend_is_cuda(backend_src)) || !ggml_backend_is_cuda(backend_dst)) {
+    if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) {
        return false;
    }

-    if (!(copy_from_host || ggml_backend_buffer_is_cuda(buf_src)) || !ggml_backend_buffer_is_cuda(buf_dst)) {
+    if (!ggml_backend_buffer_is_cuda(buf_src) || !ggml_backend_buffer_is_cuda(buf_dst)) {
        return false;
    }

@@ -3220,17 +3203,14 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
    ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *) buf_src->context;
    ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *) buf_dst->context;

-    if ((copy_from_host && cuda_ctx_dst->device != buf_ctx_dst->device) ||
-        !copy_from_host && (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device)) {
+    if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
 #ifndef NDEBUG
        GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
 #endif // NDEBUG
        return false;
    }

-    if (copy_from_host) {
-        CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyHostToDevice, cuda_ctx_dst->stream()));
-    } else if (backend_src != backend_dst) {
+    if (backend_src != backend_dst) {
        // copy on src stream
        if (cuda_ctx_src->device == cuda_ctx_dst->device) {
            CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
@@ -5336,21 +5316,13 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                }
                return false;
            } break;
-        case GGML_OP_COL2IM_1D:
-            {
-                ggml_type src0_type = op->src[0]->type;
-                return (src0_type == GGML_TYPE_F32 || src0_type == GGML_TYPE_F16 || src0_type == GGML_TYPE_BF16) &&
-                    op->type == src0_type &&
-                    ggml_is_contiguous(op->src[0]) &&
-                    ggml_is_contiguous(op);
-            } break;
        case GGML_OP_SILU_BACK:
            return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
            break;
        case GGML_OP_NORM:
        case GGML_OP_RMS_NORM:
        case GGML_OP_L2_NORM:
-            return ggml_is_contiguous_rows(op->src[0]);
+            return true;
        case GGML_OP_RMS_NORM_BACK:
            return ggml_is_contiguous(op->src[0]);
            break;
@@ -2,28 +2,6 @@

 #include <cstdint>

-static __global__ void k_compute_out_prod_ptrs(
-        const float * src0_d, const float * src1_d, float * dst_d,
-        const float ** ptrs_a, const float ** ptrs_b, float ** ptrs_c,
-        const int64_t ne2, const int64_t ne3,
-        const int64_t dps2, const int64_t dps3,
-        const size_t s02, const size_t s03,
-        const size_t s12, const size_t s13,
-        const size_t s2,  const size_t s3) {
-    const int64_t i2 = blockIdx.x*blockDim.x + threadIdx.x;
-    const int64_t i3 = blockIdx.y*blockDim.y + threadIdx.y;
-
-    if (i2 >= ne2 || i3 >= ne3) {
-        return;
-    }
-
-    const int64_t idx = i3*ne2 + i2;
-
-    ptrs_a[idx] = src0_d + (i3/dps3)*s03 + (i2/dps2)*s02;
-    ptrs_b[idx] = src1_d +  i3      *s13 +  i2      *s12;
-    ptrs_c[idx] = dst_d  +  i3      *s3  +  i2      *s2;
-}
-
 void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const ggml_tensor * src1 = dst->src[1];
@@ -89,39 +67,18 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
                        &beta,  dst_d  +  i3     *s3,  ldc, s2,
                        batch_count));
        }
-    } else if (ne2 > 1 || ne3 > 1) {
-        // dps2 > 1 (src0 broadcast along dim 2 with non-uniform stride) or multiple GEMMs
-        // along dim 3: compute per-GEMM pointers on the device and use a single batched GEMM.
-        GGML_ASSERT(ne3 > 0);
-        GGML_ASSERT(ne2 <= (int64_t) std::numeric_limits<int>::max() / ne3);
-        const int batch_count = (int) (ne2 * ne3);
-
-        ggml_cuda_pool_alloc<const float *> ptrs_a(ctx.pool(), batch_count);
-        ggml_cuda_pool_alloc<const float *> ptrs_b(ctx.pool(), batch_count);
-        ggml_cuda_pool_alloc<      float *> ptrs_c(ctx.pool(), batch_count);
-
-        const dim3 block_dims(16, 16);
-        const dim3 grid_dims((ne2 + block_dims.x - 1)/block_dims.x, (ne3 + block_dims.y - 1)/block_dims.y);
-        k_compute_out_prod_ptrs<<<grid_dims, block_dims, 0, stream>>>(
-            src0_d, src1_d, dst_d,
-            ptrs_a.get(), ptrs_b.get(), ptrs_c.get(),
-            ne2, ne3, dps2, dps3, s02, s03, s12, s13, s2, s3);
-        CUDA_CHECK(cudaGetLastError());
-
-        CUBLAS_CHECK(
-            cublasSgemmBatched(handle, CUBLAS_OP_N, src1_cublas_op,
-                    ne0, ne1, ne01,
-                    &alpha, ptrs_a.get(), lda,
-                            ptrs_b.get(), ldb,
-                    &beta,  ptrs_c.get(), ldc,
-                    batch_count));
    } else {
-        // ne2 == 1 && ne3 == 1: single GEMM
-        CUBLAS_CHECK(
-            cublasSgemm(handle, CUBLAS_OP_N, src1_cublas_op,
-                    ne0, ne1, ne01,
-                    &alpha, src0_d, lda,
-                            src1_d, ldb,
-                    &beta,  dst_d,  ldc));
+        // Fallback: ne2 == 1 (no batching benefit) or dps2 > 1 (src0 broadcast along dim 2
+        // with non-uniform stride; would need cublasSgemmBatched with pointer arrays).
+        for (int64_t i3 = 0; i3 < ne3; ++i3) {
+            for (int64_t i2 = 0; i2 < ne2; ++i2) {
+                CUBLAS_CHECK(
+                    cublasSgemm(handle, CUBLAS_OP_N, src1_cublas_op,
+                            ne0, ne1, ne01,
+                            &alpha, src0_d + (i3/dps3)*s03 + (i2/dps2)*s02, lda,
+                                    src1_d +  i3      *s13 +  i2      *s12, ldb,
+                            &beta,  dst_d  +  i3      *s3  +  i2      *s2,  ldc));
+            }
+        }
    }
 }
@@ -48,7 +48,6 @@
 #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
 #define cublasSetStream hipblasSetStream
 #define cublasSgemm hipblasSgemm
-#define cublasSgemmBatched hipblasSgemmBatched
 #define cublasSgemmStridedBatched hipblasSgemmStridedBatched
 #define cublasStatus_t hipblasStatus_t
 #define cublasOperation_t hipblasOperation_t
@@ -32,7 +32,6 @@
 #define cublasSetMathMode mublasSetMathMode
 #define cublasSetStream mublasSetStream
 #define cublasSgemm mublasSgemm
-#define cublasSgemmBatched mublasSgemmBatched
 #define cublasSgemmStridedBatched mublasSgemmStridedBatched
 #define cublasStatus_t mublasStatus_t
 #define cublasOperation_t mublasOperation_t
@@ -25,6 +25,7 @@ include(ExternalProject)
 option(GGML_HEXAGON_HTP_DEBUG  "ggml-hexagon: enable HTP debug output" OFF)
 option(GGML_HEXAGON_FA_EXP2_HF "ggml-hexagon: use FP16 exp2 polynomial in FA softmax instead of F32 exp round-trip" OFF)
 set(GGML_HEXAGON_HTP_CERT  "$ENV{HEXAGON_HTP_CERT}" CACHE PATH "ggml-hexagon: enable HTP library signing using certificate")
+set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml-hexagon: quantize group size (32, 64, or 128)")

 add_library(htp_iface OBJECT
    ${CMAKE_CURRENT_BINARY_DIR}/htp_iface_stub.c)
@@ -71,12 +72,15 @@ function(build_htp_skel V)
            -DHEXAGON_SDK_ROOT=${HEXAGON_SDK_ROOT}
            -DHEXAGON_TOOLS_ROOT=${HEXAGON_TOOLS_ROOT}
            -DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG}
+            -DGGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE}
            -DDSP_VERSION=${V}
            -DPREBUILT_LIB_DIR="toolv19_${V}")
    list(APPEND HTP_SKELS ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-${V}.so)
    set(HTP_SKELS ${HTP_SKELS} PARENT_SCOPE)
 endfunction()

+build_htp_skel(v68)
+build_htp_skel(v69)
 build_htp_skel(v73)
 build_htp_skel(v75)
 build_htp_skel(v79)
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Ruben Ortlam	37db4fa4be	improve test	2026-06-17 17:42:56 +02:00
Ruben Ortlam	e804ed3fbe	tests: add backend copy test	2026-06-17 16:04:35 +02:00