wip

Merge branch 'master' into pr/23398
2026-06-30 17:47:40 +02:00 · 2026-06-06 16:30:41 +03:00 · 2026-06-06 10:48:36 +03:00 · 2026-06-05 17:47:19 +03:00 · 2026-06-05 14:39:03 +03:00 · 2026-06-05 14:38:41 +03:00
732 changed files with 30286 additions and 77315 deletions
@@ -13,20 +13,6 @@ ARG APP_REVISION=N/A
 # BUILD STAGE
 # Compile all binary files and libraries
 # ==============================================================================
-ARG NODE_VERSION=24
-
-FROM docker.io/node:$NODE_VERSION AS web
-
-ARG APP_VERSION
-
-WORKDIR /app/tools/ui
-
-COPY tools/ui/package.json tools/ui/package-lock.json ./
-RUN npm ci
-
-COPY tools/ui/ ./
-RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
-
 FROM ${CANN_BASE_IMAGE} AS build

 # -- Install build dependencies --
@@ -40,8 +26,6 @@ WORKDIR /app
 # -- Copy project files --
 COPY . .

-COPY --from=web /app/tools/ui/dist tools/ui/dist
-
 # -- Set CANN environment variables (required for compilation) --
 # Using ENV instead of `source` allows environment variables to persist across the entire image layer
 ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
@@ -145,7 +129,7 @@ ENTRYPOINT ["/app/tools.sh"]
 # ==============================================================================
 FROM base AS light

-COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

 ENTRYPOINT [ "/app/llama-cli" ]

@@ -156,7 +140,7 @@ FROM base AS server

 ENV LLAMA_ARG_HOST=0.0.0.0

-COPY --from=build /app/full/llama /app/full/llama-server /app
+COPY --from=build /app/full/llama-server /app

 HEALTHCHECK --interval=5m CMD [ "curl", "-f", "http://localhost:8080/health" ]

@@ -3,21 +3,7 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-ARG NODE_VERSION=24
-
-FROM docker.io/node:$NODE_VERSION AS web
-
-ARG APP_VERSION
-
-WORKDIR /app/tools/ui
-
-COPY tools/ui/package.json tools/ui/package-lock.json ./
-RUN npm ci
-
-COPY tools/ui/ ./
-RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
-
-FROM docker.io/ubuntu:$UBUNTU_VERSION AS build
+FROM ubuntu:$UBUNTU_VERSION AS build

 ARG TARGETARCH

@@ -30,8 +16,6 @@ WORKDIR /app

 COPY . .

-COPY --from=web /app/tools/ui/dist tools/ui/dist
-
 RUN if [ "$TARGETARCH" = "amd64" ] || [ "$TARGETARCH" = "arm64" ]; then \
        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
    else \
@@ -53,7 +37,7 @@ RUN mkdir -p /app/full \
    && cp .devops/tools.sh /app/full/tools.sh

 ## Base image
-FROM docker.io/ubuntu:$UBUNTU_VERSION AS base
+FROM ubuntu:$UBUNTU_VERSION AS base

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
@@ -69,7 +53,7 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
      org.opencontainers.image.source=$IMAGE_SOURCE

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl ffmpeg \
+    && apt-get install -y libgomp1 curl \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -104,7 +88,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

 WORKDIR /app

@@ -115,7 +99,7 @@ FROM base AS server

 ENV LLAMA_ARG_HOST=0.0.0.0

-COPY --from=build /app/full/llama /app/full/llama-server /app
+COPY --from=build /app/full/llama-server /app

 WORKDIR /app

@@ -1,47 +1,29 @@
 ARG UBUNTU_VERSION=24.04
 # This needs to generally match the container host's environment.
 ARG CUDA_VERSION=12.8.1
-ARG GCC_VERSION=14
 # Target the CUDA build image
-ARG BASE_CUDA_DEV_CONTAINER=docker.io/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

-ARG BASE_CUDA_RUN_CONTAINER=docker.io/nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-ARG NODE_VERSION=24
-
-FROM docker.io/node:$NODE_VERSION AS web
-
-ARG APP_VERSION
-
-WORKDIR /app/tools/ui
-
-COPY tools/ui/package.json tools/ui/package-lock.json ./
-RUN npm ci
-
-COPY tools/ui/ ./
-RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
-
 FROM ${BASE_CUDA_DEV_CONTAINER} AS build

-ARG GCC_VERSION
 # CUDA architecture to build for (defaults to all supported archs)
 ARG CUDA_DOCKER_ARCH=default

 RUN apt-get update && \
-    apt-get install -y gcc-${GCC_VERSION} g++-${GCC_VERSION} build-essential cmake python3 python3-pip git libssl-dev libgomp1
+    apt-get install -y gcc-14 g++-14 build-essential cmake python3 python3-pip git libssl-dev libgomp1

-ENV CC=gcc-${GCC_VERSION} CXX=g++-${GCC_VERSION} CUDAHOSTCXX=g++-${GCC_VERSION}
+ENV CC=gcc-14 CXX=g++-14 CUDAHOSTCXX=g++-14

 WORKDIR /app

 COPY . .

-COPY --from=web /app/tools/ui/dist tools/ui/dist
-
 RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
    export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
    fi && \
@@ -77,7 +59,7 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
      org.opencontainers.image.source=$IMAGE_SOURCE

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl ffmpeg \
+    && apt-get install -y libgomp1 curl \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -113,7 +95,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

 WORKDIR /app

@@ -124,7 +106,7 @@ FROM base AS server

 ENV LLAMA_ARG_HOST=0.0.0.0

-COPY --from=build /app/full/llama /app/full/llama-server /app
+COPY --from=build /app/full/llama-server /app

 WORKDIR /app

@@ -5,23 +5,9 @@ ARG APP_REVISION=N/A

 ## Build Image

-ARG NODE_VERSION=24
+FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build

-FROM docker.io/node:$NODE_VERSION AS web
-
-ARG APP_VERSION
-
-WORKDIR /app/tools/ui
-
-COPY tools/ui/package.json tools/ui/package-lock.json ./
-RUN npm ci
-
-COPY tools/ui/ ./
-RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
-
-FROM docker.io/intel/deep-learning-essentials:$ONEAPI_VERSION AS build
-
-ARG GGML_SYCL_F16=ON
+ARG GGML_SYCL_F16=OFF
 ARG LEVEL_ZERO_VERSION=1.28.2
 ARG LEVEL_ZERO_UBUNTU_VERSION=u24.04
 RUN apt-get update && \
@@ -36,12 +22,9 @@ WORKDIR /app

 COPY . .

-COPY --from=web /app/tools/ui/dist tools/ui/dist
-
 RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
        echo "GGML_SYCL_F16 is set" \
-        && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON" \
-        && export SYCL_PROGRAM_COMPILE_OPTIONS="-cl-fp32-correctly-rounded-divide-sqrt"; \
+        && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
    fi && \
    echo "Building with dynamic libs" && \
    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${OPT_SYCL_F16} && \
@@ -59,7 +42,7 @@ RUN mkdir -p /app/full \
    && cp requirements.txt /app/full \
    && cp .devops/tools.sh /app/full/tools.sh

-FROM docker.io/intel/deep-learning-essentials:$ONEAPI_VERSION AS base
+FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
@@ -74,21 +57,11 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
      org.opencontainers.image.url=$IMAGE_URL \
      org.opencontainers.image.source=$IMAGE_SOURCE

-#Following versions are for multiple GPUs, since 26.x has known issue:
-#   https://github.com/ggml-org/llama.cpp/issues/21747,
-#   https://github.com/intel/compute-runtime/issues/921.
-#ARG IGC_VERSION=v2.20.5
-#ARG IGC_VERSION_FULL=2_2.20.5+19972
-#ARG COMPUTE_RUNTIME_VERSION=25.40.35563.10
-#ARG COMPUTE_RUNTIME_VERSION_FULL=25.40.35563.10-0
-#ARG IGDGMM_VERSION=22.8.2
-
-
-ARG IGC_VERSION=v2.34.4
-ARG IGC_VERSION_FULL=2_2.34.4+21428
-ARG COMPUTE_RUNTIME_VERSION=26.18.38308.1
-ARG COMPUTE_RUNTIME_VERSION_FULL=26.18.38308.1-0
-ARG IGDGMM_VERSION=22.10.0
+ARG IGC_VERSION=v2.20.5
+ARG IGC_VERSION_FULL=2_2.20.5+19972
+ARG COMPUTE_RUNTIME_VERSION=25.40.35563.10
+ARG COMPUTE_RUNTIME_VERSION_FULL=25.40.35563.10-0
+ARG IGDGMM_VERSION=22.8.2
 RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
  && wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
  && wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
@@ -102,7 +75,7 @@ RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
  && dpkg --install *.deb

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl ffmpeg \
+    && apt-get install -y libgomp1 curl \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -141,7 +114,7 @@ ENTRYPOINT ["/app/tools.sh"]
 FROM base AS light

 COPY --from=build /app/lib/ /app
-COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

 WORKDIR /app

@@ -153,7 +126,7 @@ FROM base AS server
 ENV LLAMA_ARG_HOST=0.0.0.0

 COPY --from=build /app/lib/ /app
-COPY --from=build /app/full/llama /app/full/llama-server /app
+COPY --from=build /app/full/llama-server /app

 WORKDIR /app

@@ -3,7 +3,7 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-FROM docker.io/ascendai/cann:$ASCEND_VERSION AS build
+FROM ascendai/cann:$ASCEND_VERSION AS build

 WORKDIR /app

@@ -30,7 +30,7 @@ RUN echo "Building with static libs" && \
    cmake --build build --config Release --target llama-completion

 # TODO: use image with NNRT
-FROM docker.io/ascendai/cann:$ASCEND_VERSION AS runtime
+FROM ascendai/cann:$ASCEND_VERSION AS runtime

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
@@ -2,28 +2,14 @@ ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG MUSA_VERSION=rc4.3.0
 # Target the MUSA build image
-ARG BASE_MUSA_DEV_CONTAINER=docker.io/mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64

-ARG BASE_MUSA_RUN_CONTAINER=docker.io/mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
+ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-ARG NODE_VERSION=24
-
-FROM docker.io/node:$NODE_VERSION AS web
-
-ARG APP_VERSION
-
-WORKDIR /app/tools/ui
-
-COPY tools/ui/package.json tools/ui/package-lock.json ./
-RUN npm ci
-
-COPY tools/ui/ ./
-RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
-
 FROM ${BASE_MUSA_DEV_CONTAINER} AS build

 # MUSA architecture to build for (defaults to all supported archs)
@@ -43,8 +29,6 @@ WORKDIR /app

 COPY . .

-COPY --from=web /app/tools/ui/dist tools/ui/dist
-
 RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
    fi && \
@@ -80,7 +64,7 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
      org.opencontainers.image.source=$IMAGE_SOURCE

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl ffmpeg \
+    && apt-get install -y libgomp1 curl \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -115,7 +99,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

 WORKDIR /app

@@ -126,7 +110,7 @@ FROM base AS server

 ENV LLAMA_ARG_HOST=0.0.0.0

-COPY --from=build /app/full/llama /app/full/llama-server /app
+COPY --from=build /app/full/llama-server /app

 WORKDIR /app

@@ -1,17 +1,17 @@
-ARG OPENVINO_VERSION_MAJOR=2026.2.1
-ARG OPENVINO_VERSION_FULL=2026.2.1.21919.ede283a88e3
+ARG OPENVINO_VERSION_MAJOR=2026.0
+ARG OPENVINO_VERSION_FULL=2026.0.0.20965.c6d6a13a886
 ARG UBUNTU_VERSION=24.04

 # Intel GPU driver versions. https://github.com/intel/compute-runtime/releases
-ARG IGC_VERSION=v2.36.3
-ARG IGC_VERSION_FULL=2_2.36.3+21719
-ARG COMPUTE_RUNTIME_VERSION=26.22.38646.4
-ARG COMPUTE_RUNTIME_VERSION_FULL=26.22.38646.4-0
-ARG IGDGMM_VERSION=22.10.0
+ARG IGC_VERSION=v2.30.1
+ARG IGC_VERSION_FULL=2_2.30.1+20950
+ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
+ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
+ARG IGDGMM_VERSION=22.9.0

 # Intel NPU driver versions. https://github.com/intel/linux-npu-driver/releases
-ARG NPU_DRIVER_VERSION=v1.33.0
-ARG NPU_DRIVER_FULL=v1.33.0.20260529-26625960453
+ARG NPU_DRIVER_VERSION=v1.32.0
+ARG NPU_DRIVER_FULL=v1.32.0.20260402-23905121947
 ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2

 # Optional proxy build arguments
@@ -22,22 +22,8 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-ARG NODE_VERSION=24
-
-FROM docker.io/node:$NODE_VERSION AS web
-
-ARG APP_VERSION
-
-WORKDIR /app/tools/ui
-
-COPY tools/ui/package.json tools/ui/package-lock.json ./
-RUN npm ci
-
-COPY tools/ui/ ./
-RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
-
 ## Build Image
-FROM docker.io/ubuntu:${UBUNTU_VERSION} AS build
+FROM ubuntu:${UBUNTU_VERSION} AS build

 # Pass proxy args to build stage
 ARG http_proxy
@@ -60,18 +46,13 @@ RUN apt-get update && \
        intel-opencl-icd && \
    rm -rf /var/lib/apt/lists/*

-# OpenVINO toolkit and GPU/NPU drivers are cached via BuildKit cache mounts to avoid re-downloading on rebuilds.
-# Install OpenVINO for Ubuntu 24.04.
+# Install OpenVINO for Ubuntu 24.04
 ARG OPENVINO_VERSION_MAJOR
 ARG OPENVINO_VERSION_FULL
-RUN --mount=type=cache,target=/var/cache/openvino,sharing=locked \
-    mkdir -p /opt/intel && \
-    TGZ=/var/cache/openvino/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
-    if [ ! -f "$TGZ" ]; then \
-        wget -O "$TGZ" https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz; \
-    fi && \
-    tar -xf "$TGZ" -C /opt/intel/ && \
-    mv /opt/intel/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
+RUN mkdir -p /opt/intel && \
+    wget https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
+    tar -xf openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
+    mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
    cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
    echo "Y" | ./install_dependencies/install_openvino_dependencies.sh && \
    cd - && \
@@ -83,20 +64,18 @@ WORKDIR /app

 COPY . .

-COPY --from=web /app/tools/ui/dist tools/ui/dist
-
 # Build Stage
 RUN bash -c "source ${OpenVINO_DIR}/setupvars.sh && \
    cmake -B build/ReleaseOV -G Ninja \
        -DCMAKE_BUILD_TYPE=Release \
-        -DLLAMA_BUILD_TESTS=OFF \
        -DGGML_OPENVINO=ON && \
-    cmake --build build/ReleaseOV --parallel "
+    cmake --build build/ReleaseOV -j$(nproc)"

-# Copy all necessary libraries (build outputs + OpenVINO runtime libs)
+# Copy all necessary libraries
 RUN mkdir -p /app/lib && \
-    find build/ReleaseOV -name '*.so*' -exec cp -P {} /app/lib \; && \
-    find "${OpenVINO_DIR}/runtime/lib/intel64" -name '*.so*' -exec cp -P {} /app/lib \;
+    find build/ReleaseOV -name '*.so*' -exec cp {} /app/lib \; && \
+    find ${OpenVINO_DIR}/runtime/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \; 2>/dev/null || \
+    find ${OpenVINO_DIR}/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \;

 # Create runtime directories and copy binaries
 RUN mkdir -p /app/full \
@@ -109,7 +88,7 @@ RUN mkdir -p /app/full \
    && cp .devops/tools.sh /app/full/tools.sh

 ## Base Runtime Image
-FROM docker.io/ubuntu:${UBUNTU_VERSION} AS base
+FROM ubuntu:${UBUNTU_VERSION} AS base

 # Pass proxy args to runtime stage
 ARG http_proxy
@@ -128,7 +107,7 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
      org.opencontainers.image.source=$IMAGE_SOURCE

 RUN apt-get update \
-    && apt-get install -y libgomp1 libtbb12 curl wget ffmpeg ocl-icd-libopencl1 \
+    && apt-get install -y libgomp1 libtbb12 curl wget ocl-icd-libopencl1 \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -141,41 +120,33 @@ ARG IGC_VERSION_FULL
 ARG COMPUTE_RUNTIME_VERSION
 ARG COMPUTE_RUNTIME_VERSION_FULL
 ARG IGDGMM_VERSION
-RUN --mount=type=cache,target=/var/cache/intel-gpu,sharing=locked \
-    set -eux; \
-    cd /var/cache/intel-gpu; \
-    for url in \
-        https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
-        https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
-        https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-        https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-        https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
-        https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb ; do \
-        f=$(basename "$url"); \
-        [ -f "$f" ] || wget -q -O "$f" "$url"; \
-    done; \
-    apt-get update; \
-    apt-get install -y --no-install-recommends ./*.deb; \
-    rm -rf /var/lib/apt/lists/*
+RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
+    && wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
+    && wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
+    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
+    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
+    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
+    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
+    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
+    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
+    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
+    && dpkg --install *.deb \
+    && rm -rf /tmp/neo/

 # Install NPU drivers
 ARG NPU_DRIVER_VERSION
 ARG NPU_DRIVER_FULL
 ARG LIBZE1_VERSION
-RUN --mount=type=cache,target=/var/cache/intel-npu,sharing=locked \
-    set -eux; \
-    TGZ=/var/cache/intel-npu/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz; \
-    if [ ! -f "$TGZ" ]; then \
-        wget -q -O "$TGZ" https://github.com/intel/linux-npu-driver/releases/download/${NPU_DRIVER_VERSION}/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz; \
-    fi; \
-    DEB=/var/cache/intel-npu/libze1_${LIBZE1_VERSION}_amd64.deb; \
-    if [ ! -f "$DEB" ]; then \
-        wget -q -O "$DEB" https://snapshot.ppa.launchpadcontent.net/kobuk-team/intel-graphics/ubuntu/20260324T100000Z/pool/main/l/level-zero-loader/libze1_${LIBZE1_VERSION}_amd64.deb; \
-    fi; \
-    mkdir /tmp/npu/ && cd /tmp/npu/ && tar -xf "$TGZ" && cp "$DEB" .; \
-    apt-get update; \
-    apt-get install -y --no-install-recommends ./*.deb; \
-    rm -rf /tmp/npu/ /var/lib/apt/lists/*
+RUN mkdir /tmp/npu/ && cd /tmp/npu/ \
+    && wget https://github.com/intel/linux-npu-driver/releases/download/${NPU_DRIVER_VERSION}/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
+    && tar -xf linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
+    && dpkg --install *.deb \
+    && rm -rf /tmp/npu/
+
+RUN cd /tmp \
+    && wget https://snapshot.ppa.launchpadcontent.net/kobuk-team/intel-graphics/ubuntu/20260324T100000Z/pool/main/l/level-zero-loader/libze1_${LIBZE1_VERSION}_amd64.deb \
+    && dpkg --install libze1_${LIBZE1_VERSION}_amd64.deb \
+    && rm libze1_${LIBZE1_VERSION}_amd64.deb

 COPY --from=build /app/lib/ /app/

@@ -195,26 +166,22 @@ RUN apt-get update && \
    python3 \
    python3-venv \
    python3-pip && \
-    python3 -m venv /openvino-venv && \
-    /openvino-venv/bin/pip install --no-cache-dir --upgrade pip setuptools wheel && \
-    /openvino-venv/bin/pip install --no-cache-dir -r requirements.txt && \
+    python3 -m venv /ov-venv && \
+    /ov-venv/bin/pip install --no-cache-dir --upgrade pip setuptools wheel && \
+    /ov-venv/bin/pip install --no-cache-dir -r requirements.txt && \
    apt-get autoremove -y && \
    apt-get clean && \
    rm -rf /tmp/* /var/tmp/* && \
    find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
    find /var/cache -type f -delete

-# Activate the venv
-ENV VIRTUAL_ENV=/openvino-venv \
-    PATH=/openvino-venv/bin:$PATH
-
-ENTRYPOINT ["/app/tools.sh"]
+ENTRYPOINT ["/bin/bash", "-c", "source /ov-venv/bin/activate && exec /app/tools.sh \"$@\"", "--"]


 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app/
+COPY --from=build /app/full/llama-cli /app/

 WORKDIR /app

@@ -225,7 +192,7 @@ FROM base AS server

 ENV LLAMA_ARG_HOST=0.0.0.0

-COPY --from=build /app/full/llama /app/full/llama-server /app/
+COPY --from=build /app/full/llama-server /app/

 WORKDIR /app

@@ -5,26 +5,12 @@ ARG ROCM_VERSION=7.2.1
 ARG AMDGPU_VERSION=7.2.1

 # Target the ROCm build image
-ARG BASE_ROCM_DEV_CONTAINER=docker.io/rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
+ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-ARG NODE_VERSION=24
-
-FROM docker.io/node:$NODE_VERSION AS web
-
-ARG APP_VERSION
-
-WORKDIR /app/tools/ui
-
-COPY tools/ui/package.json tools/ui/package-lock.json ./
-RUN npm ci
-
-COPY tools/ui/ ./
-RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
-
 ### Build image
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build

@@ -52,8 +38,6 @@ WORKDIR /app

 COPY . .

-COPY --from=web /app/tools/ui/dist tools/ui/dist
-
 RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
    cmake -S . -B build \
        -DGGML_HIP=ON \
@@ -92,7 +76,7 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
      org.opencontainers.image.source=$IMAGE_SOURCE

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl ffmpeg \
+    && apt-get install -y libgomp1 curl \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -127,7 +111,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

 WORKDIR /app

@@ -138,7 +122,7 @@ FROM base AS server

 ENV LLAMA_ARG_HOST=0.0.0.0

-COPY --from=build /app/full/llama /app/full/llama-server /app
+COPY --from=build /app/full/llama-server /app

 WORKDIR /app

@@ -5,7 +5,7 @@ ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

 ### Build Llama.cpp stage
-FROM docker.io/gcc:${GCC_VERSION} AS build
+FROM gcc:${GCC_VERSION} AS build

 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
@@ -55,7 +55,7 @@ COPY --from=build /opt/llama.cpp/conversion /llama.cpp/conversion


 ### Base image
-FROM docker.io/ubuntu:${UBUNTU_VERSION} AS base
+FROM ubuntu:${UBUNTU_VERSION} AS base

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
@@ -124,7 +124,7 @@ WORKDIR /llama.cpp/bin

 # Copy llama.cpp binaries and libraries
 COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
-COPY --from=collector /llama.cpp/bin/llama /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin
+COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin

 ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]

@@ -138,7 +138,7 @@ WORKDIR /llama.cpp/bin

 # Copy llama.cpp binaries and libraries
 COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
-COPY --from=collector /llama.cpp/bin/llama /llama.cpp/bin/llama-server /llama.cpp/bin
+COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin

 EXPOSE 8080

@@ -3,21 +3,7 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-ARG NODE_VERSION=24
-
-FROM docker.io/node:$NODE_VERSION AS web
-
-ARG APP_VERSION
-
-WORKDIR /app/tools/ui
-
-COPY tools/ui/package.json tools/ui/package-lock.json ./
-RUN npm ci
-
-COPY tools/ui/ ./
-RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
-
-FROM docker.io/ubuntu:$UBUNTU_VERSION AS build
+FROM ubuntu:$UBUNTU_VERSION AS build

 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget xz-utils
@@ -31,8 +17,6 @@ WORKDIR /app

 COPY . .

-COPY --from=web /app/tools/ui/dist tools/ui/dist
-
 RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
    cmake --build build --config Release -j$(nproc)

@@ -49,7 +33,7 @@ RUN mkdir -p /app/full \
    && cp .devops/tools.sh /app/full/tools.sh

 ## Base image
-FROM docker.io/ubuntu:$UBUNTU_VERSION AS base
+FROM ubuntu:$UBUNTU_VERSION AS base

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
@@ -65,7 +49,7 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
      org.opencontainers.image.source=$IMAGE_SOURCE

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl ffmpeg libvulkan1 mesa-vulkan-drivers \
+    && apt-get install -y libgomp1 curl libvulkan1 mesa-vulkan-drivers \
    libglvnd0 libgl1 libglx0 libegl1 libgles2 \
    && apt autoremove -y \
    && apt clean -y \
@@ -107,7 +91,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

 WORKDIR /app

@@ -118,7 +102,7 @@ FROM base AS server

 ENV LLAMA_ARG_HOST=0.0.0.0

-COPY --from=build /app/full/llama /app/full/llama-server /app
+COPY --from=build /app/full/llama-server /app

 WORKDIR /app

@@ -3,21 +3,7 @@ ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
 ARG APP_REVISION=N/A

-ARG NODE_VERSION=24
-
-FROM docker.io/node:$NODE_VERSION AS web
-
-ARG APP_VERSION
-
-WORKDIR /app/tools/ui
-
-COPY tools/ui/package.json tools/ui/package-lock.json ./
-RUN npm ci
-
-COPY tools/ui/ ./
-RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
-
-FROM docker.io/ubuntu:$UBUNTU_VERSION AS build
+FROM ubuntu:$UBUNTU_VERSION AS build

 RUN apt-get update && \
    apt-get install -y gcc-13 g++-13 build-essential git cmake libssl-dev libomp-dev libnuma-dev python3 ca-certificates
@@ -28,8 +14,6 @@ WORKDIR /app

 COPY . .

-COPY --from=web /app/tools/ui/dist tools/ui/dist
-
 RUN cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_ZENDNN=ON && \
    cmake --build build -j $(nproc)

@@ -46,7 +30,7 @@ RUN mkdir -p /app/full \
    && cp .devops/tools.sh /app/full/tools.sh

 ## Base image
-FROM docker.io/ubuntu:$UBUNTU_VERSION AS base
+FROM ubuntu:$UBUNTU_VERSION AS base

 ARG BUILD_DATE=N/A
 ARG APP_VERSION=N/A
@@ -62,7 +46,7 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
      org.opencontainers.image.source=$IMAGE_SOURCE

 RUN apt-get update \
-    && apt-get install -y libgomp1 libnuma1 curl ffmpeg \
+    && apt-get install -y libgomp1 libnuma1 curl \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -97,7 +81,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

 WORKDIR /app

@@ -108,7 +92,7 @@ FROM base AS server

 ENV LLAMA_ARG_HOST=0.0.0.0

-COPY --from=build /app/full/llama /app/full/llama-server /app
+COPY --from=build /app/full/llama-server /app

 WORKDIR /app

@@ -10,8 +10,6 @@

 build*/

-tools/ui/node_modules/
-
 models/*

 /llama-cli
@@ -1,24 +0,0 @@
-name: "Windows - Setup OpenVINO Toolkit"
-description: "Setup OpenVINO Toolkit for Windows"
-inputs:
-  path:
-    description: "Installation path"
-    required: true
-  version_major:
-    description: "OpenVINO major version (e.g., 2026.2)"
-    required: true
-  version_full:
-    description: "OpenVINO full version"
-    required: true
-
-runs:
-  using: "composite"
-  steps:
-    - name: Download and extract OpenVINO Runtime
-      shell: powershell
-      run: |
-        $url = "https://storage.openvinotoolkit.org/repositories/openvino/packages/${{ inputs.version_major }}/windows/openvino_toolkit_windows_${{ inputs.version_full }}_x86_64.zip"
-        $out = "openvino.zip"
-        Invoke-WebRequest -Uri $url -OutFile $out
-        Expand-Archive -Path $out -DestinationPath ${{ inputs.path }} -Force
-        Remove-Item $out
@@ -12,7 +12,7 @@ SYCL:
            - ggml/src/ggml-sycl/**
            - docs/backend/SYCL.md
            - examples/sycl/**
-CUDA:
+Nvidia GPU:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/include/ggml-cuda.h
@@ -35,20 +35,8 @@ AMD ZenDNN:
 documentation:
    - changed-files:
        - any-glob-to-any-file:
-            - "**/*.md"
            - docs/**
            - media/**
-examples:
-    - all:
-        - changed-files:
-            - any-glob-to-any-file:
-                - app/**
-                - examples/**
-                - tools/**
-            - all-globs-to-all-files:
-                - '!tools/server/**'
-                - '!tools/mtmd/**'
-                - '!tools/ui/**'
 testing:
    - changed-files:
        - any-glob-to-any-file:
@@ -59,12 +47,28 @@ build:
            - cmake/**
            - CMakeLists.txt
            - CMakePresets.json
+examples:
+    - changed-files:
+        - any-glob-to-any-file:
+            - examples/**
+            - tools/**
 devops:
    - changed-files:
        - any-glob-to-any-file:
            - .devops/**
            - .github/**
            - ci/**
+python:
+    - changed-files:
+        - any-glob-to-any-file:
+            - "**/*.py"
+            - requirements/**
+            - gguf-py/**
+            - .flake8
+script:
+    - changed-files:
+        - any-glob-to-any-file:
+            - scripts/**
 android:
    - changed-files:
        - any-glob-to-any-file:
@@ -77,20 +81,9 @@ server:
    - changed-files:
        - any-glob-to-any-file:
            - tools/server/**
-mtmd:
-    - changed-files:
-        - any-glob-to-any-file:
-            - tools/mtmd/**
-conversion:
-    - changed-files:
-        - any-glob-to-any-file:
-            - conversion/**
-            - convert_*.py
-            - gguf-py/**
-vendor:
-    - changed-files:
-        - any-glob-to-any-file:
-            - vendor/**
+
+
+
 ggml:
    - changed-files:
        - any-glob-to-any-file:
@@ -68,8 +68,8 @@ jobs:

    env:
      # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.2.1"
-      OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
+      OPENVINO_VERSION_MAJOR: "2026.0"
+      OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"

    steps:
      - name: Clone
@@ -91,34 +91,6 @@ jobs:
          version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
          version_full: ${{ env.OPENVINO_VERSION_FULL }}

-  windows-2022-openvino-cache:
-    runs-on: windows-2022
-
-    env:
-      # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.2.1"
-      OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Setup Cache
-        uses: actions/cache@v5
-        id: cache-openvino
-        with:
-          path: ./openvino_toolkit
-          key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
-
-      - name: Setup OpenVINO Toolkit
-        if: steps.cache-openvino.outputs.cache-hit != 'true'
-        uses: ./.github/actions/windows-setup-openvino
-        with:
-          path: ./openvino_toolkit
-          version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
-          version_full: ${{ env.OPENVINO_VERSION_FULL }}
-
  windows-2022-rocm-cache:
    runs-on: windows-2022

@@ -37,10 +37,14 @@ jobs:
  ubuntu-24-openvino:
    runs-on: [self-hosted, Linux, Intel, OpenVINO]

+    concurrency:
+      group: openvino-gpu-${{ github.head_ref || github.ref }}
+      cancel-in-progress: false
+
    env:
      # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.2.1"
-      OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
+      OPENVINO_VERSION_MAJOR: "2026.0"
+      OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"

    steps:
      - name: Clone
@@ -74,7 +78,7 @@ jobs:
          cmake -B build/ReleaseOV -G Ninja \
            -DCMAKE_BUILD_TYPE=Release \
            -DGGML_OPENVINO=ON
-          time cmake --build build/ReleaseOV --config Release --parallel
+          time cmake --build build/ReleaseOV --config Release -j $(nproc)

      - name: Test (CPU)
        id: cmake_test_cpu
@@ -89,81 +93,4 @@ jobs:
        run: |
          cd ${{ github.workspace }}
          export GGML_OPENVINO_DEVICE=GPU
-          ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 3000
-
-  openvino-windows-2022:
-    runs-on: windows-2022
-
-    env:
-      # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.2.1"
-      OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: openvino-windows-2022
-          variant: ccache
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Setup Cache
-        uses: actions/cache@v5
-        id: cache-openvino
-        with:
-          path: ./openvino_toolkit
-          key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
-
-      - name: Setup OpenVINO Toolkit
-        if: steps.cache-openvino.outputs.cache-hit != 'true'
-        uses: ./.github/actions/windows-setup-openvino
-        with:
-          path: ./openvino_toolkit
-          version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
-          version_full: ${{ env.OPENVINO_VERSION_FULL }}
-
-      - name: Install OpenCL using vcpkg
-        shell: powershell
-        run: |
-          git clone https://github.com/microsoft/vcpkg C:\vcpkg
-          C:\vcpkg\bootstrap-vcpkg.bat
-          C:\vcpkg\vcpkg install opencl
-
-      - name: Build
-        id: cmake_build
-        shell: cmd
-        run: |
-          REM Find extracted OpenVINO folder dynamically
-          for /d %%i in (openvino_toolkit\*) do set OPENVINO_ROOT=%%i
-
-          if not exist "%OPENVINO_ROOT%\runtime\cmake\OpenVINOConfig.cmake" (
-              echo ERROR: OpenVINOConfig.cmake not found
-              exit /b 1
-          )
-
-          call "%OPENVINO_ROOT%\setupvars.bat"
-
-          cmake -B build\ReleaseOV -G "Visual Studio 17 2022" ^
-            -A x64 ^
-            -DCMAKE_BUILD_TYPE=Release ^
-            -DGGML_OPENVINO=ON ^
-            -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake
-
-          cmake --build build\ReleaseOV --config Release -- /m
-
-      - name: Test (CPU)
-        id: cmake_test_cpu
-        shell: cmd
-        # TODO: fix and re-enable the `test-llama-archs` test below
-        run: |
-          REM Find extracted OpenVINO folder dynamically
-          for /d %%i in (openvino_toolkit\*) do set OPENVINO_ROOT=%%i
-          call "%OPENVINO_ROOT%\setupvars.bat"
-
-          cd build
-          ctest --test-dir ReleaseOV -L main -E "test-llama-archs" -C Release --verbose --timeout 3000
+          ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
@@ -264,10 +264,14 @@ jobs:
  gpu-openvino-low-perf:
    runs-on: [self-hosted, Linux, Intel, OpenVINO]

+    concurrency:
+      group: openvino-gpu-${{ github.head_ref || github.ref }}
+      cancel-in-progress: false
+
    env:
      # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.2.1"
-      OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
+      OPENVINO_VERSION_MAJOR: "2026.0"
+      OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"

    steps:
      - name: Clone
@@ -34,108 +34,129 @@ env:
  LLAMA_ARG_LOG_TIMESTAMPS: 1

 jobs:
-  ubuntu-24-sycl:
-    strategy:
-      matrix:
-        build: [fp32, fp16]
-        include:
-          - build: fp32
-            fp16: OFF
-          - build: fp16
-            fp16: ON

-    runs-on: ubuntu-24.04
+# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
+#       in order to enable it again, we have to provision dedicated runners  to run it
+#  ubuntu-24-sycl:
+#    strategy:
+#      matrix:
+#        build: [fp32]
+#        include:
+#          - build: fp32
+#            fp16: OFF
+#
+#    runs-on: ubuntu-24.04
+#
+#    env:
+#      ONEAPI_ROOT: /opt/intel/oneapi/
+#      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+#      LEVEL_ZERO_VERSION: "1.28.2"
+#      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
+#
+#    continue-on-error: true
+#
+#    steps:
+#      - uses: actions/checkout@v6
+#
+#      - name: Use oneAPI Installation Cache
+#        uses: actions/cache@v5
+#        id: cache-sycl
+#        with:
+#          path: ${{ env.ONEAPI_ROOT }}
+#          key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+#
+#      - name: Download & Install oneAPI
+#        shell: bash
+#        if: steps.cache-sycl.outputs.cache-hit != 'true'
+#        run: |
+#          cd /tmp
+#          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
+#          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
+#
+#      - name: Install Level Zero SDK
+#        shell: bash
+#        run: |
+#          cd /tmp
+#          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
+#          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
+#          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
+#
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v6
+#
+#      - name: ccache
+#        uses: ggml-org/ccache-action@v1.2.21
+#        with:
+#          key: sycl-ubuntu-24-${{ matrix.build }}
+#          evict-old-files: 1d
+#          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+#
+#      - name: Build
+#        id: cmake_build
+#        run: |
+#          source /opt/intel/oneapi/setvars.sh
+#          cmake -B build \
+#            -G "Ninja" \
+#            -DCMAKE_BUILD_TYPE=Release \
+#            -DGGML_SYCL=ON \
+#            -DCMAKE_C_COMPILER=icx \
+#            -DCMAKE_CXX_COMPILER=icpx \
+#            -DLLAMA_OPENSSL=OFF \
+#            -DGGML_NATIVE=OFF \
+#            -DGGML_SYCL_F16=${{ matrix.fp16 }}
+#          time cmake --build build --config Release -j $(nproc)

-    env:
-      ONEAPI_ROOT: /opt/intel/oneapi/
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-      LEVEL_ZERO_VERSION: "1.28.2"
-      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
-
-    continue-on-error: true
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Download & Install oneAPI
-        shell: bash
-        run: |
-          cd /tmp
-          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
-          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
-
-      - name: Install Level Zero SDK
-        shell: bash
-        run: |
-          cd /tmp
-          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
-          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
-          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: sycl-ubuntu-24-${{ matrix.build }}
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx \
-            -DLLAMA_OPENSSL=OFF \
-            -DGGML_NATIVE=OFF \
-            -DGGML_SYCL_F16=${{ matrix.fp16 }}
-          time cmake --build build --config Release -j $(nproc)
-
-  windows-latest-sycl:
-    runs-on: windows-2022
-
-    defaults:
-      run:
-        shell: bash
-
-    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
-      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
-      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Download & Install oneAPI
-        shell: bash
-        run: |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
-      - name: Install Level Zero SDK
-        shell: pwsh
-        run: |
-          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
-          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
-          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: sycl-windows-latest
-          variant: ccache
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
-
-      - name: Build
-        id: cmake_build
-        run:  examples/sycl/win-build-sycl.bat
+# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
+#       in order to enable it again, we have to provision dedicated runners  to run it
+#  windows-latest-sycl:
+#    runs-on: windows-2022
+#
+#    defaults:
+#      run:
+#        shell: bash
+#
+#    env:
+#      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
+#      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
+#      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
+#      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+#      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+#    steps:
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v6
+#
+#      - name: Use oneAPI Installation Cache
+#        uses: actions/cache@v5
+#        id: cache-sycl
+#        with:
+#          path: ${{ env.ONEAPI_ROOT }}
+#          key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+#
+#      - name: Download & Install oneAPI
+#        shell: bash
+#        if: steps.cache-sycl.outputs.cache-hit != 'true'
+#        run: |
+#          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+#
+#      - name: Install Level Zero SDK
+#        shell: pwsh
+#        run: |
+#          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
+#          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
+#          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
+#
+#      - name: ccache
+#        uses: ggml-org/ccache-action@v1.2.21
+#        with:
+#          key: sycl-windows-latest
+#          variant: ccache
+#          evict-old-files: 1d
+#          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+#
+#      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
+#
+#      - name: Build
+#        id: cmake_build
+#        run:  examples/sycl/win-build-sycl.bat
@@ -35,29 +35,6 @@ env:
  LLAMA_ARG_LOG_TIMESTAMPS: 1

 jobs:
-  format:
-    runs-on: ubuntu-24.04
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v6
-
-      - name: Install clang-format 22
-        run: |
-          wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key |
-            sudo tee /etc/apt/trusted.gpg.d/apt.llvm.org.asc > /dev/null
-          sudo add-apt-repository -y \
-            "deb http://apt.llvm.org/noble/ llvm-toolchain-noble-22 main"
-          sudo apt-get update
-          sudo apt-get install -y clang-format-22
-
-      - name: Check formatting
-        run: |
-          find ggml/src/ggml-webgpu \
-            -type f \( -name '*.cpp' -o -name '*.hpp' -o -name '*.h' \) \
-            -print0 |
-            xargs -0 clang-format-22 --dry-run --Werror
-
  macos:
    runs-on: macos-latest

@@ -58,13 +58,6 @@ jobs:
          git tag ${{ steps.srctag.outputs.name }} || exit 0
          git push origin ${{ steps.srctag.outputs.name }} || exit 0

-  build_ui:
-    name: Build UI
-    needs: create_tag
-    uses: ./.github/workflows/ui-build.yml
-    with:
-      hf_ui_version: ${{ needs.create_tag.outputs.source_tag }}
-
  prepare_matrices:
    name: Prepare Docker matrices
    runs-on: ubuntu-24.04
@@ -86,11 +79,11 @@ jobs:
          [
            { "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
            { "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
-            { "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x", "prebuilt_ui": true },
+            { "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" },
            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
-            { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.3.0", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
-            { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.3.0", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
+            { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
+            { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
            { "tag": "musa", "dockerfile": ".devops/musa.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
            { "tag": "intel", "dockerfile": ".devops/intel.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
            { "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
@@ -142,7 +135,7 @@ jobs:

  push_to_registry:
    name: Push Docker image to Docker Registry
-    needs: [prepare_matrices, create_tag, build_ui]
+    needs: [prepare_matrices, create_tag]

    runs-on: ${{ matrix.config.runs_on }}
    strategy:
@@ -157,13 +150,6 @@ jobs:
          fetch-depth: 0
          ref: ${{ needs.create_tag.outputs.source_tag }}

-      - name: Download prebuilt UI
-        if: ${{ matrix.config.prebuilt_ui == true }}
-        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
-        with:
-          name: ui-build
-          path: tools/ui/dist
-
      - name: Set up QEMU
        if: ${{ contains(matrix.config.platforms, 'linux/amd64') }}
        uses: docker/setup-qemu-action@ce360397dd3f832beb865e1373c09c0e9f86d70a # v4
@@ -46,13 +46,11 @@ jobs:

    steps:
      - id: check
-        env:
-          COMMIT_MESSAGE: ${{ github.event.head_commit.message }}
        run: |
          if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
            echo "should_release=true" >> $GITHUB_OUTPUT
          elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/master" ]]; then
-            if echo "$COMMIT_MESSAGE" | grep -q '\[no release\]'; then
+            if echo "${{ github.event.head_commit.message }}" | grep -q '\[no release\]'; then
              echo "should_release=false" >> $GITHUB_OUTPUT
            else
              echo "should_release=true" >> $GITHUB_OUTPUT
@@ -61,31 +59,8 @@ jobs:
            echo "should_release=false" >> $GITHUB_OUTPUT
          fi

-  get-version:
-    runs-on: ubuntu-slim
-    outputs:
-      ui_version: ${{ steps.version.outputs.ui_version }}
-    steps:
-      - uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-      - id: version
-        run: |
-          # Resolve UI version: BUILD_NUMBER from cmake/build-info.cmake > git hash + epoch > fallback
-          version=""
-          if grep -q "BUILD_NUMBER" cmake/build-info.cmake; then
-            build_number=$(grep "set(BUILD_NUMBER" cmake/build-info.cmake | grep -oP '\d+')
-            if [ -n "$build_number" ] && [ "$build_number" -gt 0 ]; then
-              version="b${build_number}"
-            fi
-          fi
-          if [ -z "$version" ]; then
-            version=$(git rev-parse --short HEAD)-$(date +%s)
-          fi
-          echo "ui_version=${version}" >> $GITHUB_OUTPUT
-
  macos-cpu:
-    needs: [check-release, get-version]
+    needs: [check-release]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}
    strategy:
      matrix:
@@ -141,7 +116,6 @@ jobs:
            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DLLAMA_BUILD_BORINGSSL=ON \
-            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

@@ -167,7 +141,7 @@ jobs:
          name: llama-bin-macos-${{ matrix.build }}.tar.gz

  ubuntu-cpu:
-    needs: [check-release, get-version]
+    needs: [check-release]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}
    strategy:
      matrix:
@@ -227,7 +201,6 @@ jobs:
            -DGGML_NATIVE=OFF \
            -DGGML_CPU_ALL_VARIANTS=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
-            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release -j $(nproc)

@@ -254,7 +227,7 @@ jobs:
          name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz

  ubuntu-vulkan:
-    needs: [check-release, get-version]
+    needs: [check-release]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}

    strategy:
@@ -314,7 +287,6 @@ jobs:
            -DGGML_NATIVE=OFF \
            -DGGML_CPU_ALL_VARIANTS=ON \
            -DGGML_VULKAN=ON \
-            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release -j $(nproc)

@@ -340,7 +312,7 @@ jobs:
          name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz

  android-arm64:
-    needs: [check-release, get-version]
+    needs: [check-release]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}

    runs-on: ubuntu-latest
@@ -407,7 +379,6 @@ jobs:
            -DLLAMA_FATAL_WARNINGS=ON \
            -DGGML_OPENMP=OFF \
            -DLLAMA_BUILD_BORINGSSL=ON \
-            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release -j $(nproc)

@@ -433,7 +404,7 @@ jobs:
          name: llama-bin-android-arm64.tar.gz

  ubuntu-24-openvino:
-    needs: [check-release, get-version]
+    needs: [check-release]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}

    runs-on: ubuntu-24.04
@@ -445,9 +416,9 @@ jobs:
      openvino_version: ${{ steps.openvino_version.outputs.value }}

    env:
-      # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.2.1"
-      OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
+      # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
+      OPENVINO_VERSION_MAJOR: "2026.0"
+      OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"

    steps:
      - name: Set OpenVINO version output
@@ -505,12 +476,8 @@ jobs:
          source ./openvino_toolkit/setupvars.sh
          cmake -B build/ReleaseOV -G Ninja \
            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_OPENVINO=ON \
-            -DCMAKE_INSTALL_RPATH='$ORIGIN' \
-            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build/ReleaseOV --config Release --parallel
+            -DGGML_OPENVINO=ON
+          cmake --build build/ReleaseOV --config Release -j $(nproc)

      - name: ccache-clear
        uses: ./.github/actions/ccache-clear
@@ -524,26 +491,8 @@ jobs:
      - name: Pack artifacts
        id: pack_artifacts
        run: |
-          dest=./build/ReleaseOV/bin
-          OPENVINO_ROOT=./openvino_toolkit
-          ov_lib="$OPENVINO_ROOT/runtime/lib/intel64"
-
-          # Bundle OpenVINO runtime libs + TBB. Binaries built with RPATH=$ORIGIN
-          # load these siblings without setupvars.sh / LD_LIBRARY_PATH.
-          cp -P "$ov_lib"/libopenvino.so* \
-                "$ov_lib"/libopenvino_c.so* \
-                "$ov_lib"/libopenvino_*_plugin.so \
-                "$ov_lib"/libopenvino_intel_npu_compiler*.so \
-                "$OPENVINO_ROOT"/runtime/3rdparty/tbb/lib/*.so* \
-                "$dest"
-          cp -P /usr/lib/x86_64-linux-gnu/libOpenCL.so.1* "$dest" 2>/dev/null || true
-          cp "$ov_lib"/cache.json "$dest" 2>/dev/null || true
-
-          # OpenVINO licensing
-          cp -r "$OPENVINO_ROOT"/docs/licensing "$dest"/openvino-licensing
-
-          cp LICENSE "$dest"
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C "$dest" .
+          cp LICENSE ./build/ReleaseOV/bin/
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/ReleaseOV/bin .

      - name: Upload artifacts
        uses: actions/upload-artifact@v6
@@ -551,140 +500,11 @@ jobs:
          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz
          name: llama-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz

-  windows-openvino:
-    needs: [check-release]
-    if: ${{ needs.check-release.outputs.should_release == 'true' }}
-
-    runs-on: windows-2022
-
-    outputs:
-      openvino_version: ${{ steps.openvino_version.outputs.value }}
-
-    env:
-      # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.2.1"
-      OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
-
-    steps:
-      - name: Set OpenVINO version output
-        id: openvino_version
-        shell: bash
-        run: echo "value=${{ env.OPENVINO_VERSION_MAJOR }}" >> $GITHUB_OUTPUT
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-            fetch-depth: 0
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: release-windows-2022-openvino
-          variant: ccache
-          evict-old-files: 1d
-
-      - name: Setup Cache
-        uses: actions/cache@v5
-        id: cache-openvino
-        with:
-          path: ./openvino_toolkit
-          key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
-
-      - name: Setup OpenVINO Toolkit
-        if: steps.cache-openvino.outputs.cache-hit != 'true'
-        uses: ./.github/actions/windows-setup-openvino
-        with:
-          path: ./openvino_toolkit
-          version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
-          version_full: ${{ env.OPENVINO_VERSION_FULL }}
-
-      - name: Install OpenCL using vcpkg
-        shell: powershell
-        run: |
-          git clone https://github.com/microsoft/vcpkg C:\vcpkg
-          C:\vcpkg\bootstrap-vcpkg.bat
-          C:\vcpkg\vcpkg install opencl
-
-      - name: Build
-        id: cmake_build
-        shell: cmd
-        run: |
-          REM Find extracted OpenVINO folder dynamically
-          for /d %%i in (openvino_toolkit\*) do set OPENVINO_ROOT=%%i
-
-          if not exist "%OPENVINO_ROOT%\runtime\cmake\OpenVINOConfig.cmake" (
-              echo ERROR: OpenVINOConfig.cmake not found
-              exit /b 1
-          )
-
-          call "%OPENVINO_ROOT%\setupvars.bat"
-
-          cmake -B build\ReleaseOV -G "Visual Studio 17 2022" ^
-            -A x64 ^
-            -DCMAKE_BUILD_TYPE=Release ^
-            -DGGML_OPENVINO=ON ^
-            -DLLAMA_BUILD_BORINGSSL=ON ^
-            -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake ^
-            ${{ env.CMAKE_ARGS }}
-
-          cmake --build build\ReleaseOV --config Release -- /m
-
-      - name: ccache-clear
-        uses: ./.github/actions/ccache-clear
-        with:
-          key: release-windows-2022-openvino
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        shell: powershell
-        run: |
-          # Locate the extracted OpenVINO toolkit root (same pattern as the Build step).
-          $OPENVINO_ROOT = (Get-ChildItem -Directory openvino_toolkit | Select-Object -First 1).FullName
-          if (-not $OPENVINO_ROOT) {
-            Write-Error "OpenVINO toolkit folder not found under .\openvino_toolkit"
-            exit 1
-          }
-
-          $dest = ".\build\ReleaseOV\bin\Release"
-
-          $ovBin = Join-Path $OPENVINO_ROOT 'runtime\bin\intel64\Release'
-          Copy-Item -Path (Join-Path $ovBin '*.dll')       -Destination $dest -Force
-          Copy-Item -Path (Join-Path $ovBin 'cache.json')  -Destination $dest -Force
-
-          $tbbBin = Join-Path $OPENVINO_ROOT 'runtime\3rdparty\tbb\bin'
-          Copy-Item -Path (Join-Path $tbbBin 'tbb*.dll') -Destination $dest -Force
-
-          # OpenVINO licensing
-          $licensingDest = Join-Path $dest 'openvino-licensing'
-          New-Item -ItemType Directory -Force -Path $licensingDest | Out-Null
-          Copy-Item -Path (Join-Path $OPENVINO_ROOT 'docs\licensing\*') -Destination $licensingDest -Recurse -Force
-
-          Copy-Item LICENSE $dest
-          7z a -snl llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip $dest\*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v6
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip
-          name: llama-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip
-
  windows-cpu:
    needs: [check-release]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}

-    runs-on: windows-2025-vs2026
+    runs-on: windows-2025

    permissions:
      actions: write
@@ -715,12 +535,12 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
-          key: release-windows-2025-vs2026-${{ matrix.arch }}-cpu
+          key: release-windows-2025-${{ matrix.arch }}-cpu

      - name: Build
        shell: cmd
        run: |
-          call "C:\Program Files\Microsoft Visual Studio\18\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'x64' && 'x64' || 'amd64_arm64' }}
+          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'x64' && 'x64' || 'amd64_arm64' }}
          cmake -S . -B build -G "Ninja Multi-Config" ^
            -D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
            -DLLAMA_BUILD_BORINGSSL=ON ^
@@ -734,12 +554,12 @@ jobs:
      - name: ccache-clear
        uses: ./.github/actions/ccache-clear
        with:
-          key: release-windows-2025-vs2026-${{ matrix.arch }}-cpu
+          key: release-windows-2025-${{ matrix.arch }}-cpu

      - name: Pack artifacts
        id: pack_artifacts
        run: |
-          Copy-Item "C:\Program Files\Microsoft Visual Studio\18\Enterprise\VC\Redist\MSVC\14.51.36231\debug_nonredist\${{ matrix.arch }}\Microsoft.VC145.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
+          Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.44.35112\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
          7z a -snl llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*

      - name: Upload artifacts
@@ -934,209 +754,213 @@ jobs:
          path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
          name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip

-  windows-sycl:
-    needs: [check-release]
-    if: ${{ needs.check-release.outputs.should_release == 'true' }}
+# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
+#       in order to enable it again, we have to provision dedicated runners  to run it
+#  windows-sycl:
+#
+#    runs-on: windows-2022
+#
+#    defaults:
+#      run:
+#        shell: bash
+#
+#    env:
+#      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
+#      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
+#      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
+#      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+#      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+#
+#    steps:
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v6
+#
+#      - name: Use oneAPI Installation Cache
+#        uses: actions/cache@v5
+#        id: cache-sycl
+#        with:
+#          path: ${{ env.ONEAPI_ROOT }}
+#          key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+#
+#      - name: Download & Install oneAPI
+#        shell: bash
+#        if: steps.cache-sycl.outputs.cache-hit != 'true'
+#        run: |
+#          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+#
+#      - name: Install Level Zero SDK
+#        shell: pwsh
+#        run: |
+#          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
+#          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
+#          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
+#
+#      - name: Setup Node.js
+#        uses: actions/setup-node@v6
+#        with:
+#          node-version: "24"
+#          cache: "npm"
+#          cache-dependency-path: "tools/ui/package-lock.json"
+#
+#      - name: ccache
+#        uses: ggml-org/ccache-action@v1.2.21
+#        with:
+#          key: release-windows-2022-x64-sycl
+#
+#      - name: Build
+#        id: cmake_build
+#        shell: cmd
+#        run: |
+#          call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
+#          cmake -G "Ninja" -B build ^
+#            -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
+#            -DCMAKE_BUILD_TYPE=Release ^
+#            -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
+#            -DGGML_CPU=OFF -DGGML_SYCL=ON ^
+#            -DLLAMA_BUILD_BORINGSSL=ON
+#          cmake --build build --target ggml-sycl -j
+#
+#      - name: Build the release package
+#        id: pack_artifacts
+#        run: |
+#          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
+#
+#          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
+#
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
+#          ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
+#          if [ -n "$ZE_LOADER_DLL" ]; then
+#            echo "Using Level Zero loader: $ZE_LOADER_DLL"
+#            cp "$ZE_LOADER_DLL" ./build/bin
+#          else
+#            echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
+#          fi
+#
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
+#
+#          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
+#
+#          cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin
+#          cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
+#
+#          echo "cp oneAPI running time dll files to ./build/bin done"
+#          7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
+#
+#      - name: Upload the release package
+#        uses: actions/upload-artifact@v6
+#        with:
+#          path: llama-bin-win-sycl-x64.zip
+#          name: llama-bin-win-sycl-x64.zip

-    runs-on: windows-2022
-
-    defaults:
-      run:
-        shell: bash
-
-    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
-      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
-      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Download & Install oneAPI
-        shell: bash
-        run: |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
-      - name: Install Level Zero SDK
-        shell: pwsh
-        run: |
-          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
-          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
-          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: release-windows-2022-x64-sycl
-
-      - name: Build
-        id: cmake_build
-        shell: cmd
-        run: |
-          call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
-          cmake -G "Ninja" -B build ^
-            -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
-            -DCMAKE_BUILD_TYPE=Release ^
-            -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
-            -DGGML_CPU=OFF -DGGML_SYCL=ON ^
-            -DLLAMA_BUILD_BORINGSSL=ON
-          cmake --build build --target ggml-sycl -j %NUMBER_OF_PROCESSORS%
-
-      - name: ccache-clear
-        uses: ./.github/actions/ccache-clear
-        with:
-          key: release-windows-2022-x64-sycl
-
-      - name: Build the release package
-        id: pack_artifacts
-        run: |
-          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
-
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
-          ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
-          if [ -n "$ZE_LOADER_DLL" ]; then
-            echo "Using Level Zero loader: $ZE_LOADER_DLL"
-            cp "$ZE_LOADER_DLL" ./build/bin
-          else
-            echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
-          fi
-
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
-
-          echo "cp oneAPI running time dll files to ./build/bin done"
-          7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
-
-      - name: Upload the release package
-        uses: actions/upload-artifact@v6
-        with:
-          path: llama-bin-win-sycl-x64.zip
-          name: llama-bin-win-sycl-x64.zip
-
-  ubuntu-24-sycl:
-    needs: [check-release]
-    if: ${{ needs.check-release.outputs.should_release == 'true' }}
-
-    strategy:
-      matrix:
-        build: [fp32, fp16]
-        include:
-          - build: fp32
-            fp16: OFF
-          - build: fp16
-            fp16: ON
-
-    runs-on: ubuntu-24.04
-
-    env:
-      ONEAPI_ROOT: /opt/intel/oneapi/
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-      LEVEL_ZERO_VERSION: "1.28.2"
-      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: Download & Install oneAPI
-        shell: bash
-        run: |
-          cd /tmp
-          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
-          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
-
-      - name: Install Level Zero SDK
-        shell: bash
-        run: |
-          cd /tmp
-          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
-          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
-          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: release-ubuntu-24.04-sycl-${{ matrix.build }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx \
-            -DLLAMA_OPENSSL=OFF \
-            -DGGML_NATIVE=OFF \
-            -DGGML_SYCL_F16=${{ matrix.fp16 }}
-          time cmake --build build --config Release -j $(nproc)
-
-      - name: ccache-clear
-        uses: ./.github/actions/ccache-clear
-        with:
-          key: release-ubuntu-24.04-sycl-${{ matrix.build }}
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v6
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
-          name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
+# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
+#       in order to enable it again, we have to provision dedicated runners  to run it
+#  ubuntu-24-sycl:
+#
+#    strategy:
+#      matrix:
+#        build: [fp32]
+#        include:
+#          - build: fp32
+#            fp16: OFF
+#
+#    runs-on: ubuntu-24.04
+#
+#    env:
+#      ONEAPI_ROOT: /opt/intel/oneapi/
+#      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+#      LEVEL_ZERO_VERSION: "1.28.2"
+#      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
+#
+#    steps:
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v6
+#        with:
+#          fetch-depth: 0
+#
+#      - name: Use oneAPI Installation Cache
+#        uses: actions/cache@v5
+#        id: cache-sycl
+#        with:
+#          path: ${{ env.ONEAPI_ROOT }}
+#          key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+#
+#      - name: Download & Install oneAPI
+#        shell: bash
+#        if: steps.cache-sycl.outputs.cache-hit != 'true'
+#        run: |
+#          cd /tmp
+#          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
+#          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
+#
+#      - name: Install Level Zero SDK
+#        shell: bash
+#        run: |
+#          cd /tmp
+#          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
+#          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
+#          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
+#
+#      - name: Setup Node.js
+#        uses: actions/setup-node@v6
+#        with:
+#          node-version: "24"
+#          cache: "npm"
+#          cache-dependency-path: "tools/ui/package-lock.json"
+#
+#      - name: ccache
+#        uses: ggml-org/ccache-action@v1.2.21
+#        with:
+#          key: release-ubuntu-24.04-sycl
+#
+#      - name: Build
+#        id: cmake_build
+#        run: |
+#          source /opt/intel/oneapi/setvars.sh
+#          cmake -B build \
+#            -G "Ninja" \
+#            -DCMAKE_BUILD_TYPE=Release \
+#            -DGGML_SYCL=ON \
+#            -DCMAKE_C_COMPILER=icx \
+#            -DCMAKE_CXX_COMPILER=icpx \
+#            -DLLAMA_OPENSSL=OFF \
+#            -DGGML_NATIVE=OFF \
+#            -DGGML_SYCL_F16=${{ matrix.fp16 }}
+#          time cmake --build build --config Release -j $(nproc)
+#
+#      - name: Determine tag name
+#        id: tag
+#        uses: ./.github/actions/get-tag-name
+#
+#      - name: Pack artifacts
+#        id: pack_artifacts
+#        run: |
+#          cp LICENSE ./build/bin/
+#          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
+#
+#      - name: Upload artifacts
+#        uses: actions/upload-artifact@v6
+#        with:
+#          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
+#          name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz

  ubuntu-22-rocm:
-    needs: [check-release, get-version]
+    needs: [check-release]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}

    runs-on: ubuntu-22.04
@@ -1228,7 +1052,6 @@ jobs:
            -DGGML_HIP=ON \
            -DHIP_PLATFORM=amd \
            -DGGML_HIP_ROCWMMA_FATTN=ON \
-            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release -j $(nproc)

@@ -1257,7 +1080,7 @@ jobs:
          name: llama-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz

  windows-hip:
-    needs: [check-release, get-version]
+    needs: [check-release]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}

    runs-on: windows-2022
@@ -1353,7 +1176,6 @@ jobs:
            -DGPU_TARGETS="${{ matrix.gpu_targets }}" `
            -DGGML_HIP_ROCWMMA_FATTN=ON `
            -DGGML_HIP=ON `
-            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} `
            -DLLAMA_BUILD_BORINGSSL=ON
          cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
          md "build\bin\rocblas\library\"
@@ -1381,7 +1203,7 @@ jobs:
          name: llama-bin-win-hip-${{ matrix.name }}-x64.zip

  ios-xcode:
-    needs: [check-release, get-version]
+    needs: [check-release]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}
    runs-on: macos-26

@@ -1410,8 +1232,7 @@ jobs:
            -DLLAMA_BUILD_SERVER=OFF \
            -DCMAKE_SYSTEM_NAME=iOS \
            -DCMAKE_OSX_DEPLOYMENT_TARGET=16.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml \
-            -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }}
+            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO

      - name: xcodebuild for swift package
@@ -1531,12 +1352,10 @@ jobs:
 #          path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
 #          name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz

-  ui-build:
-    needs: [check-release, get-version]
+  ui:
+    needs: [check-release]
    if: ${{ needs.check-release.outputs.should_release == 'true' }}
    uses: ./.github/workflows/ui-build.yml
-    with:
-      hf_ui_version: ${{ needs.get-version.outputs.ui_version }}

  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@@ -1549,13 +1368,11 @@ jobs:
    runs-on: ubuntu-slim

    needs:
-      - get-version
      - windows
      - windows-cpu
      - windows-cuda
      #- windows-sycl
      - windows-hip
-      - windows-openvino
      - ubuntu-22-rocm
      - ubuntu-cpu
      - ubuntu-vulkan
@@ -1565,7 +1382,7 @@ jobs:
      - macos-cpu
      - ios-xcode
      #- openEuler-cann
-      - ui-build
+      - ui

    outputs:
      tag_name: ${{ steps.tag.outputs.name }}
@@ -1665,8 +1482,7 @@ jobs:
            - [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz)
            - [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
            - [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
-            - [Ubuntu x64 (SYCL FP32)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp32-x64.tar.gz)
-            - [Ubuntu x64 (SYCL FP16)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp16-x64.tar.gz)
+            - Ubuntu x64 (SYCL FP32) [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705)

            **Android:**
            - [Android arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz)
@@ -1674,12 +1490,10 @@ jobs:
            **Windows:**
            - [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
            - [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip)
-            - [Windows arm64 (OpenCL Adreno)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-opencl-adreno-arm64.zip)
            - [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) - [CUDA 12.4 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-12.4-x64.zip)
            - [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.3-x64.zip) - [CUDA 13.3 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.3-x64.zip)
            - [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
-            - [Windows x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ needs.windows-openvino.outputs.openvino_version }}-x64.zip)
-            - [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
+            - Windows x64 (SYCL) [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705)
            - [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)

            **openEuler:**
@@ -28,6 +28,13 @@ jobs:
        run: npm run build
        working-directory: tools/ui

+      - name: Generate checksums
+        run: |
+          cd tools/ui/dist
+          for f in *; do
+            sha256sum "$f" | awk '{print $1, $2}' >> checksums.txt
+          done
+
      - name: Upload built UI
        uses: actions/upload-artifact@v6
        with:
@@ -2,11 +2,6 @@ name: UI Build

 on:
  workflow_call:
-    inputs:
-      hf_ui_version:
-        description: 'Version string for version.json (e.g. 12345)'
-        required: false
-        type: string

 jobs:
  build:
@@ -30,15 +25,15 @@ jobs:
        working-directory: tools/ui

      - name: Build application
-        env:
-          HF_UI_VERSION: ${{ inputs.hf_ui_version || '' }}
-          LLAMA_BUILD_NUMBER: ${{ inputs.hf_ui_version || 'b0000' }}
        run: npm run build
        working-directory: tools/ui

-      - name: Run PWA unit tests (versioned build output)
-        run: npx vitest --project=unit --run tests/unit/pwa.spec.ts
-        working-directory: tools/ui
+      - name: Generate checksums
+        run: |
+          cd tools/ui/dist
+          for f in *; do
+            sha256sum "$f" | awk '{print $1, $2}' >> checksums.txt
+          done

      - name: Upload built UI
        uses: actions/upload-artifact@v6
@@ -40,12 +40,6 @@ jobs:
          name: ui-build
          path: tools/ui/dist/

-      - name: Create distribution archive
-        run: |
-          tar -czf dist.tar.gz -C tools/ui/dist .
-          sha256sum dist.tar.gz > dist.tar.gz.sha256
-          mv dist.tar.gz dist.tar.gz.sha256 tools/ui/dist/
-
      - name: Install Hugging Face Hub CLI
        run: pip install -U huggingface_hub

@@ -1,8 +1,8 @@
 name: UI (self-hosted)

 # these are the same as ui.yml, but with self-hosted runners
-# the jobs are lighter because they don't need to install Node.js or Playwright browsers
-# the runner has pre-installed Playwright browsers for @playwright/test (1.56.1) at /ms-playwright/
+# the runners come with pre-installed Playwright browsers version: 1.56.1
+# the jobs are much lighter because they don't need to install node and playwright browsers

 on:
  workflow_dispatch:
@@ -61,12 +61,6 @@ jobs:
        run: npm ci
        working-directory: tools/ui

-      - name: Download built UI artifacts
-        uses: actions/download-artifact@v6
-        with:
-          name: ui-build
-          path: tools/ui/dist/
-
      - name: Run type checking
        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run check
@@ -78,12 +72,12 @@ jobs:
        working-directory: tools/ui

      - name: Run Client tests
-        if: ${{ always() && steps.setup.conclusion == 'success' }}
+        if: ${{ always() }}
        run: npm run test:client
        working-directory: tools/ui

      - name: Run Unit tests
-        if: ${{ always() && steps.setup.conclusion == 'success' }}
+        if: ${{ always() }}
        run: npm run test:unit
        working-directory: tools/ui

@@ -103,23 +97,22 @@ jobs:
        run: npm ci
        working-directory: tools/ui

-      - name: Download built UI artifacts
-        uses: actions/download-artifact@v6
-        with:
-          name: ui-build
-          path: tools/ui/dist/
+      - name: Build application
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
+        run: npm run build
+        working-directory: tools/ui

      - name: Build Storybook
-        if: ${{ always() && steps.setup.conclusion == 'success' }}
+        if: ${{ always() }}
        run: npm run build-storybook
        working-directory: tools/ui

      - name: Run UI tests
-        if: ${{ always() && steps.setup.conclusion == 'success' }}
+        if: ${{ always() }}
        run: npm run test:ui -- --testTimeout=60000
        working-directory: tools/ui

      - name: Run E2E tests
-        if: ${{ always() && steps.setup.conclusion == 'success' }}
+        if: ${{ always() }}
        run: npm run test:e2e
        working-directory: tools/ui
@@ -43,7 +43,7 @@ jobs:
  ui-checks:
    name: Checks
    needs: ui-build
-    runs-on: ubuntu-24.04
+    runs-on: ubuntu-latest
    continue-on-error: true
    steps:
      - name: Checkout code
@@ -60,12 +60,6 @@ jobs:
          cache: "npm"
          cache-dependency-path: "tools/ui/package-lock.json"

-      - name: Download built UI artifacts
-        uses: actions/download-artifact@v6
-        with:
-          name: ui-build
-          path: tools/ui/dist/
-
      - name: Install dependencies
        id: setup
        if: ${{ steps.node.conclusion == 'success' }}
@@ -93,7 +87,7 @@ jobs:
        run: npm run test:client
        working-directory: tools/ui

-      - name: Run Unit tests (uses pre-built dist/ from ui-build)
+      - name: Run Unit tests
        if: ${{ always() && steps.playwright.conclusion == 'success' }}
        run: npm run test:unit
        working-directory: tools/ui
@@ -101,7 +95,7 @@ jobs:
  e2e-tests:
    name: E2E Tests
    needs: ui-build
-    runs-on: ubuntu-24.04
+    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v6
@@ -123,11 +117,10 @@ jobs:
        run: npm ci
        working-directory: tools/ui

-      - name: Download built UI artifacts (reuses ui-build)
-        uses: actions/download-artifact@v6
-        with:
-          name: ui-build
-          path: tools/ui/dist/
+      - name: Build application
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
+        run: npm run build
+        working-directory: tools/ui

      - name: Install Playwright browsers
        id: playwright
@@ -145,7 +138,7 @@ jobs:
        run: npm run test:ui -- --testTimeout=60000
        working-directory: tools/ui

-      - name: Run E2E tests (uses pre-built dist/ from ui-build)
+      - name: Run E2E tests
        if: ${{ always() && steps.playwright.conclusion == 'success' }}
        run: npm run test:e2e
        working-directory: tools/ui
@@ -17,7 +17,7 @@ jobs:

      - name: Install komac
        run: |
-          cargo binstall komac@2.16.0 -y
+          cargo binstall komac@2.15.0 -y

      - name: Find latest release
        id: find_latest_release
@@ -92,6 +92,13 @@
 !/examples/sycl/*.bat
 !/examples/sycl/*.sh

+# Server Web UI temporary files (+ legacy directory)
+
+/tools/server/webui/node_modules
+/tools/server/webui/dist
+/tools/ui/node_modules
+/tools/ui/dist
+
 # Python

 /.venv
@@ -25,3 +25,13 @@ Commits:
 - Do not explicitly set the git author in commits - rely on the default git config
 - Always use `--no-gpg-sign` when committing
 - Never `git push` without explicit confirmation from the user
+
+Resources (read on demand):
+- [CONTRIBUTING.md](CONTRIBUTING.md)
+- [Build documentation](docs/build.md)
+- [Server usage documentation](tools/server/README.md)
+- [Server development documentation](tools/server/README-dev.md)
+- [PEG parser](docs/development/parsing.md)
+- [Auto parser](docs/autoparser.md)
+- [Jinja engine](common/jinja/README.md)
+- [PR template](.github/pull_request_template.md)
@@ -222,16 +222,6 @@ if (LLAMA_BUILD_APP)
    add_subdirectory(app)
 endif()

-# Standalone libmtmd build without pulling in the rest of the tools/ tree.
-# Useful when packaging just the mtmd library for language bindings (e.g. an
-# Apple XCFramework, or a WASM build). When the full tools build is enabled,
-# mtmd is already built by the tools/ subdirectory above; this hook only fires
-# when LLAMA_BUILD_TOOLS is OFF to avoid double-adding the target.
-option(LLAMA_BUILD_MTMD "llama: build tools/mtmd library standalone" OFF)
-if (LLAMA_BUILD_MTMD AND NOT (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS))
-    add_subdirectory(tools/mtmd)
-endif()
-
 #
 # install
 #
@@ -10,7 +10,7 @@
 # ggml-org/ggml-rpc         : rgerganov
 # ggml-org/ggml-sycl        : arthw
 # ggml-org/ggml-vulkan      : 0cc4m, jeffbolznv
-# ggml-org/ggml-webgpu      : reeselevine, yomaytk
+# ggml-org/ggml-webgpu      : reeselevine
 # ggml-org/ggml-zdnn        : taronaeo
 # ggml-org/llama-common     : ggerganov, aldehir, angt, danbev, ngxson, pwilkin
 # ggml-org/llama-mtmd       : ngxson
@@ -1,6 +1,6 @@
 # llama.cpp

-![llama](https://raw.githubusercontent.com/ggml-org/llama.brand/refs/heads/master/cover/llama-cpp/cover-llama-cpp-dark.svg)
+![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)

 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [![Release](https://img.shields.io/github/v/release/ggml-org/llama.cpp)](https://github.com/ggml-org/llama.cpp/releases)
@@ -37,7 +37,7 @@ LLM inference in C/C++

 Getting started with llama.cpp is straightforward. Here are several ways to install it on your machine:

- Install `llama.cpp` using [brew, nix, winget, or conda-forge](docs/install.md)
+- Install `llama.cpp` using [brew, nix or winget](docs/install.md)
 - Run with Docker - see our [Docker documentation](docs/docker.md)
 - Download pre-built binaries from the [releases page](https://github.com/ggml-org/llama.cpp/releases)
 - Build from source by cloning this repository - check out [our build guide](docs/build.md)
@@ -142,9 +142,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
 - [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
 - [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
- [x] [Liquid LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2)
- [x] [Liquid LFM2.5 models](https://huggingface.co/collections/LiquidAI/lfm25)
- [x] [Liquid Nanos](https://huggingface.co/collections/LiquidAI/liquid-nanos)
+- [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38)
 - [x] [Hunyuan models](https://huggingface.co/collections/tencent/hunyuan-dense-model-6890632cda26b19119c9c5e7)
 - [x] [BailingMoeV2 (Ring/Ling 2.0) models](https://huggingface.co/collections/inclusionAI/ling-v2-68bf1dd2fc34c306c1fa6f86)
 - [x] [Mellum models](https://huggingface.co/JetBrains/models?search=mellum)
@@ -80,7 +80,7 @@ To protect sensitive data from potential leaks or unauthorized access, it is cru
 ### Untrusted environments or networks

 If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
-* Do not use the RPC backend, [ggml-rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061).
+* Do not use the RPC backend, [rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061).
 * Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value.
 * Encrypt your data if sending it over the network.

@@ -1,6 +1,6 @@
 set(TARGET llama-app)

-add_executable(${TARGET} llama.cpp download.cpp)
+add_executable(${TARGET} llama.cpp)
 set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama)

 target_link_libraries(${TARGET} PRIVATE
@@ -1,71 +0,0 @@
-#include "arg.h"
-#include "common.h"
-#include "download.h"
-#include "log.h"
-
-#include <cstdio>
-#include <filesystem>
-
-static void print_usage(int /*argc*/, char ** argv) {
-    printf(
-        "\nexamples:\n"
-        "  %s -hf ggml-org/gemma-3-4b-it-qat-GGUF\n"
-        "  %s -hf ggml-org/gemma-3-4b-it-qat-GGUF:Q4_K_M\n"
-        "  %s -hf ggml-org/models -hff model.gguf\n"
-        "  %s -mu https://example.com/model.gguf -m model.gguf\n"
-        "\n",
-        argv[0], argv[0], argv[0], argv[0]
-    );
-}
-
-int llama_download(int argc, char ** argv);
-
-int llama_download(int argc, char ** argv) {
-    common_init();
-
-    common_params params;
-    params.verbosity = LOG_LEVEL_ERROR;
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DOWNLOAD, print_usage)) {
-        return 1;
-    }
-
-    const bool has_source = !params.model.hf_repo.empty() || !params.model.url.empty() ||
-                            !params.model.path.empty()    || !params.model.docker_repo.empty();
-    if (!has_source) {
-        fprintf(stderr, "error: no model source specified (use --hf-repo, --model-url, --model or --docker-repo)\n");
-        return 1;
-    }
-
-    try {
-        common_models_handler handler = common_models_handler_init(params, LLAMA_EXAMPLE_DOWNLOAD);
-        common_models_handler_apply(handler, params);
-    } catch (const std::exception & e) {
-        fprintf(stderr, "error: %s\n", e.what());
-        return 1;
-    }
-
-    if (!params.models_preset.empty()) {
-        // -hf pointed at a preset repo: print the preset path and stop
-        printf("%s\n", params.models_preset.c_str());
-        return 0;
-    }
-    if (params.model.path.empty()) {
-        fprintf(stderr, "error: model download failed\n");
-        return 1;
-    }
-    if (!std::filesystem::exists(params.model.path)) {
-        fprintf(stderr, "error: model file does not exist: %s\n", params.model.path.c_str());
-        return 1;
-    }
-
-    printf("%s\n", params.model.path.c_str());
-    if (!params.mmproj.path.empty()) {
-        printf("%s\n", params.mmproj.path.c_str());
-    }
-    if (!params.speculative.draft.mparams.path.empty()) {
-        printf("%s\n", params.speculative.draft.mparams.path.c_str());
-    }
-
-    return 0;
-}
@@ -19,23 +19,17 @@ int llama_batched_bench(int argc, char ** argv);
 int llama_fit_params(int argc, char ** argv);
 int llama_quantize(int argc, char ** argv);
 int llama_perplexity(int argc, char ** argv);
-int llama_download(int argc, char ** argv);

-// Self-update is only supported for binaries built with llama-install.sh
+// hands the update over to the install script, which downloads and swaps the binary
 static int llama_update(int argc, char ** argv) {
    (void) argc;
    (void) argv;

-#ifdef LLAMA_INSTALL_BUILD
 #if defined(_WIN32)
    return system("powershell -NoProfile -ExecutionPolicy Bypass -Command \"irm https://llama.app/install.ps1 | iex\"");
 #else
    return system("curl -fsSL https://llama.app/install.sh | sh");
 #endif
-#else
-    printf("Updates are available only when installed from https://llama.app\n");
-    return 1;
-#endif
 }

 static const char * progname;
@@ -50,33 +44,23 @@ struct command {
    std::vector<std::string> aliases;
    bool hidden;
    int (*func)(int, char **);
-    bool flags = false; // allow --name
 };

-#ifdef LLAMA_INSTALL_BUILD
-#define UPDATE_HIDDEN false
-#else
-#define UPDATE_HIDDEN true
-#endif
-
 static const command cmds[] = {
-    {"serve",         "HTTP API server",                                    {"server"},   false,         llama_server       },
-    {"cli",           "Command-line interactive interface",                 {"client"},   false,         llama_cli          },
-    {"update",        "Update llama to the latest release",                 {},           UPDATE_HIDDEN, llama_update       },
-    {"download",      "Download a model",                                   {"get"},      false,         llama_download     },
-    {"completion",    "Text completion",                                    {"complete"}, true,          llama_completion   },
-    {"bench",         "Benchmark prompt processing and text generation",    {},           true,          llama_bench        },
-    {"batched-bench", "Benchmark batched decoding performance",             {},           true,          llama_batched_bench},
-    {"fit-params",    "Compute parameters to fit a model in device memory", {},           true,          llama_fit_params   },
-    {"quantize",      "Quantize a model",                                   {},           true,          llama_quantize     },
-    {"perplexity",    "Compute model perplexity and KL divergence",         {},           true,          llama_perplexity   },
-    {"version",       "Show version",                                       {},           false,         version,           true },
-    {"licenses",      "Show third-party licenses",                          {"credits"},  false,         licenses,          true },
-    {"help",          "Show available commands",                            {},           false,         help,              true },
+    {"serve",         "HTTP API server",                                    {"server"},   false, llama_server       },
+    {"cli",           "Command-line interactive interface",                 {"client"},   false, llama_cli          },
+    {"update",        "Update llama to the latest release",                 {},           false, llama_update       },
+    {"completion",    "Text completion",                                    {"complete"}, true,  llama_completion   },
+    {"bench",         "Benchmark prompt processing and text generation",    {},           true,  llama_bench        },
+    {"batched-bench", "Benchmark batched decoding performance",             {},           true,  llama_batched_bench},
+    {"fit-params",    "Compute parameters to fit a model in device memory", {},           true,  llama_fit_params   },
+    {"quantize",      "Quantize a model",                                   {},           true,  llama_quantize     },
+    {"perplexity",    "Compute model perplexity and KL divergence",         {},           true,  llama_perplexity   },
+    {"version",       "Show version",                                       {},           false, version            },
+    {"licenses",      "Show third-party licenses",                          {"credits"},  false, licenses           },
+    {"help",          "Show available commands",                            {},           false, help               },
 };

-#undef UPDATE_HIDDEN
-
 static int version(int argc, char ** argv) {
    printf("%s\n", llama_build_info());
    return 0;
@@ -109,10 +93,7 @@ static int help(int argc, char ** argv) {
    return 0;
 }

-static bool matches(std::string arg, const command & cmd) {
-    if (cmd.flags && arg.size() > 2 && arg[0] == '-' && arg[1] == '-') {
-        arg.erase(0, 2);
-    }
+static bool matches(const std::string & arg, const command & cmd) {
    if (arg == cmd.name) {
        return true;
    }
@@ -13,7 +13,6 @@ LLAMA_BUILD_EXAMPLES=OFF
 LLAMA_BUILD_TOOLS=OFF
 LLAMA_BUILD_TESTS=OFF
 LLAMA_BUILD_SERVER=OFF
-LLAMA_BUILD_MTMD=ON
 GGML_METAL=ON
 GGML_METAL_EMBED_LIBRARY=ON
 GGML_BLAS_DEFAULT=ON
@@ -40,7 +39,6 @@ COMMON_CMAKE_ARGS=(
    -DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS}
    -DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
    -DLLAMA_BUILD_SERVER=${LLAMA_BUILD_SERVER}
-    -DLLAMA_BUILD_MTMD=${LLAMA_BUILD_MTMD}
    -DGGML_METAL_EMBED_LIBRARY=${GGML_METAL_EMBED_LIBRARY}
    -DGGML_BLAS_DEFAULT=${GGML_BLAS_DEFAULT}
    -DGGML_METAL=${GGML_METAL}
@@ -128,8 +126,6 @@ setup_framework_structure() {
    cp ggml/include/ggml-cpu.h     ${header_path}
    cp ggml/include/ggml-blas.h    ${header_path}
    cp ggml/include/gguf.h         ${header_path}
-    cp tools/mtmd/mtmd.h           ${header_path}
-    cp tools/mtmd/mtmd-helper.h    ${header_path}

    # Create module map (common for all platforms)
    cat > ${module_path}module.modulemap << EOF
@@ -251,7 +247,6 @@ combine_static_libraries() {
        "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-cpu.a"
        "${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a"
        "${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a"
-        "${base_dir}/${build_dir}/tools/mtmd/${release_dir}/libmtmd.a"
    )

    # Create temporary directory for processing
@@ -415,7 +410,6 @@ cmake -B build-ios-sim -G Xcode \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
    -DLLAMA_OPENSSL=OFF \
-    -DMTMD_VIDEO=OFF \
    -S .
 cmake --build build-ios-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet

@@ -430,7 +424,6 @@ cmake -B build-ios-device -G Xcode \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
    -DLLAMA_OPENSSL=OFF \
-    -DMTMD_VIDEO=OFF \
    -S .
 cmake --build build-ios-device --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet

@@ -457,7 +450,6 @@ cmake -B build-visionos -G Xcode \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
    -DLLAMA_OPENSSL=OFF \
    -DLLAMA_BUILD_SERVER=OFF \
-    -DMTMD_VIDEO=OFF \
    -S .
 cmake --build build-visionos --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet

@@ -473,7 +465,6 @@ cmake -B build-visionos-sim -G Xcode \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
    -DLLAMA_OPENSSL=OFF \
    -DLLAMA_BUILD_SERVER=OFF \
-    -DMTMD_VIDEO=OFF \
    -S .
 cmake --build build-visionos-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet

@@ -490,7 +481,6 @@ cmake -B build-tvos-sim -G Xcode \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
    -DLLAMA_OPENSSL=OFF \
-    -DMTMD_VIDEO=OFF \
    -S .
 cmake --build build-tvos-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet

@@ -506,7 +496,6 @@ cmake -B build-tvos-device -G Xcode \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
    -DLLAMA_OPENSSL=OFF \
-    -DMTMD_VIDEO=OFF \
    -S .
 cmake --build build-tvos-device --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet

@@ -80,6 +80,8 @@ add_library(${TARGET}
    http.h
    imatrix-loader.cpp
    imatrix-loader.h
+    json-partial.cpp
+    json-partial.h
    json-schema-to-grammar.cpp
    llguidance.cpp
    log.cpp
@@ -94,8 +96,10 @@ add_library(${TARGET}
    peg-parser.h
    preset.cpp
    preset.h
+    regex-partial.cpp
    reasoning-budget.cpp
    reasoning-budget.h
+    regex-partial.h
    sampling.cpp
    sampling.h
    speculative.cpp
@@ -17,7 +17,6 @@
 #   define NOMINMAX
 #endif
 #include <windows.h>
-#include <shellapi.h>
 #endif

 #define JSON_ASSERT GGML_ASSERT
@@ -286,17 +285,108 @@ static std::string clean_file_name(const std::string & fname) {
    return clean_fname;
 }

+static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
+    GGML_ASSERT(!params.model.hf_repo.empty());
+
+    // the returned hf_repo is without tag
+    auto [hf_repo, hf_tag] = common_download_split_repo_tag(params.model.hf_repo);
+
+    // "latest" tag (default if not specified) is translated to "default" preset
+    if (hf_tag == "latest") {
+        hf_tag = "default";
+    }
+
+    std::string model_endpoint = common_get_model_endpoint();
+    auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
+
+    // prepare local path for caching
+    auto preset_fname = clean_file_name(hf_repo + "_preset.ini");
+    auto preset_path = fs_get_cache_file(preset_fname);
+    common_download_opts opts;
+    opts.bearer_token = params.hf_token;
+    opts.offline = params.offline;
+
+    LOG_TRC("%s: looking for remote preset at %s\n", __func__, preset_url.c_str());
+    const int status = common_download_file_single(preset_url, preset_path, opts);
+    const bool has_preset = status >= 200 && status < 400;
+
+    // remote preset is optional, so we don't error out if not found
+    if (has_preset) {
+        LOG_TRC("%s: applying remote preset from %s\n", __func__, preset_url.c_str());
+        common_preset_context ctx(ex, /* only_remote_allowed */ true);
+        common_preset global;
+        auto remote_presets = ctx.load_from_ini(preset_path, global);
+        remote_presets = ctx.cascade(global, remote_presets);
+        if (remote_presets.find(hf_tag) != remote_presets.end()) {
+            common_preset preset = remote_presets.at(hf_tag);
+            LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
+            preset.apply_to_params(params);
+        } else {
+            throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
+        }
+    } else {
+        LOG_TRC("%s: no remote preset found, skipping\n", __func__);
+    }
+
+    return has_preset;
+}
+
 struct handle_model_result {
    bool found_mmproj = false;
    common_params_model mmproj;

    bool found_mtp = false;
    common_params_model mtp;
-
-    bool found_preset = false;
-    std::string preset_path;
 };

+static handle_model_result common_params_handle_model(struct common_params_model & model,
+                                                      const common_download_opts & opts) {
+    handle_model_result result;
+
+    if (!model.docker_repo.empty()) {
+        model.path = common_docker_resolve_model(model.docker_repo);
+        model.name = model.docker_repo;
+    } else if (!model.hf_repo.empty()) {
+        // If -m was used with -hf, treat the model "path" as the hf_file to download
+        if (model.hf_file.empty() && !model.path.empty()) {
+            model.hf_file = model.path;
+            model.path = "";
+        }
+        common_download_opts hf_opts = opts;
+        auto download_result = common_download_model(model, hf_opts);
+
+        if (download_result.model_path.empty()) {
+            throw std::runtime_error("failed to download model from Hugging Face");
+        }
+
+        model.name = model.hf_repo;
+        model.path = download_result.model_path;
+
+        if (!download_result.mmproj_path.empty()) {
+            result.found_mmproj = true;
+            result.mmproj.path  = download_result.mmproj_path;
+        }
+
+        if (!download_result.mtp_path.empty()) {
+            result.found_mtp = true;
+            result.mtp.path  = download_result.mtp_path;
+        }
+    } else if (!model.url.empty()) {
+        if (model.path.empty()) {
+            auto f = string_split<std::string>(model.url, '#').front();
+            f = string_split<std::string>(f, '?').front();
+            model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
+        }
+
+        auto download_result = common_download_model(model, opts);
+        if (download_result.model_path.empty()) {
+            throw std::runtime_error("failed to download model from " + model.url);
+        }
+    }
+
+    return result;
+}
+
 const std::vector<ggml_type> kv_cache_types = {
    GGML_TYPE_F32,
    GGML_TYPE_F16,
@@ -340,242 +430,62 @@ static bool parse_bool_value(const std::string & value) {
    throw std::invalid_argument("the argument has been removed. " + msg);
 }

-//
-// common_models_handler
-//
-
-static std::string get_default_local_path(const std::string & url) {
-    auto f = string_split<std::string>(url, '#').front();
-    f = string_split<std::string>(f, '?').front();
-    return fs_get_cache_file(string_split<std::string>(f, '/').back());
-}
-
-common_models_handler common_models_handler_init(const common_params & params, llama_example curr_ex) {
-    common_download_hf_plan plan;
-    common_download_hf_plan plan_spec;
-    common_download_hf_plan plan_voc;
-    common_download_opts opts;
-
-    const bool spec_type_draft_mtp = std::find(params.speculative.types.begin(),
-                                        params.speculative.types.end(),
-                                        COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params.speculative.types.end();
-
-    // only download mmproj if the current example is using it
-    bool use_mmproj = false;
-    for (const auto & ex : mmproj_examples) {
-        if (curr_ex == ex) {
-            use_mmproj = true;
-            break;
-        }
-    }
-
-    opts.bearer_token    = params.hf_token;
-    opts.offline         = params.offline;
-    opts.download_mtp    = spec_type_draft_mtp;
-    opts.download_mmproj = use_mmproj && !params.no_mmproj
-                        && params.mmproj.path.empty() && params.mmproj.url.empty();
-
-    if (!params.model.hf_repo.empty()) {
-        plan = common_download_get_hf_plan(params.model, opts);
-    }
-
-    if (!params.speculative.draft.mparams.hf_repo.empty()) {
-        plan_spec = common_download_get_hf_plan(params.speculative.draft.mparams, opts);
-    }
-
-    if (!params.vocoder.model.hf_repo.empty()) {
-        plan_voc = common_download_get_hf_plan(params.vocoder.model, opts);
-    }
-
-    return common_models_handler{plan, plan_spec, plan_voc, opts};
-}
-
-bool common_models_handler_is_preset_repo(const common_models_handler & handler) {
-    return !handler.plan.preset.url.empty();
-}
-
-static std::vector<common_download_task> build_url_tasks(const common_params_model & model, common_download_opts opts) {
-    auto parts = common_download_get_all_parts(model.url);
-    std::vector<common_download_task> tasks;
-
-    // single-part: download straight to model.path if the user gave one (-m), else the cache default
-    if (parts.size() == 1) {
-        common_download_task task;
-        task.url        = parts[0];
-        task.local_path = model.path.empty() ? get_default_local_path(parts[0]) : model.path;
-        task.opts       = opts;
-        tasks.push_back(std::move(task));
-        return tasks;
-    }
-
-    // multi-part: place each part under the user's -m directory (if given), else the cache default
-    std::string base_dir;
-    if (!model.path.empty()) {
-        auto pos = model.path.rfind('/');
-        base_dir = pos == std::string::npos ? std::string(".") : model.path.substr(0, pos);
-    }
-
-    for (const auto & part : parts) {
-        common_download_task task;
-        task.url  = part;
-        task.opts = opts;
-
-        std::string local = get_default_local_path(part);
-        if (!base_dir.empty()) {
-            auto pos = local.rfind('/');
-            std::string name = pos == std::string::npos ? local : local.substr(pos + 1);
-            local = base_dir + "/" + name;
-        }
-        task.local_path = local;
-        tasks.push_back(std::move(task));
-    }
-    return tasks;
-}
-
-void common_models_handler_apply(common_models_handler & handler, common_params & params, common_download_callback * callback) {
-    std::vector<common_download_task> tasks;
-
-    auto & plan      = handler.plan;
-    auto & plan_spec = handler.plan_spec;
-    auto & plan_voc  = handler.plan_voc;
-
-    auto opts = handler.opts; // copy
-    opts.callback = callback;
-
-    // handle plain "url" if needed
-    auto handle_url = [&](common_params_model & model) {
-        if (!model.url.empty()) {
-            if (model.path.empty()) {
-                model.path = get_default_local_path(model.url);
-            }
-        }
-    };
-    handle_url(params.model);
-    handle_url(params.mmproj);
-    handle_url(params.vocoder.model);
-    handle_url(params.speculative.draft.mparams);
-
-    // optionally, if docker repo is set, resolve it
-    if (!params.model.docker_repo.empty()) {
-        params.model.url  = common_docker_resolve_model(params.model.docker_repo);
-        params.model.path = get_default_local_path(params.model.url);
-    }
-
-    // handle plain "url" tasks (non-hf)
-    if (!params.model.url.empty()) {
-        auto url_tasks = build_url_tasks(params.model, opts);
-        // the first part is what gets loaded, so point params.model.path at it
-        if (!url_tasks.empty()) {
-            std::string first_path = url_tasks.front().local_path;
-            url_tasks.front().on_done = [&, first_path]() { params.model.path = first_path; };
-        }
-        for (auto & task : url_tasks) {
-            tasks.push_back(std::move(task));
-        }
-    }
-    if (!params.mmproj.url.empty()) {
-        common_download_task task;
-        task.url        = params.mmproj.url;
-        task.local_path = params.mmproj.path;
-        task.opts       = opts;
-        tasks.push_back(task);
-    }
-    if (!params.vocoder.model.url.empty()) {
-        common_download_task task;
-        task.url        = params.vocoder.model.url;
-        task.local_path = params.vocoder.model.path;
-        task.opts       = opts;
-        tasks.push_back(task);
-    }
-    if (!params.speculative.draft.mparams.url.empty()) {
-        common_download_task task;
-        task.url        = params.speculative.draft.mparams.url;
-        task.local_path = params.speculative.draft.mparams.path;
-        task.opts       = opts;
-        tasks.push_back(task);
-    }
-
-    // handle hf_plan tasks
-    auto add_tasks = [&opts, &tasks](const hf_cache::hf_files & model_files, common_params_model & model) {
-        for (size_t i = 0; i < model_files.size(); ++i) {
-            auto & model_file = model_files[i];
-            bool is_first = (i == 0);
-            tasks.emplace_back(model_file, opts, [&, is_first]() {
-                if (is_first) {
-                    // only use first part as model path
-                    model.path = hf_cache::finalize_file(model_file);
-                } else {
-                    hf_cache::finalize_file(model_file);
-                }
-            });
-        }
-    };
-    if (!plan.model_files.empty()) {
-        add_tasks(plan.model_files, params.model);
-    }
-    if (!plan.mmproj.local_path.empty()) {
-        tasks.emplace_back(plan.mmproj, opts, [&]() {
-            params.mmproj.path = hf_cache::finalize_file(plan.mmproj);
-        });
-    }
-    if (!plan.mtp.local_path.empty()) {
-        tasks.emplace_back(plan.mtp, opts, [&]() {
-            // only fall back to the discovered MTP head when no draft was explicitly provided
-            if (params.speculative.draft.mparams.empty()) {
-                params.speculative.draft.mparams.path = hf_cache::finalize_file(plan.mtp);
-            } else {
-                hf_cache::finalize_file(plan.mtp);
-            }
-        });
-    }
-    if (!plan.preset.local_path.empty()) {
-        tasks.emplace_back(plan.preset, opts, [&]() {
-            // if HF repo is a preset repo, we simply run server in router mode with the preset.ini file
-            params.models_preset_hf = params.model.hf_repo; // only for showing a warning
-            params.models_preset    = hf_cache::finalize_file(plan.preset);
-            params.model = common_params_model{}; // make sure to clear model, so server starts in router mode
-        });
-    }
-
-    // handle plan_spec (e.g. --spec-draft-hf)
-    if (!plan_spec.model_files.empty()) {
-        add_tasks(plan_spec.model_files, params.speculative.draft.mparams);
-    }
-
-    // handle vocoder plan (e.g. --hf-repo-v)
-    if (!plan_voc.model_files.empty()) {
-        add_tasks(plan_voc.model_files, params.vocoder.model);
-    }
-
-    // run all tasks in parallel
-    if (!params.offline) {
-        // if duplicated files are found, only download once (but still call on_done for each task)
-        std::unordered_map<std::string, common_download_task *> unique_tasks;
-        for (auto & task : tasks) {
-            auto it = unique_tasks.find(task.local_path);
-            if (it == unique_tasks.end()) {
-                unique_tasks[task.local_path] = &task;
-            }
-        }
-        std::vector<common_download_task> unique_tasks_vec;
-        for (auto & pair : unique_tasks) {
-            unique_tasks_vec.push_back(*pair.second);
-        }
-        common_download_run_tasks(unique_tasks_vec);
-    }
-
-    // download successful, update params with the downloaded paths
-    for (const auto & task : tasks) {
-        if (task.on_done) {
-            task.on_done();
-        }
-    }
-}
-
 //
 // CLI argument parsing functions
 //

+bool common_params_handle_models(common_params & params, llama_example curr_ex) {
+    const bool spec_type_draft_mtp = std::find(params.speculative.types.begin(),
+                                         params.speculative.types.end(),
+                                         COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params.speculative.types.end();
+
+    common_download_opts opts;
+    opts.bearer_token    = params.hf_token;
+    opts.offline         = params.offline;
+    opts.skip_download   = params.skip_download;
+    opts.download_mtp    = spec_type_draft_mtp;
+    opts.download_mmproj = !params.no_mmproj;
+
+    // sub-models (draft, mmproj, vocoder) are explicitly specified by the user,
+    // so we should not auto-discover mtp/mmproj siblings for them
+    common_download_opts sub_opts = opts;
+    sub_opts.download_mtp    = false;
+    sub_opts.download_mmproj = false;
+
+    try {
+        auto res = common_params_handle_model(params.model, opts);
+        if (params.no_mmproj) {
+            params.mmproj = {};
+        } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
+            // optionally, handle mmproj model when -hf is specified
+            params.mmproj = res.mmproj;
+        }
+        // only download mmproj if the current example is using it
+        for (const auto & ex : mmproj_examples) {
+            if (curr_ex == ex) {
+                common_params_handle_model(params.mmproj, sub_opts);
+                break;
+            }
+        }
+
+        // when --spec-type mtp is set and no draft model was provided explicitly,
+        // fall back to the MTP head discovered alongside the -hf model
+        if (spec_type_draft_mtp && res.found_mtp &&
+            params.speculative.draft.mparams.path.empty() &&
+            params.speculative.draft.mparams.hf_repo.empty() &&
+            params.speculative.draft.mparams.url.empty()) {
+            params.speculative.draft.mparams.path = res.mtp.path;
+        }
+        common_params_handle_model(params.speculative.draft.mparams, sub_opts);
+        common_params_handle_model(params.vocoder.model,             sub_opts);
+        return true;
+    } catch (const common_skip_download_exception &) {
+        return false;
+    } catch (const std::exception &) {
+        throw;
+    }
+}
+
 static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
    common_params & params = ctx_arg.params;

@@ -691,6 +601,30 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
    // parse the first time to get -hf option (used for remote preset)
    parse_cli_args();

+    // export_graph_ops loads only metadata
+    const bool skip_model_download = ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;
+
+    // maybe handle remote preset
+    if (!params.model.hf_repo.empty() && !skip_model_download) {
+        std::string cli_hf_repo = params.model.hf_repo;
+        bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex);
+
+        // special case: if hf_repo explicitly set by preset, we need to preserve it (ignore CLI value)
+        // this is useful when we have one HF repo pointing to other HF repos (one model - multiple GGUFs)
+        std::string preset_hf_repo = params.model.hf_repo;
+        bool preset_has_hf_repo = preset_hf_repo != cli_hf_repo;
+
+        if (has_preset) {
+            // re-parse CLI args to override preset values
+            parse_cli_args();
+        }
+
+        // preserve hf_repo from preset if needed
+        if (preset_has_hf_repo) {
+            params.model.hf_repo = preset_hf_repo;
+        }
+    }
+
    postprocess_cpu_params(params.cpuparams,       nullptr);
    postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);

@@ -701,26 +635,15 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
    }

-    const bool skip_model_download =
-        // server will call common_params_handle_models() later, so we skip it here
-        ctx_arg.ex == LLAMA_EXAMPLE_SERVER ||
-        // download calls common_params_handle_models() itself and prints the paths
-        ctx_arg.ex == LLAMA_EXAMPLE_DOWNLOAD ||
-        // export_graph_ops loads only metadata
-        ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;
-
+    // handle model and download
    if (!skip_model_download) {
-        // handle model and download
-        common_models_handler handler = common_models_handler_init(params, ctx_arg.ex);
-        common_models_handler_apply(handler, params);
+        common_params_handle_models(params, ctx_arg.ex);
+    }

-        // model is required (except for server)
-        // TODO @ngxson : maybe show a list of available models in CLI in this case
-        if (params.model.path.empty()
-                && !params.usage
-                && !params.completion) {
-            throw std::invalid_argument("error: --model is required\n");
-        }
+    // model is required (except for server)
+    // TODO @ngxson : maybe show a list of available models in CLI in this case
+    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !skip_model_download && !params.usage && !params.completion) {
+        throw std::invalid_argument("error: --model is required\n");
    }

    if (params.escape) {
@@ -784,19 +707,15 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
            common_options.push_back(&opt);
        }
    }
-    bool first = true;
-    auto print_section = [&](const char * header, std::vector<common_arg *> & options) {
-        if (options.empty()) {
-            return;
-        }
-        printf("%s----- %s -----\n\n", first ? "" : "\n\n", header);
-        first = false;
-        print_options(options);
-    };
-    print_section("common params",           common_options);
-    print_section("sampling params",         sampling_options);
-    print_section("speculative params",      spec_options);
-    print_section("example-specific params", specific_options);
+    printf("----- common params -----\n\n");
+    print_options(common_options);
+    printf("\n\n----- sampling params -----\n\n");
+    print_options(sampling_options);
+    printf("\n\n----- speculative params -----\n\n");
+    print_options(spec_options);
+    // TODO: maybe convert enum llama_example to string
+    printf("\n\n----- example-specific params -----\n\n");
+    print_options(specific_options);
 }

 static void common_params_print_completion(common_params_context & ctx_arg) {
@@ -1018,44 +937,7 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
    return true;
 }

-#ifdef _WIN32
-struct utf8_argv {
-    std::vector<std::string> buf;
-    std::vector<char*> ptrs;
-};
-
-static utf8_argv make_utf8_argv() {
-    utf8_argv out;
-    int wargc = 0;
-    LPWSTR* wargv = CommandLineToArgvW(GetCommandLineW(), &wargc);
-    if (!wargv) return out;
-
-    out.buf.reserve(wargc);
-    for (int i = 0; i < wargc; ++i) {
-        int n = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, wargv[i], -1, nullptr, 0, nullptr, nullptr);
-        if (n <= 0) { out.buf.emplace_back(); continue; }
-        auto& s = out.buf.emplace_back();
-        s.resize(static_cast<size_t>(n - 1));
-        (void)WideCharToMultiByte(CP_UTF8, 0, wargv[i], -1, s.data(), n, nullptr, nullptr);
-    }
-    LocalFree(wargv);
-
-    out.ptrs.reserve(out.buf.size() + 1);
-    for (auto& s : out.buf) out.ptrs.push_back(s.data());
-    out.ptrs.push_back(nullptr);
-    return out;
-}
-#endif
-
 bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
-#ifdef _WIN32
-    auto utf8 = make_utf8_argv();
-    // repair argv only when it matches the process command line
-    if (static_cast<int>(utf8.buf.size()) == argc) {
-        argv = utf8.ptrs.data();
-    }
-#endif
-
    auto ctx_arg = common_params_parser_init(params, ex, print_usage);
    const common_params params_org = ctx_arg.params; // the example can modify the default params

@@ -1196,9 +1078,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
     */
    auto add_opt = [&](common_arg arg) {
-        // download only exposes the handful of args explicitly tagged for it
-        const bool inherit_common = ex != LLAMA_EXAMPLE_DOWNLOAD;
-        if ((arg.in_example(ex) || (inherit_common && arg.in_example(LLAMA_EXAMPLE_COMMON))) && !arg.is_exclude(ex)) {
+        if ((arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) {
            ctx_arg.options.push_back(std::move(arg));
        }
    };
@@ -1209,7 +1089,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.usage = true;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}));
+    ));
    add_opt(common_arg(
        {"--version"},
        "show version and build info",
@@ -1480,7 +1360,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    add_opt(common_arg(
        {"--cache-idle-slots"},
        {"--no-cache-idle-slots"},
-        "save idle slots to the prompt cache on new task, and clear them when using unified KV (default: enabled, requires cache-ram)",
+        "save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)",
        [](common_params & params, bool value) {
            params.cache_idle_slots = value;
        }
@@ -1735,7 +1615,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
        [](common_params & params, const std::string & value) {
            const auto sampler_names = string_split<std::string>(value, ';');
-            params.sampling.samplers = common_sampler_types_from_names(sampler_names);
+            params.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
            params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS;
        }
    ).set_sampling());
@@ -2331,7 +2211,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, bool value) {
            params.no_mmproj = !value;
        }
-    ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_MMPROJ_AUTO"));
+    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO"));
    add_opt(common_arg(
        {"--mmproj-offload"},
        {"--no-mmproj-offload"},
@@ -2341,8 +2221,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
    add_opt(common_arg(
-        {"--image", "--audio", "--video"}, "FILE",
-        "path to an image, audio, or video file. use with multimodal models, use comma-separated values for multiple files\n",
+        {"--image", "--audio"}, "FILE",
+        "path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
        [](common_params & params, const std::string & value) {
            for (const auto & item : parse_csv_row(value)) {
                params.image.emplace_back(item);
@@ -2363,13 +2243,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.image_max_tokens = value;
        }
    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
-    add_opt(common_arg(
-        {"--mtmd-batch-max-tokens"}, "N",
-        string_format("maximum number of image tokens per batch when encoding images (default: %d)", params.mtmd_batch_max_tokens),
-        [](common_params & params, int value) {
-            params.mtmd_batch_max_tokens = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MTMD_BATCH_MAX_TOKENS"));
    if (llama_supports_rpc()) {
        add_opt(common_arg(
            {"--rpc"}, "SERVERS",
@@ -2730,14 +2603,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.model.path = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_MODEL"));
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
    add_opt(common_arg(
        {"-mu", "--model-url"}, "MODEL_URL",
        "model download url (default: unused)",
        [](common_params & params, const std::string & value) {
            params.model.url = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_MODEL_URL"));
+    ).set_env("LLAMA_ARG_MODEL_URL"));
    add_opt(common_arg(
        { "-dr", "--docker-repo" }, "[<repo>/]<model>[:quant]",
        "Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.\n"
@@ -2746,7 +2619,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.model.docker_repo = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_DOCKER_REPO"));
+    ).set_env("LLAMA_ARG_DOCKER_REPO"));
    add_opt(common_arg(
        {"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
        "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
@@ -2756,14 +2629,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.model.hf_repo = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_HF_REPO"));
+    ).set_env("LLAMA_ARG_HF_REPO"));
    add_opt(common_arg(
        {"-hff", "--hf-file"}, "FILE",
        "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
        [](common_params & params, const std::string & value) {
            params.model.hf_file = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_HF_FILE"));
+    ).set_env("LLAMA_ARG_HF_FILE"));
    add_opt(common_arg(
        {"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
        "Hugging Face model repository for the vocoder model (default: unused)",
@@ -2784,14 +2657,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.hf_token = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("HF_TOKEN"));
-    add_opt(common_arg(
-        {"--mtp"},
-        "also download the multi-token prediction (MTP) head, if available (default: unused)",
-        [](common_params & params) {
-            params.speculative.types.push_back(COMMON_SPECULATIVE_TYPE_DRAFT_MTP);
-        }
-    ).set_examples({LLAMA_EXAMPLE_DOWNLOAD}));
+    ).set_env("HF_TOKEN"));
    add_opt(common_arg(
        {"--context-file"}, "FNAME",
        "file to load context from (use comma-separated values to specify multiple files)",
@@ -3001,26 +2867,62 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.api_prefix = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
+    // Deprecated: use --ui-config instead (kept for backward compat)
    add_opt(common_arg(
-        {"--ui-config", "--webui-config"}, "JSON",
+        {"--webui-config"}, "JSON",
+        "[DEPRECATED: use --ui-config] JSON that provides default WebUI settings (overrides WebUI defaults)",
+        [](common_params & params, const std::string & value) {
+            params.ui_config_json = value;
+            params.webui_config_json = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG"));
+
+    add_opt(common_arg(
+        {"--ui-config"}, "JSON",
        "JSON that provides default UI settings (overrides UI defaults)",
        [](common_params & params, const std::string & value) {
            params.ui_config_json = value;
+            params.webui_config_json = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_CONFIG"));
+
+    // Deprecated: use --ui-config-file instead (kept for backward compat)
    add_opt(common_arg(
-        {"--ui-config-file", "--webui-config-file"}, "PATH",
+        {"--webui-config-file"}, "PATH",
+        "[DEPRECATED: use --ui-config-file] JSON file that provides default WebUI settings (overrides WebUI defaults)",
+        [](common_params & params, const std::string & value) {
+            params.ui_config_json = read_file(value);
+            params.webui_config_json = params.ui_config_json;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
+
+    add_opt(common_arg(
+        {"--ui-config-file"}, "PATH",
        "JSON file that provides default UI settings (overrides UI defaults)",
        [](common_params & params, const std::string & value) {
            params.ui_config_json = read_file(value);
+            params.webui_config_json = params.ui_config_json;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_CONFIG_FILE"));
+
+    // Deprecated: use --ui-mcp-proxy instead (kept for backward compat)
    add_opt(common_arg(
-        {"--ui-mcp-proxy", "--webui-mcp-proxy"},
-        {"--no-ui-mcp-proxy", "--no-webui-mcp-proxy"},
+        {"--webui-mcp-proxy"},
+        {"--no-webui-mcp-proxy"},
+        "[DEPRECATED: use --ui-mcp-proxy/--no-ui-mcp-proxy] experimental: whether to enable MCP CORS proxy",
+        [](common_params & params, bool value) {
+            params.ui_mcp_proxy = value;
+            params.webui_mcp_proxy = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_MCP_PROXY"));
+
+    add_opt(common_arg(
+        {"--ui-mcp-proxy"},
+        {"--no-ui-mcp-proxy"},
        "experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)",
        [](common_params & params, bool value) {
            params.ui_mcp_proxy = value;
+            params.webui_mcp_proxy = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_MCP_PROXY"));
    add_opt(common_arg(
@@ -3032,26 +2934,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.server_tools = parse_csv_row(value);
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TOOLS"));
+    // Deprecated: use --ui/--no-ui instead (kept for backward compat)
    add_opt(common_arg(
-        {"-ag", "--agent"},
-        {"-no-ag", "--no-agent"},
-        "whether to enable CORS proxy and all built-in tools - do not enable in untrusted environments (default: disabled)",
+        {"--webui"},
+        {"--no-webui"},
+        "[DEPRECATED: use --ui/--no-ui] whether to enable the Web UI",
        [](common_params & params, bool value) {
-            if (value) {
-                params.server_tools = {"all"};
-                params.ui_mcp_proxy = true;
-            } else {
-                params.server_tools.clear();
-                params.ui_mcp_proxy = false;
-            }
+            params.ui = value;
+            params.webui = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_AGENT"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI"));
+
    add_opt(common_arg(
-        {"--ui", "--webui"},
-        {"--no-ui", "--no-webui"},
+        {"--ui"},
+        {"--no-ui"},
        string_format("whether to enable the Web UI (default: %s)", params.ui ? "enabled" : "disabled"),
        [](common_params & params, bool value) {
            params.ui = value;
+            params.webui = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI"));
    add_opt(common_arg(
@@ -3082,7 +2982,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
    add_opt(common_arg(
        {"--api-key-file"}, "FNAME",
-        "path to file containing API keys, one per line; lines starting with a hash are treated as comments (default: none)",
+        "path to file containing API keys (default: none)",
        [](common_params & params, const std::string & value) {
            std::ifstream key_file(value);
            if (!key_file) {
@@ -3090,7 +2990,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            }
            std::string key;
            while (std::getline(key_file, key)) {
-                if (!key.empty() && key[0] != '#') {
+                if (!key.empty()) {
                    params.api_keys.push_back(key);
                }
            }
@@ -3296,20 +3196,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.sampling.reasoning_budget_message = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET_MESSAGE"));
-    add_opt(common_arg(
-        {"--reasoning-preserve"},
-        {"--no-reasoning-preserve"},
-        "preserve reasoning trace in the full history, not just the last assistant message (default: template default)\n"
-        "compatible with certain templates having 'supports_preserve_reasoning' capability\n"
-        "example: https://docs.z.ai/guides/capabilities/thinking-mode#preserved-thinking",
-        [](common_params & params, bool value) {
-            if (value) {
-                params.default_template_kwargs["preserve_reasoning"] = "true";
-            } else {
-                params.default_template_kwargs["preserve_reasoning"] = "false";
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_REASONING_PRESERVE"));
    add_opt(common_arg(
        {"--chat-template"}, "JINJA_TEMPLATE",
        string_format(
@@ -3447,13 +3333,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            common_log_set_file(common_log_main(), value.c_str());
        }
    ).set_env("LLAMA_ARG_LOG_FILE"));
-    add_opt(common_arg(
-        {"--log-prompts-dir"}, "PATH",
-        "Log prompts to directory (only used for debugging, default: disabled)",
-        [](common_params & params, const std::string & value) {
-            params.path_prompts_log_dir = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
    add_opt(common_arg(
        {"--log-colors"}, "[on|off|auto]",
        "Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
@@ -3485,7 +3364,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.offline = true;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_OFFLINE"));
+    ).set_env("LLAMA_ARG_OFFLINE"));
    add_opt(common_arg(
        {"-lv", "--verbosity", "--log-verbosity"}, "N",
        string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n"
@@ -3762,7 +3641,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        "draft model for speculative decoding (default: unused)",
        [](common_params & params, const std::string & value) {
            params.speculative.draft.mparams.path = value;
-            params.speculative.draft.mparams.hf_file = value; // will be used if --spec-draft-hf is set
        }
    ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_MODEL"));
    add_opt(common_arg(
@@ -1,14 +1,12 @@
 #pragma once

 #include "common.h"
-#include "download.h"

 #include <set>
 #include <map>
 #include <string>
 #include <vector>
 #include <cstring>
-#include <memory>

 // pseudo-env variable to identify preset-only arguments
 #define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
@@ -131,21 +129,11 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
 // see: https://github.com/ggml-org/llama.cpp/issues/18163
 void common_params_add_preset_options(std::vector<common_arg> & args);

-struct common_models_handler {
-    common_download_hf_plan plan;
-    common_download_hf_plan plan_spec;
-    common_download_hf_plan plan_voc;
-    common_download_opts opts;
-};
-
-// initialize downloading opts and hf_plan if needed, but does not download anything yet
-common_models_handler common_models_handler_init(const common_params & params, llama_example curr_ex);
-
-// check if the model is a preset repo (i.e. has a preset file)
-bool common_models_handler_is_preset_repo(const common_models_handler & handler);
-
-// download and update params with the downloaded model path
-void common_models_handler_apply(common_models_handler & handler, common_params & params, common_download_callback * callback = nullptr);
+// populate model paths (main model, mmproj, etc) from -hf if necessary
+// return true if the model is ready to use
+// throw an exception if there is an error that prevents the model from being used (e.g. network error, model not found, etc)
+// if params.skip_download is true, no downloads will be attempted. return false if the model is invalid or missing (e.g. ETag check failed)
+bool common_params_handle_models(common_params & params, llama_example curr_ex);

 // initialize argument parser context - used by test-arg-parser and preset
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
@@ -103,10 +103,6 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
            data.grammar_triggers = {
                { COMMON_GRAMMAR_TRIGGER_TYPE_WORD, trigger_marker }
            };
-            if (autoparser.tools.format.openai_wrapper_trigger) {
-                // model emits the OpenAI function wrapper, trigger on it
-                data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "{\"type\": \"function\"," });
-            }
        }
    }

@@ -138,7 +134,7 @@ common_peg_arena autoparser::build_parser(const generation_params & inputs, cons
            auto response_format = p.rule("response-format", p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema)));
            parser = ctx.reasoning_parser + p.space() + p.choice({
                p.literal("```json") + p.space() + response_format + p.space() + p.literal("```"),
-                p.space() + response_format  + p.space()
+                response_format
            }) + p.end();
            pure_content = false;
        } else if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && jinja_caps.supports_tool_calls) {
@@ -228,13 +224,13 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont
        auto single_tool_parser = p.standard_json_tools(
            format.per_call_start, format.per_call_end, inputs.tools, inputs.parallel_tool_calls,
            inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
-            format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order, format.openai_wrapper_trigger);
+            format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
        tools_parser = p.trigger_rule("tool-calls", p.one_or_more(single_tool_parser + p.space()));
    } else {
        tools_parser = p.standard_json_tools(
            format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls,
            inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
-            format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order, format.openai_wrapper_trigger);
+            format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
    }

    // Handle content wrappers if present
@@ -395,11 +391,11 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
                                           arguments.name_suffix) +
                           arguments.value_prefix +
                           (schema_info.resolves_to_string(param_schema) ?
-                                p.ac(p.tool_arg_string_value(until_suffix) +
-                                    p.tool_arg_close(p.literal(arguments.value_suffix)), arguments.value_suffix) :
-                                (p.tool_arg_json_value(p.schema(
+                                p.tool_arg_string_value(until_suffix) :
+                                p.tool_arg_json_value(p.schema(
                                    p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) +
-                                    p.tool_arg_close(p.literal(arguments.value_suffix)))));
+                                    p.space()) +
+                           p.tool_arg_close(p.literal(arguments.value_suffix)));

            auto named_arg = p.rule("tool-" + name + "-arg-" + param_name, arg);
            if (is_required) {
@@ -181,7 +181,6 @@ struct tool_format_analysis {

    bool fun_name_is_key = false;       // In JSON format function name is JSON key, i.e. { "<funname>": { ... arguments ... } }
    bool tools_array_wrapped = false;   // Tool calls wrapped in JSON array [...]
-    bool openai_wrapper_trigger = false;  // model emits the OpenAI function wrapper, trigger on it

    std::string              function_field = "function";
    std::string              name_field     = "name";
@@ -165,14 +165,6 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
              LOG_DBG(ANSI_ORANGE "[Patch: Apriel 1.6]\n" ANSI_RESET);
          }
      },
-      // template uses the JSON {name, parameters} tool instruction, emits the OpenAI function wrapper
-      [](const common_chat_template & tmpl, autoparser & analysis) -> void {
-          if (tmpl.src.find("Respond in the format {\"name\": function name") != std::string::npos &&
-              tmpl.src.find("Do not use variables.") != std::string::npos) {
-              analysis.tools.format.openai_wrapper_trigger = true;
-              LOG_DBG(ANSI_ORANGE "[Patch: JSON name/parameters tool instruction]\n" ANSI_RESET);
-          }
-      },

    });

@@ -1237,8 +1229,8 @@ void analyze_tools::extract_argument_name_markers() {
            left_result.tags["pre"] == right_result.tags["pre"] &&
            left_result.tags["suffix"] == right_result.tags["suffix"]) {
            // Name is inside a structure (e.g., JSON key): prefix is the shared wrapper
-            arguments.name_prefix = left_result.tags["pre"];
-            arguments.name_suffix = left_result.tags["suffix"];
+            arguments.name_prefix = trim_whitespace(left_result.tags["pre"]);
+            arguments.name_suffix = trim_leading_whitespace(left_result.tags["suffix"]);
        } else if (diff.left.substr(0, ARG_FIRST.length()) == ARG_FIRST && diff.right.substr(0, ARG_SECOND.length()) == ARG_SECOND) {
            // Name is directly in the diff: prefix comes from last marker in diff.prefix
            auto pre_parser = build_tagged_peg_parser([&](common_peg_parser_builder & p) {
@@ -1323,7 +1315,8 @@ void analyze_tools::extract_argument_value_markers() {
                value_suffix = value_suffix.substr(0, end_marker_pos);
            }
        }
-        if (!trim_whitespace(value_suffix).empty()) {
+        value_suffix = trim_leading_whitespace(value_suffix);
+        if (!value_suffix.empty()) {
            arguments.value_suffix = value_suffix;
        }
    }
@@ -87,8 +87,6 @@ static std::string normalize_quotes_to_json(const std::string & input) {
    bool in_single_quoted = false;
    bool in_double_quoted = false;

-    auto is_word_char = [](char ch) { return std::isalnum(static_cast<unsigned char>(ch)) || ch == '_'; };
-
    for (size_t i = 0; i < input.size(); ++i) {
        char c = input[i];

@@ -153,29 +151,6 @@ static std::string normalize_quotes_to_json(const std::string & input) {
                in_single_quoted = true;
                result += '"';
            }
-        } else if (!in_single_quoted && !in_double_quoted && (c == 'T' || c == 'F' || c == 'N') &&
-                   (i == 0 || !is_word_char(input[i - 1]))) {
-            // Python literals -> JSON; prefix match keeps streamed partials monotonic.
-            static constexpr std::pair<std::string_view, std::string_view> literals[] = {
-                { "True", "true" }, { "False", "false" }, { "None", "null" },
-            };
-            size_t n = 0;
-            while (i + n < input.size() && is_word_char(input[i + n])) {
-                ++n;
-            }
-            std::string_view token(input.data() + i, n);
-            bool matched = false;
-            for (const auto & [py, js] : literals) {
-                if (py.substr(0, n) == token) {
-                    result += js.substr(0, n);
-                    i += n - 1;
-                    matched = true;
-                    break;
-                }
-            }
-            if (!matched) {
-                result += c;
-            }
        } else {
            result += c;
        }
@@ -363,7 +338,7 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
    }

    if ((is_arg_value || is_arg_string_value) && current_tool) {
-        std::string value_content = std::string(node.text);
+        std::string value_content = std::string(trim_trailing_space(trim_leading_space(node.text, 1), 1));

        std::string value_to_add;
        if (value_content.empty() && is_arg_string_value) {
@@ -378,8 +353,12 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
            }
            value_to_add += escape_json_string_inner(value_content);
        } else if (!value_content.empty()) {
-            // Pythonic scalars/containers -> JSON.
-            value_to_add += normalize_container_value(value_content);
+            // For potential containers, normalize Python-style single quotes to JSON double quotes
+            bool is_potential_container = value_content[0] == '[' || value_content[0] == '{';
+            if (is_potential_container) {
+                value_content = normalize_container_value(value_content);
+            }
+            value_to_add += value_content;
        }

        args_target() += value_to_add;
@@ -487,34 +466,11 @@ common_peg_parser common_chat_peg_builder::standard_constructed_tools(
    return force_tool_calls ? section : optional(section);
 }

-// Like python_value(), but the leaf also accepts JSON-cased true/false/null, used by LFM2/LFM2.5
-common_peg_parser common_chat_peg_builder::python_or_json_value() {
-    return rule("python-or-json-value", [this]() {
-        auto ws    = space();
-        auto value = python_or_json_value();
-
-        auto member  = sequence({ python_string(), ws, literal(":"), ws, value });
-        auto members = sequence({ member, zero_or_more(sequence({ ws, literal(","), ws, member })) });
-        auto dict    = rule("python-or-json-dict", [&]() {
-            return sequence({ literal("{"), ws, choice({ literal("}"), sequence({ members, ws, literal("}") }) }), ws });
-        });
-
-        auto elements = sequence({ value, zero_or_more(sequence({ literal(","), ws, value })) });
-        auto array    = rule("python-or-json-array", [&]() {
-            return sequence({ literal("["), ws, choice({ literal("]"), sequence({ elements, ws, literal("]") }) }), ws });
-        });
-
-        return choice({ dict, array, python_string(), python_number(),
-                        python_bool(), python_null(), json_bool(), json_null() });
-    });
-}
-
 // Python-style tool calls: name(arg1="value1", arg2=123)
 // Used only by LFM2 for now, so we don't merge it into autoparser
 common_peg_parser common_chat_peg_builder::python_style_tool_calls(
    const ordered_json & tools,
-    bool                 parallel_tool_calls,
-    bool                 allow_json_literals) {
+    bool                 parallel_tool_calls) {
    if (!tools.is_array() || tools.empty()) {
        return eps();
    }
@@ -540,16 +496,15 @@ common_peg_parser common_chat_peg_builder::python_style_tool_calls(
                auto arg_name_parser = literal(prop_name);

                common_peg_parser arg_value_parser = eps();
-                // Quoted literal as a value: normalize_quotes_to_json preserves escapes.
-                auto string_value_parser = tool_arg_value(choice({
-                    literal("\"") + string_content('"') + literal("\""),
-                    literal("'") + string_content('\'') + literal("'")
-                }));
+                auto string_value_parser = choice({
+                    literal("\"") + tool_arg_string_value(string_content('"')) + literal("\""),
+                    literal("'") + tool_arg_string_value(string_content('\'')) + literal("'")
+                });

                if (is_string_type) {
                    arg_value_parser = string_value_parser;
                } else {
-                    arg_value_parser = tool_arg_value(allow_json_literals ? python_or_json_value() : python_value());
+                    arg_value_parser = tool_arg_value(python_value());
                }

                // Full argument: name="value" or name=value
@@ -746,8 +701,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
    const std::string &              effective_args_key,
    const std::string &              call_id_key,
    const std::string &              gen_call_id_key,
-    const std::vector<std::string> & parameters_order,
-    bool                             accept_openai_wrapper) {
+    const std::vector<std::string> & parameters_order) {

    auto tool_choices    = choice();
    auto name_key_parser = literal("\"" + effective_name_key + "\"");
@@ -809,13 +763,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
                return idx_a < idx_b;
            });

-        // accept an optional leading "type": "function" field when the model emits the OpenAI wrapper
-        common_peg_parser type_field = eps();
-        if (accept_openai_wrapper) {
-            type_field = optional(literal("\"type\"") + space() + literal(":") + space() +
-                                  literal("\"function\"") + space() + literal(",") + space());
-        }
-        auto ordered_body = tool_open(literal("{")) + space() + type_field;
+        auto ordered_body = tool_open(literal("{")) + space();
        for (size_t i = 0; i < parser_pairs.size(); i++) {
            ordered_body = ordered_body + parser_pairs[i].first;
            if (i < parser_pairs.size() - 1) {
@@ -878,8 +826,7 @@ common_peg_parser common_chat_peg_builder::standard_json_tools(
                                                       bool                             function_is_key,
                                                       const std::string &              call_id_key,
                                                       const std::string &              gen_call_id_key,
-                                                       const std::vector<std::string> & parameters_order,
-                                                       bool                             accept_openai_wrapper) {
+                                                       const std::vector<std::string> & parameters_order) {
    if (!tools.is_array() || tools.empty()) {
        return eps();
    }
@@ -897,7 +844,7 @@ common_peg_parser common_chat_peg_builder::standard_json_tools(
        if (!name_spec.first.empty() || !args_spec.first.empty()) {
            tool_choices = build_json_tools_nested_keys(tools, effective_name_key, effective_args_key, call_id_key, gen_call_id_key);
        } else {
-            tool_choices = build_json_tools_flat_keys(tools, effective_name_key, effective_args_key, call_id_key, gen_call_id_key, parameters_order, accept_openai_wrapper);
+            tool_choices = build_json_tools_flat_keys(tools, effective_name_key, effective_args_key, call_id_key, gen_call_id_key, parameters_order);
        }
    }

@@ -120,8 +120,7 @@ class common_chat_peg_builder : public common_peg_parser_builder {
                                          bool                             function_is_key = false,
                                          const std::string &              call_id_key = "",
                                          const std::string &              gen_call_id_key = "",
-                                          const std::vector<std::string> & parameters_order = {},
-                                          bool                             accept_openai_wrapper = false);
+                                          const std::vector<std::string> & parameters_order = {});

    // Legacy-compatible helper for building XML/tagged style tool calls
    // Used by tests and manual parsers
@@ -133,13 +132,9 @@ class common_chat_peg_builder : public common_peg_parser_builder {
    // Helper for Python-style function call format: name(arg1="value1", arg2=123)
    // Used by LFM2 and similar templates
    common_peg_parser python_style_tool_calls(const nlohmann::ordered_json & tools,
-                                              bool                           parallel_tool_calls,
-                                              bool                           allow_json_literals);
+                                              bool                           parallel_tool_calls);

  private:
-    // Python values plus JSON true/false/null.
-    common_peg_parser python_or_json_value();
-
    // Implementation helpers for standard_json_tools — one per JSON tool call layout mode
    common_peg_parser build_json_tools_function_is_key(const nlohmann::ordered_json & tools,
                                                       const std::string &            args_key,
@@ -158,8 +153,7 @@ class common_chat_peg_builder : public common_peg_parser_builder {
                                                 const std::string &              effective_args_key,
                                                 const std::string &              call_id_key,
                                                 const std::string &              gen_call_id_key,
-                                                 const std::vector<std::string> & parameters_order,
-                                                 bool                             accept_openai_wrapper);
+                                                 const std::vector<std::string> & parameters_order);
 };

 inline common_peg_arena build_chat_peg_parser(
@@ -201,3 +195,4 @@ struct tagged_peg_parser {

 tagged_peg_parser build_tagged_peg_parser(
    const std::function<common_peg_parser(common_peg_parser_builder & builder)> & fn);
+
@@ -90,93 +90,41 @@ std::string common_chat_msg::render_content(const std::string & delimiter) const
    return text;
 }

-common_chat_role common_chat_role_from_string(const std::string & role) {
-    if (role == "system")    { return COMMON_CHAT_ROLE_SYSTEM;    }
-    if (role == "assistant") { return COMMON_CHAT_ROLE_ASSISTANT; }
-    if (role == "user")      { return COMMON_CHAT_ROLE_USER;      }
-    if (role == "tool")      { return COMMON_CHAT_ROLE_TOOL;      }
-    return COMMON_CHAT_ROLE_UNKNOWN;
-}
-
-const char * common_chat_role_to_string(common_chat_role role) {
-    switch (role) {
-        case COMMON_CHAT_ROLE_SYSTEM:    return "system";
-        case COMMON_CHAT_ROLE_ASSISTANT: return "assistant";
-        case COMMON_CHAT_ROLE_USER:      return "user";
-        case COMMON_CHAT_ROLE_TOOL:      return "tool";
-        case COMMON_CHAT_ROLE_UNKNOWN:   return "";
-    }
-    return "";
-}
-
-json common_chat_msg_delimiters::to_json() const {
-    json result = json::array();
-    for (const auto & d : delimiters) {
-        result.push_back({
-            { "role",      common_chat_role_to_string(d.role) },
-            { "delimiter", d.delimiter                        },
-        });
-    }
-    return result;
-}
-
-common_chat_msg_delimiters common_chat_msg_delimiters_parse(const json & delimiters) {
-    common_chat_msg_delimiters result;
-
-    if (!delimiters.is_array()) {
-        return result;
+std::vector<common_chat_msg_span> common_chat_split_by_role(const std::string & prompt, const std::vector<common_chat_msg_delimiter> & delims) {
+    if (delims.empty() || prompt.empty()) {
+        return {};
    }

-    result.delimiters.reserve(delimiters.size());
-    for (const auto & d : delimiters) {
-        if (!d.is_object()) {
-            continue;
+    auto parser = build_peg_parser([&](common_peg_parser_builder & p) {
+        std::vector<std::string>       all_delims;
+        std::vector<common_peg_parser> tagged_messages;
+
+        all_delims.reserve(delims.size());
+        tagged_messages.reserve(delims.size());
+        for (const auto & d : delims) {
+            all_delims.push_back(d.delimiter);
        }
-        result.delimiters.push_back({
-            common_chat_role_from_string(d.value("role", std::string())),
-            d.value("delimiter", std::string()),
-        });
-    }

-    return result;
-}
-
-void common_chat_msg_delimiters::tokenize(const llama_vocab * vocab) {
-    for (auto & d : delimiters) {
-        d.tokens = common_tokenize(vocab, d.delimiter, false, true);
-    }
-}
-
-common_chat_msg_spans common_chat_msg_delimiters::split(const llama_tokens & tokens, const std::map<size_t, size_t> & skips) const {
-    std::vector<std::pair<common_chat_role, size_t>> matches;
-
-    auto skip = skips.begin();
-    for (size_t i = 0; i < tokens.size();) {
-        if (skip != skips.end() && i == skip->first) {
-            i += skip->second;
-            ++skip;
-            continue;
+        auto any_delim = p.until_one_of(all_delims);
+        for (const auto & d : delims) {
+            tagged_messages.push_back(p.tag(d.role, p.literal(d.delimiter) + any_delim));
        }
-        for (const auto & d : delimiters) {
-            if (i + d.tokens.size() > tokens.size()) {
-                continue;
-            }
-            if (std::equal(d.tokens.begin(), d.tokens.end(), tokens.begin() + i)) {
-                matches.emplace_back(d.role, i);
-                break;
-            }
+
+        return any_delim + p.zero_or_more(p.choice(tagged_messages)) + p.end();
+    });
+
+    common_peg_parse_context ctx(prompt);
+    const auto result = parser.parse(ctx);
+    if (!result.success()) {
+        return {};
+    }
+
+    std::vector<common_chat_msg_span> spans;
+    ctx.ast.visit(result, [&](const common_peg_ast_node & node) {
+        if (!node.tag.empty()) {
+            spans.push_back({ node.tag, node.start, node.end - node.start });
        }
-        i++;
-    }
-
-    matches.emplace_back(COMMON_CHAT_ROLE_UNKNOWN, tokens.size());
-
-    common_chat_msg_spans spans;
-    for (size_t i = 0; i + 1 < matches.size(); i++) {
-        const auto & curr = matches[i];
-        const auto & next = matches[i + 1];
-        spans.add(curr.first, curr.second, next.second - curr.second);
-    }
+    });

    return spans;
 }
@@ -912,10 +860,6 @@ static std::string common_chat_template_direct_apply_impl(
    if (inputs.add_generation_prompt) {
        inp["add_generation_prompt"] = true;
    }
-    if (inp.contains("preserve_reasoning") && inp["preserve_reasoning"].is_boolean()) {
-        bool enabled = inp["preserve_reasoning"].get<bool>();
-        jinja::caps_apply_preserve_reasoning(ctx, enabled);
-    }

    jinja::global_from_json(ctx, inp, inputs.mark_input);

@@ -1137,13 +1081,13 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp

    data.prompt            = prompt;
    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs, /* messages_override= */ adjusted_messages);
-    data.message_delimiters = {
-        { COMMON_CHAT_ROLE_ASSISTANT, "<|start|>assistant" },
-        { COMMON_CHAT_ROLE_USER,      "<|start|>user"      },
-        { COMMON_CHAT_ROLE_SYSTEM,    "<|start|>developer" },
-        { COMMON_CHAT_ROLE_SYSTEM,    "<|start|>system"    },
-        { COMMON_CHAT_ROLE_TOOL,      "<|start|>functions" },
-    };
+    data.message_spans = common_chat_split_by_role(prompt, {
+        { "assistant", "<|start|>assistant" },
+        { "user",      "<|start|>user"      },
+        { "system",    "<|start|>developer" },
+        { "system",    "<|start|>system"    },
+        { "tool",      "<|start|>functions" },
+    });

    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.supports_thinking = true;
@@ -1284,10 +1228,10 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ
        data.prompt += data.generation_prompt;
    }

-    data.message_delimiters = {
-        { COMMON_CHAT_ROLE_USER,      "<|turn>user"  },
-        { COMMON_CHAT_ROLE_ASSISTANT, "<|turn>model" },
-    };
+    data.message_spans = common_chat_split_by_role(data.prompt, {
+        { "user",      "<|turn>user\n"  },
+        { "assistant", "<|turn>model\n" },
+    });

    data.format            = COMMON_CHAT_FORMAT_PEG_GEMMA4;
    data.supports_thinking  = true;
@@ -1664,52 +1608,42 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
    return data;
 }

-// LFM2/LFM2.5 parser. Tool calls are almost Python-style and parallel-capable
-// (except dotted names and JSON literals true/false/null).
-// Always wrapped in <|tool_call_start|>[name(args)]<|tool_call_end|> with optional <think> reasoning.
-// tool_list_tokens preserves LFM2 system tool-list markers.
-static common_chat_params common_chat_params_init_lfm2(const common_chat_template &          tmpl,
-                                                       const autoparser::generation_params & inputs,
-                                                       bool tool_list_tokens) {
+// LFM2 format: uses <|tool_list_start|>[...]<|tool_list_end|> in system prompt
+// and <|tool_call_start|>[name(arg="val")]<|tool_call_end|> for tool calls.
+// - Reasoning: <think>{reasoning}</think> (optional)
+// - Content: text before a tool call (optional)
+// - Tool calls: Python-style, e.g. [function_name(arg1="value1", arg2="value2")]
+//   Tool calls can appear multiple times (parallel tool calls supported)
+static common_chat_params common_chat_params_init_lfm2(const common_chat_template &    tmpl,
+                                                       const autoparser::generation_params & inputs) {
    common_chat_params data;

+    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
+    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
+    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
+    data.supports_thinking = true;
+    data.preserved_tokens  = {
+        "<|tool_list_start|>",
+        "<|tool_list_end|>",
+        "<|tool_call_start|>",
+        "<|tool_call_end|>",
+        "<think>",
+        "</think>",
+    };
+
+    auto has_tools         = inputs.tools.is_array() && !inputs.tools.empty();
+    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
+    auto include_grammar   = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
+
    const std::string TOOL_CALL_START = "<|tool_call_start|>";
    const std::string TOOL_CALL_END   = "<|tool_call_end|>";
-    const std::string TOOL_LIST_START = "<|tool_list_start|>";
-    const std::string TOOL_LIST_END   = "<|tool_list_end|>";
    const std::string THINK_START     = "<think>";
    const std::string THINK_END       = "</think>";
    const std::string GEN_PROMPT      = "<|im_start|>assistant\n";

-    // Copy reasoning to the "thinking" field the template expects
-    auto adjusted_messages = json::array();
-    for (auto msg : inputs.messages) {
-        if (msg.contains("reasoning_content") && msg.at("reasoning_content").is_string()) {
-            msg["thinking"] = msg.at("reasoning_content");
-        }
-        adjusted_messages.push_back(msg);
-    }
-
-    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs, adjusted_messages);
-    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs, adjusted_messages);
-    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
-    data.supports_thinking = true;
-    data.preserved_tokens  = { TOOL_CALL_START, TOOL_CALL_END, THINK_START, THINK_END };
-    if (tool_list_tokens) {
-        data.preserved_tokens.push_back(TOOL_LIST_START);
-        data.preserved_tokens.push_back(TOOL_LIST_END);
-    }
-
    data.thinking_start_tag = THINK_START;
    data.thinking_end_tag   = THINK_END;

-    auto has_tools           = inputs.tools.is_array() && !inputs.tools.empty();
-    auto has_response_format = !inputs.json_schema.is_null() && inputs.json_schema.is_object();
-    // Gate by reasoning format and whether the template supports <think>
-    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE &&
-                             tmpl.source().find(THINK_START) != std::string::npos;
-    auto include_grammar   = has_response_format || (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE);
-
    if (inputs.has_continuation()) {
        const auto & msg = inputs.continue_msg;

@@ -1726,21 +1660,17 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
        auto end = p.end();

        auto reasoning = p.eps();
-        if (extract_reasoning) {
+        if (extract_reasoning && inputs.enable_thinking) {
            reasoning = p.optional(THINK_START + p.reasoning(p.until(THINK_END)) + THINK_END);
        }

        if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
-            if (has_response_format) {
-                auto response_format = p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema));
-                return generation_prompt + reasoning + response_format + end;
-            }
            return generation_prompt + reasoning + p.content(p.rest()) + end;
        }
        auto tool_calls = p.rule("tool-calls",
            p.trigger_rule("tool-call",
                p.literal(TOOL_CALL_START) +
-                p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls, /* allow_json_literals = */ true) +
+                p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls) +
                p.literal(TOOL_CALL_END)
            )
        );
@@ -1753,17 +1683,13 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
    data.parser = parser.save();

    if (include_grammar) {
-        data.grammar_lazy = !(has_response_format || (has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED));
+        data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
        data.grammar      = build_grammar([&](const common_grammar_builder & builder) {
            foreach_function(inputs.tools, [&](const json & tool) {
                const auto & function = tool.at("function");
                auto         schema   = function.at("parameters");
                builder.resolve_refs(schema);
            });
-            if (has_response_format) {
-                auto schema = inputs.json_schema;
-                builder.resolve_refs(schema);
-            }
            parser.build_grammar(builder, data.grammar_lazy);
        });

@@ -1771,6 +1697,93 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
            { COMMON_GRAMMAR_TRIGGER_TYPE_WORD, TOOL_CALL_START }
        };
    }
+    return data;
+}
+
+// LFM2.5 format: uses plain "List of tools: [...]" in system prompt, no wrapper tokens.
+// Tool calls are bare [name(arg="val")], though model may optionally emit <|tool_call_start|>.
+// - Reasoning: <think>{reasoning}</think> (optional)
+// - Content: text before a tool call (optional)
+// - Tool calls: Python-style, e.g. [function_name(arg1="value1", arg2="value2")]
+//   Tool calls can appear multiple times (parallel tool calls supported)
+static common_chat_params common_chat_params_init_lfm2_5(const common_chat_template &    tmpl,
+                                                         const autoparser::generation_params & inputs) {
+    common_chat_params data;
+
+    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
+    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
+    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
+    data.supports_thinking = true;
+    data.preserved_tokens  = {
+        "<|tool_call_start|>",
+        "<|tool_call_end|>",
+        "<think>",
+        "</think>",
+    };
+
+    auto has_tools         = inputs.tools.is_array() && !inputs.tools.empty();
+    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
+    auto include_grammar   = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
+
+    const std::string THINK_START     = "<think>";
+    const std::string THINK_END       = "</think>";
+    const std::string GEN_PROMPT      = "<|im_start|>assistant\n";
+
+    data.thinking_start_tag = THINK_START;
+    data.thinking_end_tag   = THINK_END;
+
+    if (inputs.has_continuation()) {
+        const auto & msg = inputs.continue_msg;
+
+        data.generation_prompt = GEN_PROMPT + THINK_START + msg.reasoning_content;
+        if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
+            data.generation_prompt += THINK_END + msg.render_content();
+        }
+
+        data.prompt += data.generation_prompt;
+    }
+
+    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
+        auto generation_prompt = p.literal(GEN_PROMPT);
+        auto end = p.end();
+
+        auto reasoning = p.eps();
+        if (extract_reasoning && inputs.enable_thinking) {
+            reasoning = p.optional(THINK_START + p.reasoning(p.until(THINK_END)) + THINK_END);
+        }
+
+        if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
+            return generation_prompt + reasoning + p.content(p.rest()) + end;
+        }
+
+        auto tool_calls = p.rule("tool-calls",
+            p.trigger_rule("tool-call",
+                p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls)
+            )
+        );
+
+        auto content = p.content(p.until_one_of({"<|tool_call_start|>", "["}));
+        auto maybe_start = p.optional(p.literal("<|tool_call_start|>"));
+        return generation_prompt + reasoning + content + maybe_start + tool_calls + end;
+    });
+
+    data.parser = parser.save();
+
+    if (include_grammar) {
+        data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
+        data.grammar      = build_grammar([&](const common_grammar_builder & builder) {
+            foreach_function(inputs.tools, [&](const json & tool) {
+                const auto & function = tool.at("function");
+                auto         schema   = function.at("parameters");
+                builder.resolve_refs(schema);
+            });
+            parser.build_grammar(builder, data.grammar_lazy);
+        });
+        foreach_function(inputs.tools, [&](const json & tool) {
+            const std::string name = tool.at("function").at("name");
+            data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[" + name + "(" });
+        });
+    }

    return data;
 }
@@ -2035,146 +2048,6 @@ static common_chat_params common_chat_params_init_deepseek_v3_2(const common_cha
    return data;
 }

-// Cohere2 MoE (a.k.a. "North Code") parser.
-//
-// The assistant turn is fully marker-wrapped:
-//   <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
-//     <|START_THINKING|>{reasoning}<|END_THINKING|>
-//     then EITHER content:    <|START_TEXT|>{content}<|END_TEXT|>
-//          OR     tool calls: <|START_ACTION|>[
-//                                 {"tool_call_id": "0", "tool_name": "f", "parameters": {...}}, ...
-//                             ]<|END_ACTION|>
-//   <|END_OF_TURN_TOKEN|>
-//
-// The generation prompt forces a leading <|START_THINKING|> (when reasoning is enabled, which is
-// the template default), so the model's output continues from *inside* the thinking block. The
-// parser literal therefore only covers the stable <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|> prefix
-// and the reasoning rule consumes the <|START_THINKING|> ... <|END_THINKING|> markers itself,
-// regardless of whether they came from the generation prompt or the generated text.
-static common_chat_params common_chat_params_init_cohere2moe(const common_chat_template &          tmpl,
-                                                              const autoparser::generation_params & inputs) {
-    common_chat_params data;
-
-    const std::string TURN_START    = "<|START_OF_TURN_TOKEN|>";
-    const std::string TURN_END      = "<|END_OF_TURN_TOKEN|>";
-    const std::string CHATBOT       = "<|CHATBOT_TOKEN|>";
-    const std::string USER          = "<|USER_TOKEN|>";
-    const std::string SYSTEM        = "<|SYSTEM_TOKEN|>";
-    const std::string THINK_START   = "<|START_THINKING|>";
-    const std::string THINK_END     = "<|END_THINKING|>";
-    const std::string TEXT_START    = "<|START_TEXT|>";
-    const std::string TEXT_END      = "<|END_TEXT|>";
-    const std::string ACTION_START  = "<|START_ACTION|>";
-    const std::string ACTION_END    = "<|END_ACTION|>";
-    const std::string RESULT_START  = "<|START_TOOL_RESULT|>";
-    const std::string RESULT_END    = "<|END_TOOL_RESULT|>";
-
-    // Stable prefix of the generation prompt that precedes the (forced) <|START_THINKING|> marker.
-    const std::string GEN_PREFIX = TURN_START + CHATBOT;
-
-    data.prompt             = common_chat_template_direct_apply_impl(tmpl, inputs);
-    data.generation_prompt  = common_chat_template_generation_prompt_impl(tmpl, inputs);
-    data.format             = COMMON_CHAT_FORMAT_PEG_NATIVE;
-    data.supports_thinking  = true;
-    data.thinking_start_tag = THINK_START;
-    data.thinking_end_tag   = THINK_END;
-    data.preserved_tokens   = {
-        TURN_START, TURN_END, CHATBOT, USER, SYSTEM,
-        THINK_START, THINK_END,
-        TEXT_START, TEXT_END,
-        ACTION_START, ACTION_END,
-        RESULT_START, RESULT_END,
-    };
-
-    // Declare per-role message delimiters. Tool results are rendered with the
-    // system token followed by <|START_TOOL_RESULT|>, so the "tool" delimiter must be listed before
-    // the plain "system" one (it is a strict superset, and the role split tries delimiters in order).
-    data.message_delimiters = {
-        { COMMON_CHAT_ROLE_ASSISTANT, GEN_PREFIX },
-        { COMMON_CHAT_ROLE_USER,      TURN_START + USER },
-        { COMMON_CHAT_ROLE_TOOL,      TURN_START + SYSTEM + RESULT_START },
-        { COMMON_CHAT_ROLE_SYSTEM,    TURN_START + SYSTEM },
-    };
-
-    auto has_tools         = inputs.tools.is_array() && !inputs.tools.empty();
-    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
-    auto include_grammar   = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
-
-    if (inputs.has_continuation()) {
-        const auto & msg = inputs.continue_msg;
-
-        data.generation_prompt = GEN_PREFIX + THINK_START + msg.reasoning_content;
-        if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
-            data.generation_prompt += THINK_END + TEXT_START + msg.render_content();
-        }
-
-        data.prompt += data.generation_prompt;
-    }
-
-    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
-        auto generation_prompt = p.literal(GEN_PREFIX);
-        auto end               = p.end();
-
-        // The thinking block is always present (the generation prompt forces <|START_THINKING|>).
-        // When extracting reasoning, capture its body; otherwise keep the whole block (markers
-        // included) inline as content, matching reasoning_format=NONE conventions.
-        common_peg_parser reasoning = p.eps();
-        if (extract_reasoning) {
-            reasoning = p.optional(p.literal(THINK_START) +
-                                   p.reasoning(p.until_one_of({ THINK_END, TEXT_START, ACTION_START })) +
-                                   p.optional(p.literal(THINK_END)));
-        } else {
-            reasoning = p.optional(p.content(p.literal(THINK_START) +
-                                             p.until_one_of({ THINK_END, TEXT_START, ACTION_START }) +
-                                             p.optional(p.literal(THINK_END))));
-        }
-
-        auto text_content = p.literal(TEXT_START) + p.content(p.until(TEXT_END)) + p.optional(p.literal(TEXT_END));
-
-        if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
-            return generation_prompt + reasoning + text_content + p.optional(p.literal(TURN_END)) + end;
-        }
-
-        auto require_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-
-        // <|START_ACTION|>[ {"tool_call_id": "0", "tool_name": "f", "parameters": {...}}, ... ]<|END_ACTION|>
-        auto tool_calls = p.standard_json_tools(ACTION_START, ACTION_END, inputs.tools, inputs.parallel_tool_calls,
-                                                /* force_tool_calls = */ true,
-                                                /* name_key         = */ "tool_name",
-                                                /* args_key         = */ "parameters",
-                                                /* array_wrapped    = */ true,
-                                                /* function_is_key  = */ false,
-                                                /* call_id_key      = */ "",
-                                                /* gen_call_id_key  = */ "tool_call_id",
-                                                /* parameters_order = */ { "tool_call_id", "tool_name", "parameters" });
-
-        // Content and tool calls are mutually exclusive in this format.
-        common_peg_parser body = require_tools ? tool_calls : p.choice({ tool_calls, text_content });
-
-        return generation_prompt + reasoning + body + p.optional(p.literal(TURN_END)) + end;
-    });
-
-    data.parser = parser.save();
-
-    if (include_grammar) {
-        data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
-        data.grammar      = build_grammar([&](const common_grammar_builder & builder) {
-            foreach_function(inputs.tools, [&](const json & tool) {
-                const auto & function = tool.at("function");
-                auto         schema   = function.at("parameters");
-                builder.resolve_refs(schema);
-            });
-            parser.build_grammar(builder, data.grammar_lazy);
-        });
-
-        data.grammar_triggers = {
-            { COMMON_GRAMMAR_TRIGGER_TYPE_WORD, ACTION_START }
-        };
-    }
-
-    return data;
-}
-
 namespace workaround {

 static void map_developer_role_to_system(json & messages) {
@@ -2380,149 +2253,6 @@ static void func_args_not_string(json & messages) {

 }

-// MiniCPM5 format:
-// - Reasoning: <think>{reasoning}</think> (optional)
-// - Tool calls: <function name="foo"><param name="bar">value</param></function>
-static common_chat_params common_chat_params_init_minicpm5(const common_chat_template &          tmpl,
-                                                           const autoparser::generation_params & inputs) {
-    common_chat_params data;
-
-    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
-    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
-    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
-    data.supports_thinking = true;
-    data.preserved_tokens  = {
-        "<function",
-        "<param",
-        "</function>",
-        "</param>",
-        "<think>",
-        "</think>",
-    };
-
-    data.thinking_start_tag = "<think>";
-    data.thinking_end_tag   = "</think>";
-
-    data.message_delimiters = {
-        { COMMON_CHAT_ROLE_ASSISTANT, "<|im_start|>assistant"             },
-        { COMMON_CHAT_ROLE_TOOL,      "<|im_start|>user\n<tool_response>" },
-        { COMMON_CHAT_ROLE_USER,      "<|im_start|>user"                  },
-        { COMMON_CHAT_ROLE_SYSTEM,    "<|im_start|>system"                },
-    };
-
-    auto has_tools           = inputs.tools.is_array() && !inputs.tools.empty();
-    auto has_response_format = inputs.json_schema.is_object() && !inputs.json_schema.empty();
-    auto extract_reasoning   = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
-    auto include_grammar     = has_response_format || (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE);
-
-    if (inputs.has_continuation()) {
-        const auto & msg = inputs.continue_msg;
-
-        data.generation_prompt = "<|im_start|>assistant\n<think>\n" + msg.reasoning_content;
-        if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
-            data.generation_prompt += "\n</think>\n\n" + msg.render_content();
-        }
-
-        data.prompt += data.generation_prompt;
-    }
-
-    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
-        auto generation_prompt = p.literal("<|im_start|>assistant\n");
-
-        auto reasoning = p.eps();
-        if (extract_reasoning) {
-            reasoning = ("<think>" << p.reasoning(p.until("</think>")) << "</think>") + p.space();
-        }
-
-        // Response format parser
-        if (has_response_format) {
-            return generation_prompt + reasoning + p.content(p.schema(p.json(), "response-format", inputs.json_schema));
-        }
-
-        if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
-            // CDATA lets a value carry characters that would otherwise close the tag (e.g.
-            // </param>); capture the inner text only, excluding the CDATA markers.
-            auto string_value = p.choice({
-                p.literal("<![CDATA[") + p.ac(p.tool_arg_string_value(p.until("]]>")) + p.literal("]]>"), "]]>") + p.tool_arg_close(p.literal("</param>")),
-                p.negate(p.literal("<![CDATA[")) + p.ac(p.tool_arg_string_value(p.until("</param>")) + p.tool_arg_close(p.literal("</param>")), "</param>")
-            });
-
-            auto tool_choice = p.choice();
-            foreach_function(inputs.tools, [&](const json & tool) {
-                const auto &      function = tool.at("function");
-                const std::string name     = function.at("name");
-                auto              params   = function.contains("parameters") ? function.at("parameters") : json::object();
-
-                auto args = p.eps();
-                if (params.contains("properties") && params.at("properties").is_object() && !params.at("properties").empty()) {
-                    auto schema_info = common_schema_info();
-                    schema_info.resolve_refs(params);
-
-                    auto arg_choice = p.choice();
-                    for (const auto & [prop_name, prop_schema] : params.at("properties").items()) {
-                        auto value_parser = p.eps();
-                        if (schema_info.resolves_to_string(prop_schema)) {
-                            value_parser = string_value;
-                        } else {
-                            value_parser = p.tool_arg_json_value(
-                                    p.schema(p.json(), "tool-" + name + "-arg-" + prop_name + "-schema", prop_schema, false)
-                                ) + p.tool_arg_close(p.literal("</param>"));
-                        }
-
-                        auto arg_rule = p.tool_arg(
-                            p.tool_arg_open(p.literal("<param name=\"") + p.tool_arg_name(p.literal(prop_name)) + p.literal("\">")) +
-                            value_parser
-                        );
-
-                        arg_choice |= arg_rule;
-                    }
-                    args = p.zero_or_more(arg_choice + p.space());
-                }
-
-                auto tool_parser = p.tool(
-                    p.tool_open(p.literal("<function name=\"") + p.tool_name(p.literal(name)) + p.literal("\">"))
-                    << p.tool_args(args)
-                    << p.tool_close(p.literal("</function>")));
-
-                tool_choice |= p.rule("tool-" + name, tool_parser);
-            });
-
-            auto max_calls  = inputs.parallel_tool_calls ? -1 : 1;
-            auto tool_calls = p.trigger_rule("tool-call", p.repeat(tool_choice + p.space(), 1, max_calls));
-
-            auto content = p.content(p.until("<function"));
-
-            return generation_prompt + reasoning + content + tool_calls + p.end();
-        }
-
-        return generation_prompt + reasoning + p.content(p.rest()) + p.end();
-    });
-
-    data.parser = parser.save();
-
-    if (include_grammar) {
-        data.grammar_lazy = !(has_response_format || (has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED));
-        data.grammar      = build_grammar([&](const common_grammar_builder & builder) {
-            foreach_function(inputs.tools, [&](const json & tool) {
-                const auto & function = tool.at("function");
-                auto         schema   = function.contains("parameters") ? function.at("parameters") : json::object();
-                builder.resolve_refs(schema);
-            });
-            if (has_response_format) {
-                auto schema = inputs.json_schema;
-                builder.resolve_refs(schema);
-            }
-            parser.build_grammar(builder, data.grammar_lazy);
-        });
-
-        data.grammar_triggers = {
-            { COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<function" },
-        };
-    }
-
-    return data;
-}
-
 static json common_chat_extra_context() {
    json ctx = json::object();
    std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
@@ -2566,25 +2296,16 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
        return common_chat_params_init_kimi_k2(tmpl, params);
    }

-    // Cohere2 MoE / North Code - marker-wrapped format with <|START_TEXT|> content and
-    // <|START_ACTION|> JSON tool calls. <|START_TEXT|> is unique to this template (the older
-    // Command-R templates use <|START_RESPONSE|>).
-    if (src.find("<|START_TEXT|>") != std::string::npos &&
-        src.find("<|START_ACTION|>") != std::string::npos) {
-        LOG_DBG("Using specialized template: Cohere2 MoE\n");
-        return common_chat_params_init_cohere2moe(tmpl, params);
-    }
-
    if (is_lfm2_template(src)) {
        LOG_DBG("Using specialized template: LFM2\n");
-        return common_chat_params_init_lfm2(tmpl, params, /* tool_list_tokens = */ true);
+        return common_chat_params_init_lfm2(tmpl, params);
    }

    // LFM2.5 format detection: template uses plain "List of tools: [...]" with no special tokens
    if (src.find("List of tools: [") != std::string::npos &&
        src.find("<|tool_list_start|>") == std::string::npos) {
        LOG_DBG("Using specialized template: LFM2.5\n");
-        return common_chat_params_init_lfm2(tmpl, params, /* tool_list_tokens = */ false);
+        return common_chat_params_init_lfm2_5(tmpl, params);
    }

    // GigaChatV3 format detection
@@ -2615,14 +2336,6 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
        return common_chat_params_init_gemma4(tmpl, params);
    }

-    // MiniCPM5 - XML tool calls with <function name="..."><param name="...">...</param></function>
-    if (src.find("Tool usage guidelines:") != std::string::npos &&
-        src.find("<function name=\"") != std::string::npos &&
-        src.find("<param name=\"") != std::string::npos) {
-        LOG_DBG("Using specialized template: MiniCPM5\n");
-        return common_chat_params_init_minicpm5(tmpl, params);
-    }
-
    return std::nullopt;
 }

@@ -2733,15 +2446,17 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
        autoparser.analyze_template(tmpl);
        auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser);

-        common_chat_msg_delimiters delimiters;
+        std::vector<common_chat_msg_delimiter> delimiters;
        if (!autoparser.assistant_start.empty()) {
-            delimiters.add(COMMON_CHAT_ROLE_ASSISTANT, autoparser.assistant_start);
+            delimiters.push_back({ "assistant", autoparser.assistant_start });
        }
        if (!autoparser.user_start.empty()) {
-            delimiters.add(COMMON_CHAT_ROLE_USER, autoparser.user_start);
+            delimiters.push_back({ "user", autoparser.user_start });
        }

-        auto_params.message_delimiters = std::move(delimiters);
+        if (!delimiters.empty()) {
+            auto_params.message_spans = common_chat_split_by_role(auto_params.prompt, delimiters);
+        }

        auto_params.supports_thinking = autoparser.reasoning.mode != autoparser::reasoning_mode::NONE;
        if (auto_params.supports_thinking) {
@@ -2883,9 +2598,8 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena &          src_pars
            }
            return msg;
        }
-        LOG_WRN("%s: unparsed %s output: %s\n", __func__, common_chat_format_name(params.format), effective_input.substr(result.end).c_str());
-        LOG_DBG("%s: full %s output triggering error:\n=== BEGIN ===\n%s\n=== END ===\n", __func__, common_chat_format_name(params.format), effective_input.c_str());
-        throw std::runtime_error(std::string("The model produced output that does not match the expected ") + common_chat_format_name(params.format) + " format");
+        throw std::runtime_error(std::string("Failed to parse input at pos ") + std::to_string(result.end) + ": " +
+                                 effective_input.substr(result.end));
    }

    common_chat_msg msg;
@@ -2913,9 +2627,5 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena &          src_pars
 std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_templates * chat_templates) {
    GGML_ASSERT(chat_templates != nullptr);
    GGML_ASSERT(chat_templates->template_default != nullptr);
-    if (chat_templates->template_tool_use != nullptr) {
-        // take the more expressive template when available
-        return chat_templates->template_tool_use->caps.to_map();
-    }
    return chat_templates->template_default->caps.to_map();
 }
@@ -143,75 +143,15 @@ struct common_chat_msg_diff {
    }
 };

-enum common_chat_role {
-    COMMON_CHAT_ROLE_UNKNOWN,
-    COMMON_CHAT_ROLE_SYSTEM,
-    COMMON_CHAT_ROLE_ASSISTANT,
-    COMMON_CHAT_ROLE_USER,
-    COMMON_CHAT_ROLE_TOOL
-};
-
-common_chat_role common_chat_role_from_string(const std::string & role);
-const char *     common_chat_role_to_string(common_chat_role role);
-
 struct common_chat_msg_span {
-    common_chat_role role = COMMON_CHAT_ROLE_UNKNOWN;
+    std::string role;
    std::size_t pos = 0;
    std::size_t len = 0;
-
-    bool valid() const {
-        return role != COMMON_CHAT_ROLE_UNKNOWN;
-    }
-};
-
-struct common_chat_msg_spans {
-    std::vector<common_chat_msg_span> spans;
-
-    void add(common_chat_role role, size_t pos, size_t len) {
-        spans.push_back({ role, pos, len });
-    }
-
-    bool is_user_start(int32_t pos) const {
-        for (auto it = spans.begin(); it != spans.end(); ++it) {
-            if (it->role == COMMON_CHAT_ROLE_USER && pos == (int32_t) it->pos) {
-                return true;
-            }
-        }
-        return false;
-    }
-
-    int32_t last_user_message_pos() const {
-        for (auto it = spans.rbegin(); it != spans.rend(); ++it) {
-            if (it->role == COMMON_CHAT_ROLE_USER) {
-                return (int32_t) it->pos;
-            }
-        }
-        return -1;
-    }
 };

 struct common_chat_msg_delimiter {
-    common_chat_role role = COMMON_CHAT_ROLE_UNKNOWN;
-    std::string      delimiter;
-    llama_tokens     tokens = {};
-};
-
-struct common_chat_msg_delimiters {
-    std::vector<common_chat_msg_delimiter> delimiters;
-
-    common_chat_msg_delimiters() = default;
-    common_chat_msg_delimiters(std::initializer_list<common_chat_msg_delimiter> delims) : delimiters(delims) {}
-
-    void add(common_chat_role role, const std::string & delimiter) {
-        delimiters.push_back({ role, delimiter });
-    }
-
-    void tokenize(const llama_vocab * vocab);
-
-    // split tokens into message spans. skips maps a start index to a length of a region to jump over without matching
-    common_chat_msg_spans split(const llama_tokens & tokens, const std::map<size_t, size_t> & skips = {}) const;
-
-    nlohmann::ordered_json to_json() const;
+    std::string role;
+    std::string delimiter;
 };

 struct common_chat_tool {
@@ -279,7 +219,7 @@ struct common_chat_params {
    std::vector<std::string>            preserved_tokens;
    std::vector<std::string>            additional_stops;
    std::string                         parser;
-    common_chat_msg_delimiters          message_delimiters;
+    std::vector<common_chat_msg_span>   message_spans;
 };

 // per-message parsing syntax
@@ -385,4 +325,5 @@ struct common_chat_prompt_preset {

 common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates);

-common_chat_msg_delimiters common_chat_msg_delimiters_parse(const nlohmann::ordered_json & delimiters);
+std::vector<common_chat_msg_span> common_chat_split_by_role(const std::string & prompt, const std::vector<common_chat_msg_delimiter> & delims);
+
@@ -225,7 +225,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
    }

    if (!SetPriorityClass(GetCurrentProcess(), p)) {
-        COM_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
+        LOG_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
        return false;
    }

@@ -251,7 +251,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
    }

    if (setpriority(PRIO_PROCESS, 0, p) != 0) {
-        COM_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
+        LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
        return false;
    }
    return true;
@@ -284,14 +284,14 @@ void postprocess_cpu_params(common_cpu_params & cpuparams, const common_cpu_para

    if (n_set && n_set < cpuparams.n_threads) {
        // Not enough set bits, may experience performance issues.
-        COM_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
+        LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
    }
 }

 bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
    size_t dash_loc = range.find('-');
    if (dash_loc == std::string::npos) {
-        COM_ERR("%s", "Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
+        LOG_ERR("Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
        return false;
    }

@@ -303,7 +303,7 @@ bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THRE
    } else {
        start_i = std::stoull(range.substr(0, dash_loc));
        if (start_i >= GGML_MAX_N_THREADS) {
-            COM_ERR("%s", "Start index out of bounds!\n");
+            LOG_ERR("Start index out of bounds!\n");
            return false;
        }
    }
@@ -313,7 +313,7 @@ bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THRE
    } else {
        end_i = std::stoull(range.substr(dash_loc + 1));
        if (end_i >= GGML_MAX_N_THREADS) {
-            COM_ERR("%s", "End index out of bounds!\n");
+            LOG_ERR("End index out of bounds!\n");
            return false;
        }
    }
@@ -333,7 +333,7 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
    }

    size_t num_digits = mask.length() - start_i;
-    num_digits = std::min<size_t>(num_digits, 128);
+    if (num_digits > 128) num_digits = 128;

    size_t end_i = num_digits + start_i;

@@ -348,7 +348,7 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
        } else if (c >= 'A' && c <= 'F') {
            id -= 'A' - 10;
        } else {
-            COM_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i));
+            LOG_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i));
            return false;
        }

@@ -379,21 +379,21 @@ void common_params_print_info(const common_params & params, bool print_devices)
 #else
    const char * build_type = " (debug)";
 #endif
-    COM_TRC("%s: build %d (%s) with %s for %s%s\n", __func__, llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type);
+    LOG_TRC("%s: build %d (%s) with %s for %s%s\n", __func__, llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type);

-    COM_INF("%s: verbosity = %d (adjust with the `-lv N` CLI arg)\n", __func__, common_log_get_verbosity_thold());
+    LOG_INF("log_info: verbosity = %d (adjust with the `-lv N` CLI arg)\n", common_log_get_verbosity_thold());

    // device enumeration creates a primary context on CUDA backends, skip it when the caller does not own any device
    if (print_devices) {
-        COM_TRC("%s", "device_info:\n");
+        LOG_INF("device_info:\n");
        for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
            auto * dev = ggml_backend_dev_get(i);
            size_t free, total;
            ggml_backend_dev_memory(dev, &free, &total);
-            COM_TRC("  - %-8s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
+            LOG_INF("  - %-8s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
        }
    }
-    COM_TRC("%s\n", common_params_get_system_info(params).c_str());
+    LOG_INF("%s\n", common_params_get_system_info(params).c_str());
 }

 std::string common_params_get_system_info(const common_params & params) {
@@ -660,7 +660,7 @@ void string_process_escapes(std::string & input) {
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
    const char * sep = strchr(data, '=');
    if (sep == nullptr || sep - data >= 128) {
-        COM_ERR("%s: malformed KV override '%s'\n", __func__, data);
+        LOG_ERR("%s: malformed KV override '%s'\n", __func__, data);
        return false;
    }
    llama_model_kv_override kvo;
@@ -683,20 +683,20 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
        } else if (std::strcmp(sep, "false") == 0) {
            kvo.val_bool = false;
        } else {
-            COM_ERR("%s: invalid boolean value for KV override '%s'\n", __func__, data);
+            LOG_ERR("%s: invalid boolean value for KV override '%s'\n", __func__, data);
            return false;
        }
    } else if (strncmp(sep, "str:", 4) == 0) {
        sep += 4;
        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
        if (strlen(sep) > 127) {
-            COM_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
+            LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
            return false;
        }
        strncpy(kvo.val_str, sep, 127);
        kvo.val_str[127] = '\0';
    } else {
-        COM_ERR("%s: invalid type for KV override '%s'\n", __func__, data);
+        LOG_ERR("%s: invalid type for KV override '%s'\n", __func__, data);
        return false;
    }
    overrides.emplace_back(std::move(kvo));
@@ -1074,18 +1074,6 @@ std::vector<common_file_info> fs_list(const std::string & path, bool include_dir
    return files;
 }

-std::ifstream fs_open_ifstream(const std::string & fname, std::ios_base::openmode mode) {
-#ifdef _WIN32
-    int wlen = MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, NULL, 0);
-    if (!wlen) { return std::ifstream(); }
-    std::vector<wchar_t> wfname(wlen);
-    (void)MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, wfname.data(), wlen);
-    return std::ifstream(wfname.data(), mode);
-#else
-    return std::ifstream(fname, mode);
-#endif
-}
-
 //
 // TTY utils
 //
@@ -1160,7 +1148,7 @@ static void common_init_sampler_from_model(
        if (llama_model_meta_val_str(model, llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE), buf, sizeof(buf)) > 0) {
            const std::vector<std::string> sampler_names = string_split<std::string>(std::string(buf), ';');
            if (!sampler_names.empty()) {
-                sparams.samplers = common_sampler_types_from_names(sampler_names);
+                sparams.samplers = common_sampler_types_from_names(sampler_names, true);
            }
        }
    }
@@ -1199,8 +1187,8 @@ common_init_result::common_init_result(common_params & params, bool model_only)
    auto cparams = common_context_params_to_llama(params);

    if (params.fit_params) {
-        COM_TRC("%s", "fitting params to device memory ...\n");
-        COM_TRC("%s", "(for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on)\n");
+        LOG_INF("%s: fitting params to device memory ...\n", __func__);
+        LOG_INF("%s: (for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on)\n", __func__);
        common_fit_params(params.model.path.c_str(), &mparams, &cparams,
            params.tensor_split,
            params.tensor_buft_overrides.data(),
@@ -1227,7 +1215,7 @@ common_init_result::common_init_result(common_params & params, bool model_only)
        llama_adapter_lora_ptr lora;
        lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
        if (lora == nullptr) {
-            COM_ERR("failed to load lora adapter '%s'\n", la.path.c_str());
+            LOG_ERR("%s: failed to load lora adapter '%s'\n", __func__, la.path.c_str());
            pimpl->model.reset(model);
            return;
        }
@@ -1246,14 +1234,14 @@ common_init_result::common_init_result(common_params & params, bool model_only)
    common_init_sampler_from_model(model, params.sampling);

    if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
-        COM_WRN("%s", "vocab does not have an EOS token, ignoring --ignore-eos\n");
+        LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
        params.sampling.ignore_eos = false;
    }

    // initialize once
    for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
        if (llama_vocab_is_eog(vocab, i)) {
-            COM_TRC("added %s logit bias = %f\n", common_token_to_piece(vocab, i).c_str(), -INFINITY);
+            LOG_TRC("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
            params.sampling.logit_bias_eog.push_back({i, -INFINITY});
        }
    }
@@ -1291,7 +1279,7 @@ common_init_result::common_init_result(common_params & params, bool model_only)

    llama_context * lctx = llama_init_from_model(model, cparams);
    if (lctx == NULL) {
-        COM_ERR("failed to create context with model '%s'\n", params.model.path.c_str());
+        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
        return;
    }

@@ -1328,7 +1316,7 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode

    llama_model * model = res->model();
    if (model == NULL) {
-        COM_ERR("failed to load model '%s'\n", params.model.path.c_str());
+        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
        return res;
    }

@@ -1338,14 +1326,14 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode

    llama_context * lctx = res->context();
    if (lctx == NULL) {
-        COM_ERR("failed to create context with model '%s'\n", params.model.path.c_str());
+        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
        return res;
    }

    const llama_vocab * vocab = llama_model_get_vocab(model);

    if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
-        COM_WRN("%s", "KV cache shifting is not supported for this context, disabling KV cache shifting\n");
+        LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
        params.ctx_shift = false;
    }

@@ -1374,7 +1362,7 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode
        bool ok = true;

        if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
-            COM_WRN("%s", "vocab does not have a  BOS token, reranking will not work\n");
+            LOG_WRN("%s: warning: vocab does not have a  BOS token, reranking will not work\n", __func__);
            ok = false;
        }

@@ -1383,10 +1371,10 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode
        bool has_rerank_prompt = llama_model_chat_template(model, "rerank") != NULL;

        if (!has_eos && !has_sep && !has_rerank_prompt) {
-            COM_WRN("%s", "vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n");
+            LOG_WRN("%s: warning: vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n", __func__);
            ok = false;
        } else if (!has_eos) {
-            COM_WRN("%s", "vocab does not have an EOS token, using SEP token as fallback\n");
+            LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
        }

        if (!ok) {
@@ -1399,7 +1387,7 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode
    }

    if (params.warmup) {
-        COM_TRC("%s", "warming up the model with an empty run - please wait ... (--no-warmup to disable)\n");
+        LOG_INF("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);

        std::vector<llama_token> tmp;
        llama_token bos = llama_vocab_bos(vocab);
@@ -1473,20 +1461,20 @@ common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx) {

    int ret = llama_decode(ctx, llama_batch_get_one(tmp.data(), tmp.size()));
    if (ret != 0) {
-        COM_ERR("llama_decode() failed: %d\n", ret);
+        LOG_ERR("%s: llama_decode() failed: %d\n", __func__, ret);
        res = COMMON_CONTEXT_SEQ_RM_TYPE_NO;
        goto done;
    }

    if (llama_n_rs_seq(ctx) > 0) {
-        COM_TRC("%s", "the context supports bounded partial sequence removal\n");
+        LOG_INF("%s: the context supports bounded partial sequence removal\n", __func__);
        res = COMMON_CONTEXT_SEQ_RM_TYPE_RS;
        goto done;
    }

    // try to remove the last tokens
    if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
-        COM_TRC("%s", "the context does not support partial sequence removal\n");
+        LOG_TRC("%s: the context does not support partial sequence removal\n", __func__);
        res = COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
        goto done;
    }
@@ -1803,13 +1791,13 @@ static common_control_vector_data common_control_vector_load_one(const common_co
    };
    struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
    if (!ctx_gguf) {
-        COM_ERR("failed to load control vector file from %s\n", load_info.fname.c_str());
+        LOG_ERR("%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
        return result;
    }

    int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
    if (n_tensors == 0) {
-        COM_WRN("no direction tensors found in %s\n", load_info.fname.c_str());
+        LOG_WRN("%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
    }

    for (int i = 0; i < n_tensors; i++) {
@@ -1827,23 +1815,23 @@ static common_control_vector_data common_control_vector_load_one(const common_co
            }
        }
        if (layer_idx < 0) {
-            COM_ERR("invalid/unparsable direction tensor layer index in %s\n", load_info.fname.c_str());
+            LOG_ERR("%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
            result.n_embd = -1;
            break;
        } else if (layer_idx == 0) {
-            COM_ERR("invalid (zero) direction tensor layer index in %s\n", load_info.fname.c_str());
+            LOG_ERR("%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
            result.n_embd = -1;
            break;
        }

        struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
        if (tensor->type != GGML_TYPE_F32) {
-            COM_ERR("invalid (non-F32) direction tensor type in %s\n", load_info.fname.c_str());
+            LOG_ERR("%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
            result.n_embd = -1;
            break;
        }
        if (ggml_n_dims(tensor) != 1) {
-            COM_ERR("invalid (non-1D) direction tensor shape in %s\n", load_info.fname.c_str());
+            LOG_ERR("%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
            result.n_embd = -1;
            break;
        }
@@ -1851,7 +1839,7 @@ static common_control_vector_data common_control_vector_load_one(const common_co
        if (result.n_embd == -1) {
            result.n_embd = ggml_nelements(tensor);
        } else if (ggml_nelements(tensor) != result.n_embd) {
-            COM_ERR("direction tensor in %s does not match previous dimensions\n", load_info.fname.c_str());
+            LOG_ERR("%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
            result.n_embd = -1;
            break;
        }
@@ -1868,7 +1856,7 @@ static common_control_vector_data common_control_vector_load_one(const common_co
    }

    if (result.n_embd == -1) {
-        COM_WRN("skipping %s due to invalid direction tensors\n", load_info.fname.c_str());
+        LOG_WRN("%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
        result.data.clear();
    }

@@ -1889,7 +1877,7 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
            break;
        }
        if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
-            COM_ERR("control vectors in %s does not match previous dimensions\n", info.fname.c_str());
+            LOG_ERR("%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
            result.n_embd = -1;
            break;
        }
@@ -1905,7 +1893,7 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
    }

    if (result.n_embd == -1) {
-        COM_ERR("%s", "no valid control vector files passed\n");
+        LOG_ERR("%s: no valid control vector files passed\n", __func__);
        result.data.clear();
    }

@@ -2016,13 +2004,13 @@ bool common_prompt_batch_decode(
        // memory, so we can't just remove the last token from the memory and replay the last token which
        // is the reason for this logic.
        if (llama_decode(ctx, llama_batch_get_one(const_cast<llama_token*>(all_tokens.data() + offset), n_tokens_before_last))) {
-            COM_ERR("%s", "failed to eval\n");
+            LOG_ERR("%s : failed to eval\n", __func__);
            return false;
        }
        n_past += n_tokens_before_last;

        llama_state_save_file(ctx, state_path.data(), all_tokens.data(), all_tokens.size());
-        COM_INF("saved session before last token to %s, n_new = %zu\n", state_path.data(), all_tokens.size());
+        LOG_INF("saved session before last token to %s, n_new = %zu\n", state_path.data(), all_tokens.size());

        llama_token last_token = all_tokens.back();
        llama_batch batch = llama_batch_get_one(&last_token, 1);
@@ -2030,13 +2018,13 @@ bool common_prompt_batch_decode(
        batch.pos = &pos;

        if (llama_decode(ctx, batch)) {
-            COM_ERR("%s", "failed to eval last token\n");
+            LOG_ERR("%s : failed to eval last token\n", __func__);
            return false;
        }
        n_past++;
    } else {
        if (llama_decode(ctx, llama_batch_get_one(const_cast<llama_token*>(all_tokens.data() + offset), n_new))) {
-            COM_ERR("%s", "failed to eval\n");
+            LOG_ERR("%s : failed to eval\n", __func__);
            return false;
        }
        n_past += n_new;
@@ -2046,7 +2034,7 @@ bool common_prompt_batch_decode(
 }

 size_t common_prompt_checkpoint::size() const {
-    return data_tgt.size() + data_dft.size() + data_spec.size();
+    return data_tgt.size() + data_dft.size();
 }

 bool common_prompt_checkpoint::empty() const {
@@ -2061,7 +2049,6 @@ void common_prompt_checkpoint::clear() {

    data_tgt.clear();
    data_dft.clear();
-    data_spec.clear();
 }

 void common_prompt_checkpoint::update_pos(
@@ -2151,5 +2138,4 @@ void common_prompt_checkpoint::clear_tgt() {

 void common_prompt_checkpoint::clear_dft() {
    data_dft.clear();
-    data_spec.clear();
 }
@@ -25,13 +25,6 @@
 #define DIRECTORY_SEPARATOR '/'
 #endif // _WIN32

-#define COM_DBG(fmt, ...) LOG_DBG("cmn  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
-#define COM_TRC(fmt, ...) LOG_TRC("cmn  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
-#define COM_INF(fmt, ...) LOG_INF("cmn  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
-#define COM_WRN(fmt, ...) LOG_WRN("cmn  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
-#define COM_ERR(fmt, ...) LOG_ERR("cmn  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
-#define COM_CNT(fmt, ...) LOG_CNT(""              fmt,               __VA_ARGS__)
-
 #define die(msg)          do { fputs("error: " msg "\n", stderr);                exit(1); } while (0)
 #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)

@@ -103,7 +96,6 @@ enum llama_example {
    LLAMA_EXAMPLE_FIT_PARAMS,
    LLAMA_EXAMPLE_RESULTS,
    LLAMA_EXAMPLE_EXPORT_GRAPH_OPS,
-    LLAMA_EXAMPLE_DOWNLOAD,

    LLAMA_EXAMPLE_COUNT,
 };
@@ -169,7 +161,6 @@ enum common_speculative_type {
    COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE,  // standalone draft model speculative decoding
    COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3,  // Eagle3 speculative decoding
    COMMON_SPECULATIVE_TYPE_DRAFT_MTP,     // Multi-token prediction
-    COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH,  // DFlash speculative decoding
    COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,  // simple self-speculative decoding based on n-grams
    COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K,   // self-speculative decoding with n-gram keys only
    COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
@@ -299,25 +290,12 @@ struct common_params_sampling {
 };

 struct common_params_model {
-    std::string path        = ""; // model local path
-    std::string url         = ""; // model url to download
-    std::string hf_repo     = ""; // HF repo
-    std::string hf_file     = ""; // HF file
-    std::string docker_repo = ""; // Docker repo
-
-    std::string get_name() const {
-        if (!hf_repo.empty()) {
-            return hf_repo;
-        }
-        if (!docker_repo.empty()) {
-            return docker_repo;
-        }
-        return path;
-    }
-
-    bool empty() const {
-        return get_name().empty();
-    }
+    std::string path        = ""; // model local path                                       // NOLINT
+    std::string url         = ""; // model url to download                                  // NOLINT
+    std::string hf_repo     = ""; // HF repo                                                // NOLINT
+    std::string hf_file     = ""; // HF file                                                // NOLINT
+    std::string docker_repo = ""; // Docker repo                                            // NOLINT
+    std::string name        = ""; // in format <user>/<model>[:<tag>] (tag is optional)     // NOLINT
 };

 // draft-model-based speculative decoding parameters
@@ -380,12 +358,12 @@ struct common_params_speculative {
    common_params_speculative_ngram_cache ngram_cache;

    bool has_dft() const {
-        return !draft.mparams.empty();
+        return !draft.mparams.path.empty() || !draft.mparams.hf_repo.empty();
    }

    uint32_t need_n_rs_seq() const {
        bool needs_rs_seq = std::any_of(types.begin(), types.end(), [&](auto t) {
-            return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP || t == COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3 || t == COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH;
+            return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP;
        });

        return needs_rs_seq ? draft.n_max : 0u;
@@ -511,7 +489,6 @@ struct common_params {
    std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
    std::string input_suffix         = ""; // string to suffix user inputs with                             // NOLINT
    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
-    std::string path_prompts_log_dir = ""; // directory with logged prompts                                 // NOLINT

    // llama-debug specific options
    std::string logits_output_dir = "data"; // directory for saving logits output files                     // NOLINT
@@ -532,6 +509,7 @@ struct common_params {
    int32_t control_vector_layer_start = -1; // layer range for control vector
    int32_t control_vector_layer_end   = -1; // layer range for control vector
    bool    offline                    = false;
+    bool    skip_download              = false; // skip model file downloading

    int32_t ppl_stride      = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
    int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
@@ -593,10 +571,9 @@ struct common_params {
    struct common_params_model mmproj;
    bool mmproj_use_gpu = true;     // use GPU for multimodal model
    bool no_mmproj = false;         // explicitly disable multimodal model
-    std::vector<std::string> image; // path to image file(s) ; TODO: change the name to "media"
+    std::vector<std::string> image; // path to image file(s)
    int image_min_tokens = -1;
    int image_max_tokens = -1;
-    int mtmd_batch_max_tokens = 1024;

    // finetune
    struct lr_opt lr;
@@ -621,7 +598,7 @@ struct common_params {
    bool    cache_prompt        = true;  // whether to enable prompt caching
    bool    cache_idle_slots    = true;  // save and clear idle slots upon starting a new task
    int32_t n_ctx_checkpoints   = 32;    // max number of context checkpoints per slot
-    int32_t checkpoint_min_step = 8192;  // minimum spacing between context checkpoints
+    int32_t checkpoint_min_step = 256;   // minimum spacing between context checkpoints
    int32_t cache_ram_mib       = 8192;  // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.

    std::string hostname      = "127.0.0.1";
@@ -645,6 +622,12 @@ struct common_params {

    // UI configs
    bool ui = true;
+
+    // Deprecated: use ui, ui_mcp_proxy, ui_config_json instead
+    bool webui = ui;
+    bool webui_mcp_proxy = false;
+    std::string webui_config_json;
+
    bool ui_mcp_proxy = false;
    std::string ui_config_json;

@@ -657,11 +640,10 @@ struct common_params {
    std::vector<std::string> server_tools;

    // router server configs
-    std::string models_dir    = "";     // directory containing models for the router server
-    std::string models_preset = "";     // directory containing model presets for the router server
-    int models_max = 4;                 // maximum number of models to load simultaneously
-    bool models_autoload = true;        // automatically load models when requested via the router server
-    std::string models_preset_hf = "";  // show a warning about remote presets on router loaded (if not empty)
+    std::string models_dir    = ""; // directory containing models for the router server
+    std::string models_preset = ""; // directory containing model presets for the router server
+    int models_max = 4;             // maximum number of models to load simultaneously
+    bool models_autoload = true;    // automatically load models when requested via the router server

    bool log_json = false;

@@ -863,9 +845,6 @@ struct common_file_info {
 };
 std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);

-// fs open, also handle UTF8 on Windows
-std::ifstream fs_open_ifstream(const std::string & fname, std::ios_base::openmode mode);
-
 //
 // TTY utils
 //
@@ -1083,10 +1062,6 @@ struct common_prompt_checkpoint {
    std::vector<uint8_t> data_tgt;
    std::vector<uint8_t> data_dft;

-    // (optional) speculative-decoding implementation state stashed with the checkpoint
-    // (e.g. eagle3's deferred-boundary g_embd row)
-    std::vector<uint8_t> data_spec;
-
    size_t size() const;

    bool empty() const;
@@ -292,6 +292,10 @@ static int common_download_file_single_online(const std::string & url,

    const bool file_exists = std::filesystem::exists(path);

+    if (!file_exists && opts.skip_download) {
+        return -2; // file is missing and download is disabled
+    }
+
    if (file_exists && skip_etag) {
        LOG_DBG("%s: using cached file: %s\n", __func__, path.c_str());
        return 304; // 304 Not Modified - fake cached response
@@ -358,6 +362,9 @@ static int common_download_file_single_online(const std::string & url,
            return 304; // 304 Not Modified - fake cached response
        }
        // pass this point, the file exists but is different from the server version, so we need to redownload it
+        if (opts.skip_download) {
+            return -2; // special code to indicate that the download was skipped due to etag mismatch
+        }
        if (remove(path.c_str()) != 0) {
            LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
            return -1;
@@ -684,8 +691,18 @@ static void list_available_gguf_files(const hf_cache::hf_files & files) {
    }
 }

-common_download_hf_plan common_download_get_hf_plan(const common_params_model & model, const common_download_opts & opts) {
-    common_download_hf_plan plan;
+struct hf_plan {
+    hf_cache::hf_file primary;
+    hf_cache::hf_files model_files;
+    hf_cache::hf_file mmproj;
+    hf_cache::hf_file mtp;
+};
+
+static hf_plan get_hf_plan(const common_params_model  & model,
+                           const common_download_opts & opts,
+                           bool download_mmproj,
+                           bool download_mtp) {
+    hf_plan plan;
    hf_cache::hf_files all;

    auto [repo, tag] = common_download_split_repo_tag(model.hf_repo);
@@ -700,14 +717,6 @@ common_download_hf_plan common_download_get_hf_plan(const common_params_model &
        return plan;
    }

-    // if preset.ini exists in the repo root, download only that file
-    for (const auto & f : all) {
-        if (f.path == "preset.ini") {
-            plan.preset = f;
-            return plan;
-        }
-    }
-
    hf_cache::hf_file primary;

    if (!model.hf_file.empty()) {
@@ -734,49 +743,115 @@ common_download_hf_plan common_download_get_hf_plan(const common_params_model &
    plan.primary = primary;
    plan.model_files = get_split_files(all, primary);

-    if (opts.download_mmproj) {
+    if (download_mmproj) {
        plan.mmproj = find_best_mmproj(all, primary.path);
    }
-    if (opts.download_mtp) {
+
+    if (download_mtp) {
        plan.mtp = find_best_mtp(all, primary.path);
    }

    return plan;
 }

-void common_download_run_tasks(const std::vector<common_download_task> & tasks) {
+struct download_task {
+    std::string url;
+    std::string path;
+};
+
+static std::vector<download_task> get_url_tasks(const common_params_model & model) {
+    auto split = get_gguf_split_info(model.url);
+
+    if (split.count <= 1) {
+        return {{model.url, model.path}};
+    }
+
+    auto filename = split.prefix;
+    if (auto pos = split.prefix.rfind('/'); pos != std::string::npos) {
+        filename = split.prefix.substr(pos + 1);
+    }
+
+    auto parent_path = std::filesystem::path(model.path).parent_path();
+    auto prefix_path = (parent_path / filename).string();
+
+    std::vector<download_task> tasks;
+    for (int i = 1; i <= split.count; i++) {
+        auto suffix = string_format("-%05d-of-%05d.gguf", i, split.count);
+        tasks.push_back({split.prefix + suffix, prefix_path + suffix});
+    }
+    return tasks;
+}
+
+common_download_model_result common_download_model(const common_params_model  & model,
+                                                   const common_download_opts & opts) {
+    common_download_model_result result;
+    std::vector<download_task> tasks;
+    hf_plan hf;
+
+    bool download_mmproj = opts.download_mmproj;
+    bool download_mtp = opts.download_mtp;
+    bool is_hf = !model.hf_repo.empty();
+
+    if (is_hf) {
+        hf = get_hf_plan(model, opts, download_mmproj, download_mtp);
+        for (const auto & f : hf.model_files) {
+            tasks.push_back({f.url, f.local_path});
+        }
+        if (!hf.mmproj.path.empty()) {
+            tasks.push_back({hf.mmproj.url, hf.mmproj.local_path});
+        }
+        if (!hf.mtp.path.empty()) {
+            tasks.push_back({hf.mtp.url, hf.mtp.local_path});
+        }
+    } else if (!model.url.empty()) {
+        tasks = get_url_tasks(model);
+    } else {
+        result.model_path = model.path;
+        return result;
+    }
+
+    if (tasks.empty()) {
+        return result;
+    }
+
    std::vector<std::future<int>> futures;
    for (const auto & task : tasks) {
        futures.push_back(std::async(std::launch::async,
-            [&task]() {
-                return common_download_file_single(task.url, task.local_path, task.opts, task.is_hf);
+            [&task, &opts, is_hf]() {
+                return common_download_file_single(task.url, task.path, opts, is_hf);
            }
        ));
    }

-    for (size_t i = 0; i < futures.size(); ++i) {
-        std::string url = tasks[i].url;
-        int status = futures[i].get();
+    for (auto & f : futures) {
+        int status = f.get();
+        if (status == -2 && opts.skip_download) {
+            throw common_skip_download_exception();
+        }
        bool is_ok = is_http_status_ok(status);
        if (!is_ok) {
-            throw std::runtime_error(string_format("Download '%s' failed with status code: %d", url.c_str(), status));
+            return {};
        }
    }
-}

-std::vector<std::string> common_download_get_all_parts(const std::string & url) {
-    auto split = get_gguf_split_info(url);
+    if (is_hf) {
+        for (const auto & f : hf.model_files) {
+            hf_cache::finalize_file(f);
+        }
+        result.model_path = hf.primary.final_path;

-    if (split.count <= 1) {
-        return {url};
+        if (!hf.mmproj.path.empty()) {
+            result.mmproj_path = hf_cache::finalize_file(hf.mmproj);
+        }
+
+        if (!hf.mtp.path.empty()) {
+            result.mtp_path = hf_cache::finalize_file(hf.mtp);
+        }
+    } else {
+        result.model_path = model.path;
    }

-    std::vector<std::string> parts;
-    for (int i = 1; i <= split.count; i++) {
-        auto suffix = string_format("-%05d-of-%05d.gguf", i, split.count);
-        parts.push_back(split.prefix + suffix);
-    }
-    return parts;
+    return result;
 }

 //
@@ -922,87 +997,3 @@ std::vector<common_cached_model_info> common_list_cached_models() {

    return result;
 }
-
-bool common_download_remove(const std::string & hf_repo_with_tag) {
-    namespace fs = std::filesystem;
-
-    auto [repo_id, tag] = common_download_split_repo_tag(hf_repo_with_tag);
-
-    if (tag.empty()) {
-        return hf_cache::remove_cached_repo(repo_id);
-    }
-
-    std::string tag_upper = tag;
-    for (char & c : tag_upper) {
-        c = (char) std::toupper((unsigned char) c);
-    }
-
-    auto files = hf_cache::get_cached_files(repo_id);
-    if (files.empty()) {
-        return false;
-    }
-
-    // collect snapshot entries whose tag matches
-    std::vector<fs::path> to_remove;
-    for (const auto & f : files) {
-        auto split = get_gguf_split_info(f.path);
-        if (split.tag == tag_upper) {
-            to_remove.emplace_back(f.local_path);
-        }
-    }
-
-    if (to_remove.empty()) {
-        return false;
-    }
-
-    // resolve blob paths from symlinks before deleting snapshot entries
-    std::vector<fs::path> blobs_to_check;
-    for (const auto & p : to_remove) {
-        std::error_code ec;
-        if (fs::is_symlink(p, ec)) {
-            auto target = fs::read_symlink(p, ec);
-            if (!ec) {
-                blobs_to_check.push_back((p.parent_path() / target).lexically_normal());
-            }
-        }
-    }
-
-    // remove snapshot entries
-    for (const auto & p : to_remove) {
-        std::error_code ec;
-        fs::remove(p, ec);
-        if (ec) {
-            LOG_WRN("%s: failed to remove %s: %s\n", __func__, p.string().c_str(), ec.message().c_str());
-        }
-    }
-
-    if (blobs_to_check.empty()) {
-        return true;
-    }
-
-    // collect blobs still referenced by remaining snapshot entries
-    std::unordered_set<std::string> still_referenced;
-    for (const auto & f : hf_cache::get_cached_files(repo_id)) {
-        fs::path p(f.local_path);
-        std::error_code ec;
-        if (fs::is_symlink(p, ec)) {
-            auto target = fs::read_symlink(p, ec);
-            if (!ec) {
-                still_referenced.insert((p.parent_path() / target).lexically_normal().string());
-            }
-        }
-    }
-
-    // remove orphaned blobs
-    for (const auto & blob : blobs_to_check) {
-        if (still_referenced.find(blob.string()) == still_referenced.end()) {
-            std::error_code ec;
-            fs::remove(blob, ec);
-            if (ec) {
-                LOG_WRN("%s: failed to remove blob %s: %s\n", __func__, blob.string().c_str(), ec.message().c_str());
-            }
-        }
-    }
-
-    return true;
-}
@@ -1,10 +1,7 @@
 #pragma once

-#include "hf-cache.h"
-
 #include <string>
 #include <vector>
-#include <functional>

 struct common_params_model;

@@ -50,40 +47,65 @@ struct common_cached_model_info {
    }
 };

-// Options for common_download_file_single
+// Options for common_download_model and common_download_file_single
 struct common_download_opts {
    std::string bearer_token;
    common_header_list headers;
    bool offline = false;
+    bool skip_download = false; // if true, only validation is performed, common_skip_download_exception may be thrown if the file is missing or invalid
    bool download_mmproj = false;
    bool download_mtp = false;
    common_download_callback * callback = nullptr;
 };

-struct common_download_task {
-    common_download_opts opts;
-    std::string url;
-    std::string local_path;
-    std::function<void()> on_done;
-    bool is_hf = false;
-
-    common_download_task() = default;
-    common_download_task(hf_cache::hf_file f,
-            const common_download_opts & opts,
-            std::function<void()> on_done = nullptr)
-        : opts(opts), url(f.url), local_path(f.local_path), on_done(on_done), is_hf(true) {}
+// Result of common_download_model
+struct common_download_model_result {
+    std::string model_path;
+    std::string mmproj_path;
+    std::string mtp_path;
 };

-void common_download_run_tasks(const std::vector<common_download_task> & tasks);
+// throw if the file is missing or invalid (e.g. ETag check failed)
+struct common_skip_download_exception : public std::runtime_error {
+    common_skip_download_exception() : std::runtime_error("skip download") {}
+};

-// if url is a multi-part GGUF file, returns all parts, otherwise returns the single file
-std::vector<std::string> common_download_get_all_parts(const std::string & url);
+// Download model from HuggingFace repo or URL
+//
+// input (via model struct):
+// - model.hf_repo: HF repo with optional tag, see common_download_split_repo_tag
+// - model.hf_file: specific file in the repo (requires hf_repo)
+// - model.url: simple download (used if hf_repo is empty)
+// - model.path: local file path
+//
+// tag matching (for HF repos without model.hf_file):
+// - if tag is specified, searches for GGUF matching that quantization
+// - if no tag, searches for Q4_K_M, then Q4_0, then first available GGUF
+//
+// split GGUF: multi-part files like "model-00001-of-00003.gguf" are automatically
+// detected and all parts are downloaded
+//
+// caching:
+// - HF repos: uses HuggingFace cache
+// - URLs: uses ETag-based caching
+//
+// when opts.offline=true, no network requests are made
+// when download_mmproj=true, searches for mmproj in same directory as model or any parent directory
+// then with the closest quantization bits
+// when download_mtp=true, applies the same sibling search for an MTP-head GGUF
+//
+// returns result with model_path, mmproj_path and mtp_path (empty when not found / on failure)
+common_download_model_result common_download_model(
+    const common_params_model & model,
+    const common_download_opts & opts = {}
+);

 // returns list of cached models
 std::vector<common_cached_model_info> common_list_cached_models();

 // download single file from url to local path
 // returns status code or -1 on error
+// returns -2 if the download was skipped due to ETag mismatch (file outdated, skip_download=true)
 // skip_etag: if true, don't read/write .etag files (for HF cache where filename is the hash)
 int common_download_file_single(const std::string & url,
                                const std::string & path,
@@ -93,19 +115,3 @@ int common_download_file_single(const std::string & url,
 // resolve and download model from Docker registry
 // return local path to downloaded model file
 std::string common_docker_resolve_model(const std::string & docker);
-
-// Remove a cached model from disk
-// input format: "user/model" or "user/model:tag"
-// - if tag is omitted, removes the entire repo cache directory
-// - if tag is present, removes only files matching that tag (and orphaned blobs)
-// returns true if anything was removed
-bool common_download_remove(const std::string & hf_repo_with_tag);
-
-struct common_download_hf_plan {
-    hf_cache::hf_file primary;
-    hf_cache::hf_files model_files;
-    hf_cache::hf_file mmproj;
-    hf_cache::hf_file mtp;
-    hf_cache::hf_file preset; // if set, only this file is downloaded
-};
-common_download_hf_plan common_download_get_hf_plan(const common_params_model & model, const common_download_opts & opts);
@@ -26,7 +26,7 @@ class common_params_fit_exception : public std::runtime_error {
    using std::runtime_error::runtime_error;
 };

-static std::vector<llama_device_memory_data> common_get_device_memory_data_impl(
+std::vector<llama_device_memory_data> common_get_device_memory_data(
        const char * path_model,
        const llama_model_params * mparams,
        const llama_context_params * cparams,
@@ -150,29 +150,6 @@ static std::vector<llama_device_memory_data> common_get_device_memory_data_impl(
    return ret;
 }

-common_device_memory_data_vec common_get_device_memory_data(
-        const char * path_model,
-        const llama_model_params * mparams,
-        const llama_context_params * cparams,
-        std::vector<ggml_backend_dev_t> & devs,
-        uint32_t & hp_ngl,
-        uint32_t & hp_n_ctx_train,
-        uint32_t & hp_n_expert,
-        ggml_log_level log_level) {
-    std::vector<llama_device_memory_data> impl = common_get_device_memory_data_impl(
-            path_model, mparams, cparams, devs, hp_ngl, hp_n_ctx_train, hp_n_expert, log_level);
-
-    common_device_memory_data_vec ret(impl.size());
-    for (size_t i = 0; i < impl.size(); i++) {
-        ret[i].total   = impl[i].total;
-        ret[i].free    = impl[i].free;
-        ret[i].model   = impl[i].mb.model;
-        ret[i].context = impl[i].mb.context;
-        ret[i].compute = impl[i].mb.compute;
-    }
-    return ret;
-}
-
 static void common_params_fit_impl(
        const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
        float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
@@ -192,7 +169,7 @@ static void common_params_fit_impl(
    // step 1: get data for default parameters and check whether any changes are necessary in the first place

    LOG_TRC("%s: getting device memory data for initial parameters:\n", __func__);
-    const dmds_t dmds_full = common_get_device_memory_data_impl(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
+    const dmds_t dmds_full = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
    const size_t nd = devs.size(); // number of devices

    std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
@@ -233,7 +210,7 @@ static void common_params_fit_impl(
        sum_projected_used = dmds_full.back().mb.total();
        sum_free           = dmds_full.back().total;
        sum_projected_free = sum_free - sum_projected_used;
-        LOG_TRC("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
+        LOG_INF("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
            __func__, sum_projected_used/MiB, sum_free/MiB);
        if (sum_projected_free >= margins[0]) {
            LOG_TRC("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
@@ -327,7 +304,7 @@ static void common_params_fit_impl(

                    int64_t sum_projected_used_min_ctx = 0;
                    cparams->n_ctx = n_ctx_min;
-                    const dmds_t dmds_min_ctx = common_get_device_memory_data_impl(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
+                    const dmds_t dmds_min_ctx = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
                    if (nd == 0) {
                        sum_projected_used_min_ctx = dmds_min_ctx.back().mb.total();
                    } else {
@@ -505,7 +482,7 @@ static void common_params_fit_impl(
        llama_model_params mparams_copy = *mparams;
        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);

-        const dmds_t dmd_nl = common_get_device_memory_data_impl(
+        const dmds_t dmd_nl = common_get_device_memory_data(
            path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);

        LOG_TRC("%s: memory for test allocation by device:\n", func_name);
@@ -533,7 +510,7 @@ static void common_params_fit_impl(
        mparams->tensor_buft_overrides = tensor_buft_overrides;

        LOG_TRC("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
-        const dmds_t dmds_cpu_moe = common_get_device_memory_data_impl(
+        const dmds_t dmds_cpu_moe = common_get_device_memory_data(
            path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);

        for (size_t id = 0; id < nd; id++) {
@@ -963,7 +940,7 @@ void common_fit_print(
    uint32_t hp_nct = 0; // hparams.n_ctx_train
    uint32_t hp_nex = 0; // hparams.n_expert

-    auto dmd = common_get_device_memory_data_impl(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR);
+    auto dmd = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR);
    GGML_ASSERT(dmd.size() == devs.size() + 1);

    for (size_t id = 0; id < devs.size(); id++) {
@@ -1,7 +1,9 @@
 #pragma once

 #include "ggml.h"
+#include "ggml-backend.h"
 #include "llama.h"
+#include "../src/llama-ext.h"

 #include <vector>

@@ -16,41 +18,31 @@ enum common_params_fit_status {
 //   - this function is NOT thread safe because it modifies the global llama logger state
 //   - only parameters that have the same value as in llama_default_model_params are modified
 //     with the exception of the context size which is modified if and only if equal to 0
-common_params_fit_status common_fit_params(
-                         const char * path_model,
-                 llama_model_params * mparams,
-               llama_context_params * cparams,
-                              float * tensor_split,          // writable buffer for tensor split, needs at least llama_max_devices elements
-   llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
-                             size_t * margins,               // margins of memory to leave per device in bytes
-                           uint32_t   n_ctx_min,             // minimum context size to set when trying to reduce memory use
-                     ggml_log_level   log_level);            // minimum log level to print during fitting, lower levels go to debug log
+enum common_params_fit_status common_fit_params(
+                               const char   * path_model,
+                struct llama_model_params   * mparams,
+                struct llama_context_params * cparams,
+                                      float * tensor_split,          // writable buffer for tensor split, needs at least llama_max_devices elements
+    struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
+                                     size_t * margins,               // margins of memory to leave per device in bytes
+                                   uint32_t   n_ctx_min,             // minimum context size to set when trying to reduce memory use
+                        enum ggml_log_level   log_level);            // minimum log level to print during fitting, lower levels go to debug log

 // print estimated memory to stdout
 void common_fit_print(
-                         const char * path_model,
-                 llama_model_params * mparams,
-               llama_context_params * cparams);
+                               const char   * path_model,
+                struct llama_model_params   * mparams,
+                struct llama_context_params * cparams);

-void common_memory_breakdown_print(const llama_context * ctx);
-
-struct common_device_memory_data {
-    int64_t total;
-    int64_t free;
-    size_t  model;
-    size_t  context;
-    size_t  compute;
-};
-
-using common_device_memory_data_vec = std::vector<common_device_memory_data>;
+void common_memory_breakdown_print(const struct llama_context * ctx);

 // Load a model + context with no_alloc and return the per-device memory breakdown.
-common_device_memory_data_vec common_get_device_memory_data(
-                         const char * path_model,
-           const llama_model_params * mparams,
-         const llama_context_params * cparams,
-    std::vector<ggml_backend_dev_t> & devs,
-                           uint32_t & hp_ngl,
-                           uint32_t & hp_n_ctx_train,
-                           uint32_t & hp_n_expert,
-                     ggml_log_level   log_level);
+std::vector<llama_device_memory_data> common_get_device_memory_data(
+                                  const char   * path_model,
+        const struct llama_model_params         * mparams,
+        const struct llama_context_params       * cparams,
+        std::vector<ggml_backend_dev_t>         & devs,
+                                      uint32_t  & hp_ngl,
+                                      uint32_t  & hp_n_ctx_train,
+                                      uint32_t  & hp_n_expert,
+                           enum ggml_log_level    log_level);
@@ -495,19 +495,4 @@ std::string finalize_file(const hf_file & file) {
    return file.final_path;
 }

-bool remove_cached_repo(const std::string & repo_id) {
-    if (!is_valid_repo_id(repo_id)) {
-        LOG_WRN("%s: invalid repository: %s\n", __func__, repo_id.c_str());
-        return false;
-    }
-    fs::path repo_path = get_repo_path(repo_id);
-    std::error_code ec;
-    auto removed = fs::remove_all(repo_path, ec);
-    if (ec) {
-        LOG_ERR("%s: failed to remove repo cache %s: %s\n", __func__, repo_path.string().c_str(), ec.message().c_str());
-        return false;
-    }
-    return removed > 0;
-}
-
 } // namespace hf_cache
@@ -29,7 +29,4 @@ hf_files get_cached_files(const std::string & repo_id = {});
 // Create snapshot path (link or move/copy) and return it
 std::string finalize_file(const hf_file & file);

-// Remove the entire cached directory for a repo, returns true if removed
-bool remove_cached_repo(const std::string & repo_id);
-
 } // namespace hf_cache
@@ -16,34 +16,22 @@ using json = nlohmann::ordered_json;
 namespace jinja {

 using caps_json_fn = std::function<json()>;
-using caps_ctx_fn = std::function<void(context &)>;
-using caps_analyze_fn = std::function<void(bool, value &, value &, const std::string &)>;
-
-void caps_apply_preserve_reasoning(jinja::context & ctx, bool enabled) {
-    ctx.set_val("preserve_thinking",         mk_val<value_bool>(enabled));
-    ctx.set_val("clear_thinking",            mk_val<value_bool>(!enabled));
-    ctx.set_val("truncate_history_thinking", mk_val<value_bool>(!enabled));
-}
+using caps_analyze_fn = std::function<void(bool, value &, value &)>;

 static void caps_try_execute(jinja::program & prog,
                             const caps_json_fn & messages_fn,
-                             const caps_ctx_fn & ctx_fn,
                             const caps_json_fn & tools_fn,
                             const caps_analyze_fn & analyze_fn) {
    context ctx;
    ctx.is_get_stats = true;
    jinja::global_from_json(ctx, json{
        {"messages", messages_fn()},
-        {"tools", tools_fn ? tools_fn() : json::array()},
+        {"tools", tools_fn()},
        {"bos_token", ""},
        {"eos_token", ""},
        {"add_generation_prompt", true}
    }, true);

-    if (ctx_fn) {
-        ctx_fn(ctx);
-    }
-
    auto messages = ctx.get_val("messages");
    auto tools = ctx.get_val("tools");

@@ -61,7 +49,7 @@ static void caps_try_execute(jinja::program & prog,
        // ignore exceptions during capability analysis
    }

-    analyze_fn(success, messages, tools, result);
+    analyze_fn(success, messages, tools);
 }

 // for debugging only
@@ -121,9 +109,11 @@ caps caps_get(jinja::program & prog) {
                }
            });
        },
-        nullptr, // ctx_fn
-        nullptr, // tools_fn
-        [&](bool success, value & messages, value &, const std::string &) {
+        [&]() {
+            // tools
+            return json{nullptr};
+        },
+        [&](bool success, value & messages, value &) {
            auto & content = messages->at(0)->at("content");
            caps_print_stats(content, "messages[0].content");
            if (has_op(content, "selectattr") || has_op(content, "array_access")) {
@@ -155,9 +145,11 @@ caps caps_get(jinja::program & prog) {
                },
            });
        },
-        nullptr, // ctx_fn
-        nullptr, // tools_fn
-        [&](bool, value & messages, value &, const std::string &) {
+        [&]() {
+            // tools
+            return json::array();
+        },
+        [&](bool, value & messages, value &) {
            auto & content = messages->at(0)->at("content");
            caps_print_stats(content, "messages[0].content");
            if (!content->stats.used) {
@@ -209,7 +201,6 @@ caps caps_get(jinja::program & prog) {
                },
            });
        },
-        nullptr, // ctx_fn
        [&]() {
            // tools
            return json::array({
@@ -233,7 +224,7 @@ caps caps_get(jinja::program & prog) {
                },
            });
        },
-        [&](bool success, value & messages, value & tools, const std::string &) {
+        [&](bool success, value & messages, value & tools) {
            if (!success) {
                return; // Nothing can be inferred
            }
@@ -302,7 +293,6 @@ caps caps_get(jinja::program & prog) {
                    },
                });
            },
-            nullptr, // ctx_fn
            [&]() {
                // tools
                return json::array({
@@ -326,7 +316,7 @@ caps caps_get(jinja::program & prog) {
                    },
                });
            },
-            [&](bool success, value & messages, value & tools, const std::string &) {
+            [&](bool success, value & messages, value & tools) {
                if (!success) {
                    result.supports_tool_calls = false;
                    result.supports_tools = false;
@@ -404,7 +394,6 @@ caps caps_get(jinja::program & prog) {
                },
            });
        },
-        nullptr, // ctx_fn
        [&]() {
            // tools
            return json::array({
@@ -428,7 +417,7 @@ caps caps_get(jinja::program & prog) {
                },
            });
        },
-        [&](bool success, value & messages, value &, const std::string &) {
+        [&](bool success, value & messages, value & /*tools*/) {
            if (!success) {
                result.supports_parallel_tool_calls = false;
                return;
@@ -449,22 +438,11 @@ caps caps_get(jinja::program & prog) {
    JJ_DEBUG("%s\n", ">>> Running capability check: preserve reasoning");

    // case: preserve reasoning content in chat history
-    const std::string reasoning_placeholder = "<REASONING_CONTENT_PLACEHOLDER>";
    caps_try_execute(
        prog,
        [&]() {
            // messages
            return json::array({
-                {
-                    {"role", "user"},
-                    {"content", "User message"}
-                },
-                {
-                    {"role", "assistant"},
-                    {"content", "Assistant message"},
-                    // check of reasoning_content deeper in the history, not just the last assistant message
-                    {"reasoning_content", reasoning_placeholder}
-                },
                {
                    {"role", "user"},
                    {"content", "User message"}
@@ -480,13 +458,14 @@ caps caps_get(jinja::program & prog) {
                },
            });
        },
-        [&](context & ctx) {
-            caps_apply_preserve_reasoning(ctx, true);
+        [&]() {
+            // tools
+            return json::array();
        },
-        nullptr, // tools_fn
-        [&](bool, value &, value &, const std::string & output) {
-            // note: we cannot use stats here because the reasoning_content may be used for "if" condition test, but not actually outputted in the final result
-            if (output.find(reasoning_placeholder) != std::string::npos) {
+        [&](bool, value & messages, value &) {
+            auto & content = messages->at(1)->at("reasoning_content");
+            caps_print_stats(content, "messages[1].reasoning_content");
+            if (content->stats.used) {
                result.supports_preserve_reasoning = true;
            }
        }
@@ -12,9 +12,7 @@ struct caps {
    bool supports_tool_calls = true;
    bool supports_system_role = true;
    bool supports_parallel_tool_calls = true;
-
-    // supports preserve reasoning trace in the full history, not just the last assistant message
-    bool supports_preserve_reasoning = false;
+    bool supports_preserve_reasoning = false; // support assistant message with reasoning_content

    // one of the 2 content capabilities must be true
    bool supports_string_content = true;
@@ -31,6 +29,4 @@ struct caps {

 caps caps_get(jinja::program & prog);

-void caps_apply_preserve_reasoning(jinja::context & ctx, bool enabled);
-
 } // namespace jinja
@@ -316,22 +316,12 @@ value filter_expression::execute_impl(context & ctx) {

    JJ_DEBUG("Applying filter to %s", input->type().c_str());

-    auto set_filter_alias = [](auto & filter_id) {
-        if (filter_id == "count") {
-            filter_id = "length";
-        } else if (filter_id == "d") {
-            filter_id = "default";
-        } else if (filter_id == "e") {
-            filter_id = "escape";
-        } else if (filter_id == "trim") {
-            filter_id = "strip";
-        }
-    };
-
    if (is_stmt<identifier>(filter)) {
        auto filter_id = cast_stmt<identifier>(filter)->val;

-        set_filter_alias(filter_id);
+        if (filter_id == "trim") {
+            filter_id = "strip"; // alias
+        }
        JJ_DEBUG("Applying filter '%s' to %s", filter_id.c_str(), input->type().c_str());
        // TODO: Refactor filters so this coercion can be done automatically
        if (!input->is_undefined() && !is_val<value_string>(input) && (
@@ -355,7 +345,9 @@ value filter_expression::execute_impl(context & ctx) {
        }
        auto filter_id = cast_stmt<identifier>(call->callee)->val;

-        set_filter_alias(filter_id);
+        if (filter_id == "trim") {
+            filter_id = "strip"; // alias
+        }
        JJ_DEBUG("Applying filter '%s' with arguments to %s", filter_id.c_str(), input->type().c_str());
        func_args args(ctx);
        for (const auto & arg_expr : call->args) {
@@ -686,62 +678,59 @@ value set_statement::execute_impl(context & ctx) {
    return mk_val<value_undefined>();
 }

-static inline void bind_parameters(const std::string & name, const statements & this_args, const func_args & args, context & ctx) {
-    const size_t expected_count = this_args.size();
-    const size_t input_count = args.count();
-
-    JJ_DEBUG("Invoking '%s' with %zu input arguments (expected %zu)", name.c_str(), input_count, expected_count);
-    for (size_t i = 0; i < expected_count; ++i) {
-        if (i < input_count) {
-            if (is_stmt<identifier>(this_args[i])) {
-                // normal parameter
-                std::string param_name = cast_stmt<identifier>(this_args[i])->val;
-                value param_value = args.get_kwarg_or_pos(param_name, i);
-                JJ_DEBUG("  Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
-                ctx.set_val(param_name, param_value);
-            } else if (is_stmt<keyword_argument_expression>(this_args[i])) {
-                // default argument used as normal parameter
-                auto kwarg = cast_stmt<keyword_argument_expression>(this_args[i]);
-                if (!is_stmt<identifier>(kwarg->key)) {
-                    throw std::runtime_error("Keyword argument key must be an identifier in '" + name + "'");
-                }
-                std::string param_name = cast_stmt<identifier>(kwarg->key)->val;
-                value param_value = args.get_kwarg_or_pos(param_name, i);
-                JJ_DEBUG("  Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
-                ctx.set_val(param_name, param_value);
-            } else {
-                throw std::runtime_error("Invalid parameter type in '" + name + "'");
-            }
-        } else {
-            auto & default_arg = this_args[i];
-            if (is_stmt<keyword_argument_expression>(default_arg)) {
-                auto kwarg = cast_stmt<keyword_argument_expression>(default_arg);
-                if (!is_stmt<identifier>(kwarg->key)) {
-                    throw std::runtime_error("Keyword argument key must be an identifier in '" + name + "'");
-                }
-                std::string param_name = cast_stmt<identifier>(kwarg->key)->val;
-                JJ_DEBUG("  Binding parameter '%s' to default argument of type %s", param_name.c_str(), kwarg->val->type().c_str());
-                ctx.set_val(param_name, kwarg->val->execute(args.ctx));
-            } else {
-                throw std::runtime_error("Not enough arguments provided to '" + name + "'");
-            }
-            //std::string param_name = cast_stmt<identifier>(default_args[i])->val;
-            //JJ_DEBUG("  Binding parameter '%s' to default", param_name.c_str());
-            //ctx.var[param_name] = default_args[i]->execute(ctx);
-        }
-    }
-}
-
 value macro_statement::execute_impl(context & ctx) {
    if (!is_stmt<identifier>(this->name)) {
        throw std::runtime_error("Macro name must be an identifier");
    }
    std::string name = cast_stmt<identifier>(this->name)->val;

-    const func_handler func = [this, name](const func_args & args) -> value {
-        context macro_ctx(args.ctx); // new scope for macro execution
+    const func_handler func = [this, name, &ctx](const func_args & args) -> value {
+        size_t expected_count = this->args.size();
+        size_t input_count = args.count();

-        bind_parameters(name, this->args, args, macro_ctx);
+        JJ_DEBUG("Invoking macro '%s' with %zu input arguments (expected %zu)", name.c_str(), input_count, expected_count);
+        context macro_ctx(ctx); // new scope for macro execution
+
+        // bind parameters
+        for (size_t i = 0; i < expected_count; ++i) {
+            if (i < input_count) {
+                if (is_stmt<identifier>(this->args[i])) {
+                    // normal parameter
+                    std::string param_name = cast_stmt<identifier>(this->args[i])->val;
+                    value param_value = args.get_kwarg_or_pos(param_name, i);
+                    JJ_DEBUG("  Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
+                    macro_ctx.set_val(param_name, param_value);
+                } else if (is_stmt<keyword_argument_expression>(this->args[i])) {
+                    // default argument used as normal parameter
+                    auto kwarg = cast_stmt<keyword_argument_expression>(this->args[i]);
+                    if (!is_stmt<identifier>(kwarg->key)) {
+                        throw std::runtime_error("Keyword argument key must be an identifier in macro '" + name + "'");
+                    }
+                    std::string param_name = cast_stmt<identifier>(kwarg->key)->val;
+                    value param_value = args.get_kwarg_or_pos(param_name, i);
+                    JJ_DEBUG("  Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
+                    macro_ctx.set_val(param_name, param_value);
+                } else {
+                    throw std::runtime_error("Invalid parameter type in macro '" + name + "'");
+                }
+            } else {
+                auto & default_arg = this->args[i];
+                if (is_stmt<keyword_argument_expression>(default_arg)) {
+                    auto kwarg = cast_stmt<keyword_argument_expression>(default_arg);
+                    if (!is_stmt<identifier>(kwarg->key)) {
+                        throw std::runtime_error("Keyword argument key must be an identifier in macro '" + name + "'");
+                    }
+                    std::string param_name = cast_stmt<identifier>(kwarg->key)->val;
+                    JJ_DEBUG("  Binding parameter '%s' to default argument of type %s", param_name.c_str(), kwarg->val->type().c_str());
+                    macro_ctx.set_val(param_name, kwarg->val->execute(ctx));
+                } else {
+                    throw std::runtime_error("Not enough arguments provided to macro '" + name + "'");
+                }
+                //std::string param_name = cast_stmt<identifier>(default_args[i])->val;
+                //JJ_DEBUG("  Binding parameter '%s' to default", param_name.c_str());
+                //macro_ctx.var[param_name] = default_args[i]->execute(ctx);
+            }
+        }

        // execute macro body
        JJ_DEBUG("Executing macro '%s' body with %zu statements", name.c_str(), this->body.size());
@@ -755,46 +744,6 @@ value macro_statement::execute_impl(context & ctx) {
    return mk_val<value_undefined>();
 }

-value call_statement::execute_impl(context & ctx) {
-    auto call_expr = cast_stmt<call_expression>(this->call);
-    if (!call_expr) {
-        throw std::runtime_error("Call statement requires a valid call expression");
-    }
-
-    value callee_val = call_expr->callee->execute(ctx);
-    if (!is_val<value_func>(callee_val)) {
-        throw std::runtime_error("Callee is not a function: got " + callee_val->type());
-    }
-    auto * callee_func = cast_val<value_func>(callee_val);
-
-    context caller_ctx(ctx); // new scope for caller execution
-
-    const func_handler func = [this, caller_ctx = std::move(caller_ctx)](const func_args & args) -> value {
-        context block_ctx(caller_ctx); // new scope for block execution
-
-        bind_parameters("caller", this->caller_args, args, block_ctx);
-
-        JJ_DEBUG("Executing call body with %zu statements", this->body.size());
-        auto res = exec_statements(this->body, block_ctx);
-        JJ_DEBUG("Call body execution complete, result: %s", res->val_str.str().c_str());
-        return res;
-    };
-
-    context call_ctx(ctx);
-    call_ctx.set_val("caller", mk_val<value_func>("caller", func));
-
-    func_args args(call_ctx);
-
-    for (const auto & arg_expr : call_expr->args) {
-        auto arg_val = arg_expr->execute(ctx);
-        JJ_DEBUG("  Argument type: %s", arg_val->type().c_str());
-        args.push_back(arg_val);
-    }
-
-    JJ_DEBUG("Calling macro '%s' with %zu arguments", callee_func->name.c_str(), args.count());
-    return callee_func->invoke(args);
-}
-
 value member_expression::execute_impl(context & ctx) {
    value object = this->object->execute(ctx);

@@ -812,9 +761,9 @@ value member_expression::execute_impl(context & ctx) {

        if (is_stmt<slice_expression>(this->property)) {
            auto s = cast_stmt<slice_expression>(this->property);
+            value start_val = s->start_expr ? s->start_expr->execute(ctx) : mk_val<value_int>(0);
+            value stop_val  = s->stop_expr  ? s->stop_expr->execute(ctx)  : mk_val<value_int>(arr_size);
            value step_val  = s->step_expr  ? s->step_expr->execute(ctx)  : mk_val<value_int>(1);
-            value start_val = s->start_expr ? s->start_expr->execute(ctx) : (step_val->as_int() < 0 ? mk_val<value_int>(arr_size - 1) : mk_val<value_int>(0));
-            value stop_val  = s->stop_expr  ? s->stop_expr->execute(ctx)  : (step_val->as_int() < 0 ? mk_val<value_int>(-1) : mk_val<value_int>(arr_size));

            // translate to function call: obj.slice(start, stop, step)
            JJ_DEBUG("Member expression is a slice: start %s, stop %s, step %s",
@@ -954,50 +903,4 @@ value keyword_argument_expression::execute_impl(context & ctx) {
    return mk_val<value_kwarg>(k, v);
 }

-std::string runtime::debug_dump_program(const program & prog, const std::string & src) {
-    std::ostringstream oss;
-    size_t lvl = 0;
-    context ctx;
-    ctx.src.reset(new std::string(src));
-
-    auto indent = [](size_t lvl) -> std::string {
-        return std::string(lvl * 2, ' ');
-    };
-
-    ctx.visitor = [&](bool is_leaf, statement * node, std::vector<visitor_pair> children) {
-        oss << indent(lvl) << node->type() << ":\n";
-        lvl++;
-        if (is_leaf) {
-            const auto & pos = node->pos;
-            oss << indent(lvl) << "(leaf) at " << get_line_col(src, pos) << " in source:\n";
-            std::string snippet = peak_source(src, pos);
-            string_replace_all(snippet, "\n", "\n" + indent(lvl));
-            oss << indent(lvl) << snippet << "\n";
-        } else {
-            for (auto & [label, children_vec] : children) {
-                oss << indent(lvl) << label << ":\n";
-                lvl++;
-                if (children_vec.empty()) {
-                    oss << indent(lvl) << "<empty>\n\n";
-                } else {
-                    for (auto * child : children_vec) {
-                        if (!child) {
-                            continue;
-                        }
-                        child->visit(ctx);
-                    }
-                }
-                lvl--;
-            }
-        }
-        lvl--;
-    };
-
-    for (const auto & stmt : prog.body) {
-        stmt->visit(ctx);
-    }
-
-    return oss.str();
-}
-
 } // namespace jinja
@@ -47,19 +47,12 @@ const T * cast_stmt(const statement_ptr & ptr) {
 // not thread-safe
 void enable_debug(bool enable);

-// for visiting AST nodes
-// function signature: void(bool is_leaf, statement * node, pair of <label, children>)
-using visitor_pair = std::pair<std::string, std::vector<statement *>>;
-using visitor_fn = std::function<void(bool, statement *, std::vector<visitor_pair>)>;
-
 struct context {
    std::shared_ptr<std::string> src; // for debugging; use shared_ptr to avoid copying on scope creation
    std::time_t current_time; // for functions that need current time

    bool is_get_stats = false; // whether to collect stats

-    visitor_fn visitor;
-
    // src is optional, used for error reporting
    context(std::string src = "") : src(std::make_shared<std::string>(std::move(src))) {
        env = mk_val<value_object>();
@@ -106,15 +99,6 @@ private:
    value_object env;
 };

-// utils for visiting AST nodes
-static std::vector<statement *> stmts_to_ptr(const statements & stmts) {
-    std::vector<statement *> children;
-    for (const auto & stmt : stmts) {
-        children.push_back(stmt.get());
-    }
-    return children;
-}
-
 /**
 * Base class for all nodes in the AST.
 */
@@ -122,7 +106,6 @@ struct statement {
    size_t pos; // position in source, for debugging
    virtual ~statement() = default;
    virtual std::string type() const { return "Statement"; }
-    virtual void visit(context & ctx) { ctx.visitor(true, this, {}); }

    // execute_impl must be overridden by derived classes
    virtual value execute_impl(context &) { throw_exec_error(); }
@@ -183,13 +166,6 @@ struct if_statement : public statement {

    std::string type() const override { return "If"; }
    value execute_impl(context & ctx) override;
-    void visit(context & ctx) override {
-        ctx.visitor(false, this, {
-            {"test", {test.get()}},
-            {"body", stmts_to_ptr(body)},
-            {"alternate", stmts_to_ptr(alternate)}
-        });
-    }
 };

 struct identifier;
@@ -214,14 +190,6 @@ struct for_statement : public statement {

    std::string type() const override { return "For"; }
    value execute_impl(context & ctx) override;
-    void visit(context & ctx) override {
-        ctx.visitor(false, this, {
-            {"loopvar", {loopvar.get()}},
-            {"iterable", {iterable.get()}},
-            {"body", stmts_to_ptr(body)},
-            {"default_block", stmts_to_ptr(default_block)}
-        });
-    }
 };

 struct break_statement : public statement {
@@ -273,13 +241,6 @@ struct set_statement : public statement {

    std::string type() const override { return "Set"; }
    value execute_impl(context & ctx) override;
-    void visit(context & ctx) override {
-        ctx.visitor(false, this, {
-            {"assignee", {assignee.get()}},
-            {"value", {val.get()}},
-            {"body", stmts_to_ptr(body)}
-        });
-    }
 };

 struct macro_statement : public statement {
@@ -295,13 +256,6 @@ struct macro_statement : public statement {

    std::string type() const override { return "Macro"; }
    value execute_impl(context & ctx) override;
-    void visit(context & ctx) override {
-        ctx.visitor(false, this, {
-            {"name", {name.get()}},
-            {"args", stmts_to_ptr(args)},
-            {"body", stmts_to_ptr(body)}
-        });
-    }
 };

 struct comment_statement : public statement {
@@ -335,12 +289,6 @@ struct member_expression : public expression {
    }
    std::string type() const override { return "MemberExpression"; }
    value execute_impl(context & ctx) override;
-    void visit(context & ctx) override {
-        ctx.visitor(false, this, {
-            {"object", {object.get()}},
-            {"property", {property.get()}}
-        });
-    }
 };

 struct call_expression : public expression {
@@ -354,12 +302,6 @@ struct call_expression : public expression {
    }
    std::string type() const override { return "CallExpression"; }
    value execute_impl(context & ctx) override;
-    void visit(context & ctx) override {
-        ctx.visitor(false, this, {
-            {"callee", {callee.get()}},
-            {"args", stmts_to_ptr(args)}
-        });
-    }
 };

 /**
@@ -463,12 +405,6 @@ struct binary_expression : public expression {
    }
    std::string type() const override { return "BinaryExpression"; }
    value execute_impl(context & ctx) override;
-    void visit(context & ctx) override {
-        ctx.visitor(false, this, {
-            {"left", {left.get()}},
-            {"right", {right.get()}}
-        });
-    }
 };

 /**
@@ -495,12 +431,6 @@ struct filter_expression : public expression {

    std::string type() const override { return "FilterExpression"; }
    value execute_impl(context & ctx) override;
-    void visit(context & ctx) override {
-        ctx.visitor(false, this, {
-            {"operand", {operand.get()}},
-            {"filter", {filter.get()}}
-        });
-    }
 };

 struct filter_statement : public statement {
@@ -513,12 +443,6 @@ struct filter_statement : public statement {
    }
    std::string type() const override { return "FilterStatement"; }
    value execute_impl(context & ctx) override;
-    void visit(context & ctx) override {
-        ctx.visitor(false, this, {
-            {"filter", {filter.get()}},
-            {"body", stmts_to_ptr(body)}
-        });
-    }
 };

 /**
@@ -544,12 +468,6 @@ struct select_expression : public expression {
        }
        return lhs->execute_impl(ctx);
    }
-    void visit(context & ctx) override {
-        ctx.visitor(false, this, {
-            {"lhs", {lhs.get()}},
-            {"test", {test.get()}}
-        });
-    }
 };

 /**
@@ -568,12 +486,6 @@ struct test_expression : public expression {
    }
    std::string type() const override { return "TestExpression"; }
    value execute_impl(context & ctx) override;
-    void visit(context & ctx) override {
-        ctx.visitor(false, this, {
-            {"operand", {operand.get()}},
-            {"test", {test.get()}}
-        });
-    }
 };

 /**
@@ -589,11 +501,6 @@ struct unary_expression : public expression {
    }
    std::string type() const override { return "UnaryExpression"; }
    value execute_impl(context & ctx) override;
-    void visit(context & ctx) override {
-        ctx.visitor(false, this, {
-            {"argument", {argument.get()}}
-        });
-    }
 };

 struct slice_expression : public expression {
@@ -611,13 +518,6 @@ struct slice_expression : public expression {
    [[noreturn]] value execute_impl(context &) override {
        throw std::runtime_error("must be handled by MemberExpression");
    }
-    void visit(context & ctx) override {
-        ctx.visitor(false, this, {
-            {"start_expr", {start_expr.get()}},
-            {"stop_expr", {stop_expr.get()}},
-            {"step_expr", {step_expr.get()}}
-        });
-    }
 };

 struct keyword_argument_expression : public expression {
@@ -631,12 +531,6 @@ struct keyword_argument_expression : public expression {
    }
    std::string type() const override { return "KeywordArgumentExpression"; }
    value execute_impl(context & ctx) override;
-    void visit(context & ctx) override {
-        ctx.visitor(false, this, {
-            {"key", {key.get()}},
-            {"val", {val.get()}}
-        });
-    }
 };

 struct spread_expression : public expression {
@@ -645,11 +539,6 @@ struct spread_expression : public expression {
        chk_type<expression>(this->argument);
    }
    std::string type() const override { return "SpreadExpression"; }
-    void visit(context & ctx) override {
-        ctx.visitor(false, this, {
-            {"argument", {argument.get()}}
-        });
-    }
 };

 struct call_statement : public statement {
@@ -663,14 +552,6 @@ struct call_statement : public statement {
        for (const auto & arg : this->caller_args) chk_type<expression>(arg);
    }
    std::string type() const override { return "CallStatement"; }
-    value execute_impl(context & ctx) override;
-    void visit(context & ctx) override {
-        ctx.visitor(false, this, {
-            {"call", {call.get()}},
-            {"caller_args", stmts_to_ptr(caller_args)},
-            {"body", stmts_to_ptr(body)}
-        });
-    }
 };

 struct ternary_expression : public expression {
@@ -693,13 +574,6 @@ struct ternary_expression : public expression {
            return false_expr->execute(ctx);
        }
    }
-    void visit(context & ctx) override {
-        ctx.visitor(false, this, {
-            {"condition", {condition.get()}},
-            {"true_expr", {true_expr.get()}},
-            {"false_expr", {false_expr.get()}}
-        });
-    }
 };

 struct raised_exception : public std::exception {
@@ -773,8 +647,6 @@ struct runtime {
        }
        return parts;
    }
-
-    static std::string debug_dump_program(const program & prog, const std::string & src);
 };

 } // namespace jinja
@@ -90,14 +90,14 @@ static T slice(const T & array, int64_t start, int64_t stop, int64_t step = 1) {
            stop_val = std::min(stop_val, len);
        }
    } else {
-        start_val = start;
+        start_val = len - 1;
        if (start_val < 0) {
-            start_val = std::max(len + start_val, (int64_t)0);
+            start_val = std::max(len + start_val, (int64_t)-1);
        } else {
            start_val = std::min(start_val, len - 1);
        }

-        stop_val = stop;
+        stop_val = -1;
        if (stop_val < -1) {
            stop_val = std::max(len + stop_val, (int64_t)-1);
        } else {
@@ -673,9 +673,6 @@ const func_builtins & value_string_t::get_builtins() const {
            std::string str = val_input->as_string().str();
            // FIXME: Support non-specified delimiter (split on consecutive (no leading or trailing) whitespace)
            std::string delim = (args.count() > 1) ? args.get_pos(1)->as_string().str() : " ";
-            if (delim.empty()) {
-                throw raised_exception("empty separator");
-            }
            int64_t maxsplit = (args.count() > 2) ? args.get_pos(2)->as_int() : -1;
            auto result = mk_val<value_array>();
            size_t pos = 0;
@@ -700,9 +697,6 @@ const func_builtins & value_string_t::get_builtins() const {
            std::string str = val_input->as_string().str();
            // FIXME: Support non-specified delimiter (split on consecutive (no leading or trailing) whitespace)
            std::string delim = (args.count() > 1) ? args.get_pos(1)->as_string().str() : " ";
-            if (delim.empty()) {
-                throw raised_exception("empty separator");
-            }
            int64_t maxsplit = (args.count() > 2) ? args.get_pos(2)->as_int() : -1;
            auto result = mk_val<value_array>();
            size_t pos = 0;
@@ -728,23 +722,10 @@ const func_builtins & value_string_t::get_builtins() const {
            if (count > 0) {
                throw not_implemented_exception("String replace with count argument not implemented");
            }
-            if (old_str != new_str) {
-                size_t pos = 0;
-                if (old_str.empty()) {
-                    std::string new_res;
-                    new_res.reserve(str.length() + new_str.length() * (str.length() + 1));
-                    new_res += new_str;
-                    for (const char c : str) {
-                        new_res.push_back(c);
-                        new_res += new_str;
-                    }
-                    str = new_res;
-                } else {
-                    while ((pos = str.find(old_str, pos)) != std::string::npos) {
-                        str.replace(pos, old_str.length(), new_str);
-                        pos += new_str.length();
-                    }
-                }
+            size_t pos = 0;
+            while ((pos = str.find(old_str, pos)) != std::string::npos) {
+                str.replace(pos, old_str.length(), new_str);
+                pos += new_str.length();
            }
            auto res = mk_val<value_string>(str);
            res->val_str.mark_input_based_on(args.get_pos(0)->val_str);
@@ -1108,50 +1089,6 @@ const func_builtins & value_array_t::get_builtins() const {
            std::reverse(arr.begin(), arr.end());
            return is_val<value_tuple>(val) ? mk_val<value_tuple>(std::move(arr)) : mk_val<value_array>(std::move(arr));
        }},
-        {"min", [](const func_args & args) -> value {
-            args.ensure_count(1, 4);
-            args.ensure_vals<value_array>();
-            value val_case    = args.get_kwarg_or_pos("case_sensitive", 1);
-            value attribute   = args.get_kwarg_or_pos("attribute",      2);
-            if (!attribute->is_undefined()) {
-                throw not_implemented_exception("min: attribute not implemented");
-            }
-            // FIXME: min is currently always case sensitive
-            (void) val_case;
-            const auto & arr = args.get_pos(0)->as_array();
-            if (arr.empty()) {
-                return mk_val<value_undefined>();
-            }
-            value result = arr[0];
-            for (size_t i = 1; i < arr.size(); ++i) {
-                if (value_compare(arr[i], result, value_compare_op::lt)) {
-                    result = arr[i];
-                }
-            }
-            return result;
-        }},
-        {"max", [](const func_args & args) -> value {
-            args.ensure_count(1, 4);
-            args.ensure_vals<value_array>();
-            value val_case    = args.get_kwarg_or_pos("case_sensitive", 1);
-            value attribute   = args.get_kwarg_or_pos("attribute",      2);
-            if (!attribute->is_undefined()) {
-                throw not_implemented_exception("max: attribute not implemented");
-            }
-            // FIXME: max is currently always case sensitive
-            (void) val_case;
-            const auto & arr = args.get_pos(0)->as_array();
-            if (arr.empty()) {
-                return mk_val<value_undefined>();
-            }
-            value result = arr[0];
-            for (size_t i = 1; i < arr.size(); ++i) {
-                if (value_compare(arr[i], result, value_compare_op::gt)) {
-                    result = arr[i];
-                }
-            }
-            return result;
-        }},
        {"unique", array_unique_not_implemented},
    };
    return builtins;
@@ -0,0 +1,324 @@
+#include "json-partial.h"
+
+#include "log.h"
+
+#include <nlohmann/json.hpp>
+
+#include <string>
+#include <regex>
+
+using json = nlohmann::ordered_json;
+
+enum common_json_stack_element_type {
+    COMMON_JSON_STACK_ELEMENT_OBJECT,
+    COMMON_JSON_STACK_ELEMENT_KEY,
+    COMMON_JSON_STACK_ELEMENT_ARRAY,
+};
+
+struct common_json_stack_element {
+    common_json_stack_element_type type;
+    std::string key;
+};
+
+bool common_json_parse(
+    const std::string & input,
+    const std::string & healing_marker,
+    common_json & out)
+{
+    std::string::const_iterator it = input.begin();
+    const auto end = input.end();
+    return common_json_parse(it, end, healing_marker, out);
+}
+
+bool common_json_parse(
+    std::string::const_iterator & it,
+    const std::string::const_iterator & end,
+    const std::string & healing_marker,
+    common_json & out)
+{
+    // // https://json.nlohmann.me/features/parsing/sax_interface/
+    struct json_error_locator : public nlohmann::json_sax<json> {
+        std::size_t position;
+        bool found_error;
+        std::string last_token;
+        std::string exception_message;
+        std::vector<common_json_stack_element> stack;
+
+        json_error_locator() : position(0), found_error(false) {}
+
+        bool parse_error(std::size_t position, const std::string & last_token, const json::exception & ex) override { // NOLINT
+            this->position = position - 1;
+            this->found_error = true;
+            this->last_token = last_token;
+            this->exception_message = ex.what();
+            return false;
+        }
+        void close_value() {
+            if (!stack.empty() && (stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY)) {
+                stack.pop_back();
+            }
+        }
+        bool null() override { // NOLINT
+            close_value();
+            return true;
+        }
+        bool boolean(bool) override { // NOLINT
+            close_value();
+            return true;
+        }
+        bool number_integer(number_integer_t) override { // NOLINT
+            close_value();
+            return true;
+        }
+        bool number_unsigned(number_unsigned_t) override { // NOLINT
+            close_value();
+            return true;
+        }
+        bool number_float(number_float_t, const string_t &) override { // NOLINT
+            close_value();
+            return true;
+        }
+        bool string(string_t &) override { // NOLINT
+            close_value();
+            return true;
+        }
+        bool binary(binary_t &) override { // NOLINT
+            close_value();
+            return true;
+        }
+        bool start_object(std::size_t) override { // NOLINT
+            stack.push_back({COMMON_JSON_STACK_ELEMENT_OBJECT, ""});
+            return true;
+        }
+        bool end_object() override {
+            GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT);
+            stack.pop_back();
+            close_value();
+            return true;
+        }
+        bool key(string_t & key) override { // NOLINT
+            stack.push_back({COMMON_JSON_STACK_ELEMENT_KEY, key});
+            return true;
+        }
+        bool start_array(std::size_t) override { // NOLINT
+            stack.push_back({COMMON_JSON_STACK_ELEMENT_ARRAY, ""});
+            return true;
+        }
+        bool end_array() override {
+            GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY);
+            stack.pop_back();
+            close_value();
+            return true;
+        }
+    };
+    json_error_locator err_loc;
+    auto start = it;
+    json::sax_parse(it, end, &err_loc);
+
+    if (err_loc.found_error) {
+        it = start;
+        auto temptative_end = it + err_loc.position;
+        // LOG_DBG("Error at position %zu (is_end = %s): %s\n", err_loc.position, temptative_end == end ? "true" : "false", err_loc.exception_message.c_str());
+
+        auto input = std::string(it, temptative_end);
+        try {
+            out.json = json::parse(input);
+            // out.json = json::parse(it, temptative_end);
+            it = temptative_end;
+            return true;
+        } catch (const std::exception & ex) {
+            // No, needs healing.
+            LOG_DBG("Failed to parse up to error: %s: <<<%s>>>\n", ex.what(), std::string(it, temptative_end).c_str());
+        }
+        auto can_parse = [](const std::string & str) {
+            try {
+                auto _ = json::parse(str); // NOLINT
+                return true;
+            } catch (const std::exception &) {
+                return false;
+            }
+        };
+        if (!healing_marker.empty() && !err_loc.stack.empty()) {
+            std::string str(it, temptative_end);
+            auto last_non_sp_pos = str.find_last_not_of(" \n\r\t");
+            if (last_non_sp_pos == std::string::npos) {
+                throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
+            }
+            auto last_non_sp_char = str[last_non_sp_pos];
+            // Used to detect stops on a number, which may not be complete.
+            auto was_maybe_number = [&]() {
+                if (!str.empty() && std::isspace(str.back())) {
+                    return false;
+                }
+                return std::isdigit(last_non_sp_char) ||
+                    last_non_sp_char == '.' ||
+                    last_non_sp_char == 'e' ||
+                    last_non_sp_char == 'E' ||
+                    last_non_sp_char == '-';
+            };
+
+            std::string closing;
+            for (size_t i = err_loc.stack.size(); i > 0; i--) {
+                auto & el = err_loc.stack[i - 1];
+                if (el.type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
+                    closing += "}";
+                } else if (el.type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
+                    closing += "]";
+                } else if (el.type != COMMON_JSON_STACK_ELEMENT_KEY) {
+                    throw std::runtime_error("Unexpected stack element type");
+                }
+            }
+
+            // Matches a potentially partial unicode escape sequence, e.g. \u, \uX, \uXX, \uXXX, \uXXXX
+            static const std::regex partial_unicode_regex(R"(\\u(?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F])?)?)?)?$)");
+
+            auto is_high_surrogate = [&](const std::string & s) {
+                // Check if a partial of a high surrogate (U+D800-U+DBFF)
+                return s.length() >= 4 &&
+                    s[0] == '\\' && s[1] == 'u' &&
+                    std::tolower(s[2]) == 'd' &&
+                    (s[3] == '8' || s[3] == '9' || std::tolower(s[3]) == 'a' || std::tolower(s[3]) == 'b');
+            };
+
+            // Initialize the unicode marker to a low surrogate to handle the edge case
+            // where a high surrogate (U+D800-U+DBFF) is immediately followed by a
+            // backslash (\)
+            std::string unicode_marker_padding = "udc00";
+            std::smatch last_unicode_seq;
+
+            if (std::regex_search(str, last_unicode_seq, partial_unicode_regex)) {
+                std::smatch second_last_seq;
+                std::string prelude = str.substr(0, last_unicode_seq.position());
+
+                // Pad the escape sequence with 0s until it forms a complete sequence of 6 characters
+                unicode_marker_padding = std::string(6 - last_unicode_seq.length(), '0');
+
+                if (is_high_surrogate(last_unicode_seq.str())) {
+                    // If the sequence is a partial match for a high surrogate, add a low surrogate (U+DC00-U+UDFF)
+                    unicode_marker_padding += "\\udc00";
+                } else if (std::regex_search(prelude, second_last_seq, partial_unicode_regex)) {
+                    if (is_high_surrogate(second_last_seq.str())) {
+                        // If this follows a high surrogate, pad it to be a low surrogate
+                        if (last_unicode_seq.length() == 2) {
+                            unicode_marker_padding = "dc00";
+                        } else if (last_unicode_seq.length() == 3) {
+                            unicode_marker_padding = "c00";
+                        } else {
+                            // The original unicode_marker_padding is already padded with 0s
+                        }
+                    }
+                }
+            }
+
+            const auto & magic_seed = out.healing_marker.marker = healing_marker;//"$llama.cpp.json$";
+
+            if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY) {
+                // We're inside an object value
+                if (last_non_sp_char == ':' && can_parse(str + "1" + closing)) {
+                    // Was about to create an object value
+                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
+                } else if (can_parse(str + ": 1" + closing)) {
+                    str += (out.healing_marker.json_dump_marker = ":\"" + magic_seed) + "\"" + closing;
+                } else if (last_non_sp_char == '{' && can_parse(str + closing)) {
+                    // Was about to create an object
+                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
+                } else if (can_parse(str + "\"" + closing)) {
+                    // Was inside an object value string
+                    str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
+                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
+                    // Was inside an object value string after an escape
+                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
+                } else if (can_parse(str + unicode_marker_padding + "\"" + closing)) {
+                    // Was inside an object value string after a partial unicode escape
+                    str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing;
+                } else {
+                    // find last :
+                    auto last_pos = str.find_last_of(':');
+                    if (last_pos == std::string::npos) {
+                        throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
+                    }
+                    // Cutting back to opening : for object value
+                    str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
+                }
+            } else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
+                if ((last_non_sp_char == ',' || last_non_sp_char == '[') && can_parse(str + "1" + closing)) {
+                    // Was about to create an array value
+                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
+                } else if (can_parse(str + "\"" + closing)) {
+                    // Was inside an array value string
+                    str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
+                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
+                    // Was inside an array value string after an escape
+                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
+                } else if (can_parse(str + unicode_marker_padding + "\"" + closing)) {
+                    // Was inside an array value string after a partial unicode escape
+                    str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing;
+                } else if (!was_maybe_number() && can_parse(str + ", 1" + closing)) {
+                    // Had just finished a value
+                    str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\"" + closing;
+                } else {
+                    auto last_pos = str.find_last_of("[,");
+                    if (last_pos == std::string::npos) {
+                        throw std::runtime_error("Cannot heal a truncated JSON array stopped in an unknown location");
+                    }
+                    // Cutting back to last [ or , for array value
+                    str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
+                }
+            } else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
+                if ((last_non_sp_char == '{' && can_parse(str + closing)) ||
+                        (last_non_sp_char == ',' && can_parse(str + "\"\": 1" + closing))) {
+                    // Was about to create an object key+value
+                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
+                } else if (!was_maybe_number() && can_parse(str + ",\"\": 1" + closing)) {
+                    // Was about to create an object key+value
+                    str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\": 1" + closing;
+                } else if (can_parse(str + "\": 1" + closing)) {
+                    // Was inside an object key string
+                    str += (out.healing_marker.json_dump_marker = magic_seed) + "\": 1" + closing;
+                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\": 1" + closing)) {
+                    // Was inside an object key string after an escape
+                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\": 1" + closing;
+                } else if (can_parse(str + unicode_marker_padding + "\": 1" + closing)) {
+                    // Was inside an object key string after a partial unicode escape
+                    str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\": 1" + closing;
+                } else {
+                    auto last_pos = str.find_last_of(':');
+                    if (last_pos == std::string::npos) {
+                        throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
+                    }
+                    // fprintf(stderr, "Cutting back to last : for object key+value\n");
+                    str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
+                }
+            } else {
+                throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
+            }
+            // fprintf(stderr, "HEALED:\nSTRING <<<\n%s\n>>>\n\nmagic_cut: <<<\n%s\n>>>\n\n", str.c_str(), out.healing_marker.json_dump_marker.c_str());
+            out.json = json::parse(str);
+            it = temptative_end;
+            return true;
+        }
+        // handle unclosed top-level primitive
+        if (err_loc.position != 0 && !healing_marker.empty() && err_loc.stack.empty()) {
+            std::string str(it, temptative_end);
+            const auto & magic_seed = out.healing_marker.marker = healing_marker;
+            if (can_parse(str + "\"")) {
+                // Was inside an string
+                str += (out.healing_marker.json_dump_marker = magic_seed) + "\"";
+            } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"")) {
+                // Was inside an string after an escape
+                str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"";
+            } else {
+                // TODO: handle more unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...)
+                // fprintf(stderr, "Closing: TODO\n");
+                return false;
+            }
+            out.json = json::parse(str);
+            it = temptative_end;
+            return true;
+        }
+        return false;
+    }
+    out.json = json::parse(it, end);
+    it = end;
+    return true;
+}
@@ -0,0 +1,39 @@
+#pragma once
+
+// TODO: use json_fwd.hpp when possible
+#include <nlohmann/json.hpp>
+
+// Healing marker (empty if the JSON was fully parsed / wasn't healed).
+struct common_healing_marker {
+    // Raw marker.
+    std::string marker;
+
+    // Cutting the `common_json.json.dump()` string at the (only) occurrence of this marker should yield the original partial JSON string (modulo spaces / if it had the same dump format).
+    std::string json_dump_marker;
+};
+
+// Represents a parsed JSON object, with its optional healing marker (a JSON dump fragment that can be used to find the position of healing in the JSON dump string)
+struct common_json {
+    nlohmann::ordered_json json;
+
+    common_healing_marker healing_marker;
+};
+
+// Parse the JSON string, healing (closing) any partial JSON if `healing_marker` is not empty.
+//
+// Healing completes partial JSON strings by adding a (possibly modified) healing marker, then whatever is needed to close the JSON.
+// This allows to parse the resulting healed JSON string, yet be able to cut it again if needed at the healing marker.
+// (this is used when parsing JSON outputs from the models, then crafting partial JSONs for the partial tool calls in OAI format).
+//
+// For instance, parsing `{` with a healing marker `foo` will produce a healed JSON `{"foo":1}`, w/ json_dump_marker = `"foo"` (which can be used to break the JSON again).
+bool common_json_parse(
+    const std::string & input,
+    const std::string & healing_marker,
+    common_json & out);
+
+// Parse the JSON string (see overload above), but advancing an iterator to the end of the input when the (potentially partial) parsing succeeds.
+bool common_json_parse(
+    std::string::const_iterator & it,
+    const std::string::const_iterator & end,
+    const std::string & healing_marker,
+    common_json & out);
@@ -233,27 +233,27 @@ struct BuiltinRule {
 };

 static std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
-    {"boolean", {"(\"true\" | \"false\")", {}}},
+    {"boolean", {"(\"true\" | \"false\") space", {}}},
    {"decimal-part", {"[0-9]{1,16}", {}}},
    {"integral-part", {"[0] | [1-9] [0-9]{0,15}", {}}},
-    {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)?", {"integral-part", "decimal-part"}}},
-    {"integer", {"(\"-\"? integral-part)", {"integral-part"}}},
+    {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
+    {"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
    {"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
-    {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? space \"}\"", {"string", "value"}}},
-    {"array", {"\"[\" space ( value (\",\" space value)* )? space \"]\"", {"value"}}},
-    {"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\"", {}}},
+    {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
+    {"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
+    {"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
    {"char",   {"[^\"\\\\\\x7F\\x00-\\x1F] | [\\\\] ([\"\\\\bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
-    {"string", {"\"\\\"\" char* \"\\\"\"", {"char"}}},
-    {"null", {"\"null\"", {}}},
+    {"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
+    {"null", {"\"null\" space", {}}},
 };

 static std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
    {"date", {"[0-9]{4} \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
    {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9]{3} )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
    {"date-time", {"date \"T\" time", {"date", "time"}}},
-    {"date-string", {"\"\\\"\" date \"\\\"\"", {"date"}}},
-    {"time-string", {"\"\\\"\" time \"\\\"\"", {"time"}}},
-    {"date-time-string", {"\"\\\"\" date-time \"\\\"\"", {"date-time"}}}
+    {"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
+    {"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
+    {"date-time-string", {"\"\\\"\" date-time \"\\\"\" space", {"date-time"}}}
 };

 static bool is_reserved_name(const std::string & name) {
@@ -551,16 +551,16 @@ private:
            }
            return join_seq();
        };
-        return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\"");
+        return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space");
    }

    /*
        Returns a rule that matches a JSON string that is none of the provided strings

        not_strings({"a"})
-            -> ["] ( [a] char+ | [^"a] char* )? ["]
+            -> ["] ( [a] char+ | [^"a] char* )? ["] space
        not_strings({"and", "also"})
-            -> ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["]
+            -> ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["] space
    */
    std::string _not_strings(const std::vector<std::string> & strings) {

@@ -619,7 +619,7 @@ private:
        if (!trie.is_end_of_string) {
            out << "?";
        }
-        out << " [\"]";
+        out << " [\"] space";
        return out.str();
    }

@@ -725,7 +725,7 @@ private:
            rule += " )?";
        }

-        rule += " space \"}\"";
+        rule += " \"}\" space";

        return rule;
    }
@@ -858,14 +858,14 @@ public:
            return _add_rule(rule_name, _generate_union_rule(name, schema_types));
        }
        if (schema.contains("const")) {
-            return _add_rule(rule_name, _generate_constant_rule(schema["const"]));
+            return _add_rule(rule_name, _generate_constant_rule(schema["const"]) + " space");
        }
        if (schema.contains("enum")) {
            std::vector<std::string> enum_values;
            for (const auto & v : schema["enum"]) {
                enum_values.push_back(_generate_constant_rule(v));
            }
-            return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ")");
+            return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ") space");
        }
        if ((schema_type.is_null() || schema_type == "object")
                && (schema.contains("properties") ||
@@ -933,7 +933,7 @@ public:
                    }
                }
                if (!enum_intersection.empty()) {
-                    return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ")");
+                    return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ") space");
                }
            }
            return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
@@ -948,7 +948,7 @@ public:
                    }
                    rule += visit(items[i], name + (name.empty() ? "" : "-") + "tuple-" + std::to_string(i));
                }
-                rule += " space \"]\"";
+                rule += " \"]\" space";
                return _add_rule(rule_name, rule);
            }
            std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item");
@@ -956,7 +956,7 @@ public:
            json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json();
            int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : std::numeric_limits<int>::max();

-            return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " space \"]\"");
+            return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space");
        }
        if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) {
            return _visit_pattern(schema["pattern"], rule_name);
@@ -972,7 +972,7 @@ public:
            std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
            int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
            int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
-            return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\"");
+            return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
        }
        if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
            int64_t min_value = std::numeric_limits<int64_t>::min();
@@ -990,7 +990,7 @@ public:
            std::stringstream out;
            out << "(";
            build_min_max_int(min_value, max_value, out);
-            out << ")";
+            out << ") space";
            return _add_rule(rule_name, out.str());
        }
        if (schema.empty() || schema_type == "object") {
@@ -11,13 +11,8 @@
 #include <sstream>
 #include <thread>
 #include <vector>
-#include <algorithm>

 #if defined(_WIN32)
-#    define WIN32_LEAN_AND_MEAN
-#    ifndef NOMINMAX
-#       define NOMINMAX
-#    endif
 #    include <io.h>
 #    include <windows.h>
 #    define isatty _isatty
@@ -67,15 +62,16 @@ static const char* g_col[] = {
 };

 struct common_log_entry {
-    enum ggml_log_level level {GGML_LOG_LEVEL_INFO};
+    enum ggml_log_level level;
+
+    bool prefix;
+
+    int64_t timestamp;

    std::vector<char> msg;

-    int64_t timestamp { 0 };
-    bool is_end       { false }; // signals the worker thread to stop
-    bool prefix       { false };
-
-    common_log_entry(size_t size = 256) : msg(size) { }
+    // signals the worker thread to stop
+    bool is_end;

    void print(FILE * file = nullptr) const {
        FILE * fcur = file;
@@ -126,15 +122,22 @@ struct common_log_entry {
 };

 struct common_log {
-    // default capacity
-    common_log(size_t capacity = 512) {
-        file       = nullptr;
-        prefix     = false;
-        timestamps = false;
-        running    = false;
-        t_start    = t_us();
+    // default capacity - will be expanded if needed
+    common_log() : common_log(256) {}
+
+    common_log(size_t capacity) {
+        file = nullptr;
+        prefix = false;
+        timestamps = false;
+        running = false;
+        t_start = t_us();
+
+        // initial message size - will be expanded if longer messages arrive
+        entries.resize(capacity);
+        for (auto & entry : entries) {
+            entry.msg.resize(256);
+        }

-        queue.resize(capacity, common_log_entry(256));
        head = 0;
        tail = 0;

@@ -149,10 +152,9 @@ struct common_log {
    }

 private:
-    std::mutex              mtx;
-    std::thread             thrd;
-    std::condition_variable cv_new;  // new entry
-    std::condition_variable cv_full; // wait on full
+    std::mutex mtx;
+    std::thread thrd;
+    std::condition_variable cv;

    FILE * file;

@@ -162,53 +164,24 @@ private:

    int64_t t_start;

-    // queue of entries
-    std::vector<common_log_entry> queue;
+    // ring buffer of entries
+    std::vector<common_log_entry> entries;
    size_t head;
    size_t tail;

-    bool print_entry(const common_log_entry & e) const {
-        if (e.is_end) return true;
-
-        e.print();
-        if (file) {
-            e.print(file);
-        }
-        return false;
-    }
-
-    bool flush_queue(size_t start_head, size_t end_tail, size_t & out_head) const {
-        bool stop = false;
-        size_t h = start_head;
-        while (h != end_tail && !stop) {
-            stop = print_entry(queue[h]);
-            h = (h + 1) % queue.size();
-        }
-        out_head = h;
-        return stop;
-    }
+    // worker thread copies into this
+    common_log_entry cur;

 public:
-    bool is_full() const {
-        return ((tail + 1) % queue.size()) == head;
-    }
-
-    bool is_empty() const {
-        return head == tail;
-    }
-
    void add(enum ggml_log_level level, const char * fmt, va_list args) {
-        std::unique_lock<std::mutex> lock(mtx);
-
-        // block if the queue is full
-        cv_full.wait(lock, [this]() { return !running || !is_full(); });
+        std::lock_guard<std::mutex> lock(mtx);

        if (!running) {
            // discard messages while the worker thread is paused
            return;
        }

-        auto & entry = queue[tail];
+        auto & entry = entries[tail];

        {
            // cannot use args twice, so make a copy in case we need to expand the buffer
@@ -243,16 +216,38 @@ public:
            va_end(args_copy);
        }

-        entry.is_end    = false;
-        entry.level     = level;
-        entry.prefix    = prefix;
+        entry.level = level;
+        entry.prefix = prefix;
        entry.timestamp = 0;
        if (timestamps) {
            entry.timestamp = t_us() - t_start;
        }
+        entry.is_end = false;

-        tail = (tail + 1) % queue.size();
-        cv_new.notify_one();
+        tail = (tail + 1) % entries.size();
+        if (tail == head) {
+            // expand the buffer
+            std::vector<common_log_entry> new_entries(2*entries.size());
+
+            size_t new_tail = 0;
+
+            do {
+                new_entries[new_tail] = std::move(entries[head]);
+
+                head     = (head     + 1) % entries.size();
+                new_tail = (new_tail + 1);
+            } while (head != tail);
+
+            head = 0;
+            tail = new_tail;
+
+            for (size_t i = tail; i < new_entries.size(); i++) {
+                new_entries[i].msg.resize(256);
+            }
+
+            entries = std::move(new_entries);
+        }
+        cv.notify_one();
    }

    void resume() {
@@ -266,24 +261,23 @@ public:

        thrd = std::thread([this]() {
            while (true) {
-                std::unique_lock<std::mutex> lock(mtx);
-                cv_new.wait(lock, [this]() { return !is_empty(); });
+                {
+                    std::unique_lock<std::mutex> lock(mtx);
+                    cv.wait(lock, [this]() { return head != tail; });
+                    cur = entries[head];

-                size_t cached_head = head;
-                size_t cached_tail = tail;
+                    head = (head + 1) % entries.size();
+                }

-                lock.unlock(); // drop the lock during flush
-
-                size_t next_head;
-                bool stop = flush_queue(cached_head, cached_tail, next_head);
-
-                lock.lock();
-                head = next_head;
-                cv_full.notify_all();
-
-                if (stop) {
+                if (cur.is_end) {
                    break;
                }
+
+                cur.print(); // stdout and stderr
+
+                if (file) {
+                    cur.print(file);
+                }
            }
        });
    }
@@ -299,13 +293,13 @@ public:
            running = false;

            // push an entry to signal the worker thread to stop
-            auto & entry = queue[tail];
-            entry.is_end = true;
-            tail = (tail + 1) % queue.size();
+            {
+                auto & entry = entries[tail];
+                entry.is_end = true;

-            // wakeup everyone
-            cv_new.notify_one();
-            cv_full.notify_all();
+                tail = (tail + 1) % entries.size();
+            }
+            cv.notify_one();
        }

        thrd.join();
@@ -6,14 +6,13 @@
 #include "unicode.h"

 #include <algorithm>
-#include <deque>
 #include <initializer_list>
 #include <map>
 #include <memory>
 #include <nlohmann/json.hpp>
 #include <regex>
-#include <set>
 #include <stdexcept>
+#include <unordered_set>

 // Trick to catch missing branches
 template <typename T>
@@ -89,7 +88,40 @@ struct trie {
        return match_result{match_result::NO_MATCH};
    }

+    struct prefix_and_next {
+        std::vector<uint32_t> prefix;
+        std::vector<uint32_t> next_chars;
+    };
+
+    std::vector<prefix_and_next> collect_prefix_and_next() {
+        std::vector<uint32_t>        prefix;
+        std::vector<prefix_and_next> result;
+        collect_prefix_and_next(0, prefix, result);
+        return result;
+    }
+
  private:
+    void collect_prefix_and_next(size_t index, std::vector<uint32_t> & prefix, std::vector<prefix_and_next> & out) {
+        if (!nodes[index].is_word) {
+            if (!nodes[index].children.empty()) {
+                std::vector<uint32_t> chars;
+                chars.reserve(nodes[index].children.size());
+                for (const auto & p : nodes[index].children) {
+                    chars.push_back(p.first);
+                }
+                out.emplace_back(prefix_and_next{prefix, chars});
+            }
+        }
+
+        for (const auto & p : nodes[index].children) {
+            uint32_t ch = p.first;
+            auto child = p.second;
+            prefix.push_back(ch);
+            collect_prefix_and_next(child, prefix, out);
+            prefix.pop_back();
+        }
+    }
+
    size_t create_node() {
        size_t index = nodes.size();
        nodes.emplace_back();
@@ -121,65 +153,6 @@ struct trie {
    }
 };

-// Aho-Corasick automaton
-struct aho_corasick {
-    trie                t;
-    std::vector<size_t> fail;      // failure links
-    std::vector<size_t> order;     // states in BFS order
-    std::vector<bool>   terminal;  // match states (directly or via a suffix link)
-    std::set<uint32_t>  alphabet;  // every character with a transition
-
-    aho_corasick(const std::vector<std::string> & strings) : t(strings) {
-        const auto & nodes = t.nodes;
-        const size_t n = nodes.size();
-
-        fail.assign(n, 0);
-        order.reserve(n);
-
-        std::deque<size_t> queue{ 0 };
-        while (!queue.empty()) {
-            size_t u = queue.front();
-            queue.pop_front();
-            order.push_back(u);
-            for (const auto & [ch, v] : nodes[u].children) {
-                if (u != 0) {
-                    size_t f = fail[u];
-                    while (f && nodes[f].children.find(ch) == nodes[f].children.end()) {
-                        f = fail[f];
-                    }
-                    auto it = nodes[f].children.find(ch);
-                    fail[v] = (it != nodes[f].children.end() && it->second != v) ? it->second : 0;
-                }
-                queue.push_back(v);
-            }
-        }
-
-        terminal.assign(n, false);
-        for (size_t u : order) {
-            terminal[u] = nodes[u].is_word || (u != 0 && terminal[fail[u]]);
-        }
-
-        for (const auto & node : nodes) {
-            for (const auto & [ch, v] : node.children) {
-                alphabet.insert(ch);
-            }
-        }
-    }
-
-    size_t num_states()          const { return t.nodes.size(); }
-    bool   is_terminal(size_t s) const { return terminal[s]; }
-
-    // follow failure links until a transition on `ch` exists.
-    size_t next(size_t state, uint32_t ch) const {
-        const auto & nodes = t.nodes;
-        while (state && nodes[state].children.find(ch) == nodes[state].children.end()) {
-            state = fail[state];
-        }
-        auto it = nodes[state].children.find(ch);
-        return it != nodes[state].children.end() ? it->second : 0;
-    }
-};
-
 static std::pair<uint32_t, size_t> parse_hex_escape(const std::string & str, size_t pos, int hex_count) {
    if (pos + hex_count > str.length()) {
        return {0, 0};
@@ -921,10 +894,6 @@ struct parser_executor {
    common_peg_parse_result operator()(const common_peg_gbnf_parser & p) {
        return arena.parse(p.child, ctx, start_pos);
    }
-
-    common_peg_parse_result operator()(const common_peg_ac_parser & p) {
-        return arena.parse(p.child, ctx, start_pos);
-    }
 };

 common_peg_parse_result common_peg_arena::parse(common_peg_parse_context & ctx, size_t start) const {
@@ -993,8 +962,7 @@ void common_peg_arena::resolve_refs() {
                                 std::is_same_v<T, common_peg_not_parser> ||
                                 std::is_same_v<T, common_peg_tag_parser> ||
                                 std::is_same_v<T, common_peg_atomic_parser> ||
-                                 std::is_same_v<T, common_peg_gbnf_parser> ||
-                                 std::is_same_v<T, common_peg_ac_parser>) {
+                                 std::is_same_v<T, common_peg_gbnf_parser>) {
                p.child = resolve_ref(p.child);
            } else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
                p.child = resolve_ref(p.child);
@@ -1024,12 +992,12 @@ void common_peg_arena::resolve_refs() {
 }

 std::string common_peg_arena::dump(common_peg_parser_id id) const {
-    std::set<common_peg_parser_id> visited;
+    std::unordered_set<common_peg_parser_id> visited;
    return dump_impl(id, visited);
 }

 std::string common_peg_arena::dump_impl(common_peg_parser_id                       id,
-                                        std::set<common_peg_parser_id> & visited) const {
+                                        std::unordered_set<common_peg_parser_id> & visited) const {
    // Check for cycles
    if (visited.count(id)) {
        return "[cycle]";
@@ -1075,8 +1043,6 @@ std::string common_peg_arena::dump_impl(common_peg_parser_id
            return "Atomic(" + dump_impl(p.child, visited) + ")";
        } else if constexpr (std::is_same_v<T, common_peg_gbnf_parser>) {
            return "Gbnf(" + p.grammar + ", " + dump_impl(p.child, visited) + ")";
-        } else if constexpr (std::is_same_v<T, common_peg_ac_parser>) {
-            return "Ac(" + string_join(p.delimiters, " | ") + ", " + dump_impl(p.child, visited) + ")";
        } else if constexpr (std::is_same_v<T, common_peg_any_parser>) {
            return "Any";
        } else if constexpr (std::is_same_v<T, common_peg_space_parser>) {
@@ -1306,13 +1272,13 @@ common_peg_parser common_peg_parser_builder::string_content(char delimiter) {

 common_peg_parser common_peg_parser_builder::double_quoted_string() {
    return rule("double-quoted-string", [this]() {
-        return sequence({literal("\""), string_content('"'), literal("\"")});
+        return sequence({literal("\""), string_content('"'), literal("\""), space()});
    });
 }

 common_peg_parser common_peg_parser_builder::single_quoted_string() {
    return rule("single-quoted-string", [this]() {
-        return sequence({literal("'"), string_content('\''), literal("'")});
+        return sequence({literal("'"), string_content('\''), literal("'"), space()});
    });
 }

@@ -1335,25 +1301,25 @@ common_peg_parser common_peg_parser_builder::json_number() {
        // At EOF in partial mode, chars returns NEED_MORE → negate propagates NEED_MORE → number not committed.
        // This prevents premature commits of partial numbers (e.g. "3" when "3.14" is incoming).
        auto not_number_continuation = negate(chars("[0-9.eE+-]", 1, 1));
-        return sequence({ optional(literal("-")), int_part, optional(frac), optional(exp), not_number_continuation });
+        return sequence({ optional(literal("-")), int_part, optional(frac), optional(exp), not_number_continuation, space() });
    });
 }

 common_peg_parser common_peg_parser_builder::json_string() {
    return rule("json-string", [this]() {
-        return sequence({literal("\""), string_content('"'), literal("\"")});
+        return sequence({literal("\""), string_content('"'), literal("\""), space()});
    });
 }

 common_peg_parser common_peg_parser_builder::json_bool() {
    return rule("json-bool", [this]() {
-        return choice({literal("true"), literal("false")});
+        return sequence({choice({literal("true"), literal("false")}), space()});
    });
 }

 common_peg_parser common_peg_parser_builder::json_null() {
    return rule("json-null", [this]() {
-        return literal("null");
+        return sequence({literal("null"), space()});
    });
 }

@@ -1368,7 +1334,8 @@ common_peg_parser common_peg_parser_builder::json_object() {
            choice({
                literal("}"),
                sequence({members, ws, literal("}")})
-            })
+            }),
+            ws
        });
    });
 }
@@ -1376,14 +1343,15 @@ common_peg_parser common_peg_parser_builder::json_object() {
 common_peg_parser common_peg_parser_builder::json_array() {
    return rule("json-array", [this]() {
        auto ws = space();
-        auto elements = sequence({json(), zero_or_more(sequence({ws, literal(","), ws, json()}))});
+        auto elements = sequence({json(), zero_or_more(sequence({literal(","), ws, json()}))});
        return sequence({
            literal("["),
            ws,
            choice({
                literal("]"),
                sequence({elements, ws, literal("]")})
-            })
+            }),
+            ws
        });
    });
 }
@@ -1413,13 +1381,16 @@ common_peg_parser common_peg_parser_builder::python_number() {

 common_peg_parser common_peg_parser_builder::python_bool() {
    return rule("python-bool", [this]() {
-        return choice({literal("True"), literal("False")});
+        return sequence({
+            choice({literal("True"), literal("False")}),
+            space()
+        });
    });
 }

 common_peg_parser common_peg_parser_builder::python_null() {
    return rule("python-none", [this]() {
-        return literal("None");
+        return sequence({literal("None"), space()});
    });
 }

@@ -1486,13 +1457,6 @@ common_peg_parser common_peg_parser_builder::json_member(const std::string & key
    });
 }

-common_peg_parser common_peg_parser_builder::ac(const common_peg_parser & p, const std::vector<std::string> & delimiters) {
-    if (delimiters.empty()) {
-        throw std::runtime_error("ac parser requires at least one delimiter");
-    }
-    return add(common_peg_ac_parser{p, delimiters});
-}
-
 static std::string gbnf_escape_char_class(uint32_t c) {
    if (c == '-' || c == ']' || c == '[' || c == '\\') {
        return "\\" + std::string(1, (char) c);
@@ -1543,118 +1507,41 @@ static std::string gbnf_escape_char_class(uint32_t c) {
    return std::string(buf);
 }

-static std::string gbnf_char_class(const std::vector<uint32_t> & chars, bool negate) {
-    std::string s = negate ? "[^" : "[";
-    for (uint32_t ch : chars) {
-        s += gbnf_escape_char_class(ch);
-    }
-    return s + "]";
-}
+static std::string gbnf_excluding_pattern(const std::vector<std::string> & strings) {
+    trie matcher(strings);
+    auto pieces = matcher.collect_prefix_and_next();

-static std::string gbnf_ac_grammar(
-    const common_grammar_builder &   builder,
-    const std::string &              prefix,
-    const std::vector<std::string> & strings,
-    const std::function<std::string(const std::vector<uint32_t> &,
-                                    const std::map<size_t, std::vector<uint32_t>> &,
-                                    const std::vector<uint32_t> &,
-                                    const std::function<std::string(size_t)> &)> & build_rule) {
-    aho_corasick ac(strings);
-
-    auto state_name = [&](size_t s) -> std::string {
-        if (s == 0) {
-            return prefix;
-        }
-        std::string num = std::to_string(s);
-        num = num.size() == 1 ? ("0" + num) : num;
-        return prefix + "-" + num;
-    };
-
-    for (size_t q = 0; q < ac.num_states(); q++) {
-        if (ac.is_terminal(q)) {
-            continue; // match states
+    std::string pattern;
+    for (size_t i = 0; i < pieces.size(); ++i) {
+        if (i > 0) {
+            pattern += " | ";
        }

-        std::map<size_t, std::vector<uint32_t>> buckets;
-        std::vector<uint32_t> completing;  // chars that complete a delimiter
-        std::vector<uint32_t> specific;    // chars with an explicit transition
-        for (uint32_t c : ac.alphabet) {
-            size_t d = ac.next(q, c);
-            if (ac.is_terminal(d)) {
-                completing.push_back(c);
-                specific.push_back(c);
-            } else if (d != 0) {
-                buckets[d].push_back(c); // specific non-root destination
-                specific.push_back(c);
-            }
+        const auto & pre = pieces[i].prefix;
+        const auto & chars = pieces[i].next_chars;
+
+        std::string cls;
+        cls.reserve(chars.size());
+        for (uint32_t ch : chars) {
+            cls += gbnf_escape_char_class(ch);
        }

-        builder.add_rule(state_name(q), build_rule(completing, buckets, specific, state_name));
+        if (!pre.empty()) {
+            pattern += gbnf_format_literal(common_unicode_cpts_to_utf8(pre)) + " [^" + cls + "]";
+        } else {
+            pattern += "[^" + cls + "]";
+        }
    }

-    // An empty delimiter makes the start state terminal. Emit an entry rule
-    // that matches the empty string so the returned reference stays valid.
-    if (ac.is_terminal(0)) {
-        builder.add_rule(prefix, "|");
-    }
-
-    return state_name(0);
+    return "(" + pattern + ")*";
 }

-// GBNF grammar matching strings that contain no string in `strings` as a
-// substring. Emits the complement of an Aho-Corasick automaton DFA and returns
-// the start state rule name.
-//
-// ref: https://github.com/ggml-org/llama.cpp/pull/24839
-static std::string gbnf_excluding_grammar(const common_grammar_builder & builder,
-                                          const std::string &            prefix,
-                                          const std::vector<std::string> & strings) {
-    return gbnf_ac_grammar(builder, prefix, strings,
-        [](const std::vector<uint32_t> & /*completing*/,
-           const std::map<size_t, std::vector<uint32_t>> & buckets,
-           const std::vector<uint32_t> & specific,
-           const std::function<std::string(size_t)> & state_name) {
-            // every state is accepting and completing chars get no
-            // alternative, so a forbidden string can never be matched
-            std::string rhs = "|";
-            for (const auto & [d, chars] : buckets) {
-                rhs += " " + gbnf_char_class(chars, false) + " " + state_name(d) + " |";
-            }
-            rhs += " " + gbnf_char_class(specific, true) + " " + state_name(0);
-            return rhs;
-        });
-}
-
-// GBNF grammar matching everything up to and including the first occurrence of
-// any string in `strings`. Emits the Aho-Corasick automaton DFA and returns
-// the start state rule name.
-static std::string gbnf_including_grammar(const common_grammar_builder & builder,
-                                          const std::string &            prefix,
-                                          const std::vector<std::string> & strings) {
-    return gbnf_ac_grammar(builder, prefix, strings,
-        [](const std::vector<uint32_t> & completing,
-           const std::map<size_t, std::vector<uint32_t>> & buckets,
-           const std::vector<uint32_t> & specific,
-           const std::function<std::string(size_t)> & state_name) {
-            std::vector<std::string> alts;
-            if (!completing.empty()) {
-                alts.push_back(gbnf_char_class(completing, false)); // terminate on match
-            }
-            for (const auto & [d, chars] : buckets) {
-                alts.push_back(gbnf_char_class(chars, false) + " " + state_name(d));
-            }
-            // every other character keeps scanning from the start state
-            alts.push_back(gbnf_char_class(specific, true) + " " + state_name(0));
-            return string_join(alts, " | ");
-        });
-}
-
-static std::set<std::string> collect_reachable_rules(
+static std::unordered_set<std::string> collect_reachable_rules(
    const common_peg_arena & arena,
    const common_peg_parser_id & rule
 ) {
-    std::set<std::string> reachable;
-    std::set<std::string> visited;
+    std::unordered_set<std::string> reachable;
+    std::unordered_set<std::string> visited;

    std::function<void(common_peg_parser_id)> visit = [&](common_peg_parser_id id) {
        const auto & parser = arena.get(id);
@@ -1686,7 +1573,6 @@ static std::set<std::string> collect_reachable_rules(
                                 std::is_same_v<T, common_peg_tag_parser> ||
                                 std::is_same_v<T, common_peg_atomic_parser> ||
                                 std::is_same_v<T, common_peg_gbnf_parser> ||
-                                 std::is_same_v<T, common_peg_ac_parser> ||
                                 std::is_same_v<T, common_peg_schema_parser>) {
                visit(p.child);
            } else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
@@ -1864,7 +1750,7 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
                if (p.delimiters.empty()) {
                    return ".*";
                }
-                return gbnf_excluding_grammar(builder, "until-" + std::to_string(id), p.delimiters);
+                return gbnf_excluding_pattern(p.delimiters);
            } else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
                if (schema_delegates(p)) {
                    return to_gbnf(p.child);
@@ -1881,8 +1767,6 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
                return to_gbnf(p.child);
            } else if constexpr (std::is_same_v<T, common_peg_gbnf_parser>) {
                return p.grammar;
-            } else if constexpr (std::is_same_v<T, common_peg_ac_parser>) {
-                return gbnf_including_grammar(builder, "ac-" + std::to_string(id), p.delimiters);
            } else {
                static_assert(is_always_false_v<T>);
            }
@@ -1890,7 +1774,7 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
    };

    // Collect reachable rules
-    std::set<std::string> reachable_rules;
+    std::unordered_set<std::string> reachable_rules;

    if (lazy) {
        // Collect rules reachable from trigger rules
@@ -2019,8 +1903,6 @@ static nlohmann::json serialize_parser_variant(const common_peg_parser_variant &
            };
        } else if constexpr (std::is_same_v<T, common_peg_gbnf_parser>) {
            return json{{"type", "gbnf"}, {"child", p.child}, {"grammar", p.grammar}};
-        } else if constexpr (std::is_same_v<T, common_peg_ac_parser>) {
-            return json{{"type", "ac"}, {"child", p.child}, {"delimiters", p.delimiters}};
        }
    }, variant);
 }
@@ -2193,16 +2075,6 @@ static common_peg_parser_variant deserialize_parser_variant(const nlohmann::json
        };
    }

-    if (type == "ac") {
-        if (!j.contains("child") || !j.contains("delimiters") || !j["delimiters"].is_array() || j["delimiters"].empty()) {
-            throw std::runtime_error("ac parser requires 'child' and a non-empty 'delimiters' array");
-        }
-        return common_peg_ac_parser{
-            j["child"].get<common_peg_parser_id>(),
-            j["delimiters"].get<std::vector<std::string>>(),
-        };
-    }
-
    throw std::runtime_error("Unknown parser type: " + type);
 }

@@ -3,8 +3,8 @@
 #include <nlohmann/json_fwd.hpp>

 #include <memory>
-#include <set>
 #include <unordered_map>
+#include <unordered_set>
 #include <string>
 #include <string_view>
 #include <functional>
@@ -275,11 +275,6 @@ struct common_peg_gbnf_parser {
    std::string grammar;
 };

-struct common_peg_ac_parser {
-    common_peg_parser_id child;
-    std::vector<std::string> delimiters;
-};
-
 // Variant holding all parser types
 using common_peg_parser_variant = std::variant<
    common_peg_epsilon_parser,
@@ -301,8 +296,7 @@ using common_peg_parser_variant = std::variant<
    common_peg_ref_parser,
    common_peg_atomic_parser,
    common_peg_tag_parser,
-    common_peg_gbnf_parser,
-    common_peg_ac_parser
+    common_peg_gbnf_parser
 >;

 class common_peg_arena {
@@ -341,7 +335,7 @@ class common_peg_arena {
    friend class common_peg_parser_builder;

  private:
-    std::string dump_impl(common_peg_parser_id id, std::set<common_peg_parser_id> & visited) const;
+    std::string dump_impl(common_peg_parser_id id, std::unordered_set<common_peg_parser_id> & visited) const;

    common_peg_parser_id add_parser(common_peg_parser_variant parser);
    void add_rule(const std::string & name, common_peg_parser_id id);
@@ -520,13 +514,6 @@ class common_peg_parser_builder {
    // the child's grammar. Parsing delegates entirely to the child.
    common_peg_parser gbnf(const common_peg_parser & p, const std::string & grammar) { return add(common_peg_gbnf_parser{p, grammar}); }

-    // Wraps a child parser but emits a GBNF grammar built from the Aho-Corasick
-    // automaton of `delimiters`, matching everything up to and including the
-    // first delimiter. Parsing delegates entirely to the child, which is
-    // responsible for consuming the delimiter (e.g. until(D) + literal(D)).
-    common_peg_parser ac(const common_peg_parser & p, const std::vector<std::string> & delimiters);
-    common_peg_parser ac(const common_peg_parser & p, const std::string & delimiter) { return ac(p, std::vector<std::string>{delimiter}); }
-
    void set_root(const common_peg_parser & p);

    common_peg_arena build();
@@ -7,7 +7,6 @@
 #include <fstream>
 #include <sstream>
 #include <filesystem>
-#include <regex>

 static std::string rm_leading_dashes(const std::string & str) {
    size_t pos = 0;
@@ -17,21 +16,46 @@ static std::string rm_leading_dashes(const std::string & str) {
    return str.substr(pos);
 }

-static std::string canonical_tag(const std::string & tag) {
-    static const std::regex re_tag("[-.]([A-Z0-9_]+)$", std::regex::icase);
-    std::smatch m;
-    if (std::regex_search(tag, m, re_tag)) {
-        std::string canon = m[1].str();
-        for (char & c : canon) {
-            c = (char) std::toupper((unsigned char) c);
+// only allow a subset of args for remote presets for security reasons
+// do not add more args unless absolutely necessary
+// args that output to files are strictly prohibited
+static std::set<std::string> get_remote_preset_whitelist(const std::map<std::string, common_arg> & key_to_opt) {
+    static const std::set<std::string> allowed_options = {
+        "model-url",
+        "hf-repo",
+        "hf-repo-draft",
+        "hf-repo-v", // vocoder
+        "hf-file-v", // vocoder
+        "mmproj-url",
+        "pooling",
+        "jinja",
+        "batch-size",
+        "ubatch-size",
+        "cache-reuse",
+        "chat-template-kwargs",
+        "mmap",
+        // note: sampling params are automatically allowed by default
+        // negated args will be added automatically if the positive arg is specified above
+    };
+
+    std::set<std::string> allowed_keys;
+
+    for (const auto & it : key_to_opt) {
+        const std::string & key = it.first;
+        const common_arg & opt = it.second;
+        if (allowed_options.find(key) != allowed_options.end() || opt.is_sampling) {
+            allowed_keys.insert(key);
+            // also add variant keys (args without leading dashes and env vars)
+            for (const auto & arg : opt.get_args()) {
+                allowed_keys.insert(rm_leading_dashes(arg));
+            }
+            for (const auto & env : opt.get_env()) {
+                allowed_keys.insert(env);
+            }
        }
-        return canon;
    }
-    std::string upper = tag;
-    for (char & c : upper) {
-        c = (char) std::toupper((unsigned char) c);
-    }
-    return upper;
+
+    return allowed_keys;
 }

 std::vector<std::string> common_preset::to_args(const std::string & bin_path) const {
@@ -276,10 +300,16 @@ static std::string parse_bool_arg(const common_arg & arg, const std::string & ke
    return value;
 }

-common_preset_context::common_preset_context(llama_example ex)
+common_preset_context::common_preset_context(llama_example ex, bool only_remote_allowed)
        : ctx_params(common_params_parser_init(default_params, ex)) {
    common_params_add_preset_options(ctx_params.options);
    key_to_opt = get_map_key_opt(ctx_params);
+
+    // setup allowed keys if only_remote_allowed is true
+    if (only_remote_allowed) {
+        filter_allowed_keys = true;
+        allowed_keys = get_remote_preset_whitelist(key_to_opt);
+    }
 }

 common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {
@@ -288,18 +318,11 @@ common_presets common_preset_context::load_from_ini(const std::string & path, co

    for (auto section : ini_data) {
        common_preset preset;
-        std::string section_name = section.first.empty() ? std::string(COMMON_PRESET_DEFAULT_NAME) : section.first;
-        if (section_name != "*" && section_name != COMMON_PRESET_DEFAULT_NAME) {
-            auto colon_idx = section_name.rfind(':');
-            if (colon_idx != std::string::npos) {
-                std::string tag       = section_name.substr(colon_idx + 1);
-                std::string canon_tag = canonical_tag(tag);
-                if (canon_tag != tag) {
-                    section_name = section_name.substr(0, colon_idx + 1) + canon_tag;
-                }
-            }
+        if (section.first.empty()) {
+            preset.name = COMMON_PRESET_DEFAULT_NAME;
+        } else {
+            preset.name = section.first;
        }
-        preset.name = section_name;
        LOG_DBG("loading preset: %s\n", preset.name.c_str());
        for (const auto & [key, value] : section.second) {
            if (key == "version") {
@@ -60,7 +60,7 @@ struct common_preset_context {
    std::set<std::string> allowed_keys;

    // if only_remote_allowed is true, only accept whitelisted keys
-    common_preset_context(llama_example ex);
+    common_preset_context(llama_example ex, bool only_remote_allowed = false);

    // load presets from INI file
    common_presets load_from_ini(const std::string & path, common_preset & global) const;
@@ -65,12 +65,12 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
            if (ctx->start_matcher.advance(token)) {
                ctx->state = REASONING_BUDGET_COUNTING;
                ctx->remaining = ctx->budget;
-                COM_TRC("activated, budget=%d tokens\n", ctx->budget);
+                LOG_INF("reasoning-budget: activated, budget=%d tokens\n", ctx->budget);

                if (ctx->remaining <= 0) {
                    ctx->state = REASONING_BUDGET_FORCING;
                    ctx->force_pos = 0;
-                    COM_TRC("%s", "budget=0, forcing immediately\n");
+                    LOG_INF("reasoning-budget: budget=0, forcing immediately\n");
                }
            }
            break;
@@ -80,7 +80,7 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
        {
            if (ctx->end_matcher.advance(token)) {
                ctx->state = REASONING_BUDGET_DONE;
-                COM_TRC("%s", "deactivated (natural end)\n");
+                LOG_INF("reasoning-budget: deactivated (natural end)\n");
                break;
            }

@@ -95,7 +95,7 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
                    ctx->state = REASONING_BUDGET_FORCING;
                    ctx->force_pos = 0;
                    ctx->end_matcher.reset();
-                    COM_TRC("%s", "UTF-8 complete, now forcing end sequence\n");
+                    LOG_INF("reasoning-budget: UTF-8 complete, now forcing end sequence\n");
                }
            } else if (ctx->state == REASONING_BUDGET_COUNTING) {
                ctx->remaining--;
@@ -104,11 +104,11 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
                        ctx->state = REASONING_BUDGET_FORCING;
                        ctx->force_pos = 0;
                        ctx->end_matcher.reset();
-                        COM_TRC("%s", "budget exhausted, forcing end sequence\n");
+                        LOG_INF("reasoning-budget: budget exhausted, forcing end sequence\n");
                    } else {
                        ctx->state = REASONING_BUDGET_WAITING_UTF8;
                        ctx->end_matcher.reset();
-                        COM_TRC("%s", "budget exhausted, waiting for UTF-8 completion\n");
+                        LOG_INF("reasoning-budget: budget exhausted, waiting for UTF-8 completion\n");
                    }
                }
            }
@@ -118,7 +118,7 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
            ctx->force_pos++;
            if (ctx->force_pos >= ctx->forced_tokens.size()) {
                ctx->state = REASONING_BUDGET_DONE;
-                COM_TRC("%s", "forced sequence complete, done\n");
+                LOG_INF("reasoning-budget: forced sequence complete, done\n");
            }
            break;
        case REASONING_BUDGET_DONE:
@@ -128,12 +128,12 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
                ctx->state = REASONING_BUDGET_COUNTING;
                ctx->remaining = ctx->budget;
                ctx->end_matcher.reset();
-                COM_TRC("re-activated on new start tag, budget=%d tokens\n", ctx->budget);
+                LOG_INF("reasoning-budget: re-activated on new start tag, budget=%d tokens\n", ctx->budget);

                if (ctx->remaining <= 0) {
                    ctx->state = REASONING_BUDGET_FORCING;
                    ctx->force_pos = 0;
-                    COM_TRC("%s", "budget=0, forcing immediately\n");
+                    LOG_INF("reasoning-budget: budget=0, forcing immediately\n");
                }
            }
            break;
@@ -264,7 +264,7 @@ bool common_reasoning_budget_force(struct llama_sampler * smpl) {
    ctx->state = REASONING_BUDGET_FORCING;
    ctx->force_pos = 0;
    ctx->end_matcher.reset();
-    COM_TRC("%s", "forced into forcing state (manual transition)\n");
+    LOG_INF("reasoning-budget: forced into forcing state (manual transition)\n");

    return true;
 }
@@ -0,0 +1,204 @@
+#include "regex-partial.h"
+#include "common.h"
+#include <functional>
+#include <optional>
+
+common_regex::common_regex(const std::string & pattern) :
+    pattern(pattern),
+    rx(pattern),
+    rx_reversed_partial(regex_to_reversed_partial_regex(pattern)) {}
+
+common_regex_match common_regex::search(const std::string & input, size_t pos, bool as_match) const {
+    std::smatch match;
+    if (pos > input.size()) {
+        throw std::runtime_error("Position out of bounds");
+    }
+    auto start = input.begin() + pos;
+    auto found = as_match
+        ? std::regex_match(start, input.end(), match, rx)
+        : std::regex_search(start, input.end(), match, rx);
+    if (found) {
+        common_regex_match res;
+        res.type = COMMON_REGEX_MATCH_TYPE_FULL;
+        for (size_t i = 0; i < match.size(); ++i) {
+            auto begin = pos + match.position(i);
+            res.groups.emplace_back(begin, begin + match.length(i));
+        }
+        return res;
+    }
+    std::match_results<std::string::const_reverse_iterator> srmatch;
+    if (std::regex_search(input.rbegin(), input.rend() - pos, srmatch, rx_reversed_partial, std::regex_constants::match_continuous)) {
+        auto group = srmatch[1].str();
+        if (group.length() != 0) {
+            auto it = srmatch[1].second.base();
+            // auto position = static_cast<size_t>(std::distance(input.begin(), it));
+            if ((!as_match) || it == input.begin()) {
+                common_regex_match res;
+                res.type = COMMON_REGEX_MATCH_TYPE_PARTIAL;
+                const size_t begin = std::distance(input.begin(), it);
+                const size_t end = input.size();
+                if (begin == std::string::npos || end == std::string::npos || begin > end) {
+                    throw std::runtime_error("Invalid range");
+                }
+                res.groups.push_back({begin, end});
+                return res;
+            }
+        }
+    }
+    return {};
+}
+
+/*
+  Transforms a regex pattern to a partial match pattern that operates on a reversed input string to find partial final matches of the original pattern.
+
+  Ideally we'd like to use boost::match_partial (https://beta.boost.org/doc/libs/1_59_0/libs/regex/doc/html/boost_regex/partial_matches.html)
+  to see if a string ends with a partial regex match, but but it's not in std::regex yet.
+  Instead, we'll the regex into a partial match regex operating as a full match on the reverse iterators of the input.
+
+  - /abcd/ -> ^(dcba|cba|ba|a) -> ^((?:(?:(?:(?:d)?c)?b)?a)
+  - /a|b/ -> ^(a|b)
+  - /a*?/ -> error, could match ""
+  - /a*b/ -> ^((?:b)?a*+) (final repetitions become eager)
+  - /.*?ab/ -> ^((?:b)?a) (omit .*)
+  - /a.*?b/ -> ^((?:b)?.*?a) (keep reluctant matches)
+  - /a(bc)d/ -> ^((?:(?:d)?(?:(?:c)?b))?a)
+  - /a(bc|de)/ -> ^((?:(?:(?:e)?d)?|(?:(?:c)?b)?)?a)
+  - /ab{2,4}c/ -> ^cbbb?b?a -> ^((?:(?:(?:(?:(?:c)?b)?b)?b?)?b?)?a)
+
+  The regex will match a reversed string fully, and the end of the first (And only) capturing group will indicate the reversed start of the original partial pattern.
+  All other groups are turned into non-capturing groups, and reluctant quantifiers are ignored.
+*/
+std::string regex_to_reversed_partial_regex(const std::string & pattern) {
+    auto it = pattern.begin();
+    const auto end = pattern.end();
+
+    std::function<std::string()> process = [&]() {
+        std::vector<std::vector<std::string>> alternatives(1);
+        std::vector<std::string> * sequence = &alternatives.back();
+
+        while (it != end) {
+            if (*it == '[') {
+                auto start = it;
+                ++it;
+                while (it != end) {
+                    if ((*it == '\\') && (++it != end)) {
+                        ++it;
+                    } else if ((it != end) && (*it == ']')) {
+                        break;
+                    } else {
+                        ++it;
+                    }
+                }
+                if (it == end) {
+                    throw std::runtime_error("Unmatched '[' in pattern");
+                }
+                ++it;
+                sequence->push_back(std::string(start, it));
+            } else if (*it == '*' || *it == '?' || *it == '+') {
+                if (sequence->empty()) {
+                    throw std::runtime_error("Quantifier without preceding element");
+                }
+                sequence->back() += *it;
+                auto is_star = *it == '*';
+                ++it;
+                if (is_star) {
+                    if (it != end && *it == '?') {
+                        ++it;
+                    }
+                }
+            } else if (*it == '{') {
+                if (sequence->empty()) {
+                    throw std::runtime_error("Repetition without preceding element");
+                }
+                ++it;
+                auto start = it;
+                while (it != end && *it != '}') {
+                    ++it;
+                }
+                if (it == end) {
+                    throw std::runtime_error("Unmatched '{' in pattern");
+                }
+                auto parts = string_split(std::string(start, it), ",");
+                ++it;
+                if (parts.size() > 2) {
+                    throw std::runtime_error("Invalid repetition range in pattern");
+                }
+
+                auto parseOptInt = [&](const std::string & s, const std::optional<int> & def = std::nullopt) -> std::optional<int> {
+                    if (s.empty()) {
+                        return def;
+                    }
+                    return std::stoi(s);
+                };
+                auto min = parseOptInt(parts[0], 0);
+                auto max = parts.size() == 1 ? min : parseOptInt(parts[1]);
+                if (min && max && *max < *min) {
+                    throw std::runtime_error("Invalid repetition range in pattern");
+                }
+                // Brutal but... let's repeat at least min times, then ? for the delta between min & max (or * for unbounded)
+                auto part = sequence->back();
+                sequence->pop_back();
+                for (int i = 0; i < *min; i++) {
+                    sequence->push_back(part);
+                }
+                if (max) {
+                    for (int i = *min; i < *max; i++) {
+                        sequence->push_back(part + "?");
+                    }
+                } else {
+                    sequence->push_back(part + "*");
+                }
+            } else if (*it == '(') {
+                ++it;
+                if (it != end && *it == '?' && (it + 1 != end) && *(it + 1) == ':') {
+                    it += 2;
+                }
+                auto sub = process();
+                if (*it != ')') {
+                    throw std::runtime_error("Unmatched '(' in pattern");
+                }
+                ++it;
+                auto & part = sequence->emplace_back("(?:");
+                part += sub;
+                part += ")";
+            } else if (*it == ')') {
+                break;
+            } else if (*it == '|') {
+                ++it;
+                alternatives.emplace_back();
+                sequence = &alternatives.back();
+            } else if (*it == '\\' && (++it != end)) {
+                auto str = std::string("\\") + *it;
+                sequence->push_back(str);
+                ++it;
+            } else if (it != end) {
+                sequence->push_back(std::string(1, *it));
+                ++it;
+            }
+        }
+
+        // /abcd/ -> ^(dcba|cba|ba|a) -> ^((?:(?:(?:d)?c)?b)?a)
+        // if n(=4) parts, opening n-1(=3) non-capturing groups after the 1 capturing group
+        // We'll do the outermost capturing group and final .* in the enclosing function.
+        std::vector<std::string> res_alts;
+        for (const auto & parts : alternatives) {
+            auto & res = res_alts.emplace_back();
+            for (size_t i = 0; i < parts.size() - 1; i++) {
+                res += "(?:";
+            }
+            for (auto it = parts.rbegin(); it != parts.rend(); ++it) {
+                res += *it;
+                if (it != parts.rend() - 1) {
+                    res += ")?";
+                }
+            }
+        }
+        return string_join(res_alts, "|");
+    };
+    auto res = process();
+    if (it != end) {
+        throw std::runtime_error("Unmatched '(' in pattern");
+    }
+
+    return "^(" + res + ")";
+}
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <regex>
+#include <string>
+
+enum common_regex_match_type {
+    COMMON_REGEX_MATCH_TYPE_NONE,
+    COMMON_REGEX_MATCH_TYPE_PARTIAL,
+    COMMON_REGEX_MATCH_TYPE_FULL,
+};
+
+struct common_string_range {
+    size_t begin;
+    size_t end;
+    common_string_range(size_t begin, size_t end) : begin(begin), end(end) {
+        if (begin > end) {
+            throw std::runtime_error("Invalid range");
+        }
+    }
+    // prevent default ctor
+    common_string_range() = delete;
+    bool empty() const {
+        return begin == end;
+    }
+    bool operator==(const common_string_range & other) const {
+        return begin == other.begin && end == other.end;
+    }
+};
+
+struct common_regex_match {
+    common_regex_match_type type = COMMON_REGEX_MATCH_TYPE_NONE;
+    std::vector<common_string_range> groups;
+
+    bool operator==(const common_regex_match & other) const {
+        return type == other.type && groups == other.groups;
+    }
+    bool operator!=(const common_regex_match & other) const {
+        return !(*this == other);
+    }
+};
+
+class common_regex {
+    std::string pattern;
+    std::regex rx;
+    std::regex rx_reversed_partial;
+
+  public:
+    explicit common_regex(const std::string & pattern);
+
+    common_regex_match search(const std::string & input, size_t pos, bool as_match = false) const;
+
+    const std::string & str() const { return pattern; }
+};
+
+// For testing only (pretty print of failures).
+std::string regex_to_reversed_partial_regex(const std::string & pattern);
@@ -259,9 +259,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
             }
        }
    }
-    if (!grmr && !grammar_str.empty()) {
-        throw std::runtime_error("failed to parse grammar");
-    }

    // Compute prefill tokens from the generation prompt
    std::vector<llama_token> prefill_tokens;
@@ -772,63 +769,54 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
    }
 }

-std::vector<common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names) {
-    // sampler names can be written multiple ways; generate aliases from canonical names
-    static const auto sampler_name_map = []{
-        // canonical sampler name mapping
-        std::unordered_map<std::string, common_sampler_type> canonical_name_map {
-            { "dry",         COMMON_SAMPLER_TYPE_DRY         },
-            { "top_k",       COMMON_SAMPLER_TYPE_TOP_K       },
-            { "top_p",       COMMON_SAMPLER_TYPE_TOP_P       },
-            { "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
-            { "typ_p",       COMMON_SAMPLER_TYPE_TYPICAL_P   },
-            { "min_p",       COMMON_SAMPLER_TYPE_MIN_P       },
-            { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
-            { "xtc",         COMMON_SAMPLER_TYPE_XTC         },
-            { "infill",      COMMON_SAMPLER_TYPE_INFILL      },
-            { "penalties",   COMMON_SAMPLER_TYPE_PENALTIES   },
-            { "adaptive_p",  COMMON_SAMPLER_TYPE_ADAPTIVE_P  }
-        };
-        std::unordered_map<std::string, common_sampler_type> alias_name_map;
-        for (const auto & entry : canonical_name_map) {
-            const std::string & canonical = entry.first;
-            if (canonical.find('_') == std::string::npos) {
-                continue;
-            }
-            // kebab-case: "top-k", "min-p", etc.
-            {
-                std::string kebab_case = canonical;
-                std::replace(kebab_case.begin(), kebab_case.end(), '_', '-');
-                alias_name_map.insert({kebab_case, entry.second});
-            }
-            // no dash: "topk", "minp", etc.
-            {
-                std::string no_dash = canonical;
-                no_dash.erase(std::remove(no_dash.begin(), no_dash.end(), '_'), no_dash.end());
-                alias_name_map.insert({no_dash, entry.second});
-            }
-        }
-        // misc. aliases
-        alias_name_map.insert({"nucleus", COMMON_SAMPLER_TYPE_TOP_P});
-        alias_name_map.insert({"temp",    COMMON_SAMPLER_TYPE_TEMPERATURE});
-        alias_name_map.insert({"typ",     COMMON_SAMPLER_TYPE_TYPICAL_P});
-        // include aliases + canonical names in the complete mapping
-        alias_name_map.merge(canonical_name_map);
-        return alias_name_map;
-    }();
+std::vector<common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
+    std::unordered_map<std::string, common_sampler_type> sampler_canonical_name_map {
+        { "dry",         COMMON_SAMPLER_TYPE_DRY },
+        { "top_k",       COMMON_SAMPLER_TYPE_TOP_K },
+        { "top_p",       COMMON_SAMPLER_TYPE_TOP_P },
+        { "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
+        { "typ_p",       COMMON_SAMPLER_TYPE_TYPICAL_P },
+        { "min_p",       COMMON_SAMPLER_TYPE_MIN_P },
+        { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
+        { "xtc",         COMMON_SAMPLER_TYPE_XTC },
+        { "infill",      COMMON_SAMPLER_TYPE_INFILL },
+        { "penalties",   COMMON_SAMPLER_TYPE_PENALTIES },
+        { "adaptive_p",  COMMON_SAMPLER_TYPE_ADAPTIVE_P },
+    };
+
+    // since samplers names are written multiple ways
+    // make it ready for both system names and input names
+    std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
+        { "top-k",       COMMON_SAMPLER_TYPE_TOP_K },
+        { "top-p",       COMMON_SAMPLER_TYPE_TOP_P },
+        { "top-n-sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
+        { "nucleus",     COMMON_SAMPLER_TYPE_TOP_P },
+        { "typical-p",   COMMON_SAMPLER_TYPE_TYPICAL_P },
+        { "typical",     COMMON_SAMPLER_TYPE_TYPICAL_P },
+        { "typ-p",       COMMON_SAMPLER_TYPE_TYPICAL_P },
+        { "typ",         COMMON_SAMPLER_TYPE_TYPICAL_P },
+        { "min-p",       COMMON_SAMPLER_TYPE_MIN_P },
+        { "temp",        COMMON_SAMPLER_TYPE_TEMPERATURE },
+        { "adaptive-p",  COMMON_SAMPLER_TYPE_ADAPTIVE_P },
+    };

    std::vector<common_sampler_type> samplers;
    samplers.reserve(names.size());

    for (const auto & name : names) {
-        std::string name_lower = name;
-        std::transform(name_lower.begin(), name_lower.end(), name_lower.begin(), ::tolower);
-        auto sampler = sampler_name_map.find(name_lower);
-        if (sampler != sampler_name_map.end()) {
+        auto sampler = sampler_canonical_name_map.find(name);
+        if (sampler != sampler_canonical_name_map.end()) {
            samplers.push_back(sampler->second);
            continue;
        }
-        LOG_WRN("%s: unable to match sampler by name '%s'\n", __func__, name_lower.c_str());
+        if (allow_alt_names) {
+            sampler = sampler_alt_name_map.find(name);
+            if (sampler != sampler_alt_name_map.end()) {
+                samplers.push_back(sampler->second);
+                continue;
+            }
+        }
+        LOG_WRN("%s: unable to match sampler by name '%s'\n", __func__, name.c_str());
    }

    return samplers;
@@ -109,7 +109,7 @@ std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx,
 char        common_sampler_type_to_chr(enum common_sampler_type cnstr);
 std::string common_sampler_type_to_str(enum common_sampler_type cnstr);

-std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names);
+std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
 std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std::string & chars);

 llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
@@ -68,10 +68,6 @@ void common_speculative_draft(common_speculative * spec);
 // informs the speculative context that n_accepted tokens were accepted by the target model
 void common_speculative_accept(common_speculative * spec, llama_seq_id, uint16_t n_accepted);

-// (optional) get/set internal state
-bool common_speculative_get_state(common_speculative * spec, llama_seq_id seq_id, std::vector<uint8_t> & data);
-void common_speculative_set_state(common_speculative * spec, llama_seq_id seq_id, const std::vector<uint8_t> & data);
-
 // print statistics about the speculative decoding
 void common_speculative_print_stats(const common_speculative * spec);

@@ -40,18 +40,14 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "ChatGLMModel": "chatglm",
    "CodeShellForCausalLM": "codeshell",
    "CogVLMForCausalLM": "cogvlm",
-    "Cohere2MoeForCausalLM": "command_r",
    "Cohere2ForCausalLM": "command_r",
    "CohereForCausalLM": "command_r",
    "DbrxForCausalLM": "dbrx",
    "DeciLMForCausalLM": "deci",
    "DeepseekForCausalLM": "deepseek",
-    "DeepseekOCRForCausalLM": "deepseek",
    "DeepseekV2ForCausalLM": "deepseek",
    "DeepseekV3ForCausalLM": "deepseek",
    "DeepseekV32ForCausalLM": "deepseek",
-    "DFlashDraftModel": "qwen",
-    "DeepseekV4ForCausalLM": "deepseek",
    "DistilBertForMaskedLM": "bert",
    "DistilBertForSequenceClassification": "bert",
    "DistilBertModel": "bert",
@@ -99,7 +95,6 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "GraniteMoeHybridForCausalLM": "granite",
    "GraniteMoeSharedForCausalLM": "granite",
    "GraniteSpeechForConditionalGeneration": "granite",
-    "GraniteSpeechPlusForConditionalGeneration": "granite",
    "Grok1ForCausalLM": "grok",
    "GrokForCausalLM": "grok",
    "GroveMoeForCausalLM": "grovemoe",
@@ -127,7 +122,6 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "LLaDAModelLM": "llada",
    "LLaMAForCausalLM": "llama",
    "Lfm25AudioTokenizer": "lfm2",
-    "Lfm2BidirectionalModel": "lfm2",
    "Lfm2ForCausalLM": "lfm2",
    "Lfm2Model": "lfm2",
    "Lfm2MoeForCausalLM": "lfm2",
@@ -136,10 +130,6 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "LlamaBidirectionalModel": "llama",
    "LlamaForCausalLM": "llama",
    "LlamaModel": "llama",
-    "Eagle3DraftModel": "llama",
-    "Eagle3Speculator": "llama",
-    "Eagle3LlamaForCausalLM": "llama",
-    "LlamaForCausalLMEagle3": "llama",
    "LlavaForConditionalGeneration": "llama",
    "LlavaStableLMEpochForCausalLM": "stablelm",
    "MPTForCausalLM": "mpt",
@@ -237,7 +227,6 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "UMT5ForConditionalGeneration": "t5",
    "UMT5Model": "t5",
    "UltravoxModel": "ultravox",
-    "UnlimitedOCRForCausalLM": "deepseek",
    "VLlama3ForCausalLM": "llama",
    "VoxtralForConditionalGeneration": "llama",
    "WavTokenizerDec": "wavtokenizer",
@@ -266,9 +255,7 @@ MMPROJ_MODEL_MAP: dict[str, str] = {
    "Glm4vMoeForConditionalGeneration": "qwen3vl",
    "GlmOcrForConditionalGeneration": "qwen3vl",
    "GlmasrModel": "ultravox",
-    "Granite4VisionForConditionalGeneration": "granite",
    "GraniteSpeechForConditionalGeneration": "granite",
-    "GraniteSpeechPlusForConditionalGeneration": "granite",
    "HunYuanVLForConditionalGeneration": "hunyuan",
    "Idefics3ForConditionalGeneration": "smolvlm",
    "InternVisionModel": "internvl",
@@ -304,7 +291,6 @@ MMPROJ_MODEL_MAP: dict[str, str] = {
    "StepVLForConditionalGeneration": "step3",
    "Step3p7ForConditionalGeneration": "step3",
    "UltravoxModel": "ultravox",
-    "UnlimitedOCRForCausalLM": "deepseek",
    "VoxtralForConditionalGeneration": "ultravox",
    "YoutuVLForConditionalGeneration": "youtuvl",
 }
@@ -126,7 +126,7 @@ class BailingMoeV2Model(TextModel):
        if (rope_dim := hparams.get("head_dim")) is None:
            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]

-        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5)))
+        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
        self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
@@ -94,7 +94,6 @@ class ModelBase:
    metadata: gguf.Metadata
    dir_model_card: Path
    remote_hf_model_id: str | None
-    target_model_dir: Path | None

    # subclasses should define this!
    model_arch: gguf.MODEL_ARCH
@@ -120,7 +119,6 @@ class ModelBase:
                 small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None,
                 disable_mistral_community_chat_template: bool = False,
                 sentence_transformers_dense_modules: bool = False,
-                 target_model_dir: Path | None = None,
                 fuse_gate_up_exps: bool = False,
                 fp8_as_q8: bool = False):
        if type(self) is ModelBase or \
@@ -141,7 +139,6 @@ class ModelBase:
        self.dry_run = dry_run
        self.remote_hf_model_id = remote_hf_model_id
        self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
-        self.target_model_dir = target_model_dir
        self.fuse_gate_up_exps = fuse_gate_up_exps
        self._gate_exp_buffer: dict[int, Tensor] = {}
        self._up_exp_buffer: dict[int, Tensor] = {}
@@ -1119,10 +1116,8 @@ class TextModel(ModelBase):

        rope_theta = self.find_hparam(["global_rope_theta", "rope_global_theta", "rope_theta_global", "rope_theta", "rotary_emb_base"], optional=True)
        local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "rope_theta_local", "swa_rope_theta", "rope_local_base_freq"], optional=True)
-        partial_rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"], optional=True)
-        original_max_position_embeddings = self.find_hparam(["original_max_position_embeddings"], optional=True)

-        # Ensure global params are mirrored in rope_parameters
+        # Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
        if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
            if local_rope_theta is not None:
                self.rope_parameters["sliding_attention"] = {"rope_theta": local_rope_theta}
@@ -1130,10 +1125,6 @@ class TextModel(ModelBase):
                self.rope_parameters["rope_theta"] = rope_theta
            if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None:
                self.rope_parameters["rope_type"] = rope_type
-            if "partial_rotary_factor" not in self.rope_parameters and partial_rotary_factor is not None:
-                self.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
-            if "original_max_position_embeddings" not in self.rope_parameters and original_max_position_embeddings is not None:
-                self.rope_parameters["original_max_position_embeddings"] = original_max_position_embeddings

    @classmethod
    def __init_subclass__(cls):
@@ -1201,7 +1192,7 @@ class TextModel(ModelBase):
            self.gguf_writer.add_embedding_length(n_embd)
            logger.info(f"gguf: embedding length = {n_embd}")

-        if (n_ff := self.find_hparam(["prefix_dense_intermediate_size", "intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None:
+        if (n_ff := self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None:
            self.gguf_writer.add_feed_forward_length(n_ff)
            logger.info(f"gguf: feed forward length = {n_ff}")

@@ -1273,7 +1264,7 @@ class TextModel(ModelBase):
        if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
            self.gguf_writer.add_layer_norm_eps(f_norm_eps)
            logger.info(f"gguf: layer norm epsilon = {f_norm_eps}")
-        if (n_experts := self.find_hparam(["num_local_experts", "num_experts", "n_routed_experts"], optional=True)) is not None:
+        if (n_experts := self.find_hparam(["num_local_experts", "num_experts"], optional=True)) is not None:
            self.gguf_writer.add_expert_count(n_experts)
            logger.info(f"gguf: expert count = {n_experts}")
        if (n_experts_used := self.find_hparam(["num_experts_per_tok", "num_experts_per_token", "top_k_experts"], optional=True)) is not None:
@@ -1286,13 +1277,11 @@ class TextModel(ModelBase):
            self.gguf_writer.add_expert_group_used_count(n_group_used)
            logger.info(f"gguf: expert groups used count = {n_group_used}")

-        if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation", "moe_router_activation_func", "expert_selection_fn"], optional=True)) is not None:
+        if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation", "moe_router_activation_func"], optional=True)) is not None:
            if score_func == "sigmoid":
                self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
            elif score_func == "softmax":
                self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
-            elif score_func == "sqrtsoftplus":
-                self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SQRTSOFTPLUS)
            else:
                raise ValueError(f"Unsupported expert score gating function value: {score_func}")
            logger.info(f"gguf: expert score gating function = {score_func}")
@@ -1503,9 +1492,6 @@ class TextModel(ModelBase):
        if chkhsh == "d772b220ace2baec124bed8cfafce0ead7d6c38a4b65ef11261cf9d5d62246d1":
            # ref: https://huggingface.co/CohereLabs/tiny-aya-base
            res = "tiny_aya"
-        if chkhsh == "52df12b4c8d4176e7481aab4b6e8454d1fd0a210a04a574f6d4e067d10e23c3e":
-            # ref: https://huggingface.co/CohereLabs/North-Mini-Code-1.0
-            res = "cohere2moe"
        if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
            # ref: https://huggingface.co/Qwen/Qwen1.5-7B
            res = "qwen2"
@@ -2495,7 +2481,6 @@ class LazyTorchTensor(gguf.LazyBase):
        torch.float16: np.float16,
        torch.float32: np.float32,
        torch.uint8: np.uint8,
-        torch.int64: np.int64,
    }

    # only used when byteswapping data. Only correct size is needed
@@ -2602,17 +2587,6 @@ class LazyTorchTensor(gguf.LazyBase):
        return cls._wrap_fn(func)(*args, **kwargs)


-if hasattr(torch, "float8_e8m0fnu"):
-    _torch_float8_e8m0 = torch.float8_e8m0fnu
-    LazyTorchTensor._dtype_map[_torch_float8_e8m0] = np.uint8
-    LazyTorchTensor._dtype_byteswap_map[_torch_float8_e8m0] = np.uint8
-    LazyTorchTensor._dtype_str_map["F8_E8M0"] = _torch_float8_e8m0
-else:
-    # Older torch builds do not expose F8_E8M0. Keep the raw bytes so callers
-    # that know the format can decode them explicitly.
-    LazyTorchTensor._dtype_str_map["F8_E8M0"] = torch.uint8
-
-
 def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> str:
    # TODO @ngxson : this won't work correctly if the model has both audio & vision encoders
    # maybe we should fallback to text model's arch in that case, since not many models have both
@@ -148,7 +148,7 @@ class ChatGLMModel(TextModel):
            rope_dim = self.hparams["attention_dim"]
        else:
            rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
-        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5)))
+        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
        self.gguf_writer.add_add_bos_token(False)
        rope_freq = 10000
        if "rope_ratio" in self.hparams:
@@ -1,6 +1,5 @@
 from __future__ import annotations

-import re
 from typing import Iterable, TYPE_CHECKING

 import torch
@@ -56,122 +55,3 @@ class Cohere2Model(TextModel):
            return

        yield from super().modify_tensors(data_torch, name, bid)
-
-
-@ModelBase.register("Cohere2MoeForCausalLM")
-class Cohere2MoeModel(TextModel):
-    model_arch = gguf.MODEL_ARCH.COHERE2MOE
-    _n_main_layers: int | None = None
-    _expert_tensor_re = re.compile(
-        r"model\.layers\.(\d+)\.mlp\.experts\.(\d+)\.(down_proj|gate_proj|up_proj)\.weight"
-    )
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        if (n_nextn := int(self.hparams.get("num_nextn_predict_layers", 0) or 0)) > 0 and not self.no_mtp:
-            self.block_count += n_nextn
-            self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
-        self._experts: list[dict[str, Tensor]] = [{} for _ in range(self.block_count)]
-
-    def _set_vocab_gpt2(self) -> None:
-        tokens, toktypes, tokpre = self.get_vocab_base()
-        self.gguf_writer.add_tokenizer_model("gpt2")
-        self.gguf_writer.add_tokenizer_pre(tokpre)
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_types(toktypes)
-
-        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
-        special_vocab.add_to_gguf(self.gguf_writer)
-
-    def set_gguf_parameters(self):
-        hparams = self.hparams
-        expert_intermediate_size = hparams["intermediate_size"]
-        mlp_layer_types = hparams.get("mlp_layer_types")
-        n_dense_lead = hparams.get("first_k_dense_replace", 0)
-        if mlp_layer_types is not None:
-            n_dense_lead = next((i for i, t in enumerate(mlp_layer_types) if t != "dense"), len(mlp_layer_types))
-
-        super().set_gguf_parameters()
-
-        self.gguf_writer.add_logit_scale(hparams["logit_scale"])
-        self.gguf_writer.add_sliding_window(hparams["sliding_window"])
-        self.gguf_writer.add_sliding_window_pattern([t == "sliding_attention" for t in hparams["layer_types"]])
-        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
-        self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size)
-        self.gguf_writer.add_leading_dense_block_count(n_dense_lead)
-        self.gguf_writer.add_expert_weights_norm(hparams.get("norm_topk_prob", False))
-        if (num_shared_experts := hparams.get("num_shared_experts", 0)) > 0:
-            if hparams.get("shared_expert_combination_strategy", "average") != "average":
-                raise ValueError("Cohere2 MoE only supports average shared expert combination")
-            self.gguf_writer.add_expert_shared_count(num_shared_experts)
-            self.gguf_writer.add_expert_shared_feed_forward_length(expert_intermediate_size * num_shared_experts)
-        if (n_nextn := hparams.get("num_nextn_predict_layers", 0)) > 0 and not self.no_mtp:
-            self.gguf_writer.add_nextn_predict_layers(n_nextn)
-        self.gguf_writer.add_rope_dimension_count(hparams["head_dim"])
-        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
-
-    def index_tensors(self, remote_hf_model_id: str | None = None):
-        hparams = {**self.hparams, **self.hparams.get("text_config", {})}
-        self._n_main_layers = hparams.get("num_hidden_layers")
-        type(self)._n_main_layers = self._n_main_layers
-        return super().index_tensors(remote_hf_model_id=remote_hf_model_id)
-
-    @classmethod
-    def filter_tensors(cls, item):
-        if (titem := super().filter_tensors(item)) is None:
-            return None
-        name, gen = titem
-
-        if cls._n_main_layers is not None:
-            is_mtp = (m := re.match(r"model\.layers\.(\d+)\.", name)) is not None and int(m.group(1)) >= cls._n_main_layers
-            if is_mtp and cls.no_mtp:
-                return None
-            if cls.mtp_only and not is_mtp and name not in (
-                "model.embed_tokens.weight", "model.norm.weight", "lm_head.weight",
-            ):
-                return None
-
-        return name, gen
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        if name.endswith(".bias"):
-            if torch.any(data_torch != 0):
-                raise ValueError(f"Bias tensor {name!r} is not zero.")
-            logger.debug(f"Skipping bias tensor {name!r}.")
-            return
-
-        if (m := self._expert_tensor_re.fullmatch(name)) is not None:
-            n_experts = self.hparams["num_experts"]
-            layer_idx = int(m.group(1))
-            assert bid is None or bid == layer_idx
-
-            self._experts[layer_idx][name] = data_torch
-
-            expected = {
-                f"model.layers.{layer_idx}.mlp.experts.{xid}.{w_name}.weight"
-                for xid in range(n_experts)
-                for w_name in ("down_proj", "gate_proj", "up_proj")
-            }
-            if expected.issubset(self._experts[layer_idx]):
-                for w_name in ["down_proj", "gate_proj", "up_proj"]:
-                    datas: list[Tensor] = []
-
-                    for xid in range(n_experts):
-                        ename = f"model.layers.{layer_idx}.mlp.experts.{xid}.{w_name}.weight"
-                        datas.append(self._experts[layer_idx][ename])
-                        del self._experts[layer_idx][ename]
-
-                    data_torch = torch.stack(datas, dim=0)
-                    merged_name = f"model.layers.{layer_idx}.mlp.experts.{w_name}.weight"
-
-                    yield from super().modify_tensors(data_torch, merged_name, layer_idx)
-            return
-
-        yield from super().modify_tensors(data_torch, name, bid)
-
-    def prepare_tensors(self):
-        super().prepare_tensors()
-
-        experts = [k for d in self._experts for k in d.keys()]
-        if len(experts) > 0:
-            raise ValueError(f"Unprocessed experts: {experts}")
@@ -161,7 +161,7 @@ class DeciModel(TextModel):
                factor = rope_params.get("factor", 8.0)
                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
-                old_context_len = rope_params.get("original_max_position_embeddings", 8192)
+                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)

                low_freq_wavelen = old_context_len / low_freq_factor
                high_freq_wavelen = old_context_len / high_freq_factor
@@ -1,23 +1,20 @@
 from __future__ import annotations

-import json
 import re
-from pathlib import Path

 from typing import Any, Callable, Iterable, TYPE_CHECKING

-import numpy as np
 import torch

 if TYPE_CHECKING:
    from torch import Tensor

-from .base import LazyTorchTensor, MmprojModel, ModelBase, TextModel, gguf, logger
+from .base import MmprojModel, ModelBase, TextModel, gguf, logger

 from .qwen import QwenModel


-@ModelBase.register("DeepseekOCRForCausalLM", "UnlimitedOCRForCausalLM")
+@ModelBase.register("DeepseekOCRForCausalLM")
 class DeepseekOCRVisionModel(MmprojModel):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
@@ -208,8 +205,6 @@ class DeepseekModel(TextModel):
@ModelBase.register(
    "DeepseekV2ForCausalLM",
    "DeepseekV3ForCausalLM",
-    "DeepseekOCRForCausalLM",
-    "UnlimitedOCRForCausalLM",
    "KimiVLForConditionalGeneration",
    "KimiK25ForConditionalGeneration",
    "YoutuForCausalLM",
@@ -229,7 +224,7 @@ class DeepseekV2Model(TextModel):
        self.origin_hf_arch = hparams.get('architectures', [None])[0]

        # special handling for Deepseek OCR
-        if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM", "UnlimitedOCRForCausalLM"):
+        if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM"):
            self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR
            self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
            self.gguf_writer.add_architecture()
@@ -355,12 +350,6 @@ class DeepseekV2Model(TextModel):

        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])

-        # Unlimited-OCR sliding window; written for metadata, the decoder ignores it (full MHA)
-        if is_ocr:
-            sliding_window = hparams.get("sliding_window_size") or hparams.get("sliding_window")
-            if sliding_window:
-                self.gguf_writer.add_sliding_window(sliding_window)
-
        if (rope_mscale_all := self.rope_parameters.get("mscale_all_dim")) is not None:
            # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
            # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
@@ -470,307 +459,3 @@ class DeepseekV32Model(DeepseekV2Model):
        self.gguf_writer.add_indexer_head_count(self.hparams["index_n_heads"])
        self.gguf_writer.add_indexer_key_length(self.hparams["index_head_dim"])
        self.gguf_writer.add_indexer_top_k(self.hparams["index_topk"])
-
-
-@ModelBase.register("DeepseekV4ForCausalLM")
-class DeepseekV4Model(TextModel):
-    model_arch = gguf.MODEL_ARCH.DEEPSEEK4
-    _skipped_mtp_tensors = 0
-
-    def __init__(self, *args, **kwargs):
-        type(self)._skipped_mtp_tensors = 0
-        super().__init__(*args, **kwargs)
-
-        with open(self.dir_model / "config.json", "r", encoding="utf-8") as f:
-            raw_hparams = json.load(f)
-        for key, value in raw_hparams.items():
-            self.hparams.setdefault(key, value)
-
-        self.block_count = self.hparams["num_hidden_layers"]
-        self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
-
-        self._dsv4_fp8_dequantized: set[str] = set()
-        self._dsv4_bf16_tensors: set[str] = set()
-        self._dsv4_f32_tensors: set[str] = set()
-        self._dsv4_mxfp4_generated = False
-        self._collect_source_dtypes()
-
-        if type(self)._skipped_mtp_tensors:
-            logger.info("Skipping %d DeepSeek-V4 MTP tensor(s) for conversion v0", type(self)._skipped_mtp_tensors)
-
-        # add a default chat template; if the model has a built-in template, it will be overridden later
-        template_path = Path(__file__).parent.parent / "models" / "templates" / "deepseek-ai-DeepSeek-V4.jinja"
-        if template_path.is_file():
-            with open(template_path, "r", encoding="utf-8") as f:
-                self.gguf_writer.add_chat_template(f.read())
-
-    @classmethod
-    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
-        name, _ = item
-        if name.startswith("mtp."):
-            cls._skipped_mtp_tensors += 1
-            return None
-        return super().filter_tensors(item)
-
-    @staticmethod
-    def _float8_dtypes() -> tuple[torch.dtype, ...]:
-        return tuple(
-            dtype for dtype in (
-                getattr(torch, "float8_e4m3fn", None),
-                getattr(torch, "float8_e5m2", None),
-            ) if dtype is not None
-        )
-
-    @staticmethod
-    def _e8m0_to_float(scale: Tensor) -> Tensor:
-        torch_float8_e8m0 = getattr(torch, "float8_e8m0fnu", None)
-        if torch_float8_e8m0 is not None and scale.dtype == torch_float8_e8m0:
-            return scale.float()
-
-        bits = scale.view(torch.uint8).float()
-        return torch.exp2(bits - 127.0)
-
-    def _collect_source_dtypes(self) -> None:
-        for name, gen in self.model_tensors.items():
-            dtype = gen().dtype
-            if dtype == torch.bfloat16:
-                self._dsv4_bf16_tensors.add(name)
-            elif dtype == torch.float32:
-                self._dsv4_f32_tensors.add(name)
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        hparams = self.hparams
-
-        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
-        self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
-        self.gguf_writer.add_sliding_window(hparams["sliding_window"])
-
-        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
-        self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
-        self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
-        self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
-        self.gguf_writer.add_swiglu_clamp_exp([hparams["swiglu_limit"]] * self.block_count)
-        self.gguf_writer.add_swiglu_clamp_shexp([hparams["swiglu_limit"]] * self.block_count)
-
-        self.gguf_writer.add_indexer_head_count(hparams["index_n_heads"])
-        self.gguf_writer.add_indexer_key_length(hparams["index_head_dim"])
-        self.gguf_writer.add_indexer_top_k(hparams["index_topk"])
-
-        self.gguf_writer.add_attention_output_group_count(hparams["o_groups"])
-        self.gguf_writer.add_attention_output_lora_rank(hparams["o_lora_rank"])
-        self.gguf_writer.add_attention_compress_ratios(hparams["compress_ratios"])
-        self.gguf_writer.add_attention_compress_rope_freq_base(hparams["compress_rope_theta"])
-        self.gguf_writer.add_hyper_connection_count(hparams["hc_mult"])
-        self.gguf_writer.add_hyper_connection_sinkhorn_iterations(hparams["hc_sinkhorn_iters"])
-        self.gguf_writer.add_hyper_connection_epsilon(hparams["hc_eps"])
-        self.gguf_writer.add_hash_layer_count(hparams["num_hash_layers"])
-
-    def dequant_model(self):
-        fp8_dtypes = self._float8_dtypes()
-        tensors_to_remove: list[str] = []
-
-        def dequant_fp8_weight(weight: Tensor, scale: Tensor) -> Tensor:
-            out_features, in_features = weight.shape
-            scale_f = self._e8m0_to_float(scale)
-            scale_f = scale_f.repeat_interleave(128, 0)[:out_features]
-            scale_f = scale_f.repeat_interleave(128, 1)[:, :in_features]
-            return weight.float() * scale_f
-
-        for name in list(self.model_tensors.keys()):
-            if not name.endswith(".scale"):
-                continue
-            weight_name = name.removesuffix(".scale") + ".weight"
-            if weight_name not in self.model_tensors:
-                continue
-
-            weight = self.model_tensors[weight_name]
-            scale = self.model_tensors[name]
-            if weight().dtype not in fp8_dtypes:
-                continue
-
-            self.model_tensors[weight_name] = lambda w=weight, s=scale: dequant_fp8_weight(w(), s())
-            self._dsv4_fp8_dequantized.add(weight_name)
-            tensors_to_remove.append(name)
-
-        for name in tensors_to_remove:
-            del self.model_tensors[name]
-
-    @staticmethod
-    def _pack_mxfp4_blocks(weight: Tensor, scale: Tensor) -> np.ndarray:
-        packed = weight.contiguous().view(torch.uint8)
-        scale_u8 = scale.contiguous().view(torch.uint8)
-
-        out_features, packed_cols = packed.shape
-        logical_cols = packed_cols * 2
-        if logical_cols % 32 != 0:
-            raise ValueError(f"MXFP4 source row has {logical_cols} values, expected a multiple of 32")
-
-        n_blocks = logical_cols // 32
-        if tuple(scale_u8.shape) != (out_features, n_blocks):
-            raise ValueError(f"MXFP4 scale shape {tuple(scale_u8.shape)} does not match {(out_features, n_blocks)}")
-
-        src = packed.reshape(out_features, n_blocks, 16)
-        low = src & 0x0F
-        high = (src >> 4) & 0x0F
-
-        # The safetensors bytes store adjacent values as low/high nibbles.
-        # ggml MXFP4 blocks store values 0..15 in low nibbles and 16..31 in high nibbles.
-        vals = torch.stack((low, high), dim=-1).reshape(out_features, n_blocks, 32)
-        qs = vals[:, :, :16] | (vals[:, :, 16:] << 4)
-        raw = torch.cat((scale_u8.unsqueeze(-1), qs.to(torch.uint8)), dim=-1)
-        return raw.reshape(out_features, n_blocks * 17).cpu().numpy()
-
-    def _write_mxfp4_expert_tensor(self, bid: int, proj: str, tensor_key: gguf.MODEL_TENSOR) -> list[str]:
-        n_experts = self.hparams["n_routed_experts"]
-        data: np.ndarray | None = None
-        consumed: list[str] = []
-
-        for eid in range(n_experts):
-            weight_name = f"layers.{bid}.ffn.experts.{eid}.{proj}.weight"
-            scale_name = f"layers.{bid}.ffn.experts.{eid}.{proj}.scale"
-            if weight_name not in self.model_tensors or scale_name not in self.model_tensors:
-                raise KeyError(f"Missing routed expert tensors for {weight_name}")
-
-            weight = LazyTorchTensor.to_eager(self.model_tensors[weight_name]())
-            scale = LazyTorchTensor.to_eager(self.model_tensors[scale_name]())
-            packed = self._pack_mxfp4_blocks(weight, scale)
-            if data is None:
-                data = np.empty((n_experts, *packed.shape), dtype=packed.dtype)
-            data[eid] = packed
-            consumed.extend((weight_name, scale_name))
-
-        assert data is not None
-        new_name = self.format_tensor_name(tensor_key, bid)
-        shape = gguf.quant_shape_from_byte_shape(data.shape, gguf.GGMLQuantizationType.MXFP4)
-        logger.info(f"{new_name}: repacked routed experts to MXFP4, shape = {{{', '.join(str(n) for n in reversed(shape))}}}")
-        self.gguf_writer.add_tensor(new_name, data, raw_dtype=gguf.GGMLQuantizationType.MXFP4)
-
-        return consumed
-
-    def _write_hash_routing_tensors(self) -> list[str]:
-        consumed: list[str] = []
-
-        for bid in range(self.hparams["num_hash_layers"]):
-            name = f"layers.{bid}.ffn.gate.tid2eid"
-            if name not in self.model_tensors:
-                raise KeyError(f"Missing hash routing tensor {name}")
-
-            data_torch = LazyTorchTensor.to_eager(self.model_tensors[name]())
-            data = data_torch.to(torch.int32).cpu().numpy()
-            new_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_TID2EID, bid, ".weight")
-            logger.info(f"{new_name}: converted hash routing table to I32, shape = {{{', '.join(str(n) for n in reversed(data.shape))}}}")
-            self.gguf_writer.add_tensor(new_name, data)
-            consumed.append(name)
-
-        return consumed
-
-    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
-        if self._dsv4_mxfp4_generated:
-            return ()
-
-        consumed: list[str] = self._write_hash_routing_tensors()
-        for bid in range(self.block_count):
-            consumed.extend(self._write_mxfp4_expert_tensor(bid, "w1", gguf.MODEL_TENSOR.FFN_GATE_EXP))
-            consumed.extend(self._write_mxfp4_expert_tensor(bid, "w2", gguf.MODEL_TENSOR.FFN_DOWN_EXP))
-            consumed.extend(self._write_mxfp4_expert_tensor(bid, "w3", gguf.MODEL_TENSOR.FFN_UP_EXP))
-
-        for name in consumed:
-            del self.model_tensors[name]
-
-        self._dsv4_mxfp4_generated = True
-        return ()
-
-    def _format_dsv4_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None, suffix: str = ".weight") -> str:
-        return self.format_tensor_name(key, bid, suffix)
-
-    def _map_dsv4_tensor_name(self, name: str, bid: int | None) -> tuple[gguf.MODEL_TENSOR, str]:
-        root_map: dict[str, tuple[gguf.MODEL_TENSOR, str]] = {
-            "embed.weight": (gguf.MODEL_TENSOR.TOKEN_EMBD, ".weight"),
-            "norm.weight": (gguf.MODEL_TENSOR.OUTPUT_NORM, ".weight"),
-            "head.weight": (gguf.MODEL_TENSOR.OUTPUT, ".weight"),
-            "hc_head_fn": (gguf.MODEL_TENSOR.HC_HEAD_FN, ".weight"),
-            "hc_head_base": (gguf.MODEL_TENSOR.HC_HEAD_BASE, ".weight"),
-            "hc_head_scale": (gguf.MODEL_TENSOR.HC_HEAD_SCALE, ".weight"),
-        }
-        if name in root_map:
-            return root_map[name]
-
-        match = re.match(r"layers\.(\d+)\.(.+)$", name)
-        if match is None:
-            raise ValueError(f"Unsupported DeepSeek-V4 tensor {name!r}")
-
-        layer = int(match.group(1))
-        if bid != layer:
-            raise ValueError(f"Tensor {name!r} parsed bid {bid} but layer name has {layer}")
-
-        layer_map: dict[str, tuple[gguf.MODEL_TENSOR, str]] = {
-            "hc_attn_fn": (gguf.MODEL_TENSOR.HC_ATTN_FN, ".weight"),
-            "hc_attn_base": (gguf.MODEL_TENSOR.HC_ATTN_BASE, ".weight"),
-            "hc_attn_scale": (gguf.MODEL_TENSOR.HC_ATTN_SCALE, ".weight"),
-            "hc_ffn_fn": (gguf.MODEL_TENSOR.HC_FFN_FN, ".weight"),
-            "hc_ffn_base": (gguf.MODEL_TENSOR.HC_FFN_BASE, ".weight"),
-            "hc_ffn_scale": (gguf.MODEL_TENSOR.HC_FFN_SCALE, ".weight"),
-            "attn.attn_sink": (gguf.MODEL_TENSOR.ATTN_SINKS, ".weight"),
-            "attn.wq_a.weight": (gguf.MODEL_TENSOR.ATTN_Q_A, ".weight"),
-            "attn.wq_b.weight": (gguf.MODEL_TENSOR.ATTN_Q_B, ".weight"),
-            "attn.q_norm.weight": (gguf.MODEL_TENSOR.ATTN_Q_A_NORM, ".weight"),
-            "attn.wkv.weight": (gguf.MODEL_TENSOR.ATTN_KV, ".weight"),
-            "attn.kv_norm.weight": (gguf.MODEL_TENSOR.ATTN_KV_NORM, ".weight"),
-            "attn.wo_a.weight": (gguf.MODEL_TENSOR.ATTN_OUT_A, ".weight"),
-            "attn.wo_b.weight": (gguf.MODEL_TENSOR.ATTN_OUT_B, ".weight"),
-            "attn.compressor.ape": (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_APE, ".weight"),
-            "attn.compressor.wkv.weight": (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_WKV, ".weight"),
-            "attn.compressor.wgate.weight": (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_WGATE, ".weight"),
-            "attn.compressor.norm.weight": (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_NORM, ".weight"),
-            "attn.indexer.wq_b.weight": (gguf.MODEL_TENSOR.INDEXER_ATTN_Q_B, ".weight"),
-            "attn.indexer.weights_proj.weight": (gguf.MODEL_TENSOR.INDEXER_PROJ, ".weight"),
-            "attn.indexer.compressor.ape": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_APE, ".weight"),
-            "attn.indexer.compressor.wkv.weight": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_WKV, ".weight"),
-            "attn.indexer.compressor.wgate.weight": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_WGATE, ".weight"),
-            "attn.indexer.compressor.norm.weight": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_NORM, ".weight"),
-            "attn_norm.weight": (gguf.MODEL_TENSOR.ATTN_NORM, ".weight"),
-            "ffn_norm.weight": (gguf.MODEL_TENSOR.FFN_NORM, ".weight"),
-            "ffn.gate.weight": (gguf.MODEL_TENSOR.FFN_GATE_INP, ".weight"),
-            "ffn.gate.bias": (gguf.MODEL_TENSOR.FFN_EXP_PROBS_B, ".bias"),
-            "ffn.gate.tid2eid": (gguf.MODEL_TENSOR.FFN_GATE_TID2EID, ".weight"),
-            "ffn.shared_experts.w1.weight": (gguf.MODEL_TENSOR.FFN_GATE_SHEXP, ".weight"),
-            "ffn.shared_experts.w2.weight": (gguf.MODEL_TENSOR.FFN_DOWN_SHEXP, ".weight"),
-            "ffn.shared_experts.w3.weight": (gguf.MODEL_TENSOR.FFN_UP_SHEXP, ".weight"),
-        }
-
-        tensor_name = match.group(2)
-        if tensor_name in layer_map:
-            return layer_map[tensor_name]
-
-        if re.match(r"ffn\.experts\.\d+\.w[123]\.(weight|scale)$", tensor_name):
-            return gguf.MODEL_TENSOR.FFN_GATE_EXP, ".weight"
-
-        raise ValueError(f"Unsupported DeepSeek-V4 tensor {name!r}")
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        if re.match(r"layers\.\d+\.ffn\.experts\.\d+\.w[123]\.(weight|scale)$", name):
-            return []
-
-        tensor_key, suffix = self._map_dsv4_tensor_name(name, bid)
-        if tensor_key == gguf.MODEL_TENSOR.FFN_GATE_TID2EID:
-            return []
-
-        return [(self._format_dsv4_tensor_name(tensor_key, bid, suffix), data_torch)]
-
-    def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
-        del new_name, bid  # unused
-
-        if name in self._dsv4_fp8_dequantized and n_dims >= 2:
-            return gguf.GGMLQuantizationType.Q8_0
-        if name in self._dsv4_f32_tensors:
-            return gguf.GGMLQuantizationType.F32
-        if name in self._dsv4_bf16_tensors and n_dims >= 2:
-            return gguf.GGMLQuantizationType.BF16
-
-        return False
-
-    def prepare_tensors(self):
-        super().prepare_tensors()
-        self._is_mxfp4 = True
-        self.ftype = gguf.LlamaFileType.MOSTLY_MXFP4_MOE
@@ -24,7 +24,7 @@ class ExaoneModel(TextModel):

        assert (hparams["activation_function"] == "silu")

-        rotary_factor = self.rope_parameters.get("partial_rotary_factor")
+        rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
        rotary_factor = rotary_factor if rotary_factor is not None else 1.0
        self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))

@@ -39,7 +39,7 @@ class ExaoneModel(TextModel):
                factor = rope_params.get("factor", 8.0)
                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
-                old_context_len = rope_params.get("original_max_position_embeddings", 8192)
+                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)

                low_freq_wavelen = old_context_len / low_freq_factor
                high_freq_wavelen = old_context_len / high_freq_factor
@@ -104,7 +104,7 @@ class Exaone4Model(TextModel):
                factor = rope_params.get("factor", 16.0)
                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
-                old_context_len = rope_params.get("original_max_position_embeddings", 8192)
+                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)

                low_freq_wavelen = old_context_len / low_freq_factor
                high_freq_wavelen = old_context_len / high_freq_factor
@@ -693,7 +693,7 @@ class Gemma4Model(Gemma3Model):
            self.gguf_writer.add_head_count_kv(value_arr)

        # handle n_rot differently for global vs swa layers
-        partial_rotary_factor_swa = self.rope_parameters.get("partial_rotary_factor", 1.0)
+        partial_rotary_factor_swa = self.hparams.get("partial_rotary_factor", 1.0)
        n_rot_full = int(head_dim_full) # "proportional" is used, see generate_extra_tensors
        n_rot_swa = int(head_dim_swa * partial_rotary_factor_swa)
        self.gguf_writer.add_rope_dimension_count(n_rot_full)
@@ -789,16 +789,6 @@ class Gemma4UnifiedModel(Gemma4Model):
 class Gemma4AssistantModel(Gemma4Model):
    model_arch = gguf.MODEL_ARCH.GEMMA4_ASSISTANT

-    @classmethod
-    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
-        name, gen = item
-
-        if "masked_embedding" in name:
-            logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
-            return None
-
-        return super().filter_tensors(item)
-
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        self.gguf_writer.add_embedding_length_out(self.hparams["backbone_hidden_size"])
@@ -832,11 +822,10 @@ class Gemma4VisionAudioModel(MmprojModel):
        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))

        # audio params
-        if self.has_audio_encoder:
-            assert self.hparams_audio is not None
-            self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A)
-            self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
-            self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-6))
+        assert self.hparams_audio is not None
+        self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A)
+        self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
+        self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-6))

    def is_audio_tensor(self, name: str) -> bool:
        return "audio_tower" in name or "embed_audio" in name
@@ -124,7 +124,7 @@ class Glm4MoeModel(TextModel):
                self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
            )
        self.gguf_writer.add_rope_dimension_count(
-            int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5))
+            int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))
        )

        # MoE parameters - Use only routed expert count (shared experts handled separately)
@@ -226,7 +226,7 @@ class GlmMoeDsaModel(DeepseekV2Model):
        super().set_gguf_parameters()

        rope_dim = self.hparams["qk_rope_head_dim"]
-        partial_rotary_factor = self.rope_parameters.get("partial_rotary_factor", 1.0)
+        partial_rotary_factor = self.hparams.get("partial_rotary_factor", 1.0)
        self.gguf_writer.add_rope_dimension_count(int(rope_dim * partial_rotary_factor))

        # NextN/MTP prediction layers
@@ -1,6 +1,5 @@
 from __future__ import annotations

-import re
 from typing import Any, Callable, Iterable, TYPE_CHECKING

 import torch
@@ -14,7 +13,7 @@ from .llama import LlamaModel
 from .mamba import Mamba2Model


-@ModelBase.register("GraniteForCausalLM")
+@ModelBase.register("GraniteForCausalLM", "GraniteSpeechForConditionalGeneration")
 class GraniteModel(LlamaModel):
    """Conversion for IBM's GraniteForCausalLM"""
    model_arch = gguf.MODEL_ARCH.GRANITE
@@ -47,29 +46,11 @@ class GraniteModel(LlamaModel):
            self.gguf_writer.add_logit_scale(logits_scale)
            logger.info("gguf: (granite) logits_scale = %s", logits_scale)

-        # If being used as the base for Granite4 Vision, add deepstack_layer_arr
-        if self.hparams.get("spatial_target_layers") or self.hparams.get("deepstack_layer_map"):
-            normalized_projector_map = Granite4VisionMmprojModel.get_normalized_projector_map(self.hparams)
-            deepstack_mapping_arr = [-1 for _ in range(self.block_count)] # Populate with -1 sentinels
-            for proj_idx, (_, llm_layer, _, _) in enumerate(normalized_projector_map):
-                # Skip the first projector which is handled as the base embedding
-                # stream like normal
-                if proj_idx == 0:
-                    continue
-                deepstack_mapping_arr[llm_layer] = proj_idx
-            self.gguf_writer.add_deepstack_mapping(deepstack_mapping_arr)
-
    @classmethod
    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
        name, gen = item
-        # Skip multimodal tensors
-        if (
-            name.startswith(("encoder."))
-            or "image_" in name
-            or "layerwise_projectors" in name
-            or "spatial_projectors" in name
-        ):
-            return
+        if name.startswith("encoder."):
+            return None
        return super().filter_tensors(item)


@@ -260,8 +241,7 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
        assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}"

    def set_vocab(self):
-        # For models with no ssm layers, don't pad for mamba2
-        self.hparams["pad_vocab_size_multiple"] = 8 if self._ssm_layers else 1
+        self.hparams["pad_vocab_size_multiple"] = 8
        Mamba2Model.set_vocab(self)


@@ -346,161 +326,3 @@ class GraniteSpeechMmprojModel(MmprojModel):
                data_torch = data_torch.squeeze(1)

        yield from super().modify_tensors(data_torch, name, bid)
-
-
-@ModelBase.register("GraniteSpeechPlusForConditionalGeneration")
-class GraniteSpeechPlusMmprojModel(GraniteSpeechMmprojModel):
-    """Conversion for GraniteSpeechPlus - extends GraniteSpeech with feature layer concatenation"""
-    has_vision_encoder = False
-    has_audio_encoder = True
-
-    def set_gguf_parameters(self):
-        assert self.hparams_audio is not None
-        super().set_gguf_parameters()
-
-        # Add feature_layer if present in encoder config
-        if feature_layers := self.hparams_audio.get("cat_hidden_layers"):
-            self.gguf_writer.add_audio_feature_layers(feature_layers)
-            logger.info(f"gguf: audio feature_layers = {feature_layers}")
-
-            # Validate projector dimension matches concatenated encoder output
-            hidden_dim = self.hparams_audio["hidden_dim"]
-            expected_dim = hidden_dim * (len(feature_layers) + 1)
-            projector_dim = self.global_config["projector_config"]["encoder_hidden_size"]
-
-            if projector_dim != expected_dim:
-                raise ValueError(
-                    f"Projector encoder_hidden_size ({projector_dim}) does not match "
-                    f"expected concatenated dimension ({expected_dim}). "
-                    f"Expected: hidden_dim ({hidden_dim}) * (len(feature_layers) + 1) = {expected_dim}"
-                )
-
-
-@ModelBase.register("Granite4VisionForConditionalGeneration")
-class Granite4VisionMmprojModel(MmprojModel):
-    has_vision_encoder = True
-    has_audio_encoder = False
-
-    @staticmethod
-    def get_normalized_projector_map(global_config: dict) -> list[tuple[int, int, str, int]]:
-        """Normalize both deepstack and spatial projector maps to the form:
-        (vision_layer, llm_layer, <type>, type_index)
-
-        This is then used to populate the following mappings:
-        - vision_feature_layers (mmproj hparam): ordered list of all
-          vision_layer values where order corresponds with the order of the
-          stacked projector tensors
-          NOTE: Values may appear multiple times for spatial projectors
-        - tensor_prefix_map (mmproj tensors): mapping from tensor prefixes to
-          the index of the corresponding projector in the stacked tensors
-        - deepstack_layer_arr (llm hparam): per-text-layer array indicating
-          which input vision feature should be injected at that layer
-          (-1 if none)
-
-        Output: (vision_layer, llm_layer, <type>, type_index)
-        """
-        deepstack_map = global_config.get("deepstack_layer_map", [])  # [[vis_layer, llm_layer], ...]
-        spatial_layers = global_config.get("spatial_target_layers", [])  # [llm_layer, ...]
-        n_text_layers = global_config["text_config"]["num_hidden_layers"]
-        n_vision_layers = global_config["vision_config"]["num_hidden_layers"]
-        normalized_projector_map = []
-        if deepstack_map:
-            for deepstack_idx, (vision_layer, llm_layer) in enumerate(sorted(deepstack_map)):
-                if vision_layer < 0:
-                    vision_layer = n_vision_layers + vision_layer
-                if llm_layer < 0:
-                    llm_layer = n_text_layers + llm_layer
-                normalized_projector_map.append((vision_layer, llm_layer, "layerwise", deepstack_idx))
-        if spatial_layers:
-            spatial_vision_layer = global_config.get("spatial_vision_layer", -1)
-            if spatial_vision_layer < 0:
-                spatial_vision_layer = n_vision_layers + spatial_vision_layer
-            for spatial_idx, llm_layer in enumerate(spatial_layers):
-                normalized_projector_map.append((spatial_vision_layer, llm_layer, "spatial", spatial_idx))
-        return list(sorted(normalized_projector_map, key=(lambda entry: entry[1])))
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        normalized_projector_map = self.get_normalized_projector_map(self.global_config)
-        self._n_proj = len(normalized_projector_map)
-
-        self._tensor_prefix_map = {
-            f"model.{proj_type}_projectors.{type_idx}": proj_idx
-            for proj_idx, (_, _, proj_type, type_idx) in enumerate(normalized_projector_map)
-        }
-        self._vision_feature_layers = [vision_layer for vision_layer, _, _, _ in normalized_projector_map]
-        self._spatial_offsets = [
-            type_idx if proj_type == "spatial" else -1
-            for _, _, proj_type, type_idx in normalized_projector_map
-        ]
-
-    def set_gguf_parameters(self):
-        assert self.hparams_vision is not None
-        super().set_gguf_parameters()
-
-        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GRANITE4_VISION)
-
-        # SigLIP encoder hparams
-        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
-        self.gguf_writer.add_vision_use_gelu(True)
-
-        # Preprocessor
-        self.gguf_writer.add_vision_preproc_image_size(self.hparams.get("image_size", 384))
-
-        # QFormer projector config
-        ds_rate = self.global_config["downsample_rate"]
-        ds_parts = ds_rate.split("/")
-        assert len(ds_parts) == 2, f"Invalid 'downsample_rate' value: {ds_rate}"
-        query_side, window_side = [int(p) for p in ds_parts]
-        self.gguf_writer.add_vision_projector_query_side(query_side)
-        self.gguf_writer.add_vision_projector_window_side(window_side)
-
-        # Set vision feature layers
-        self.gguf_writer.add_vision_feature_layers(self._vision_feature_layers)
-
-        # Set the spatial offests per projector
-        self.gguf_writer.add_vision_spatial_offsets(self._spatial_offsets)
-
-        # Add flattened image grind pinpoints (resolution candidates internally)
-        if pinpoints := self.global_config.get("image_grid_pinpoints"):
-            # Flatten with h, w -> w, h inversion
-            pinpoints = [val for h, w in pinpoints for val in (w, h)]
-            self.gguf_writer.add_vision_image_grid_pinpoints(pinpoints)
-
-    @classmethod
-    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
-        name, _ = item
-        if ("vision_model.head" in name or name.startswith("lm_head")):
-            return None
-        return super().filter_tensors(item)
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-
-        # Detect projector tensors and bin them
-        projector_idx = None
-        for prefix, proj_idx in self._tensor_prefix_map.items():
-            if name.startswith(prefix):
-                projector_idx = proj_idx
-                break
-        if projector_idx is not None:
-            # If this projector tensor has a block id within the projector,
-            # alias the bid to projector_idx
-            #
-            # TODO: currently, none of the Granite 4 Vision models have
-            # projectors with multiple QFormer layers, so the `layer.{}` index
-            # is always 0. This allows us to simply map to a single `bid` that
-            # matches the projector index. If this changes, we'll need a
-            # convention that merges the two IDs.
-            id_matches = list(re.finditer(r"\.([0-9]+)\.", name))
-            all_ids = [int(m.group(1)) for m in id_matches]
-            assert len(all_ids) >= 1 and len(all_ids) <= 2, "Must have at least 1 and at most 2 ids in tensor names"
-            # If not layer id, just use the projector index
-            new_bid = projector_idx
-            if len(all_ids) == 1:
-                new_name = name[:id_matches[0].span(1)[0]] + str(new_bid) + name[id_matches[0].span(1)[1]:]
-            else: # len(all_ids) == 2
-                new_bid = projector_idx # + all_ids[1]
-                new_name = name[:id_matches[0].span(0)[0]] + name[id_matches[0].span(1)[1]:id_matches[1].span(1)[0]] + str(new_bid) + name[id_matches[1].span(1)[1]:]
-            yield from super().modify_tensors(data_torch, new_name, new_bid)
-            return
-        yield from super().modify_tensors(data_torch, name, bid)
@@ -64,17 +64,11 @@ class LFM2Model(TextModel):
        yield from super().modify_tensors(data_torch, name, bid)


-@ModelBase.register("Lfm2Model", "Lfm2BidirectionalModel")
+@ModelBase.register("Lfm2Model")
 class LFM2ColBertModel(LFM2Model):
    model_arch = gguf.MODEL_ARCH.LFM2
    dense_tensor_name = "dense_2"

-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        if self.hf_arch == "Lfm2BidirectionalModel":
-            self.gguf_writer.add_causal_attention(False)
-        self._try_set_pooling_type()
-
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        if not name.startswith(self.dense_tensor_name):
            name = "model." + name
@@ -82,11 +76,10 @@ class LFM2ColBertModel(LFM2Model):
        yield from super().modify_tensors(data_torch, name, bid)

    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
-        # optional dense tensor is stored in a separate safetensors file
+        # dense tensor is stored in a separate safetensors file
        from safetensors.torch import load_file
        tensors_file = self.dir_model / "1_Dense" / "model.safetensors"
-        if not tensors_file.is_file():
-            return
+        assert tensors_file.is_file()
        tensor = load_file(tensors_file)["linear.weight"]
        self.gguf_writer.add_embedding_length_out(tensor.shape[0])
        yield f"{self.dense_tensor_name}.weight", tensor.clone()
@@ -5,13 +5,12 @@ import math

 from typing import Callable, Iterable, TYPE_CHECKING

-import numpy as np
 import torch

 if TYPE_CHECKING:
    from torch import Tensor

-from .base import ModelBase, TextModel, gguf, logger
+from .base import ModelBase, TextModel, gguf


@ModelBase.register(
@@ -22,10 +21,6 @@ from .base import ModelBase, TextModel, gguf, logger
    "VLlama3ForCausalLM",
    "LlavaForConditionalGeneration",
    "VoxtralForConditionalGeneration",
-    "LlamaForCausalLMEagle3",
-    "Eagle3LlamaForCausalLM",
-    "Eagle3Speculator",
-    "Eagle3DraftModel",
    "IQuestCoderForCausalLM",
    "LlamaModel")
 class LlamaModel(TextModel):
@@ -44,61 +39,7 @@ class LlamaModel(TextModel):
            hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
            self.origin_hf_arch = hparams.get('architectures', [None])[0]

-        # Detect eagle3 draft checkpoint by hparams (some models don't use a distinct HF arch name)
-        if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1:
-            self.is_eagle3 = True
-            self.model_arch = gguf.MODEL_ARCH.EAGLE3
-            logger.info("Detected EAGLE-3 draft model, switching to EAGLE3 architecture")
-            # Re-initialize tensor_map with eagle3 architecture
-            self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
-            # Update gguf_writer architecture
-            self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
-            self.gguf_writer.add_architecture()
-            if self.target_model_dir is None:
-                raise ValueError(
-                    "EAGLE-3 model requires --target-model-dir to be specified. "
-                    "Please provide the path to the target model directory to read config.json"
-                )
-            # Read both eagle3 raw config and target model config
-            with open(self.dir_model / "config.json", 'r', encoding='utf-8') as f:
-                eagle3_raw_config = json.load(f)
-            with open(self.target_model_dir / "config.json", 'r', encoding='utf-8') as f:
-                target_config = json.load(f)
-
-            if "text_config" in target_config:
-                target_config = {**target_config, **target_config["text_config"]}
-            self.target_vocab_size = target_config["vocab_size"]
-
-            # target_layers: derived from target model layer count (low/mid/high)
-            target_num_layers = target_config["num_hidden_layers"]
-            target_layers = [2, target_num_layers // 2, target_num_layers - 3]
-            logger.info(f"EAGLE-3: target_layers = {target_layers} (target model has {target_num_layers} layers)")
-            self.gguf_writer.add_target_layers(target_layers)
-
-            # target_hidden_size: prefer eagle3 config, fallback to target config
-            if eagle3_raw_config.get("target_hidden_size") is not None:
-                target_hidden_size = eagle3_raw_config["target_hidden_size"]
-                src = "EAGLE-3 config"
-            else:
-                target_hidden_size = target_config["hidden_size"]
-                src = "target model config"
-            logger.info(f"EAGLE-3: target_hidden_size = {target_hidden_size} (from {src})")
-            self.gguf_writer.add_target_hidden_size(target_hidden_size)
-
-            # norm_before_residual (RedHat-style eagle3 specific)
-            norm_before_residual = eagle3_raw_config.get("norm_before_residual", False)
-            logger.info(f"EAGLE-3: norm_before_residual = {norm_before_residual}")
-            self.gguf_writer.add_norm_before_residual(norm_before_residual)
-
    def set_vocab(self):
-        # eagle3: use tokenizer from target model if provided
-        original_dir_model = None
-        if getattr(self, 'is_eagle3', False):
-            assert self.target_model_dir is not None
-            logger.info(f"EAGLE-3: Using tokenizer from target model: {self.target_model_dir}")
-            original_dir_model = self.dir_model
-            self.dir_model = self.target_model_dir
-
        if self.origin_hf_arch == "GlmasrModel":
            return self._set_vocab_glmedge()

@@ -144,10 +85,6 @@ class LlamaModel(TextModel):
        if self.hparams.get("vocab_size", 32000) == 49152:
            self.gguf_writer.add_add_bos_token(False)

-        # eagle3: Restore original dir_model
-        if original_dir_model is not None:
-            self.dir_model = original_dir_model
-
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        hparams = self.hparams
@@ -192,49 +129,7 @@ class LlamaModel(TextModel):

        return super().filter_tensors((name, gen))

-    def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]:
-        tensors = super().index_tensors(remote_hf_model_id)
-
-        # Handle Eagle3Speculator nested config
-        if "transformer_layer_config" in self.hparams:
-            self.hparams = {**self.hparams, **self.hparams["transformer_layer_config"]}
-
-        # eagle3 detection
-        if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1:
-            logger.info("EAGLE-3: renaming midlayer.* / layers.0.* to model.layers.0.*")
-            new_tensors = {}
-            for name, gen in tensors.items():
-                if name.startswith("midlayer."):
-                    new_name = "model.layers.0." + name[len("midlayer."):]
-                    new_tensors[new_name] = gen
-                elif name.startswith("layers.0."):  # Eagle3Speculator format
-                    new_name = "model." + name
-                    new_tensors[new_name] = gen
-                else:
-                    new_tensors[name] = gen
-            return new_tensors
-
-        return tensors
-
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # eagle3: special tensors that bypass standard llama mapping
-        if getattr(self, 'is_eagle3', False):
-            if name == "fc.weight":
-                yield (name, data_torch)
-                return
-            if name == "d2t":
-                # store for manual int64 handling in prepare_tensors (avoid F32 conversion)
-                if not hasattr(self, '_eagle3_int_tensors'):
-                    self._eagle3_int_tensors = {}
-                self._eagle3_int_tensors[name] = data_torch
-                return
-            if name == "t2d":
-                # not used at runtime, skip
-                return
-            if name.endswith(".hidden_norm.weight"):
-                yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_NORM_2, bid), data_torch)
-                return
-
        n_head = self.find_hparam(["n_heads", "num_attention_heads"])
        n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"])

@@ -290,7 +185,7 @@ class LlamaModel(TextModel):
                factor = rope_params.get("factor", 8.0)
                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
-                old_context_len = rope_params.get("original_max_position_embeddings", 8192)
+                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)

                low_freq_wavelen = old_context_len / low_freq_factor
                high_freq_wavelen = old_context_len / high_freq_factor
@@ -310,33 +205,8 @@ class LlamaModel(TextModel):
                yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))

    def prepare_tensors(self):
-        # eagle3: collect d2t original dtype before parent converts tensors to F32
-        eagle3_original_dtypes = {}
-        if getattr(self, 'is_eagle3', False):
-            for name, data_torch in self.get_tensors():
-                if name == "d2t":
-                    eagle3_original_dtypes[name] = data_torch.dtype
-
        super().prepare_tensors()

-        # eagle3: write d2t as absolute target token ids
-        if getattr(self, 'is_eagle3', False) and hasattr(self, '_eagle3_int_tensors'):
-            for name, data_torch in self._eagle3_int_tensors.items():
-                old_dtype = eagle3_original_dtypes.get(name, data_torch.dtype)
-                data = data_torch.to(torch.int64).cpu().numpy()
-                if name == "d2t":
-                    data = data.reshape(-1)
-                    data = data + np.arange(data.size, dtype=np.int64)
-                    if np.any((data < 0) | (data >= self.target_vocab_size)):
-                        raise ValueError(f"EAGLE-3 d2t target ids out of range for target vocab size {self.target_vocab_size}")
-                    if np.unique(data).size != data.size:
-                        raise ValueError("EAGLE-3 d2t contains duplicate target ids")
-                data_qtype = gguf.GGMLQuantizationType.I64
-
-                shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}"
-                logger.info(f"{name + ',':<30} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
-                self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype)
-
        if self._experts is not None:
            # flatten `list[dict[str, Tensor]]` into `list[str]`
            experts = [k for d in self._experts for k in d.keys()]
@@ -114,8 +114,7 @@ class Mamba2Model(TextModel):
            hparams["text_config"] = hparams["llm_config"]
        super().__init__(dir_model, *args, hparams=hparams, **kwargs)
        self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
-        self.expand = self.find_hparam(["mamba_expand", "expand"], optional=True) or 2
-        self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or self.expand * self.d_model
+        self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model
        self.n_group = self.find_hparam(["n_groups"], optional=True) or 1

    def set_vocab(self):
@@ -145,9 +144,11 @@ class Mamba2Model(TextModel):

        rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5

+        # Fail early for models which don't have a block expansion factor of 2
+        # TODO: does this really matter?
        # skip the assertion for FalconH1 Model
        if self.model_arch != gguf.MODEL_ARCH.FALCON_H1:
-            assert self.d_inner == self.expand * self.d_model
+            assert self.d_inner == 2 * self.d_model
            assert self.d_inner % head_dim == 0

        self.gguf_writer.add_context_length(2**20)  # arbitrary value; for those who use the default
@@ -154,7 +154,7 @@ class MimoV2Model(TextModel):
        self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
        self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])

-        rope_dim = int(self.hparams["head_dim"] * self.rope_parameters["partial_rotary_factor"])
+        rope_dim = int(self.hparams["head_dim"] * self.hparams["partial_rotary_factor"])
        self.gguf_writer.add_rope_dimension_count(rope_dim)

        self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon", 1e-5))
@@ -32,9 +32,11 @@ class MiniCPMModel(TextModel):
    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
        rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]

-        long_factors = self.rope_parameters.get('long_factor')
-        short_factors = self.rope_parameters.get('short_factor')
-        if long_factors or short_factors:
+        rope_scaling = self.find_hparam(['rope_scaling'], True)
+        if rope_scaling is not None:
+            long_factors = rope_scaling.get('long_factor', None)
+            short_factors = rope_scaling.get('short_factor', None)
+
            if long_factors is None or short_factors is None:
                raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')

@@ -83,11 +85,13 @@ class MiniCPM3Model(TextModel):
        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])

    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
-        long_factors = self.rope_parameters.get('long_factor')
-        short_factors = self.rope_parameters.get('short_factor')
-        if long_factors or short_factors:
+        rope_scaling = self.find_hparam(['rope_scaling'], True)
+        if rope_scaling is not None:
            rope_dims = self.hparams["qk_rope_head_dim"]

+            long_factors = rope_scaling.get('long_factor', None)
+            short_factors = rope_scaling.get('short_factor', None)
+
            if long_factors is None or short_factors is None:
                raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')

@@ -105,9 +105,8 @@ class MistralModel(LlamaModel):
            gguf_writer.add_rope_scaling_yarn_log_mul(mscale_all_dim)
            gguf_writer.add_rope_scaling_orig_ctx_len(yarn_params["original_max_position_embeddings"])

-        llama_4_scaling = hparams.get("llama_4_scaling")
-        if llama_4_scaling is not None:
-            gguf_writer.add_attn_temperature_scale(llama_4_scaling["beta"])
+        if "llama_4_scaling" in hparams:
+            gguf_writer.add_attn_temperature_scale(hparams["llama_4_scaling"]["beta"])


 class MistralMoeModel(DeepseekV2Model):
@@ -125,18 +125,17 @@ class NemotronModel(TextModel):
        self.gguf_writer.add_layer_norm_eps(f_norm_eps)

        # * Partial RoPE
-        rot_pct = self.rope_parameters["partial_rotary_factor"]
+        rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        n_head = self.find_hparam(["num_attention_heads", "n_head"])
        self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)

        # * RopeScaling for Nemotron
-        factor = self.hparams.get("factor") or self.rope_parameters.get("factor")
-        if factor is None:
+        if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
        else:
            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-            self.gguf_writer.add_rope_scaling_factor(factor)
+            self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        # * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side
@@ -18,7 +18,7 @@ class Phi2Model(TextModel):
    model_arch = gguf.MODEL_ARCH.PHI2

    def set_gguf_parameters(self):
-        rot_pct = self.rope_parameters["partial_rotary_factor"]
+        rot_pct = self.find_hparam(["partial_rotary_factor"])
        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        n_head = self.find_hparam(["num_attention_heads", "n_head"])

@@ -149,8 +149,8 @@ class Phi3MiniModel(TextModel):
        n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
        rms_eps = self.find_hparam(["rms_norm_eps"])
        max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
-        orig_max_pos_embds = self.rope_parameters["original_max_position_embeddings"]
-        rot_pct = self.rope_parameters.get("partial_rotary_factor", 1.0)
+        orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
+        rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
        rope_dims = int(rot_pct * n_embd) // n_head

        self.gguf_writer.add_context_length(max_pos_embds)
@@ -174,19 +174,18 @@ class Phi3MiniModel(TextModel):
        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        n_head = self.find_hparam(["num_attention_heads", "n_head"])
        max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
-        orig_max_pos_embds = self.rope_parameters["original_max_position_embeddings"]
-        rot_pct = self.rope_parameters.get("partial_rotary_factor", 1.0)
+        orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
+        rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
        rope_dims = int(rot_pct * n_embd) // n_head

        # write rope scaling for long context (128k) model
-        long_factors = self.rope_parameters.get('long_factor')
-        short_factors = self.rope_parameters.get('short_factor')
-        if not long_factors:
+        rope_scaling = self.find_hparam(['rope_scaling'], True)
+        if rope_scaling is None:
            return

        scale = max_pos_embds / orig_max_pos_embds

-        rope_scaling_type = self.rope_parameters.get('rope_type', '').lower()
+        rope_scaling_type = rope_scaling.get('rope_type', rope_scaling.get('type', '')).lower()
        if len(rope_scaling_type) == 0:
            raise KeyError('Missing the required key rope_scaling.type')

@@ -199,6 +198,9 @@ class Phi3MiniModel(TextModel):

        self.gguf_writer.add_rope_scaling_attn_factors(attn_factor)

+        long_factors = rope_scaling.get('long_factor', None)
+        short_factors = rope_scaling.get('short_factor', None)
+
        if long_factors is None or short_factors is None:
            raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')

@@ -280,7 +280,7 @@ class Qwen3NextModel(Qwen2MoeModel):
        self.gguf_writer.add_full_attention_interval(self.hparams.get("full_attention_interval", 4))
        if (rope_dim := self.hparams.get("head_dim")) is None:
            rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
-        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.25)))
+        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.25)))

    @classmethod
    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
@@ -625,51 +625,3 @@ class Qwen3_5TextModel(_Qwen35MtpMixin, _Qwen35MRopeMixin, _LinearAttentionVReor
@ModelBase.register("Qwen3_5MoeForConditionalGeneration", "Qwen3_5MoeForCausalLM")
 class Qwen3_5MoeTextModel(_Qwen35MtpMixin, _Qwen35MRopeMixin, _LinearAttentionVReorderBase):
    model_arch = gguf.MODEL_ARCH.QWEN35MOE
-
-
-@ModelBase.register("DFlashDraftModel")
-class DFlashModel(Qwen3Model):
-    model_arch = gguf.MODEL_ARCH.DFLASH
-
-    def set_vocab(self):
-        if self.target_model_dir is None:
-            raise ValueError(
-                "DFlash draft model requires --target-model-dir to be specified. "
-                "Please provide the path to the target model directory containing the tokenizer."
-            )
-        logger.info(f"DFlash: Using tokenizer from target model: {self.target_model_dir}")
-        original_dir = self.dir_model
-        self.dir_model = self.target_model_dir
-        super().set_vocab()
-        self.dir_model = original_dir
-
-        mask_token_id = self.hparams.get("dflash_config", {}).get("mask_token_id")
-        if mask_token_id is not None:
-            self.gguf_writer.add_mask_token_id(mask_token_id)
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-
-        block_size = self.hparams.get("block_size", 16)
-        self.gguf_writer.add_block_size(block_size)
-        dflash_config = self.hparams.get("dflash_config", {})
-
-        target_layer_ids = dflash_config.get("target_layer_ids", [])
-        if target_layer_ids:
-            extract_layer_ids = [i + 1 for i in target_layer_ids]
-            self.gguf_writer.add_target_layers(extract_layer_ids)
-
-        use_sliding_window = self.hparams.get("use_sliding_window", False)
-        sliding_window = self.hparams.get("sliding_window")
-        layer_types = self.hparams.get("layer_types")
-        if use_sliding_window and sliding_window and layer_types:
-            is_swa = [lt == "sliding_attention" for lt in layer_types]
-            self.gguf_writer.add_sliding_window(sliding_window)
-            self.gguf_writer.add_sliding_window_pattern(is_swa)
-
-    @classmethod
-    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
-        name, gen = item
-        if not name.startswith("model."):
-            name = "model." + name
-        return super().filter_tensors((name, gen))
@@ -28,7 +28,7 @@ class StableLMModel(TextModel):
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
        self.gguf_writer.add_block_count(self.block_count)
        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
-        rotary_factor = self.rope_parameters["partial_rotary_factor"]
+        rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
        self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
        self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
@@ -314,7 +314,7 @@ class Step35Model(TextModel):
        factor = float(rope_params.get("factor", 8.0))
        low_freq_factor = float(rope_params.get("low_freq_factor", 1.0))
        high_freq_factor = float(rope_params.get("high_freq_factor", 4.0))
-        old_context_len = int(rope_params.get("original_max_position_embeddings", 8192))
+        old_context_len = int(rope_params.get("original_max_position_embeddings", self.hparams.get("original_max_position_embeddings", 8192)))

        low_freq_wavelen = old_context_len / low_freq_factor
        high_freq_wavelen = old_context_len / high_freq_factor
@@ -153,15 +153,6 @@ def parse_args() -> argparse.Namespace:
        help="Store tensors dequantized from FP8 as Q8_0 instead of BF16/F16.",
    )

-    parser.add_argument(
-        "--target-model-dir", type=str, default=None,
-        help=(
-            "path to the target model directory; required when converting a standalone draft model "
-            "(e.g. EAGLE3 / DFlash) that needs target-model metadata such as tokenizer, hidden size, and "
-            "layer count to populate its GGUF."
-        ),
-    )
-
    args = parser.parse_args()
    if not args.print_supported_models and args.model is None:
        parser.error("the following arguments are required: model")
@@ -247,7 +238,7 @@ def main() -> None:
            assert hparams.get("vision_encoder") is not None, "This model does not support multimodal"
            from conversion.pixtral import PixtralModel
            model_class = PixtralModel
-        elif hparams.get("moe") is not None:
+        elif "moe" in hparams:
            from conversion.mistral import MistralMoeModel
            model_class = MistralMoeModel
        else:
@@ -278,7 +269,6 @@ def main() -> None:
                                     small_first_shard=args.no_tensor_first_split,
                                     remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template,
                                     sentence_transformers_dense_modules=args.sentence_transformers_dense_modules,
-                                     target_model_dir=Path(args.target_model_dir) if args.target_model_dir else None,
                                     fuse_gate_up_exps=args.fuse_gate_up_exps,
                                     fp8_as_q8=args.fp8_as_q8,
                                     )
@@ -100,7 +100,6 @@ models = [
    {"name": "refact",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
    {"name": "command-r",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
    {"name": "tiny_aya",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereLabs/tiny-aya-base", },
-    {"name": "cohere2moe",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereLabs/North-Mini-Code-1.0", },
    {"name": "qwen2",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
    {"name": "olmo",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
    {"name": "dbrx",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
@@ -25,7 +25,7 @@ import gguf
 from gguf.constants import GGUFValueType

 # reuse model definitions from the conversion/ package
-from conversion import LazyTorchTensor, ModelBase, get_model_class, ModelType, get_model_architecture
+from conversion import LazyTorchTensor, ModelBase, get_model_class

 logger = logging.getLogger("lora-to-gguf")

@@ -311,10 +311,6 @@ def parse_args() -> argparse.Namespace:
        "--base-model-id", type=str,
        help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')",
    )
-    parser.add_argument(
-        "--trust-remote-code", default=False, action="store_true",
-        help="trust remote code in the model",
-    )
    parser.add_argument(
        "lora_path", type=Path,
        help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
@@ -323,11 +319,11 @@ def parse_args() -> argparse.Namespace:
    return parser.parse_args()


-def load_hparams_from_hf(hf_model_id: str, trust_remote_code: bool) -> tuple[dict[str, Any], Path | None]:
+def load_hparams_from_hf(hf_model_id: str) -> tuple[dict[str, Any], Path | None]:
    from huggingface_hub import try_to_load_from_cache

    # normally, adapter does not come with base model config, we need to load it from AutoConfig
-    config = AutoConfig.from_pretrained(hf_model_id, trust_remote_code=trust_remote_code)
+    config = AutoConfig.from_pretrained(hf_model_id)
    cache_dir = try_to_load_from_cache(hf_model_id, "config.json")
    cache_dir = Path(cache_dir).parent if isinstance(cache_dir, str) else None

@@ -376,13 +372,13 @@ if __name__ == '__main__':
    # load base model
    if base_model_id is not None:
        logger.info(f"Loading base model from Hugging Face: {base_model_id}")
-        hparams, dir_base_model = load_hparams_from_hf(base_model_id, args.trust_remote_code)
+        hparams, dir_base_model = load_hparams_from_hf(base_model_id)
    elif dir_base_model is None:
        if "base_model_name_or_path" in lparams:
            model_id = lparams["base_model_name_or_path"]
            logger.info(f"Loading base model from Hugging Face: {model_id}")
            try:
-                hparams, dir_base_model = load_hparams_from_hf(model_id, args.trust_remote_code)
+                hparams, dir_base_model = load_hparams_from_hf(model_id)
            except OSError as e:
                logger.error(f"Failed to load base model config: {e}")
                logger.error("Please try downloading the base model and add its path to --base")
@@ -396,12 +392,10 @@ if __name__ == '__main__':
        hparams = ModelBase.load_hparams(dir_base_model, False)

    with torch.inference_mode():
-        model_arch = get_model_architecture(hparams, ModelType.TEXT)
        try:
-            model_class = get_model_class(model_arch)
-            logger.info("Using model architecture: %s", model_arch)
+            model_class = get_model_class(hparams["architectures"][0])
        except NotImplementedError:
-            logger.error(f"Model {model_arch} is not supported")
+            logger.error(f"Model {hparams['architectures'][0]} is not supported")
            sys.exit(1)

        class LoraModel(model_class):  # ty: ignore[unsupported-base]
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Georgi Gerganov	37c56c245e	wip	2026-06-06 16:30:41 +03:00
Georgi Gerganov	1c4a91c0f3	wip	2026-06-06 10:48:36 +03:00
Georgi Gerganov	65eef9549c	Merge branch 'master' into pr/23398	2026-06-05 17:47:19 +03:00
Georgi Gerganov	f0438b1b15	cont : avoid computations on the CPU	2026-06-05 14:39:03 +03:00
Georgi Gerganov	d78a3864f0	cont : adjust to hparams changes	2026-06-05 14:38:41 +03:00
Georgi Gerganov	5954f196ed	Merge branch 'master' into pr/23398	2026-06-05 14:02:53 +03:00
Aman Gupta	4eaa3cee66	add unified assistant	2026-06-05 14:59:44 +08:00
Aman Gupta	dd97604fc4	move assistant to separate file	2026-06-04 18:56:48 +08:00
Aman Gupta	c0da00af04	add exception in test-llama-archs	2026-06-04 18:54:12 +08:00
Aman Gupta	777af6af54	add temp hack to not use fit with gemma4, rm later	2026-06-04 18:54:12 +08:00
Aman Gupta	27461cd888	add Q rot when cache is quantized	2026-06-04 18:54:12 +08:00
Aman Gupta	7b87cd3598	add assert that draft + shared kv should be on same device	2026-06-04 18:54:12 +08:00
Aman Gupta	9af0434d8c	fix multi-seq	2026-06-04 18:54:12 +08:00
Aman Gupta	f268966d49	llama: Gemma 4 MTP	2026-06-04 18:51:14 +08:00